aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2009-09-08 20:55:21 -0400
committerDan Williams <dan.j.williams@intel.com>2009-09-08 20:55:21 -0400
commitbbb20089a3275a19e475dbc21320c3742e3ca423 (patch)
tree216fdc1cbef450ca688135c5b8969169482d9a48 /Documentation
parent3e48e656903e9fd8bc805c6a2c4264d7808d315b (diff)
parent657a77fa7284d8ae28dfa48f1dc5d919bf5b2843 (diff)
Merge branch 'dmaengine' into async-tx-next
Conflicts: crypto/async_tx/async_xor.c drivers/dma/ioat/dma_v2.h drivers/dma/ioat/pci.c drivers/md/raid5.c
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/ABI/testing/sysfs-block59
-rw-r--r--Documentation/ABI/testing/sysfs-bus-pci7
-rw-r--r--Documentation/ABI/testing/sysfs-bus-pci-devices-cciss33
-rw-r--r--Documentation/ABI/testing/sysfs-class-mtd125
-rw-r--r--Documentation/ABI/testing/sysfs-devices-cache_disable18
-rw-r--r--Documentation/ABI/testing/sysfs-fs-ext410
-rw-r--r--Documentation/ABI/testing/sysfs-pps73
-rw-r--r--Documentation/Changes26
-rw-r--r--Documentation/CodingStyle4
-rw-r--r--Documentation/DMA-API.txt16
-rw-r--r--Documentation/DocBook/Makefile3
-rw-r--r--Documentation/DocBook/debugobjects.tmpl2
-rw-r--r--Documentation/DocBook/mac80211.tmpl1
-rw-r--r--Documentation/DocBook/tracepoint.tmpl89
-rw-r--r--Documentation/PCI/pcieaer-howto.txt25
-rw-r--r--Documentation/RCU/rculist_nulls.txt2
-rw-r--r--Documentation/RCU/trace.txt102
-rw-r--r--Documentation/SM501.txt2
-rw-r--r--Documentation/Smack.txt20
-rw-r--r--Documentation/SubmitChecklist2
-rw-r--r--Documentation/SubmittingPatches82
-rw-r--r--Documentation/accounting/getdelays.c3
-rw-r--r--Documentation/arm/Samsung-S3C24XX/GPIO.txt10
-rw-r--r--Documentation/atomic_ops.txt4
-rw-r--r--Documentation/block/biodoc.txt2
-rw-r--r--Documentation/block/deadline-iosched.txt2
-rw-r--r--Documentation/braille-console.txt2
-rw-r--r--Documentation/cdrom/packet-writing.txt2
-rw-r--r--Documentation/cgroups/memory.txt16
-rw-r--r--Documentation/connector/cn_test.c7
-rw-r--r--Documentation/cpu-freq/cpu-drivers.txt2
-rw-r--r--Documentation/cpu-freq/governors.txt26
-rw-r--r--Documentation/cpu-freq/user-guide.txt1
-rw-r--r--Documentation/dell_rbu.txt4
-rw-r--r--Documentation/development-process/5.Posting31
-rw-r--r--Documentation/device-mapper/dm-log.txt54
-rw-r--r--Documentation/device-mapper/dm-queue-length.txt39
-rw-r--r--Documentation/device-mapper/dm-service-time.txt91
-rw-r--r--Documentation/driver-model/device.txt32
-rw-r--r--Documentation/driver-model/devres.txt2
-rw-r--r--Documentation/dvb/get_dvb_firmware8
-rw-r--r--Documentation/edac.txt8
-rw-r--r--Documentation/fault-injection/fault-injection.txt70
-rw-r--r--Documentation/fb/sh7760fb.txt2
-rw-r--r--Documentation/fb/vesafb.txt2
-rw-r--r--Documentation/feature-removal-schedule.txt41
-rw-r--r--Documentation/filesystems/00-INDEX4
-rw-r--r--Documentation/filesystems/Locking45
-rw-r--r--Documentation/filesystems/autofs4-mount-control.txt2
-rw-r--r--Documentation/filesystems/caching/netfs-api.txt2
-rw-r--r--Documentation/filesystems/debugfs.txt158
-rw-r--r--Documentation/filesystems/ext2.txt2
-rw-r--r--Documentation/filesystems/ext4.txt10
-rw-r--r--Documentation/filesystems/fiemap.txt2
-rw-r--r--Documentation/filesystems/gfs2-glocks.txt2
-rw-r--r--Documentation/filesystems/gfs2.txt19
-rw-r--r--Documentation/filesystems/isofs.txt9
-rw-r--r--Documentation/filesystems/nfs-rdma.txt2
-rw-r--r--Documentation/filesystems/nilfs2.txt5
-rw-r--r--Documentation/filesystems/proc.txt287
-rw-r--r--Documentation/filesystems/sysfs-pci.txt2
-rw-r--r--Documentation/filesystems/vfat.txt13
-rw-r--r--Documentation/firmware_class/README3
-rw-r--r--Documentation/futex-requeue-pi.txt131
-rw-r--r--Documentation/gcov.txt246
-rw-r--r--Documentation/gpio.txt2
-rw-r--r--Documentation/hwmon/f71882fg12
-rw-r--r--Documentation/hwmon/ibmaem2
-rw-r--r--Documentation/hwmon/sysfs-interface19
-rw-r--r--Documentation/hwmon/tmp40142
-rw-r--r--Documentation/hwmon/w83627ehf11
-rw-r--r--Documentation/i2c/busses/i2c-ocores17
-rw-r--r--Documentation/i2c/busses/i2c-viapro4
-rw-r--r--Documentation/i2c/instantiating-devices44
-rw-r--r--Documentation/i2c/writing-clients16
-rw-r--r--Documentation/ide/ide.txt2
-rw-r--r--Documentation/input/input.txt2
-rw-r--r--Documentation/input/rotary-encoder.txt9
-rw-r--r--Documentation/ioctl/ioctl-number.txt2
-rw-r--r--Documentation/isdn/00-INDEX44
-rw-r--r--Documentation/isdn/INTERFACE.CAPI94
-rw-r--r--Documentation/isdn/README.gigaset42
-rw-r--r--Documentation/ja_JP/SubmitChecklist2
-rw-r--r--Documentation/kbuild/kconfig.txt116
-rw-r--r--Documentation/kbuild/modules.txt2
-rw-r--r--Documentation/kdump/kdump.txt4
-rw-r--r--Documentation/kernel-parameters.txt138
-rw-r--r--Documentation/kmemcheck.txt773
-rw-r--r--Documentation/kmemleak.txt142
-rw-r--r--Documentation/kobject.txt2
-rw-r--r--Documentation/kprobes.txt6
-rw-r--r--Documentation/laptops/acer-wmi.txt2
-rw-r--r--Documentation/laptops/sony-laptop.txt2
-rw-r--r--Documentation/laptops/thinkpad-acpi.txt49
-rw-r--r--Documentation/lguest/Makefile3
-rw-r--r--Documentation/lguest/lguest.c1008
-rw-r--r--Documentation/lguest/lguest.txt1
-rw-r--r--Documentation/local_ops.txt2
-rw-r--r--Documentation/memory-barriers.txt129
-rw-r--r--Documentation/memory-hotplug.txt8
-rw-r--r--Documentation/mn10300/ABI.txt2
-rw-r--r--Documentation/mtd/nand_ecc.txt12
-rw-r--r--Documentation/networking/bonding.txt6
-rw-r--r--Documentation/networking/can.txt237
-rw-r--r--Documentation/networking/dm9000.txt2
-rw-r--r--Documentation/networking/ieee802154.txt76
-rw-r--r--Documentation/networking/ip-sysctl.txt18
-rw-r--r--Documentation/networking/ipv6.txt37
-rw-r--r--Documentation/networking/l2tp.txt2
-rw-r--r--Documentation/networking/mac80211-injection.txt28
-rw-r--r--Documentation/networking/netdevices.txt2
-rw-r--r--Documentation/networking/operstates.txt3
-rw-r--r--Documentation/networking/packet_mmap.txt140
-rw-r--r--Documentation/networking/phonet.txt2
-rw-r--r--Documentation/networking/regulatory.txt2
-rw-r--r--Documentation/power/devices.txt34
-rw-r--r--Documentation/power/regulator/consumer.txt2
-rw-r--r--Documentation/power/regulator/overview.txt2
-rw-r--r--Documentation/power/s2ram.txt2
-rw-r--r--Documentation/power/userland-swsusp.txt2
-rw-r--r--Documentation/powerpc/booting-without-of.txt4
-rw-r--r--Documentation/powerpc/dts-bindings/can/sja1000.txt53
-rw-r--r--Documentation/powerpc/dts-bindings/ecm.txt64
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/board.txt2
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm.txt2
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/gpio.txt2
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt3
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/esdhc.txt7
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/mcm.txt64
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/msi-pic.txt2
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/pmc.txt4
-rw-r--r--Documentation/powerpc/qe_firmware.txt2
-rw-r--r--Documentation/pps/pps.txt172
-rw-r--r--Documentation/rbtree.txt10
-rw-r--r--Documentation/rfkill.txt640
-rw-r--r--Documentation/robust-futex-ABI.txt4
-rw-r--r--Documentation/s390/Debugging390.txt4
-rw-r--r--Documentation/scheduler/sched-nice-design.txt2
-rw-r--r--Documentation/scheduler/sched-rt-group.txt20
-rw-r--r--Documentation/scsi/aic79xx.txt2
-rw-r--r--Documentation/scsi/ncr53c8xx.txt4
-rw-r--r--Documentation/scsi/scsi_fc_transport.txt14
-rw-r--r--Documentation/scsi/scsi_mid_low_api.txt5
-rw-r--r--Documentation/scsi/sym53c8xx_2.txt2
-rw-r--r--Documentation/sound/alsa/ALSA-Configuration.txt38
-rw-r--r--Documentation/sound/alsa/HD-Audio-Models.txt19
-rw-r--r--Documentation/sound/alsa/HD-Audio.txt2
-rw-r--r--Documentation/sound/alsa/Procfile.txt36
-rw-r--r--Documentation/sound/alsa/README.maya44163
-rw-r--r--Documentation/sound/alsa/hda_codec.txt2
-rw-r--r--Documentation/sound/alsa/soc/dapm.txt1
-rw-r--r--Documentation/sysctl/kernel.txt11
-rw-r--r--Documentation/sysctl/vm.txt27
-rw-r--r--Documentation/timers/hpet.txt2
-rw-r--r--Documentation/timers/timer_stats.txt2
-rw-r--r--Documentation/trace/events.txt90
-rw-r--r--Documentation/trace/ftrace.txt252
-rw-r--r--Documentation/trace/kmemtrace.txt2
-rw-r--r--Documentation/trace/mmiotrace.txt26
-rw-r--r--Documentation/trace/power.txt17
-rw-r--r--Documentation/usb/WUSB-Design-overview.txt8
-rw-r--r--Documentation/usb/anchors.txt4
-rw-r--r--Documentation/usb/callbacks.txt2
-rw-r--r--Documentation/video4linux/CARDLIST.cx238855
-rw-r--r--Documentation/video4linux/CARDLIST.cx888
-rw-r--r--Documentation/video4linux/CARDLIST.em28xx7
-rw-r--r--Documentation/video4linux/CARDLIST.saa713422
-rw-r--r--Documentation/video4linux/CARDLIST.tuner2
-rw-r--r--Documentation/video4linux/cx18.txt2
-rw-r--r--Documentation/video4linux/gspca.txt12
-rw-r--r--Documentation/video4linux/pxa_camera.txt49
-rw-r--r--Documentation/video4linux/v4l2-framework.txt29
-rw-r--r--Documentation/vm/Makefile2
-rw-r--r--Documentation/vm/balance18
-rw-r--r--Documentation/vm/page-types.c698
-rw-r--r--Documentation/vm/pagemap.txt68
-rw-r--r--Documentation/watchdog/hpwdt.txt95
-rw-r--r--Documentation/x86/boot.txt122
-rw-r--r--Documentation/x86/x86_64/boot-options.txt49
-rw-r--r--Documentation/x86/x86_64/machinecheck8
-rw-r--r--Documentation/x86/x86_64/mm.txt9
181 files changed, 6485 insertions, 1982 deletions
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 44f52a4f5903..cbbd3e069945 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -60,3 +60,62 @@ Description:
60 Indicates whether the block layer should automatically 60 Indicates whether the block layer should automatically
61 generate checksums for write requests bound for 61 generate checksums for write requests bound for
62 devices that support receiving integrity metadata. 62 devices that support receiving integrity metadata.
63
64What: /sys/block/<disk>/alignment_offset
65Date: April 2009
66Contact: Martin K. Petersen <martin.petersen@oracle.com>
67Description:
68 Storage devices may report a physical block size that is
69 bigger than the logical block size (for instance a drive
70 with 4KB physical sectors exposing 512-byte logical
71 blocks to the operating system). This parameter
72 indicates how many bytes the beginning of the device is
73 offset from the disk's natural alignment.
74
75What: /sys/block/<disk>/<partition>/alignment_offset
76Date: April 2009
77Contact: Martin K. Petersen <martin.petersen@oracle.com>
78Description:
79 Storage devices may report a physical block size that is
80 bigger than the logical block size (for instance a drive
81 with 4KB physical sectors exposing 512-byte logical
82 blocks to the operating system). This parameter
83 indicates how many bytes the beginning of the partition
84 is offset from the disk's natural alignment.
85
86What: /sys/block/<disk>/queue/logical_block_size
87Date: May 2009
88Contact: Martin K. Petersen <martin.petersen@oracle.com>
89Description:
90 This is the smallest unit the storage device can
91 address. It is typically 512 bytes.
92
93What: /sys/block/<disk>/queue/physical_block_size
94Date: May 2009
95Contact: Martin K. Petersen <martin.petersen@oracle.com>
96Description:
97 This is the smallest unit the storage device can write
98 without resorting to read-modify-write operation. It is
99 usually the same as the logical block size but may be
100 bigger. One example is SATA drives with 4KB sectors
101 that expose a 512-byte logical block size to the
102 operating system.
103
104What: /sys/block/<disk>/queue/minimum_io_size
105Date: April 2009
106Contact: Martin K. Petersen <martin.petersen@oracle.com>
107Description:
108 Storage devices may report a preferred minimum I/O size,
109 which is the smallest request the device can perform
110 without incurring a read-modify-write penalty. For disk
111 drives this is often the physical block size. For RAID
112 arrays it is often the stripe chunk size.
113
114What: /sys/block/<disk>/queue/optimal_io_size
115Date: April 2009
116Contact: Martin K. Petersen <martin.petersen@oracle.com>
117Description:
118 Storage devices may report an optimal I/O size, which is
119 the device's preferred unit of receiving I/O. This is
120 rarely reported for disk drives. For RAID devices it is
121 usually the stripe width or the internal block size.
diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 97ad190e13af..6bf68053e4b8 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -122,3 +122,10 @@ Description:
122 This symbolic link appears when a device is a Virtual Function. 122 This symbolic link appears when a device is a Virtual Function.
123 The symbolic link points to the PCI device sysfs entry of the 123 The symbolic link points to the PCI device sysfs entry of the
124 Physical Function this device associates with. 124 Physical Function this device associates with.
125
126What: /sys/bus/pci/slots/.../module
127Date: June 2009
128Contact: linux-pci@vger.kernel.org
129Description:
130 This symbolic link points to the PCI hotplug controller driver
131 module that manages the hotplug slot.
diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss
new file mode 100644
index 000000000000..0a92a7c93a62
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss
@@ -0,0 +1,33 @@
1Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/model
2Date: March 2009
3Kernel Version: 2.6.30
4Contact: iss_storagedev@hp.com
5Description: Displays the SCSI INQUIRY page 0 model for logical drive
6 Y of controller X.
7
8Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/rev
9Date: March 2009
10Kernel Version: 2.6.30
11Contact: iss_storagedev@hp.com
12Description: Displays the SCSI INQUIRY page 0 revision for logical
13 drive Y of controller X.
14
15Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/unique_id
16Date: March 2009
17Kernel Version: 2.6.30
18Contact: iss_storagedev@hp.com
19Description: Displays the SCSI INQUIRY page 83 serial number for logical
20 drive Y of controller X.
21
22Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/vendor
23Date: March 2009
24Kernel Version: 2.6.30
25Contact: iss_storagedev@hp.com
26Description: Displays the SCSI INQUIRY page 0 vendor for logical drive
27 Y of controller X.
28
29Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/block:cciss!cXdY
30Date: March 2009
31Kernel Version: 2.6.30
32Contact: iss_storagedev@hp.com
33Description: A symbolic link to /sys/block/cciss!cXdY
diff --git a/Documentation/ABI/testing/sysfs-class-mtd b/Documentation/ABI/testing/sysfs-class-mtd
new file mode 100644
index 000000000000..4d55a1888981
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-mtd
@@ -0,0 +1,125 @@
1What: /sys/class/mtd/
2Date: April 2009
3KernelVersion: 2.6.29
4Contact: linux-mtd@lists.infradead.org
5Description:
6 The mtd/ class subdirectory belongs to the MTD subsystem
7 (MTD core).
8
9What: /sys/class/mtd/mtdX/
10Date: April 2009
11KernelVersion: 2.6.29
12Contact: linux-mtd@lists.infradead.org
13Description:
14 The /sys/class/mtd/mtd{0,1,2,3,...} directories correspond
15 to each /dev/mtdX character device. These may represent
16 physical/simulated flash devices, partitions on a flash
17 device, or concatenated flash devices. They exist regardless
18 of whether CONFIG_MTD_CHAR is actually enabled.
19
20What: /sys/class/mtd/mtdXro/
21Date: April 2009
22KernelVersion: 2.6.29
23Contact: linux-mtd@lists.infradead.org
24Description:
25 These directories provide the corresponding read-only device
26 nodes for /sys/class/mtd/mtdX/ . They are only created
27 (for the benefit of udev) if CONFIG_MTD_CHAR is enabled.
28
29What: /sys/class/mtd/mtdX/dev
30Date: April 2009
31KernelVersion: 2.6.29
32Contact: linux-mtd@lists.infradead.org
33Description:
34 Major and minor numbers of the character device corresponding
35 to this MTD device (in <major>:<minor> format). This is the
36 read-write device so <minor> will be even.
37
38What: /sys/class/mtd/mtdXro/dev
39Date: April 2009
40KernelVersion: 2.6.29
41Contact: linux-mtd@lists.infradead.org
42Description:
43 Major and minor numbers of the character device corresponding
44 to the read-only variant of thie MTD device (in
45 <major>:<minor> format). In this case <minor> will be odd.
46
47What: /sys/class/mtd/mtdX/erasesize
48Date: April 2009
49KernelVersion: 2.6.29
50Contact: linux-mtd@lists.infradead.org
51Description:
52 "Major" erase size for the device. If numeraseregions is
53 zero, this is the eraseblock size for the entire device.
54 Otherwise, the MEMGETREGIONCOUNT/MEMGETREGIONINFO ioctls
55 can be used to determine the actual eraseblock layout.
56
57What: /sys/class/mtd/mtdX/flags
58Date: April 2009
59KernelVersion: 2.6.29
60Contact: linux-mtd@lists.infradead.org
61Description:
62 A hexadecimal value representing the device flags, ORed
63 together:
64
65 0x0400: MTD_WRITEABLE - device is writable
66 0x0800: MTD_BIT_WRITEABLE - single bits can be flipped
67 0x1000: MTD_NO_ERASE - no erase necessary
68 0x2000: MTD_POWERUP_LOCK - always locked after reset
69
70What: /sys/class/mtd/mtdX/name
71Date: April 2009
72KernelVersion: 2.6.29
73Contact: linux-mtd@lists.infradead.org
74Description:
75 A human-readable ASCII name for the device or partition.
76 This will match the name in /proc/mtd .
77
78What: /sys/class/mtd/mtdX/numeraseregions
79Date: April 2009
80KernelVersion: 2.6.29
81Contact: linux-mtd@lists.infradead.org
82Description:
83 For devices that have variable eraseblock sizes, this
84 provides the total number of erase regions. Otherwise,
85 it will read back as zero.
86
87What: /sys/class/mtd/mtdX/oobsize
88Date: April 2009
89KernelVersion: 2.6.29
90Contact: linux-mtd@lists.infradead.org
91Description:
92 Number of OOB bytes per page.
93
94What: /sys/class/mtd/mtdX/size
95Date: April 2009
96KernelVersion: 2.6.29
97Contact: linux-mtd@lists.infradead.org
98Description:
99 Total size of the device/partition, in bytes.
100
101What: /sys/class/mtd/mtdX/type
102Date: April 2009
103KernelVersion: 2.6.29
104Contact: linux-mtd@lists.infradead.org
105Description:
106 One of the following ASCII strings, representing the device
107 type:
108
109 absent, ram, rom, nor, nand, dataflash, ubi, unknown
110
111What: /sys/class/mtd/mtdX/writesize
112Date: April 2009
113KernelVersion: 2.6.29
114Contact: linux-mtd@lists.infradead.org
115Description:
116 Minimal writable flash unit size. This will always be
117 a positive integer.
118
119 In the case of NOR flash it is 1 (even though individual
120 bits can be cleared).
121
122 In the case of NAND flash it is one NAND page (or a
123 half page, or a quarter page).
124
125 In the case of ECC NOR, it is the ECC block size.
diff --git a/Documentation/ABI/testing/sysfs-devices-cache_disable b/Documentation/ABI/testing/sysfs-devices-cache_disable
new file mode 100644
index 000000000000..175bb4f70512
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-cache_disable
@@ -0,0 +1,18 @@
1What: /sys/devices/system/cpu/cpu*/cache/index*/cache_disable_X
2Date: August 2008
3KernelVersion: 2.6.27
4Contact: mark.langsdorf@amd.com
5Description: These files exist in every cpu's cache index directories.
6 There are currently 2 cache_disable_# files in each
7 directory. Reading from these files on a supported
8 processor will return that cache disable index value
9 for that processor and node. Writing to one of these
10 files will cause the specificed cache index to be disabled.
11
12 Currently, only AMD Family 10h Processors support cache index
13 disable, and only for their L3 caches. See the BIOS and
14 Kernel Developer's Guide at
15 http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/31116-Public-GH-BKDG_3.20_2-4-09.pdf
16 for formatting information and other details on the
17 cache index disable.
18Users: joachim.deguara@amd.com
diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
index 4e79074de282..5fb709997d96 100644
--- a/Documentation/ABI/testing/sysfs-fs-ext4
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -79,3 +79,13 @@ Description:
79 This file is read-only and shows the number of 79 This file is read-only and shows the number of
80 kilobytes of data that have been written to this 80 kilobytes of data that have been written to this
81 filesystem since it was mounted. 81 filesystem since it was mounted.
82
83What: /sys/fs/ext4/<disk>/inode_goal
84Date: June 2008
85Contact: "Theodore Ts'o" <tytso@mit.edu>
86Description:
87 Tuning parameter which (if non-zero) controls the goal
88 inode used by the inode allocator in p0reference to
89 all other allocation hueristics. This is intended for
90 debugging use only, and should be 0 on production
91 systems.
diff --git a/Documentation/ABI/testing/sysfs-pps b/Documentation/ABI/testing/sysfs-pps
new file mode 100644
index 000000000000..25028c7bc37d
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-pps
@@ -0,0 +1,73 @@
1What: /sys/class/pps/
2Date: February 2008
3Contact: Rodolfo Giometti <giometti@linux.it>
4Description:
5 The /sys/class/pps/ directory will contain files and
6 directories that will provide a unified interface to
7 the PPS sources.
8
9What: /sys/class/pps/ppsX/
10Date: February 2008
11Contact: Rodolfo Giometti <giometti@linux.it>
12Description:
13 The /sys/class/pps/ppsX/ directory is related to X-th
14 PPS source into the system. Each directory will
15 contain files to manage and control its PPS source.
16
17What: /sys/class/pps/ppsX/assert
18Date: February 2008
19Contact: Rodolfo Giometti <giometti@linux.it>
20Description:
21 The /sys/class/pps/ppsX/assert file reports the assert events
22 and the assert sequence number of the X-th source in the form:
23
24 <secs>.<nsec>#<sequence>
25
26 If the source has no assert events the content of this file
27 is empty.
28
29What: /sys/class/pps/ppsX/clear
30Date: February 2008
31Contact: Rodolfo Giometti <giometti@linux.it>
32Description:
33 The /sys/class/pps/ppsX/clear file reports the clear events
34 and the clear sequence number of the X-th source in the form:
35
36 <secs>.<nsec>#<sequence>
37
38 If the source has no clear events the content of this file
39 is empty.
40
41What: /sys/class/pps/ppsX/mode
42Date: February 2008
43Contact: Rodolfo Giometti <giometti@linux.it>
44Description:
45 The /sys/class/pps/ppsX/mode file reports the functioning
46 mode of the X-th source in hexadecimal encoding.
47
48 Please, refer to linux/include/linux/pps.h for further
49 info.
50
51What: /sys/class/pps/ppsX/echo
52Date: February 2008
53Contact: Rodolfo Giometti <giometti@linux.it>
54Description:
55 The /sys/class/pps/ppsX/echo file reports if the X-th does
56 or does not support an "echo" function.
57
58What: /sys/class/pps/ppsX/name
59Date: February 2008
60Contact: Rodolfo Giometti <giometti@linux.it>
61Description:
62 The /sys/class/pps/ppsX/name file reports the name of the
63 X-th source.
64
65What: /sys/class/pps/ppsX/path
66Date: February 2008
67Contact: Rodolfo Giometti <giometti@linux.it>
68Description:
69 The /sys/class/pps/ppsX/path file reports the path name of
70 the device connected with the X-th source.
71
72 If the source is not connected with any device the content
73 of this file is empty.
diff --git a/Documentation/Changes b/Documentation/Changes
index b95082be4d5e..6d0f1efc5bf6 100644
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -29,7 +29,7 @@ hardware, for example, you probably needn't concern yourself with
29isdn4k-utils. 29isdn4k-utils.
30 30
31o Gnu C 3.2 # gcc --version 31o Gnu C 3.2 # gcc --version
32o Gnu make 3.79.1 # make --version 32o Gnu make 3.80 # make --version
33o binutils 2.12 # ld -v 33o binutils 2.12 # ld -v
34o util-linux 2.10o # fdformat --version 34o util-linux 2.10o # fdformat --version
35o module-init-tools 0.9.10 # depmod -V 35o module-init-tools 0.9.10 # depmod -V
@@ -48,6 +48,7 @@ o procps 3.2.0 # ps --version
48o oprofile 0.9 # oprofiled --version 48o oprofile 0.9 # oprofiled --version
49o udev 081 # udevinfo -V 49o udev 081 # udevinfo -V
50o grub 0.93 # grub --version 50o grub 0.93 # grub --version
51o mcelog 0.6
51 52
52Kernel compilation 53Kernel compilation
53================== 54==================
@@ -61,7 +62,7 @@ computer.
61Make 62Make
62---- 63----
63 64
64You will need Gnu make 3.79.1 or later to build the kernel. 65You will need Gnu make 3.80 or later to build the kernel.
65 66
66Binutils 67Binutils
67-------- 68--------
@@ -71,6 +72,13 @@ assembling the 16-bit boot code, removing the need for as86 to compile
71your kernel. This change does, however, mean that you need a recent 72your kernel. This change does, however, mean that you need a recent
72release of binutils. 73release of binutils.
73 74
75Perl
76----
77
78You will need perl 5 and the following modules: Getopt::Long, Getopt::Std,
79File::Basename, and File::Find to build the kernel.
80
81
74System utilities 82System utilities
75================ 83================
76 84
@@ -276,6 +284,16 @@ before running exportfs or mountd. It is recommended that all NFS
276services be protected from the internet-at-large by a firewall where 284services be protected from the internet-at-large by a firewall where
277that is possible. 285that is possible.
278 286
287mcelog
288------
289
290In Linux 2.6.31+ the i386 kernel needs to run the mcelog utility
291as a regular cronjob similar to the x86-64 kernel to process and log
292machine check events when CONFIG_X86_NEW_MCE is enabled. Machine check
293events are errors reported by the CPU. Processing them is strongly encouraged.
294All x86-64 kernels since 2.6.4 require the mcelog utility to
295process machine checks.
296
279Getting updated software 297Getting updated software
280======================== 298========================
281 299
@@ -365,6 +383,10 @@ FUSE
365---- 383----
366o <http://sourceforge.net/projects/fuse> 384o <http://sourceforge.net/projects/fuse>
367 385
386mcelog
387------
388o <ftp://ftp.kernel.org/pub/linux/utils/cpu/mce/mcelog/>
389
368Networking 390Networking
369********** 391**********
370 392
diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle
index 72968cd5eaf3..8bb37237ebd2 100644
--- a/Documentation/CodingStyle
+++ b/Documentation/CodingStyle
@@ -698,8 +698,8 @@ very often is not. Abundant use of the inline keyword leads to a much bigger
698kernel, which in turn slows the system as a whole down, due to a bigger 698kernel, which in turn slows the system as a whole down, due to a bigger
699icache footprint for the CPU and simply because there is less memory 699icache footprint for the CPU and simply because there is less memory
700available for the pagecache. Just think about it; a pagecache miss causes a 700available for the pagecache. Just think about it; a pagecache miss causes a
701disk seek, which easily takes 5 miliseconds. There are a LOT of cpu cycles 701disk seek, which easily takes 5 milliseconds. There are a LOT of cpu cycles
702that can go into these 5 miliseconds. 702that can go into these 5 milliseconds.
703 703
704A reasonable rule of thumb is to not put inline at functions that have more 704A reasonable rule of thumb is to not put inline at functions that have more
705than 3 lines of code in them. An exception to this rule are the cases where 705than 3 lines of code in them. An exception to this rule are the cases where
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index d9aa43d78bcc..5aceb88b3f8b 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -676,8 +676,8 @@ this directory the following files can currently be found:
676 dma-api/all_errors This file contains a numeric value. If this 676 dma-api/all_errors This file contains a numeric value. If this
677 value is not equal to zero the debugging code 677 value is not equal to zero the debugging code
678 will print a warning for every error it finds 678 will print a warning for every error it finds
679 into the kernel log. Be carefull with this 679 into the kernel log. Be careful with this
680 option. It can easily flood your logs. 680 option, as it can easily flood your logs.
681 681
682 dma-api/disabled This read-only file contains the character 'Y' 682 dma-api/disabled This read-only file contains the character 'Y'
683 if the debugging code is disabled. This can 683 if the debugging code is disabled. This can
@@ -704,12 +704,24 @@ this directory the following files can currently be found:
704 The current number of free dma_debug_entries 704 The current number of free dma_debug_entries
705 in the allocator. 705 in the allocator.
706 706
707 dma-api/driver-filter
708 You can write a name of a driver into this file
709 to limit the debug output to requests from that
710 particular driver. Write an empty string to
711 that file to disable the filter and see
712 all errors again.
713
707If you have this code compiled into your kernel it will be enabled by default. 714If you have this code compiled into your kernel it will be enabled by default.
708If you want to boot without the bookkeeping anyway you can provide 715If you want to boot without the bookkeeping anyway you can provide
709'dma_debug=off' as a boot parameter. This will disable DMA-API debugging. 716'dma_debug=off' as a boot parameter. This will disable DMA-API debugging.
710Notice that you can not enable it again at runtime. You have to reboot to do 717Notice that you can not enable it again at runtime. You have to reboot to do
711so. 718so.
712 719
720If you want to see debug messages only for a special device driver you can
721specify the dma_debug_driver=<drivername> parameter. This will enable the
722driver filter at boot time. The debug code will only print errors for that
723driver afterwards. This filter can be disabled or changed later using debugfs.
724
713When the code disables itself at runtime this is most likely because it ran 725When the code disables itself at runtime this is most likely because it ran
714out of dma_debug_entries. These entries are preallocated at boot. The number 726out of dma_debug_entries. These entries are preallocated at boot. The number
715of preallocated entries is defined per architecture. If it is too low for you 727of preallocated entries is defined per architecture. If it is too low for you
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
index b1eb661e6302..9632444f6c62 100644
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -13,7 +13,8 @@ DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \
13 gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \ 13 gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
14 genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \ 14 genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \
15 mac80211.xml debugobjects.xml sh.xml regulator.xml \ 15 mac80211.xml debugobjects.xml sh.xml regulator.xml \
16 alsa-driver-api.xml writing-an-alsa-driver.xml 16 alsa-driver-api.xml writing-an-alsa-driver.xml \
17 tracepoint.xml
17 18
18### 19###
19# The build process is as follows (targets): 20# The build process is as follows (targets):
diff --git a/Documentation/DocBook/debugobjects.tmpl b/Documentation/DocBook/debugobjects.tmpl
index 7f5f218015fe..08ff908aa7a2 100644
--- a/Documentation/DocBook/debugobjects.tmpl
+++ b/Documentation/DocBook/debugobjects.tmpl
@@ -106,7 +106,7 @@
106 number of errors are printk'ed including a full stack trace. 106 number of errors are printk'ed including a full stack trace.
107 </para> 107 </para>
108 <para> 108 <para>
109 The statistics are available via debugfs/debug_objects/stats. 109 The statistics are available via /sys/kernel/debug/debug_objects/stats.
110 They provide information about the number of warnings and the 110 They provide information about the number of warnings and the
111 number of successful fixups along with information about the 111 number of successful fixups along with information about the
112 usage of the internal tracking objects and the state of the 112 usage of the internal tracking objects and the state of the
diff --git a/Documentation/DocBook/mac80211.tmpl b/Documentation/DocBook/mac80211.tmpl
index fbeaffc1dcc3..e36986663570 100644
--- a/Documentation/DocBook/mac80211.tmpl
+++ b/Documentation/DocBook/mac80211.tmpl
@@ -145,7 +145,6 @@ usage should require reading the full document.
145 interface in STA mode at first! 145 interface in STA mode at first!
146 </para> 146 </para>
147!Finclude/net/mac80211.h ieee80211_if_init_conf 147!Finclude/net/mac80211.h ieee80211_if_init_conf
148!Finclude/net/mac80211.h ieee80211_if_conf
149 </chapter> 148 </chapter>
150 149
151 <chapter id="rx-tx"> 150 <chapter id="rx-tx">
diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl
new file mode 100644
index 000000000000..b0756d0fd579
--- /dev/null
+++ b/Documentation/DocBook/tracepoint.tmpl
@@ -0,0 +1,89 @@
1<?xml version="1.0" encoding="UTF-8"?>
2<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
3 "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
4
5<book id="Tracepoints">
6 <bookinfo>
7 <title>The Linux Kernel Tracepoint API</title>
8
9 <authorgroup>
10 <author>
11 <firstname>Jason</firstname>
12 <surname>Baron</surname>
13 <affiliation>
14 <address>
15 <email>jbaron@redhat.com</email>
16 </address>
17 </affiliation>
18 </author>
19 </authorgroup>
20
21 <legalnotice>
22 <para>
23 This documentation is free software; you can redistribute
24 it and/or modify it under the terms of the GNU General Public
25 License as published by the Free Software Foundation; either
26 version 2 of the License, or (at your option) any later
27 version.
28 </para>
29
30 <para>
31 This program is distributed in the hope that it will be
32 useful, but WITHOUT ANY WARRANTY; without even the implied
33 warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
34 See the GNU General Public License for more details.
35 </para>
36
37 <para>
38 You should have received a copy of the GNU General Public
39 License along with this program; if not, write to the Free
40 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
41 MA 02111-1307 USA
42 </para>
43
44 <para>
45 For more details see the file COPYING in the source
46 distribution of Linux.
47 </para>
48 </legalnotice>
49 </bookinfo>
50
51 <toc></toc>
52 <chapter id="intro">
53 <title>Introduction</title>
54 <para>
55 Tracepoints are static probe points that are located in strategic points
56 throughout the kernel. 'Probes' register/unregister with tracepoints
57 via a callback mechanism. The 'probes' are strictly typed functions that
58 are passed a unique set of parameters defined by each tracepoint.
59 </para>
60
61 <para>
62 From this simple callback mechanism, 'probes' can be used to profile, debug,
63 and understand kernel behavior. There are a number of tools that provide a
64 framework for using 'probes'. These tools include Systemtap, ftrace, and
65 LTTng.
66 </para>
67
68 <para>
69 Tracepoints are defined in a number of header files via various macros. Thus,
70 the purpose of this document is to provide a clear accounting of the available
71 tracepoints. The intention is to understand not only what tracepoints are
72 available but also to understand where future tracepoints might be added.
73 </para>
74
75 <para>
76 The API presented has functions of the form:
77 <function>trace_tracepointname(function parameters)</function>. These are the
78 tracepoints callbacks that are found throughout the code. Registering and
79 unregistering probes with these callback sites is covered in the
80 <filename>Documentation/trace/*</filename> directory.
81 </para>
82 </chapter>
83
84 <chapter id="irq">
85 <title>IRQ</title>
86!Iinclude/trace/events/irq.h
87 </chapter>
88
89</book>
diff --git a/Documentation/PCI/pcieaer-howto.txt b/Documentation/PCI/pcieaer-howto.txt
index ddeb14beacc8..be21001ab144 100644
--- a/Documentation/PCI/pcieaer-howto.txt
+++ b/Documentation/PCI/pcieaer-howto.txt
@@ -61,6 +61,10 @@ be initiated although firmwares have no _OSC support. To enable the
61walkaround, pls. add aerdriver.forceload=y to kernel boot parameter line 61walkaround, pls. add aerdriver.forceload=y to kernel boot parameter line
62when booting kernel. Note that forceload=n by default. 62when booting kernel. Note that forceload=n by default.
63 63
64nosourceid, another parameter of type bool, can be used when broken
65hardware (mostly chipsets) has root ports that cannot obtain the reporting
66source ID. nosourceid=n by default.
67
642.3 AER error output 682.3 AER error output
65When a PCI-E AER error is captured, an error message will be outputed to 69When a PCI-E AER error is captured, an error message will be outputed to
66console. If it's a correctable error, it is outputed as a warning. 70console. If it's a correctable error, it is outputed as a warning.
@@ -246,3 +250,24 @@ with the PCI Express AER Root driver?
246A: It could call the helper functions to enable AER in devices and 250A: It could call the helper functions to enable AER in devices and
247cleanup uncorrectable status register. Pls. refer to section 3.3. 251cleanup uncorrectable status register. Pls. refer to section 3.3.
248 252
253
2544. Software error injection
255
256Debugging PCIE AER error recovery code is quite difficult because it
257is hard to trigger real hardware errors. Software based error
258injection can be used to fake various kinds of PCIE errors.
259
260First you should enable PCIE AER software error injection in kernel
261configuration, that is, following item should be in your .config.
262
263CONFIG_PCIEAER_INJECT=y or CONFIG_PCIEAER_INJECT=m
264
265After reboot with new kernel or insert the module, a device file named
266/dev/aer_inject should be created.
267
268Then, you need a user space tool named aer-inject, which can be gotten
269from:
270 http://www.kernel.org/pub/linux/utils/pci/aer-inject/
271
272More information about aer-inject can be found in the document comes
273with its source code.
diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt
index 6389dec33459..93cb28d05dcd 100644
--- a/Documentation/RCU/rculist_nulls.txt
+++ b/Documentation/RCU/rculist_nulls.txt
@@ -118,7 +118,7 @@ to another chain) checking the final 'nulls' value if
118the lookup met the end of chain. If final 'nulls' value 118the lookup met the end of chain. If final 'nulls' value
119is not the slot number, then we must restart the lookup at 119is not the slot number, then we must restart the lookup at
120the beginning. If the object was moved to the same chain, 120the beginning. If the object was moved to the same chain,
121then the reader doesnt care : It might eventually 121then the reader doesn't care : It might eventually
122scan the list again without harm. 122scan the list again without harm.
123 123
124 124
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 068848240a8b..02cced183b2d 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -192,23 +192,24 @@ rcu/rcuhier (which displays the struct rcu_node hierarchy).
192The output of "cat rcu/rcudata" looks as follows: 192The output of "cat rcu/rcudata" looks as follows:
193 193
194rcu: 194rcu:
195 0 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=1 rp=3c2a dt=23301/73 dn=2 df=1882 of=0 ri=2126 ql=2 b=10 195rcu:
196 1 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=3 rp=39a6 dt=78073/1 dn=2 df=1402 of=0 ri=1875 ql=46 b=10 196 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10
197 2 c=4010 g=4010 pq=1 pqc=4010 qp=0 rpfq=-5 rp=1d12 dt=16646/0 dn=2 df=3140 of=0 ri=2080 ql=0 b=10 197 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10
198 3 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=2b50 dt=21159/1 dn=2 df=2230 of=0 ri=1923 ql=72 b=10 198 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10
199 4 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1644 dt=5783/1 dn=2 df=3348 of=0 ri=2805 ql=7 b=10 199 3 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=6681/1 dn=0 df=1545 of=0 ri=0 ql=0 b=10
200 5 c=4012 g=4013 pq=0 pqc=4011 qp=1 rpfq=3 rp=1aac dt=5879/1 dn=2 df=3140 of=0 ri=2066 ql=10 b=10 200 4 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1003/1 dn=0 df=1992 of=0 ri=0 ql=0 b=10
201 6 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=ed8 dt=5847/1 dn=2 df=3797 of=0 ri=1266 ql=10 b=10 201 5 c=17829 g=17830 pq=1 pqc=17829 qp=1 dt=3887/1 dn=0 df=3331 of=0 ri=4 ql=2 b=10
202 7 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1fa2 dt=6199/1 dn=2 df=2795 of=0 ri=2162 ql=28 b=10 202 6 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=859/1 dn=0 df=3224 of=0 ri=0 ql=0 b=10
203 7 c=17829 g=17830 pq=0 pqc=17829 qp=1 dt=3761/1 dn=0 df=1818 of=0 ri=0 ql=2 b=10
203rcu_bh: 204rcu_bh:
204 0 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-145 rp=21d6 dt=23301/73 dn=2 df=0 of=0 ri=0 ql=0 b=10 205 0 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=10951/1 dn=0 df=0 of=0 ri=0 ql=0 b=10
205 1 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-170 rp=20ce dt=78073/1 dn=2 df=26 of=0 ri=5 ql=0 b=10 206 1 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=16117/1 dn=0 df=13 of=0 ri=0 ql=0 b=10
206 2 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-83 rp=fbd dt=16646/0 dn=2 df=28 of=0 ri=4 ql=0 b=10 207 2 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1445/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
207 3 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-105 rp=178c dt=21159/1 dn=2 df=28 of=0 ri=2 ql=0 b=10 208 3 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=6681/1 dn=0 df=9 of=0 ri=0 ql=0 b=10
208 4 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-30 rp=b54 dt=5783/1 dn=2 df=32 of=0 ri=0 ql=0 b=10 209 4 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1003/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
209 5 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-29 rp=df5 dt=5879/1 dn=2 df=30 of=0 ri=3 ql=0 b=10 210 5 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3887/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
210 6 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-28 rp=788 dt=5847/1 dn=2 df=32 of=0 ri=0 ql=0 b=10 211 6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
211 7 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-53 rp=1098 dt=6199/1 dn=2 df=30 of=0 ri=3 ql=0 b=10 212 7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
212 213
213The first section lists the rcu_data structures for rcu, the second for 214The first section lists the rcu_data structures for rcu, the second for
214rcu_bh. Each section has one line per CPU, or eight for this 8-CPU system. 215rcu_bh. Each section has one line per CPU, or eight for this 8-CPU system.
@@ -253,12 +254,6 @@ o "pqc" indicates which grace period the last-observed quiescent
253o "qp" indicates that RCU still expects a quiescent state from 254o "qp" indicates that RCU still expects a quiescent state from
254 this CPU. 255 this CPU.
255 256
256o "rpfq" is the number of rcu_pending() calls on this CPU required
257 to induce this CPU to invoke force_quiescent_state().
258
259o "rp" is low-order four hex digits of the count of how many times
260 rcu_pending() has been invoked on this CPU.
261
262o "dt" is the current value of the dyntick counter that is incremented 257o "dt" is the current value of the dyntick counter that is incremented
263 when entering or leaving dynticks idle state, either by the 258 when entering or leaving dynticks idle state, either by the
264 scheduler or by irq. The number after the "/" is the interrupt 259 scheduler or by irq. The number after the "/" is the interrupt
@@ -305,6 +300,9 @@ o "b" is the batch limit for this CPU. If more than this number
305 of RCU callbacks is ready to invoke, then the remainder will 300 of RCU callbacks is ready to invoke, then the remainder will
306 be deferred. 301 be deferred.
307 302
303There is also an rcu/rcudata.csv file with the same information in
304comma-separated-variable spreadsheet format.
305
308 306
309The output of "cat rcu/rcugp" looks as follows: 307The output of "cat rcu/rcugp" looks as follows:
310 308
@@ -411,3 +409,63 @@ o Each element of the form "1/1 0:127 ^0" represents one struct
411 For example, the first entry at the lowest level shows 409 For example, the first entry at the lowest level shows
412 "^0", indicating that it corresponds to bit zero in 410 "^0", indicating that it corresponds to bit zero in
413 the first entry at the middle level. 411 the first entry at the middle level.
412
413
414The output of "cat rcu/rcu_pending" looks as follows:
415
416rcu:
417 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
418 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
419 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
420 3 np=236249 qsp=48766 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723
421 4 np=221310 qsp=46850 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110
422 5 np=237332 qsp=48449 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456
423 6 np=219995 qsp=46718 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834
424 7 np=249893 qsp=49390 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888
425rcu_bh:
426 0 np=146741 qsp=1419 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314
427 1 np=155792 qsp=12597 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180
428 2 np=136629 qsp=18680 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936
429 3 np=137723 qsp=2843 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863
430 4 np=123110 qsp=12433 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671
431 5 np=137456 qsp=4210 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235
432 6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921
433 7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542
434
435As always, this is once again split into "rcu" and "rcu_bh" portions.
436The fields are as follows:
437
438o "np" is the number of times that __rcu_pending() has been invoked
439 for the corresponding flavor of RCU.
440
441o "qsp" is the number of times that the RCU was waiting for a
442 quiescent state from this CPU.
443
444o "cbr" is the number of times that this CPU had RCU callbacks
445 that had passed through a grace period, and were thus ready
446 to be invoked.
447
448o "cng" is the number of times that this CPU needed another
449 grace period while RCU was idle.
450
451o "gpc" is the number of times that an old grace period had
452 completed, but this CPU was not yet aware of it.
453
454o "gps" is the number of times that a new grace period had started,
455 but this CPU was not yet aware of it.
456
457o "nf" is the number of times that this CPU suspected that the
458 current grace period had run for too long, and thus needed to
459 be forced.
460
461 Please note that "forcing" consists of sending resched IPIs
462 to holdout CPUs. If that CPU really still is in an old RCU
463 read-side critical section, then we really do have to wait for it.
464 The assumption behing "forcing" is that the CPU is not still in
465 an old RCU read-side critical section, but has not yet responded
466 for some other reason.
467
468o "nn" is the number of times that this CPU needed nothing. Alert
469 readers will note that the rcu "nn" number for a given CPU very
470 closely matches the rcu_bh "np" number for that same CPU. This
471 is due to short-circuit evaluation in rcu_pending().
diff --git a/Documentation/SM501.txt b/Documentation/SM501.txt
index 6fc656035925..561826f82093 100644
--- a/Documentation/SM501.txt
+++ b/Documentation/SM501.txt
@@ -5,7 +5,7 @@ Copyright 2006, 2007 Simtec Electronics
5 5
6The Silicon Motion SM501 multimedia companion chip is a multifunction device 6The Silicon Motion SM501 multimedia companion chip is a multifunction device
7which may provide numerous interfaces including USB host controller USB gadget, 7which may provide numerous interfaces including USB host controller USB gadget,
8Asyncronous Serial ports, Audio functions and a dual display video interface. 8asynchronous serial ports, audio functions, and a dual display video interface.
9The device may be connected by PCI or local bus with varying functions enabled. 9The device may be connected by PCI or local bus with varying functions enabled.
10 10
11Core 11Core
diff --git a/Documentation/Smack.txt b/Documentation/Smack.txt
index 629c92e99783..34614b4c708e 100644
--- a/Documentation/Smack.txt
+++ b/Documentation/Smack.txt
@@ -184,8 +184,9 @@ length. Single character labels using special characters, that being anything
184other than a letter or digit, are reserved for use by the Smack development 184other than a letter or digit, are reserved for use by the Smack development
185team. Smack labels are unstructured, case sensitive, and the only operation 185team. Smack labels are unstructured, case sensitive, and the only operation
186ever performed on them is comparison for equality. Smack labels cannot 186ever performed on them is comparison for equality. Smack labels cannot
187contain unprintable characters or the "/" (slash) character. Smack labels 187contain unprintable characters, the "/" (slash), the "\" (backslash), the "'"
188cannot begin with a '-', which is reserved for special options. 188(quote) and '"' (double-quote) characters.
189Smack labels cannot begin with a '-', which is reserved for special options.
189 190
190There are some predefined labels: 191There are some predefined labels:
191 192
@@ -523,3 +524,18 @@ Smack supports some mount options:
523 524
524These mount options apply to all file system types. 525These mount options apply to all file system types.
525 526
527Smack auditing
528
529If you want Smack auditing of security events, you need to set CONFIG_AUDIT
530in your kernel configuration.
531By default, all denied events will be audited. You can change this behavior by
532writing a single character to the /smack/logging file :
5330 : no logging
5341 : log denied (default)
5352 : log accepted
5363 : log denied & accepted
537
538Events are logged as 'key=value' pairs, for each event you at least will get
539the subjet, the object, the rights requested, the action, the kernel function
540that triggered the event, plus other pairs depending on the type of event
541audited.
diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist
index ac5e0b2f1097..78a9168ff377 100644
--- a/Documentation/SubmitChecklist
+++ b/Documentation/SubmitChecklist
@@ -54,7 +54,7 @@ kernel patches.
54 CONFIG_PREEMPT. 54 CONFIG_PREEMPT.
55 55
5614: If the patch affects IO/Disk, etc: has been tested with and without 5614: If the patch affects IO/Disk, etc: has been tested with and without
57 CONFIG_LBD. 57 CONFIG_LBDAF.
58 58
5915: All codepaths have been exercised with all lockdep features enabled. 5915: All codepaths have been exercised with all lockdep features enabled.
60 60
diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
index f309d3c6221c..5c555a8b39e5 100644
--- a/Documentation/SubmittingPatches
+++ b/Documentation/SubmittingPatches
@@ -91,6 +91,10 @@ Be as specific as possible. The WORST descriptions possible include
91things like "update driver X", "bug fix for driver X", or "this patch 91things like "update driver X", "bug fix for driver X", or "this patch
92includes updates for subsystem X. Please apply." 92includes updates for subsystem X. Please apply."
93 93
94The maintainer will thank you if you write your patch description in a
95form which can be easily pulled into Linux's source code management
96system, git, as a "commit log". See #15, below.
97
94If your description starts to get long, that's a sign that you probably 98If your description starts to get long, that's a sign that you probably
95need to split up your patch. See #3, next. 99need to split up your patch. See #3, next.
96 100
@@ -183,8 +187,9 @@ Even if the maintainer did not respond in step #4, make sure to ALWAYS
183copy the maintainer when you change their code. 187copy the maintainer when you change their code.
184 188
185For small patches you may want to CC the Trivial Patch Monkey 189For small patches you may want to CC the Trivial Patch Monkey
186trivial@kernel.org managed by Jesper Juhl; which collects "trivial" 190trivial@kernel.org which collects "trivial" patches. Have a look
187patches. Trivial patches must qualify for one of the following rules: 191into the MAINTAINERS file for its current manager.
192Trivial patches must qualify for one of the following rules:
188 Spelling fixes in documentation 193 Spelling fixes in documentation
189 Spelling fixes which could break grep(1) 194 Spelling fixes which could break grep(1)
190 Warning fixes (cluttering with useless warnings is bad) 195 Warning fixes (cluttering with useless warnings is bad)
@@ -196,7 +201,6 @@ patches. Trivial patches must qualify for one of the following rules:
196 since people copy, as long as it's trivial) 201 since people copy, as long as it's trivial)
197 Any fix by the author/maintainer of the file (ie. patch monkey 202 Any fix by the author/maintainer of the file (ie. patch monkey
198 in re-transmission mode) 203 in re-transmission mode)
199URL: <http://www.kernel.org/pub/linux/kernel/people/juhl/trivial/>
200 204
201 205
202 206
@@ -405,7 +409,14 @@ person it names. This tag documents that potentially interested parties
405have been included in the discussion 409have been included in the discussion
406 410
407 411
40814) Using Tested-by: and Reviewed-by: 41214) Using Reported-by:, Tested-by: and Reviewed-by:
413
414If this patch fixes a problem reported by somebody else, consider adding a
415Reported-by: tag to credit the reporter for their contribution. Please
416note that this tag should not be added without the reporter's permission,
417especially if the problem was not reported in a public forum. That said,
418if we diligently credit our bug reporters, they will, hopefully, be
419inspired to help us again in the future.
409 420
410A Tested-by: tag indicates that the patch has been successfully tested (in 421A Tested-by: tag indicates that the patch has been successfully tested (in
411some environment) by the person named. This tag informs maintainers that 422some environment) by the person named. This tag informs maintainers that
@@ -444,7 +455,7 @@ offer a Reviewed-by tag for a patch. This tag serves to give credit to
444reviewers and to inform maintainers of the degree of review which has been 455reviewers and to inform maintainers of the degree of review which has been
445done on the patch. Reviewed-by: tags, when supplied by reviewers known to 456done on the patch. Reviewed-by: tags, when supplied by reviewers known to
446understand the subject area and to perform thorough reviews, will normally 457understand the subject area and to perform thorough reviews, will normally
447increase the liklihood of your patch getting into the kernel. 458increase the likelihood of your patch getting into the kernel.
448 459
449 460
45015) The canonical patch format 46115) The canonical patch format
@@ -485,12 +496,33 @@ phrase" should not be a filename. Do not use the same "summary
485phrase" for every patch in a whole patch series (where a "patch 496phrase" for every patch in a whole patch series (where a "patch
486series" is an ordered sequence of multiple, related patches). 497series" is an ordered sequence of multiple, related patches).
487 498
488Bear in mind that the "summary phrase" of your email becomes 499Bear in mind that the "summary phrase" of your email becomes a
489a globally-unique identifier for that patch. It propagates 500globally-unique identifier for that patch. It propagates all the way
490all the way into the git changelog. The "summary phrase" may 501into the git changelog. The "summary phrase" may later be used in
491later be used in developer discussions which refer to the patch. 502developer discussions which refer to the patch. People will want to
492People will want to google for the "summary phrase" to read 503google for the "summary phrase" to read discussion regarding that
493discussion regarding that patch. 504patch. It will also be the only thing that people may quickly see
505when, two or three months later, they are going through perhaps
506thousands of patches using tools such as "gitk" or "git log
507--oneline".
508
509For these reasons, the "summary" must be no more than 70-75
510characters, and it must describe both what the patch changes, as well
511as why the patch might be necessary. It is challenging to be both
512succinct and descriptive, but that is what a well-written summary
513should do.
514
515The "summary phrase" may be prefixed by tags enclosed in square
516brackets: "Subject: [PATCH tag] <summary phrase>". The tags are not
517considered part of the summary phrase, but describe how the patch
518should be treated. Common tags might include a version descriptor if
519the multiple versions of the patch have been sent out in response to
520comments (i.e., "v1, v2, v3"), or "RFC" to indicate a request for
521comments. If there are four patches in a patch series the individual
522patches may be numbered like this: 1/4, 2/4, 3/4, 4/4. This assures
523that developers understand the order in which the patches should be
524applied and that they have reviewed or applied all of the patches in
525the patch series.
494 526
495A couple of example Subjects: 527A couple of example Subjects:
496 528
@@ -510,19 +542,31 @@ the patch author in the changelog.
510The explanation body will be committed to the permanent source 542The explanation body will be committed to the permanent source
511changelog, so should make sense to a competent reader who has long 543changelog, so should make sense to a competent reader who has long
512since forgotten the immediate details of the discussion that might 544since forgotten the immediate details of the discussion that might
513have led to this patch. 545have led to this patch. Including symptoms of the failure which the
546patch addresses (kernel log messages, oops messages, etc.) is
547especially useful for people who might be searching the commit logs
548looking for the applicable patch. If a patch fixes a compile failure,
549it may not be necessary to include _all_ of the compile failures; just
550enough that it is likely that someone searching for the patch can find
551it. As in the "summary phrase", it is important to be both succinct as
552well as descriptive.
514 553
515The "---" marker line serves the essential purpose of marking for patch 554The "---" marker line serves the essential purpose of marking for patch
516handling tools where the changelog message ends. 555handling tools where the changelog message ends.
517 556
518One good use for the additional comments after the "---" marker is for 557One good use for the additional comments after the "---" marker is for
519a diffstat, to show what files have changed, and the number of inserted 558a diffstat, to show what files have changed, and the number of
520and deleted lines per file. A diffstat is especially useful on bigger 559inserted and deleted lines per file. A diffstat is especially useful
521patches. Other comments relevant only to the moment or the maintainer, 560on bigger patches. Other comments relevant only to the moment or the
522not suitable for the permanent changelog, should also go here. 561maintainer, not suitable for the permanent changelog, should also go
523Use diffstat options "-p 1 -w 70" so that filenames are listed from the 562here. A good example of such comments might be "patch changelogs"
524top of the kernel source tree and don't use too much horizontal space 563which describe what has changed between the v1 and v2 version of the
525(easily fit in 80 columns, maybe with some indentation). 564patch.
565
566If you are going to include a diffstat after the "---" marker, please
567use diffstat options "-p 1 -w 70" so that filenames are listed from
568the top of the kernel source tree and don't use too much horizontal
569space (easily fit in 80 columns, maybe with some indentation).
526 570
527See more details on the proper patch format in the following 571See more details on the proper patch format in the following
528references. 572references.
diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c
index 7ea231172c85..aa73e72fd793 100644
--- a/Documentation/accounting/getdelays.c
+++ b/Documentation/accounting/getdelays.c
@@ -246,7 +246,8 @@ void print_ioacct(struct taskstats *t)
246 246
247int main(int argc, char *argv[]) 247int main(int argc, char *argv[])
248{ 248{
249 int c, rc, rep_len, aggr_len, len2, cmd_type; 249 int c, rc, rep_len, aggr_len, len2;
250 int cmd_type = TASKSTATS_CMD_ATTR_UNSPEC;
250 __u16 id; 251 __u16 id;
251 __u32 mypid; 252 __u32 mypid;
252 253
diff --git a/Documentation/arm/Samsung-S3C24XX/GPIO.txt b/Documentation/arm/Samsung-S3C24XX/GPIO.txt
index ea7ccfc4b274..948c8718d967 100644
--- a/Documentation/arm/Samsung-S3C24XX/GPIO.txt
+++ b/Documentation/arm/Samsung-S3C24XX/GPIO.txt
@@ -51,7 +51,7 @@ PIN Numbers
51----------- 51-----------
52 52
53 Each pin has an unique number associated with it in regs-gpio.h, 53 Each pin has an unique number associated with it in regs-gpio.h,
54 eg S3C2410_GPA0 or S3C2410_GPF1. These defines are used to tell 54 eg S3C2410_GPA(0) or S3C2410_GPF(1). These defines are used to tell
55 the GPIO functions which pin is to be used. 55 the GPIO functions which pin is to be used.
56 56
57 57
@@ -65,11 +65,11 @@ Configuring a pin
65 65
66 Eg: 66 Eg:
67 67
68 s3c2410_gpio_cfgpin(S3C2410_GPA0, S3C2410_GPA0_ADDR0); 68 s3c2410_gpio_cfgpin(S3C2410_GPA(0), S3C2410_GPA0_ADDR0);
69 s3c2410_gpio_cfgpin(S3C2410_GPE8, S3C2410_GPE8_SDDAT1); 69 s3c2410_gpio_cfgpin(S3C2410_GPE(8), S3C2410_GPE8_SDDAT1);
70 70
71 which would turn GPA0 into the lowest Address line A0, and set 71 which would turn GPA(0) into the lowest Address line A0, and set
72 GPE8 to be connected to the SDIO/MMC controller's SDDAT1 line. 72 GPE(8) to be connected to the SDIO/MMC controller's SDDAT1 line.
73 73
74 74
75Reading the current configuration 75Reading the current configuration
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
index 4ef245010457..396bec3b74ed 100644
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt
@@ -229,10 +229,10 @@ kernel. It is the use of atomic counters to implement reference
229counting, and it works such that once the counter falls to zero it can 229counting, and it works such that once the counter falls to zero it can
230be guaranteed that no other entity can be accessing the object: 230be guaranteed that no other entity can be accessing the object:
231 231
232static void obj_list_add(struct obj *obj) 232static void obj_list_add(struct obj *obj, struct list_head *head)
233{ 233{
234 obj->active = 1; 234 obj->active = 1;
235 list_add(&obj->list); 235 list_add(&obj->list, head);
236} 236}
237 237
238static void obj_list_del(struct obj *obj) 238static void obj_list_del(struct obj *obj)
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 6fab97ea7e6b..8d2158a1c6aa 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -186,7 +186,7 @@ a virtual address mapping (unlike the earlier scheme of virtual address
186do not have a corresponding kernel virtual address space mapping) and 186do not have a corresponding kernel virtual address space mapping) and
187low-memory pages. 187low-memory pages.
188 188
189Note: Please refer to Documentation/PCI/PCI-DMA-mapping.txt for a discussion 189Note: Please refer to Documentation/DMA-mapping.txt for a discussion
190on PCI high mem DMA aspects and mapping of scatter gather lists, and support 190on PCI high mem DMA aspects and mapping of scatter gather lists, and support
191for 64 bit PCI. 191for 64 bit PCI.
192 192
diff --git a/Documentation/block/deadline-iosched.txt b/Documentation/block/deadline-iosched.txt
index 72576769e0f4..2d82c80322cb 100644
--- a/Documentation/block/deadline-iosched.txt
+++ b/Documentation/block/deadline-iosched.txt
@@ -58,7 +58,7 @@ same criteria as reads.
58front_merges (bool) 58front_merges (bool)
59------------ 59------------
60 60
61Sometimes it happens that a request enters the io scheduler that is contigious 61Sometimes it happens that a request enters the io scheduler that is contiguous
62with a request that is already on the queue. Either it fits in the back of that 62with a request that is already on the queue. Either it fits in the back of that
63request, or it fits at the front. That is called either a back merge candidate 63request, or it fits at the front. That is called either a back merge candidate
64or a front merge candidate. Due to the way files are typically laid out, 64or a front merge candidate. Due to the way files are typically laid out,
diff --git a/Documentation/braille-console.txt b/Documentation/braille-console.txt
index 000b0fbdc105..d0d042c2fd5e 100644
--- a/Documentation/braille-console.txt
+++ b/Documentation/braille-console.txt
@@ -27,7 +27,7 @@ parameter.
27 27
28For simplicity, only one braille console can be enabled, other uses of 28For simplicity, only one braille console can be enabled, other uses of
29console=brl,... will be discarded. Also note that it does not interfere with 29console=brl,... will be discarded. Also note that it does not interfere with
30the console selection mecanism described in serial-console.txt 30the console selection mechanism described in serial-console.txt
31 31
32For now, only the VisioBraille device is supported. 32For now, only the VisioBraille device is supported.
33 33
diff --git a/Documentation/cdrom/packet-writing.txt b/Documentation/cdrom/packet-writing.txt
index cf1f8126991c..1c407778c8b2 100644
--- a/Documentation/cdrom/packet-writing.txt
+++ b/Documentation/cdrom/packet-writing.txt
@@ -117,7 +117,7 @@ Using the pktcdvd debugfs interface
117 117
118To read pktcdvd device infos in human readable form, do: 118To read pktcdvd device infos in human readable form, do:
119 119
120 # cat /debug/pktcdvd/pktcdvd[0-7]/info 120 # cat /sys/kernel/debug/pktcdvd/pktcdvd[0-7]/info
121 121
122For a description of the debugfs interface look into the file: 122For a description of the debugfs interface look into the file:
123 123
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 1a608877b14e..23d1262c0775 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -152,14 +152,19 @@ When swap is accounted, following files are added.
152 152
153usage of mem+swap is limited by memsw.limit_in_bytes. 153usage of mem+swap is limited by memsw.limit_in_bytes.
154 154
155Note: why 'mem+swap' rather than swap. 155* why 'mem+swap' rather than swap.
156The global LRU(kswapd) can swap out arbitrary pages. Swap-out means 156The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
157to move account from memory to swap...there is no change in usage of 157to move account from memory to swap...there is no change in usage of
158mem+swap. 158mem+swap. In other words, when we want to limit the usage of swap without
159affecting global LRU, mem+swap limit is better than just limiting swap from
160OS point of view.
159 161
160In other words, when we want to limit the usage of swap without affecting 162* What happens when a cgroup hits memory.memsw.limit_in_bytes
161global LRU, mem+swap limit is better than just limiting swap from OS point 163When a cgroup his memory.memsw.limit_in_bytes, it's useless to do swap-out
162of view. 164in this cgroup. Then, swap-out will not be done by cgroup routine and file
165caches are dropped. But as mentioned above, global LRU can do swapout memory
166from it for sanity of the system's memory management state. You can't forbid
167it by cgroup.
163 168
1642.5 Reclaim 1692.5 Reclaim
165 170
@@ -204,6 +209,7 @@ We can alter the memory limit:
204 209
205NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, 210NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
206mega or gigabytes. 211mega or gigabytes.
212NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
207 213
208# cat /cgroups/0/memory.limit_in_bytes 214# cat /cgroups/0/memory.limit_in_bytes
2094194304 2154194304
diff --git a/Documentation/connector/cn_test.c b/Documentation/connector/cn_test.c
index 6977c178729a..f688eba87704 100644
--- a/Documentation/connector/cn_test.c
+++ b/Documentation/connector/cn_test.c
@@ -41,6 +41,12 @@ void cn_test_callback(void *data)
41 msg->seq, msg->ack, msg->len, (char *)msg->data); 41 msg->seq, msg->ack, msg->len, (char *)msg->data);
42} 42}
43 43
44/*
45 * Do not remove this function even if no one is using it as
46 * this is an example of how to get notifications about new
47 * connector user registration
48 */
49#if 0
44static int cn_test_want_notify(void) 50static int cn_test_want_notify(void)
45{ 51{
46 struct cn_ctl_msg *ctl; 52 struct cn_ctl_msg *ctl;
@@ -117,6 +123,7 @@ nlmsg_failure:
117 kfree_skb(skb); 123 kfree_skb(skb);
118 return -EINVAL; 124 return -EINVAL;
119} 125}
126#endif
120 127
121static u32 cn_test_timer_counter; 128static u32 cn_test_timer_counter;
122static void cn_test_timer_func(unsigned long __data) 129static void cn_test_timer_func(unsigned long __data)
diff --git a/Documentation/cpu-freq/cpu-drivers.txt b/Documentation/cpu-freq/cpu-drivers.txt
index 43c743903dd7..75a58d14d3cf 100644
--- a/Documentation/cpu-freq/cpu-drivers.txt
+++ b/Documentation/cpu-freq/cpu-drivers.txt
@@ -155,7 +155,7 @@ actual frequency must be determined using the following rules:
155- if relation==CPUFREQ_REL_H, try to select a new_freq lower than or equal 155- if relation==CPUFREQ_REL_H, try to select a new_freq lower than or equal
156 target_freq. ("H for highest, but no higher than") 156 target_freq. ("H for highest, but no higher than")
157 157
158Here again the frequency table helper might assist you - see section 3 158Here again the frequency table helper might assist you - see section 2
159for details. 159for details.
160 160
161 161
diff --git a/Documentation/cpu-freq/governors.txt b/Documentation/cpu-freq/governors.txt
index ce73f3eb5ddb..aed082f49d09 100644
--- a/Documentation/cpu-freq/governors.txt
+++ b/Documentation/cpu-freq/governors.txt
@@ -119,10 +119,6 @@ want the kernel to look at the CPU usage and to make decisions on
119what to do about the frequency. Typically this is set to values of 119what to do about the frequency. Typically this is set to values of
120around '10000' or more. It's default value is (cmp. with users-guide.txt): 120around '10000' or more. It's default value is (cmp. with users-guide.txt):
121transition_latency * 1000 121transition_latency * 1000
122The lowest value you can set is:
123transition_latency * 100 or it may get restricted to a value where it
124makes not sense for the kernel anymore to poll that often which depends
125on your HZ config variable (HZ=1000: max=20000us, HZ=250: max=5000).
126Be aware that transition latency is in ns and sampling_rate is in us, so you 122Be aware that transition latency is in ns and sampling_rate is in us, so you
127get the same sysfs value by default. 123get the same sysfs value by default.
128Sampling rate should always get adjusted considering the transition latency 124Sampling rate should always get adjusted considering the transition latency
@@ -131,14 +127,20 @@ in the bash (as said, 1000 is default), do:
131echo `$(($(cat cpuinfo_transition_latency) * 750 / 1000)) \ 127echo `$(($(cat cpuinfo_transition_latency) * 750 / 1000)) \
132 >ondemand/sampling_rate 128 >ondemand/sampling_rate
133 129
134show_sampling_rate_(min|max): THIS INTERFACE IS DEPRECATED, DON'T USE IT. 130show_sampling_rate_min:
135You can use wider ranges now and the general 131The sampling rate is limited by the HW transition latency:
136cpuinfo_transition_latency variable (cmp. with user-guide.txt) can be 132transition_latency * 100
137used to obtain exactly the same info: 133Or by kernel restrictions:
138show_sampling_rate_min = transtition_latency * 500 / 1000 134If CONFIG_NO_HZ is set, the limit is 10ms fixed.
139show_sampling_rate_max = transtition_latency * 500000 / 1000 135If CONFIG_NO_HZ is not set or no_hz=off boot parameter is used, the
140(divided by 1000 is to illustrate that sampling rate is in us and 136limits depend on the CONFIG_HZ option:
141transition latency is exported ns). 137HZ=1000: min=20000us (20ms)
138HZ=250: min=80000us (80ms)
139HZ=100: min=200000us (200ms)
140The highest value of kernel and HW latency restrictions is shown and
141used as the minimum sampling rate.
142
143show_sampling_rate_max: THIS INTERFACE IS DEPRECATED, DON'T USE IT.
142 144
143up_threshold: defines what the average CPU usage between the samplings 145up_threshold: defines what the average CPU usage between the samplings
144of 'sampling_rate' needs to be for the kernel to make a decision on 146of 'sampling_rate' needs to be for the kernel to make a decision on
diff --git a/Documentation/cpu-freq/user-guide.txt b/Documentation/cpu-freq/user-guide.txt
index 75f41193f3e1..5d5f5fadd1c2 100644
--- a/Documentation/cpu-freq/user-guide.txt
+++ b/Documentation/cpu-freq/user-guide.txt
@@ -31,7 +31,6 @@ Contents:
31 31
323. How to change the CPU cpufreq policy and/or speed 323. How to change the CPU cpufreq policy and/or speed
333.1 Preferred interface: sysfs 333.1 Preferred interface: sysfs
343.2 Deprecated interfaces
35 34
36 35
37 36
diff --git a/Documentation/dell_rbu.txt b/Documentation/dell_rbu.txt
index c11b931f8f98..15174985ad08 100644
--- a/Documentation/dell_rbu.txt
+++ b/Documentation/dell_rbu.txt
@@ -76,9 +76,9 @@ Do the steps below to download the BIOS image.
76 76
77The /sys/class/firmware/dell_rbu/ entries will remain till the following is 77The /sys/class/firmware/dell_rbu/ entries will remain till the following is
78done. 78done.
79echo -1 > /sys/class/firmware/dell_rbu/loading. 79echo -1 > /sys/class/firmware/dell_rbu/loading
80Until this step is completed the driver cannot be unloaded. 80Until this step is completed the driver cannot be unloaded.
81Also echoing either mono ,packet or init in to image_type will free up the 81Also echoing either mono, packet or init in to image_type will free up the
82memory allocated by the driver. 82memory allocated by the driver.
83 83
84If a user by accident executes steps 1 and 3 above without executing step 2; 84If a user by accident executes steps 1 and 3 above without executing step 2;
diff --git a/Documentation/development-process/5.Posting b/Documentation/development-process/5.Posting
index dd48132a74dd..f622c1e9f0f9 100644
--- a/Documentation/development-process/5.Posting
+++ b/Documentation/development-process/5.Posting
@@ -119,7 +119,7 @@ which takes quite a bit of time and thought after the "real work" has been
119done. When done properly, though, it is time well spent. 119done. When done properly, though, it is time well spent.
120 120
121 121
1225.4: PATCH FORMATTING 1225.4: PATCH FORMATTING AND CHANGELOGS
123 123
124So now you have a perfect series of patches for posting, but the work is 124So now you have a perfect series of patches for posting, but the work is
125not done quite yet. Each patch needs to be formatted into a message which 125not done quite yet. Each patch needs to be formatted into a message which
@@ -146,8 +146,33 @@ that end, each patch will be composed of the following:
146 - One or more tag lines, with, at a minimum, one Signed-off-by: line from 146 - One or more tag lines, with, at a minimum, one Signed-off-by: line from
147 the author of the patch. Tags will be described in more detail below. 147 the author of the patch. Tags will be described in more detail below.
148 148
149The above three items should, normally, be the text used when committing 149The items above, together, form the changelog for the patch. Writing good
150the change to a revision control system. They are followed by: 150changelogs is a crucial but often-neglected art; it's worth spending
151another moment discussing this issue. When writing a changelog, you should
152bear in mind that a number of different people will be reading your words.
153These include subsystem maintainers and reviewers who need to decide
154whether the patch should be included, distributors and other maintainers
155trying to decide whether a patch should be backported to other kernels, bug
156hunters wondering whether the patch is responsible for a problem they are
157chasing, users who want to know how the kernel has changed, and more. A
158good changelog conveys the needed information to all of these people in the
159most direct and concise way possible.
160
161To that end, the summary line should describe the effects of and motivation
162for the change as well as possible given the one-line constraint. The
163detailed description can then amplify on those topics and provide any
164needed additional information. If the patch fixes a bug, cite the commit
165which introduced the bug if possible. If a problem is associated with
166specific log or compiler output, include that output to help others
167searching for a solution to the same problem. If the change is meant to
168support other changes coming in later patch, say so. If internal APIs are
169changed, detail those changes and how other developers should respond. In
170general, the more you can put yourself into the shoes of everybody who will
171be reading your changelog, the better that changelog (and the kernel as a
172whole) will be.
173
174Needless to say, the changelog should be the text used when committing the
175change to a revision control system. It will be followed by:
151 176
152 - The patch itself, in the unified ("-u") patch format. Using the "-p" 177 - The patch itself, in the unified ("-u") patch format. Using the "-p"
153 option to diff will associate function names with changes, making the 178 option to diff will associate function names with changes, making the
diff --git a/Documentation/device-mapper/dm-log.txt b/Documentation/device-mapper/dm-log.txt
new file mode 100644
index 000000000000..994dd75475a6
--- /dev/null
+++ b/Documentation/device-mapper/dm-log.txt
@@ -0,0 +1,54 @@
1Device-Mapper Logging
2=====================
3The device-mapper logging code is used by some of the device-mapper
4RAID targets to track regions of the disk that are not consistent.
5A region (or portion of the address space) of the disk may be
6inconsistent because a RAID stripe is currently being operated on or
7a machine died while the region was being altered. In the case of
8mirrors, a region would be considered dirty/inconsistent while you
9are writing to it because the writes need to be replicated for all
10the legs of the mirror and may not reach the legs at the same time.
11Once all writes are complete, the region is considered clean again.
12
13There is a generic logging interface that the device-mapper RAID
14implementations use to perform logging operations (see
15dm_dirty_log_type in include/linux/dm-dirty-log.h). Various different
16logging implementations are available and provide different
17capabilities. The list includes:
18
19Type Files
20==== =====
21disk drivers/md/dm-log.c
22core drivers/md/dm-log.c
23userspace drivers/md/dm-log-userspace* include/linux/dm-log-userspace.h
24
25The "disk" log type
26-------------------
27This log implementation commits the log state to disk. This way, the
28logging state survives reboots/crashes.
29
30The "core" log type
31-------------------
32This log implementation keeps the log state in memory. The log state
33will not survive a reboot or crash, but there may be a small boost in
34performance. This method can also be used if no storage device is
35available for storing log state.
36
37The "userspace" log type
38------------------------
39This log type simply provides a way to export the log API to userspace,
40so log implementations can be done there. This is done by forwarding most
41logging requests to userspace, where a daemon receives and processes the
42request.
43
44The structure used for communication between kernel and userspace are
45located in include/linux/dm-log-userspace.h. Due to the frequency,
46diversity, and 2-way communication nature of the exchanges between
47kernel and userspace, 'connector' is used as the interface for
48communication.
49
50There are currently two userspace log implementations that leverage this
51framework - "clustered_disk" and "clustered_core". These implementations
52provide a cluster-coherent log for shared-storage. Device-mapper mirroring
53can be used in a shared-storage environment when the cluster log implementations
54are employed.
diff --git a/Documentation/device-mapper/dm-queue-length.txt b/Documentation/device-mapper/dm-queue-length.txt
new file mode 100644
index 000000000000..f4db2562175c
--- /dev/null
+++ b/Documentation/device-mapper/dm-queue-length.txt
@@ -0,0 +1,39 @@
1dm-queue-length
2===============
3
4dm-queue-length is a path selector module for device-mapper targets,
5which selects a path with the least number of in-flight I/Os.
6The path selector name is 'queue-length'.
7
8Table parameters for each path: [<repeat_count>]
9 <repeat_count>: The number of I/Os to dispatch using the selected
10 path before switching to the next path.
11 If not given, internal default is used. To check
12 the default value, see the activated table.
13
14Status for each path: <status> <fail-count> <in-flight>
15 <status>: 'A' if the path is active, 'F' if the path is failed.
16 <fail-count>: The number of path failures.
17 <in-flight>: The number of in-flight I/Os on the path.
18
19
20Algorithm
21=========
22
23dm-queue-length increments/decrements 'in-flight' when an I/O is
24dispatched/completed respectively.
25dm-queue-length selects a path with the minimum 'in-flight'.
26
27
28Examples
29========
30In case that 2 paths (sda and sdb) are used with repeat_count == 128.
31
32# echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \
33 dmsetup create test
34#
35# dmsetup table
36test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128
37#
38# dmsetup status
39test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0
diff --git a/Documentation/device-mapper/dm-service-time.txt b/Documentation/device-mapper/dm-service-time.txt
new file mode 100644
index 000000000000..7d00668e97bb
--- /dev/null
+++ b/Documentation/device-mapper/dm-service-time.txt
@@ -0,0 +1,91 @@
1dm-service-time
2===============
3
4dm-service-time is a path selector module for device-mapper targets,
5which selects a path with the shortest estimated service time for
6the incoming I/O.
7
8The service time for each path is estimated by dividing the total size
9of in-flight I/Os on a path with the performance value of the path.
10The performance value is a relative throughput value among all paths
11in a path-group, and it can be specified as a table argument.
12
13The path selector name is 'service-time'.
14
15Table parameters for each path: [<repeat_count> [<relative_throughput>]]
16 <repeat_count>: The number of I/Os to dispatch using the selected
17 path before switching to the next path.
18 If not given, internal default is used. To check
19 the default value, see the activated table.
20 <relative_throughput>: The relative throughput value of the path
21 among all paths in the path-group.
22 The valid range is 0-100.
23 If not given, minimum value '1' is used.
24 If '0' is given, the path isn't selected while
25 other paths having a positive value are available.
26
27Status for each path: <status> <fail-count> <in-flight-size> \
28 <relative_throughput>
29 <status>: 'A' if the path is active, 'F' if the path is failed.
30 <fail-count>: The number of path failures.
31 <in-flight-size>: The size of in-flight I/Os on the path.
32 <relative_throughput>: The relative throughput value of the path
33 among all paths in the path-group.
34
35
36Algorithm
37=========
38
39dm-service-time adds the I/O size to 'in-flight-size' when the I/O is
40dispatched and substracts when completed.
41Basically, dm-service-time selects a path having minimum service time
42which is calculated by:
43
44 ('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput'
45
46However, some optimizations below are used to reduce the calculation
47as much as possible.
48
49 1. If the paths have the same 'relative_throughput', skip
50 the division and just compare the 'in-flight-size'.
51
52 2. If the paths have the same 'in-flight-size', skip the division
53 and just compare the 'relative_throughput'.
54
55 3. If some paths have non-zero 'relative_throughput' and others
56 have zero 'relative_throughput', ignore those paths with zero
57 'relative_throughput'.
58
59If such optimizations can't be applied, calculate service time, and
60compare service time.
61If calculated service time is equal, the path having maximum
62'relative_throughput' may be better. So compare 'relative_throughput'
63then.
64
65
66Examples
67========
68In case that 2 paths (sda and sdb) are used with repeat_count == 128
69and sda has an average throughput 1GB/s and sdb has 4GB/s,
70'relative_throughput' value may be '1' for sda and '4' for sdb.
71
72# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \
73 dmsetup create test
74#
75# dmsetup table
76test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4
77#
78# dmsetup status
79test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4
80
81
82Or '2' for sda and '8' for sdb would be also true.
83
84# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \
85 dmsetup create test
86#
87# dmsetup table
88test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8
89#
90# dmsetup status
91test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8
diff --git a/Documentation/driver-model/device.txt b/Documentation/driver-model/device.txt
index a7cbfff40d07..a124f3126b0d 100644
--- a/Documentation/driver-model/device.txt
+++ b/Documentation/driver-model/device.txt
@@ -162,3 +162,35 @@ device_remove_file(dev,&dev_attr_power);
162 162
163The file name will be 'power' with a mode of 0644 (-rw-r--r--). 163The file name will be 'power' with a mode of 0644 (-rw-r--r--).
164 164
165Word of warning: While the kernel allows device_create_file() and
166device_remove_file() to be called on a device at any time, userspace has
167strict expectations on when attributes get created. When a new device is
168registered in the kernel, a uevent is generated to notify userspace (like
169udev) that a new device is available. If attributes are added after the
170device is registered, then userspace won't get notified and userspace will
171not know about the new attributes.
172
173This is important for device driver that need to publish additional
174attributes for a device at driver probe time. If the device driver simply
175calls device_create_file() on the device structure passed to it, then
176userspace will never be notified of the new attributes. Instead, it should
177probably use class_create() and class->dev_attrs to set up a list of
178desired attributes in the modules_init function, and then in the .probe()
179hook, and then use device_create() to create a new device as a child
180of the probed device. The new device will generate a new uevent and
181properly advertise the new attributes to userspace.
182
183For example, if a driver wanted to add the following attributes:
184struct device_attribute mydriver_attribs[] = {
185 __ATTR(port_count, 0444, port_count_show),
186 __ATTR(serial_number, 0444, serial_number_show),
187 NULL
188};
189
190Then in the module init function is would do:
191 mydriver_class = class_create(THIS_MODULE, "my_attrs");
192 mydriver_class.dev_attr = mydriver_attribs;
193
194And assuming 'dev' is the struct device passed into the probe hook, the driver
195probe function would do something like:
196 create_device(&mydriver_class, dev, chrdev, &private_data, "my_name");
diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index 387b8a720f4a..d79aead9418b 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -188,7 +188,7 @@ For example, you can do something like the following.
188 188
189 void my_midlayer_destroy_something() 189 void my_midlayer_destroy_something()
190 { 190 {
191 devres_release_group(dev, my_midlayer_create_soemthing); 191 devres_release_group(dev, my_midlayer_create_something);
192 } 192 }
193 193
194 194
diff --git a/Documentation/dvb/get_dvb_firmware b/Documentation/dvb/get_dvb_firmware
index 2f21ecd4c205..a52adfc9a57f 100644
--- a/Documentation/dvb/get_dvb_firmware
+++ b/Documentation/dvb/get_dvb_firmware
@@ -112,7 +112,7 @@ sub tda10045 {
112 112
113sub tda10046 { 113sub tda10046 {
114 my $sourcefile = "TT_PCI_2.19h_28_11_2006.zip"; 114 my $sourcefile = "TT_PCI_2.19h_28_11_2006.zip";
115 my $url = "http://technotrend-online.com/download/software/219/$sourcefile"; 115 my $url = "http://www.tt-download.com/download/updates/219/$sourcefile";
116 my $hash = "6a7e1e2f2644b162ff0502367553c72d"; 116 my $hash = "6a7e1e2f2644b162ff0502367553c72d";
117 my $outfile = "dvb-fe-tda10046.fw"; 117 my $outfile = "dvb-fe-tda10046.fw";
118 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1); 118 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
@@ -129,8 +129,8 @@ sub tda10046 {
129} 129}
130 130
131sub tda10046lifeview { 131sub tda10046lifeview {
132 my $sourcefile = "Drv_2.11.02.zip"; 132 my $sourcefile = "7%5Cdrv_2.11.02.zip";
133 my $url = "http://www.lifeview.com.tw/drivers/pci_card/FlyDVB-T/$sourcefile"; 133 my $url = "http://www.lifeview.hk/dbimages/document/$sourcefile";
134 my $hash = "1ea24dee4eea8fe971686981f34fd2e0"; 134 my $hash = "1ea24dee4eea8fe971686981f34fd2e0";
135 my $outfile = "dvb-fe-tda10046.fw"; 135 my $outfile = "dvb-fe-tda10046.fw";
136 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1); 136 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
@@ -317,7 +317,7 @@ sub nxt2002 {
317 317
318sub nxt2004 { 318sub nxt2004 {
319 my $sourcefile = "AVerTVHD_MCE_A180_Drv_v1.2.2.16.zip"; 319 my $sourcefile = "AVerTVHD_MCE_A180_Drv_v1.2.2.16.zip";
320 my $url = "http://www.aver.com/support/Drivers/$sourcefile"; 320 my $url = "http://www.avermedia-usa.com/support/Drivers/$sourcefile";
321 my $hash = "111cb885b1e009188346d72acfed024c"; 321 my $hash = "111cb885b1e009188346d72acfed024c";
322 my $outfile = "dvb-fe-nxt2004.fw"; 322 my $outfile = "dvb-fe-nxt2004.fw";
323 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1); 323 my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
diff --git a/Documentation/edac.txt b/Documentation/edac.txt
index 8eda3fb66416..06f8f46692dc 100644
--- a/Documentation/edac.txt
+++ b/Documentation/edac.txt
@@ -23,8 +23,8 @@ first time, it was renamed to 'EDAC'.
23The bluesmoke project at sourceforge.net is now utilized as a 'staging area' 23The bluesmoke project at sourceforge.net is now utilized as a 'staging area'
24for EDAC development, before it is sent upstream to kernel.org 24for EDAC development, before it is sent upstream to kernel.org
25 25
26At the bluesmoke/EDAC project site, is a series of quilt patches against 26At the bluesmoke/EDAC project site is a series of quilt patches against
27recent kernels, stored in a SVN respository. For easier downloading, there 27recent kernels, stored in a SVN repository. For easier downloading, there
28is also a tarball snapshot available. 28is also a tarball snapshot available.
29 29
30============================================================================ 30============================================================================
@@ -73,9 +73,9 @@ the vendor should tie the parity status bits to 0 if they do not intend
73to generate parity. Some vendors do not do this, and thus the parity bit 73to generate parity. Some vendors do not do this, and thus the parity bit
74can "float" giving false positives. 74can "float" giving false positives.
75 75
76In the kernel there is a pci device attribute located in sysfs that is 76In the kernel there is a PCI device attribute located in sysfs that is
77checked by the EDAC PCI scanning code. If that attribute is set, 77checked by the EDAC PCI scanning code. If that attribute is set,
78PCI parity/error scannining is skipped for that device. The attribute 78PCI parity/error scanning is skipped for that device. The attribute
79is: 79is:
80 80
81 broken_parity_status 81 broken_parity_status
diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt
index 4bc374a14345..079305640790 100644
--- a/Documentation/fault-injection/fault-injection.txt
+++ b/Documentation/fault-injection/fault-injection.txt
@@ -29,16 +29,16 @@ o debugfs entries
29fault-inject-debugfs kernel module provides some debugfs entries for runtime 29fault-inject-debugfs kernel module provides some debugfs entries for runtime
30configuration of fault-injection capabilities. 30configuration of fault-injection capabilities.
31 31
32- /debug/fail*/probability: 32- /sys/kernel/debug/fail*/probability:
33 33
34 likelihood of failure injection, in percent. 34 likelihood of failure injection, in percent.
35 Format: <percent> 35 Format: <percent>
36 36
37 Note that one-failure-per-hundred is a very high error rate 37 Note that one-failure-per-hundred is a very high error rate
38 for some testcases. Consider setting probability=100 and configure 38 for some testcases. Consider setting probability=100 and configure
39 /debug/fail*/interval for such testcases. 39 /sys/kernel/debug/fail*/interval for such testcases.
40 40
41- /debug/fail*/interval: 41- /sys/kernel/debug/fail*/interval:
42 42
43 specifies the interval between failures, for calls to 43 specifies the interval between failures, for calls to
44 should_fail() that pass all the other tests. 44 should_fail() that pass all the other tests.
@@ -46,18 +46,18 @@ configuration of fault-injection capabilities.
46 Note that if you enable this, by setting interval>1, you will 46 Note that if you enable this, by setting interval>1, you will
47 probably want to set probability=100. 47 probably want to set probability=100.
48 48
49- /debug/fail*/times: 49- /sys/kernel/debug/fail*/times:
50 50
51 specifies how many times failures may happen at most. 51 specifies how many times failures may happen at most.
52 A value of -1 means "no limit". 52 A value of -1 means "no limit".
53 53
54- /debug/fail*/space: 54- /sys/kernel/debug/fail*/space:
55 55
56 specifies an initial resource "budget", decremented by "size" 56 specifies an initial resource "budget", decremented by "size"
57 on each call to should_fail(,size). Failure injection is 57 on each call to should_fail(,size). Failure injection is
58 suppressed until "space" reaches zero. 58 suppressed until "space" reaches zero.
59 59
60- /debug/fail*/verbose 60- /sys/kernel/debug/fail*/verbose
61 61
62 Format: { 0 | 1 | 2 } 62 Format: { 0 | 1 | 2 }
63 specifies the verbosity of the messages when failure is 63 specifies the verbosity of the messages when failure is
@@ -65,17 +65,17 @@ configuration of fault-injection capabilities.
65 log line per failure; '2' will print a call trace too -- useful 65 log line per failure; '2' will print a call trace too -- useful
66 to debug the problems revealed by fault injection. 66 to debug the problems revealed by fault injection.
67 67
68- /debug/fail*/task-filter: 68- /sys/kernel/debug/fail*/task-filter:
69 69
70 Format: { 'Y' | 'N' } 70 Format: { 'Y' | 'N' }
71 A value of 'N' disables filtering by process (default). 71 A value of 'N' disables filtering by process (default).
72 Any positive value limits failures to only processes indicated by 72 Any positive value limits failures to only processes indicated by
73 /proc/<pid>/make-it-fail==1. 73 /proc/<pid>/make-it-fail==1.
74 74
75- /debug/fail*/require-start: 75- /sys/kernel/debug/fail*/require-start:
76- /debug/fail*/require-end: 76- /sys/kernel/debug/fail*/require-end:
77- /debug/fail*/reject-start: 77- /sys/kernel/debug/fail*/reject-start:
78- /debug/fail*/reject-end: 78- /sys/kernel/debug/fail*/reject-end:
79 79
80 specifies the range of virtual addresses tested during 80 specifies the range of virtual addresses tested during
81 stacktrace walking. Failure is injected only if some caller 81 stacktrace walking. Failure is injected only if some caller
@@ -84,26 +84,26 @@ configuration of fault-injection capabilities.
84 Default required range is [0,ULONG_MAX) (whole of virtual address space). 84 Default required range is [0,ULONG_MAX) (whole of virtual address space).
85 Default rejected range is [0,0). 85 Default rejected range is [0,0).
86 86
87- /debug/fail*/stacktrace-depth: 87- /sys/kernel/debug/fail*/stacktrace-depth:
88 88
89 specifies the maximum stacktrace depth walked during search 89 specifies the maximum stacktrace depth walked during search
90 for a caller within [require-start,require-end) OR 90 for a caller within [require-start,require-end) OR
91 [reject-start,reject-end). 91 [reject-start,reject-end).
92 92
93- /debug/fail_page_alloc/ignore-gfp-highmem: 93- /sys/kernel/debug/fail_page_alloc/ignore-gfp-highmem:
94 94
95 Format: { 'Y' | 'N' } 95 Format: { 'Y' | 'N' }
96 default is 'N', setting it to 'Y' won't inject failures into 96 default is 'N', setting it to 'Y' won't inject failures into
97 highmem/user allocations. 97 highmem/user allocations.
98 98
99- /debug/failslab/ignore-gfp-wait: 99- /sys/kernel/debug/failslab/ignore-gfp-wait:
100- /debug/fail_page_alloc/ignore-gfp-wait: 100- /sys/kernel/debug/fail_page_alloc/ignore-gfp-wait:
101 101
102 Format: { 'Y' | 'N' } 102 Format: { 'Y' | 'N' }
103 default is 'N', setting it to 'Y' will inject failures 103 default is 'N', setting it to 'Y' will inject failures
104 only into non-sleep allocations (GFP_ATOMIC allocations). 104 only into non-sleep allocations (GFP_ATOMIC allocations).
105 105
106- /debug/fail_page_alloc/min-order: 106- /sys/kernel/debug/fail_page_alloc/min-order:
107 107
108 specifies the minimum page allocation order to be injected 108 specifies the minimum page allocation order to be injected
109 failures. 109 failures.
@@ -166,13 +166,13 @@ o Inject slab allocation failures into module init/exit code
166#!/bin/bash 166#!/bin/bash
167 167
168FAILTYPE=failslab 168FAILTYPE=failslab
169echo Y > /debug/$FAILTYPE/task-filter 169echo Y > /sys/kernel/debug/$FAILTYPE/task-filter
170echo 10 > /debug/$FAILTYPE/probability 170echo 10 > /sys/kernel/debug/$FAILTYPE/probability
171echo 100 > /debug/$FAILTYPE/interval 171echo 100 > /sys/kernel/debug/$FAILTYPE/interval
172echo -1 > /debug/$FAILTYPE/times 172echo -1 > /sys/kernel/debug/$FAILTYPE/times
173echo 0 > /debug/$FAILTYPE/space 173echo 0 > /sys/kernel/debug/$FAILTYPE/space
174echo 2 > /debug/$FAILTYPE/verbose 174echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
175echo 1 > /debug/$FAILTYPE/ignore-gfp-wait 175echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
176 176
177faulty_system() 177faulty_system()
178{ 178{
@@ -217,20 +217,20 @@ then
217 exit 1 217 exit 1
218fi 218fi
219 219
220cat /sys/module/$module/sections/.text > /debug/$FAILTYPE/require-start 220cat /sys/module/$module/sections/.text > /sys/kernel/debug/$FAILTYPE/require-start
221cat /sys/module/$module/sections/.data > /debug/$FAILTYPE/require-end 221cat /sys/module/$module/sections/.data > /sys/kernel/debug/$FAILTYPE/require-end
222 222
223echo N > /debug/$FAILTYPE/task-filter 223echo N > /sys/kernel/debug/$FAILTYPE/task-filter
224echo 10 > /debug/$FAILTYPE/probability 224echo 10 > /sys/kernel/debug/$FAILTYPE/probability
225echo 100 > /debug/$FAILTYPE/interval 225echo 100 > /sys/kernel/debug/$FAILTYPE/interval
226echo -1 > /debug/$FAILTYPE/times 226echo -1 > /sys/kernel/debug/$FAILTYPE/times
227echo 0 > /debug/$FAILTYPE/space 227echo 0 > /sys/kernel/debug/$FAILTYPE/space
228echo 2 > /debug/$FAILTYPE/verbose 228echo 2 > /sys/kernel/debug/$FAILTYPE/verbose
229echo 1 > /debug/$FAILTYPE/ignore-gfp-wait 229echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-wait
230echo 1 > /debug/$FAILTYPE/ignore-gfp-highmem 230echo 1 > /sys/kernel/debug/$FAILTYPE/ignore-gfp-highmem
231echo 10 > /debug/$FAILTYPE/stacktrace-depth 231echo 10 > /sys/kernel/debug/$FAILTYPE/stacktrace-depth
232 232
233trap "echo 0 > /debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT 233trap "echo 0 > /sys/kernel/debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT
234 234
235echo "Injecting errors into the module $module... (interrupt to stop)" 235echo "Injecting errors into the module $module... (interrupt to stop)"
236sleep 1000000 236sleep 1000000
diff --git a/Documentation/fb/sh7760fb.txt b/Documentation/fb/sh7760fb.txt
index c87bfe5c630a..b994c3b10549 100644
--- a/Documentation/fb/sh7760fb.txt
+++ b/Documentation/fb/sh7760fb.txt
@@ -1,7 +1,7 @@
1SH7760/SH7763 integrated LCDC Framebuffer driver 1SH7760/SH7763 integrated LCDC Framebuffer driver
2================================================ 2================================================
3 3
40. Overwiew 40. Overview
5----------- 5-----------
6The SH7760/SH7763 have an integrated LCD Display controller (LCDC) which 6The SH7760/SH7763 have an integrated LCD Display controller (LCDC) which
7supports (in theory) resolutions ranging from 1x1 to 1024x1024, 7supports (in theory) resolutions ranging from 1x1 to 1024x1024,
diff --git a/Documentation/fb/vesafb.txt b/Documentation/fb/vesafb.txt
index ee277dd204b0..950d5a658cb3 100644
--- a/Documentation/fb/vesafb.txt
+++ b/Documentation/fb/vesafb.txt
@@ -95,7 +95,7 @@ There is no way to change the vesafb video mode and/or timings after
95booting linux. If you are not happy with the 60 Hz refresh rate, you 95booting linux. If you are not happy with the 60 Hz refresh rate, you
96have these options: 96have these options:
97 97
98 * configure and load the DOS-Tools for your the graphics board (if 98 * configure and load the DOS-Tools for the graphics board (if
99 available) and boot linux with loadlin. 99 available) and boot linux with loadlin.
100 * use a native driver (matroxfb/atyfb) instead if vesafb. If none 100 * use a native driver (matroxfb/atyfb) instead if vesafb. If none
101 is available, write a new one! 101 is available, write a new one!
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index de491a3e2313..f8cd450be9aa 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -6,6 +6,20 @@ be removed from this file.
6 6
7--------------------------- 7---------------------------
8 8
9What: IRQF_SAMPLE_RANDOM
10Check: IRQF_SAMPLE_RANDOM
11When: July 2009
12
13Why: Many of IRQF_SAMPLE_RANDOM users are technically bogus as entropy
14 sources in the kernel's current entropy model. To resolve this, every
15 input point to the kernel's entropy pool needs to better document the
16 type of entropy source it actually is. This will be replaced with
17 additional add_*_randomness functions in drivers/char/random.c
18
19Who: Robin Getz <rgetz@blackfin.uclinux.org> & Matt Mackall <mpm@selenic.com>
20
21---------------------------
22
9What: The ieee80211_regdom module parameter 23What: The ieee80211_regdom module parameter
10When: March 2010 / desktop catchup 24When: March 2010 / desktop catchup
11 25
@@ -354,16 +368,6 @@ Who: Krzysztof Piotr Oledzki <ole@ans.pl>
354 368
355--------------------------- 369---------------------------
356 370
357What: i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client(),
358 i2c_adapter->client_register(), i2c_adapter->client_unregister
359When: 2.6.30
360Check: i2c_attach_client i2c_detach_client
361Why: Deprecated by the new (standard) device driver binding model. Use
362 i2c_driver->probe() and ->remove() instead.
363Who: Jean Delvare <khali@linux-fr.org>
364
365---------------------------
366
367What: fscher and fscpos drivers 371What: fscher and fscpos drivers
368When: June 2009 372When: June 2009
369Why: Deprecated by the new fschmd driver. 373Why: Deprecated by the new fschmd driver.
@@ -437,3 +441,20 @@ Why: Superseded by tdfxfb. I2C/DDC support used to live in a separate
437 driver but this caused driver conflicts. 441 driver but this caused driver conflicts.
438Who: Jean Delvare <khali@linux-fr.org> 442Who: Jean Delvare <khali@linux-fr.org>
439 Krzysztof Helt <krzysztof.h1@wp.pl> 443 Krzysztof Helt <krzysztof.h1@wp.pl>
444
445---------------------------
446
447What: CONFIG_RFKILL_INPUT
448When: 2.6.33
449Why: Should be implemented in userspace, policy daemon.
450Who: Johannes Berg <johannes@sipsolutions.net>
451
452----------------------------
453
454What: CONFIG_X86_OLD_MCE
455When: 2.6.32
456Why: Remove the old legacy 32bit machine check code. This has been
457 superseded by the newer machine check code from the 64bit port,
458 but the old version has been kept around for easier testing. Note this
459 doesn't impact the old P5 and WinChip machine check handlers.
460Who: Andi Kleen <andi@firstfloor.org>
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 8dd6db76171d..f15621ee5599 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -66,6 +66,10 @@ mandatory-locking.txt
66 - info on the Linux implementation of Sys V mandatory file locking. 66 - info on the Linux implementation of Sys V mandatory file locking.
67ncpfs.txt 67ncpfs.txt
68 - info on Novell Netware(tm) filesystem using NCP protocol. 68 - info on Novell Netware(tm) filesystem using NCP protocol.
69nfs41-server.txt
70 - info on the Linux server implementation of NFSv4 minor version 1.
71nfs-rdma.txt
72 - how to install and setup the Linux NFS/RDMA client and server software.
69nfsroot.txt 73nfsroot.txt
70 - short guide on setting up a diskless box with NFS root filesystem. 74 - short guide on setting up a diskless box with NFS root filesystem.
71nilfs2.txt 75nilfs2.txt
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 3120f8dd2c31..18b9d0ca0630 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -109,27 +109,28 @@ prototypes:
109 109
110locking rules: 110locking rules:
111 All may block. 111 All may block.
112 BKL s_lock s_umount 112 None have BKL
113alloc_inode: no no no 113 s_umount
114destroy_inode: no 114alloc_inode:
115dirty_inode: no (must not sleep) 115destroy_inode:
116write_inode: no 116dirty_inode: (must not sleep)
117drop_inode: no !!!inode_lock!!! 117write_inode:
118delete_inode: no 118drop_inode: !!!inode_lock!!!
119put_super: yes yes no 119delete_inode:
120write_super: no yes read 120put_super: write
121sync_fs: no no read 121write_super: read
122freeze_fs: ? 122sync_fs: read
123unfreeze_fs: ? 123freeze_fs: read
124statfs: no no no 124unfreeze_fs: read
125remount_fs: yes yes maybe (see below) 125statfs: no
126clear_inode: no 126remount_fs: maybe (see below)
127umount_begin: yes no no 127clear_inode:
128show_options: no (vfsmount->sem) 128umount_begin: no
129quota_read: no no no (see below) 129show_options: no (namespace_sem)
130quota_write: no no no (see below) 130quota_read: no (see below)
131 131quota_write: no (see below)
132->remount_fs() will have the s_umount lock if it's already mounted. 132
133->remount_fs() will have the s_umount exclusive lock if it's already mounted.
133When called from get_sb_single, it does NOT have the s_umount lock. 134When called from get_sb_single, it does NOT have the s_umount lock.
134->quota_read() and ->quota_write() functions are both guaranteed to 135->quota_read() and ->quota_write() functions are both guaranteed to
135be the only ones operating on the quota file by the quota code (via 136be the only ones operating on the quota file by the quota code (via
@@ -187,7 +188,7 @@ readpages: no
187write_begin: no locks the page yes 188write_begin: no locks the page yes
188write_end: no yes, unlocks yes 189write_end: no yes, unlocks yes
189perform_write: no n/a yes 190perform_write: no n/a yes
190bmap: yes 191bmap: no
191invalidatepage: no yes 192invalidatepage: no yes
192releasepage: no yes 193releasepage: no yes
193direct_IO: no 194direct_IO: no
diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt
index c6341745df37..8f78ded4b648 100644
--- a/Documentation/filesystems/autofs4-mount-control.txt
+++ b/Documentation/filesystems/autofs4-mount-control.txt
@@ -369,7 +369,7 @@ The call requires an initialized struct autofs_dev_ioctl. There are two
369possible variations. Both use the path field set to the path of the mount 369possible variations. Both use the path field set to the path of the mount
370point to check and the size field adjusted appropriately. One uses the 370point to check and the size field adjusted appropriately. One uses the
371ioctlfd field to identify a specific mount point to check while the other 371ioctlfd field to identify a specific mount point to check while the other
372variation uses the path and optionaly arg1 set to an autofs mount type. 372variation uses the path and optionally arg1 set to an autofs mount type.
373The call returns 1 if this is a mount point and sets arg1 to the device 373The call returns 1 if this is a mount point and sets arg1 to the device
374number of the mount and field arg2 to the relevant super block magic 374number of the mount and field arg2 to the relevant super block magic
375number (described below) or 0 if it isn't a mountpoint. In both cases 375number (described below) or 0 if it isn't a mountpoint. In both cases
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
index 4db125b3a5c6..2666b1ed5e9e 100644
--- a/Documentation/filesystems/caching/netfs-api.txt
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -184,7 +184,7 @@ This has the following fields:
184 have index children. 184 have index children.
185 185
186 If this function is not supplied or if it returns NULL then the first 186 If this function is not supplied or if it returns NULL then the first
187 cache in the parent's list will be chosed, or failing that, the first 187 cache in the parent's list will be chosen, or failing that, the first
188 cache in the master list. 188 cache in the master list.
189 189
190 (4) A function to retrieve an object's key from the netfs [mandatory]. 190 (4) A function to retrieve an object's key from the netfs [mandatory].
diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt
new file mode 100644
index 000000000000..ed52af60c2d8
--- /dev/null
+++ b/Documentation/filesystems/debugfs.txt
@@ -0,0 +1,158 @@
1Copyright 2009 Jonathan Corbet <corbet@lwn.net>
2
3Debugfs exists as a simple way for kernel developers to make information
4available to user space. Unlike /proc, which is only meant for information
5about a process, or sysfs, which has strict one-value-per-file rules,
6debugfs has no rules at all. Developers can put any information they want
7there. The debugfs filesystem is also intended to not serve as a stable
8ABI to user space; in theory, there are no stability constraints placed on
9files exported there. The real world is not always so simple, though [1];
10even debugfs interfaces are best designed with the idea that they will need
11to be maintained forever.
12
13Debugfs is typically mounted with a command like:
14
15 mount -t debugfs none /sys/kernel/debug
16
17(Or an equivalent /etc/fstab line).
18
19Note that the debugfs API is exported GPL-only to modules.
20
21Code using debugfs should include <linux/debugfs.h>. Then, the first order
22of business will be to create at least one directory to hold a set of
23debugfs files:
24
25 struct dentry *debugfs_create_dir(const char *name, struct dentry *parent);
26
27This call, if successful, will make a directory called name underneath the
28indicated parent directory. If parent is NULL, the directory will be
29created in the debugfs root. On success, the return value is a struct
30dentry pointer which can be used to create files in the directory (and to
31clean it up at the end). A NULL return value indicates that something went
32wrong. If ERR_PTR(-ENODEV) is returned, that is an indication that the
33kernel has been built without debugfs support and none of the functions
34described below will work.
35
36The most general way to create a file within a debugfs directory is with:
37
38 struct dentry *debugfs_create_file(const char *name, mode_t mode,
39 struct dentry *parent, void *data,
40 const struct file_operations *fops);
41
42Here, name is the name of the file to create, mode describes the access
43permissions the file should have, parent indicates the directory which
44should hold the file, data will be stored in the i_private field of the
45resulting inode structure, and fops is a set of file operations which
46implement the file's behavior. At a minimum, the read() and/or write()
47operations should be provided; others can be included as needed. Again,
48the return value will be a dentry pointer to the created file, NULL for
49error, or ERR_PTR(-ENODEV) if debugfs support is missing.
50
51In a number of cases, the creation of a set of file operations is not
52actually necessary; the debugfs code provides a number of helper functions
53for simple situations. Files containing a single integer value can be
54created with any of:
55
56 struct dentry *debugfs_create_u8(const char *name, mode_t mode,
57 struct dentry *parent, u8 *value);
58 struct dentry *debugfs_create_u16(const char *name, mode_t mode,
59 struct dentry *parent, u16 *value);
60 struct dentry *debugfs_create_u32(const char *name, mode_t mode,
61 struct dentry *parent, u32 *value);
62 struct dentry *debugfs_create_u64(const char *name, mode_t mode,
63 struct dentry *parent, u64 *value);
64
65These files support both reading and writing the given value; if a specific
66file should not be written to, simply set the mode bits accordingly. The
67values in these files are in decimal; if hexadecimal is more appropriate,
68the following functions can be used instead:
69
70 struct dentry *debugfs_create_x8(const char *name, mode_t mode,
71 struct dentry *parent, u8 *value);
72 struct dentry *debugfs_create_x16(const char *name, mode_t mode,
73 struct dentry *parent, u16 *value);
74 struct dentry *debugfs_create_x32(const char *name, mode_t mode,
75 struct dentry *parent, u32 *value);
76
77Note that there is no debugfs_create_x64().
78
79These functions are useful as long as the developer knows the size of the
80value to be exported. Some types can have different widths on different
81architectures, though, complicating the situation somewhat. There is a
82function meant to help out in one special case:
83
84 struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
85 struct dentry *parent,
86 size_t *value);
87
88As might be expected, this function will create a debugfs file to represent
89a variable of type size_t.
90
91Boolean values can be placed in debugfs with:
92
93 struct dentry *debugfs_create_bool(const char *name, mode_t mode,
94 struct dentry *parent, u32 *value);
95
96A read on the resulting file will yield either Y (for non-zero values) or
97N, followed by a newline. If written to, it will accept either upper- or
98lower-case values, or 1 or 0. Any other input will be silently ignored.
99
100Finally, a block of arbitrary binary data can be exported with:
101
102 struct debugfs_blob_wrapper {
103 void *data;
104 unsigned long size;
105 };
106
107 struct dentry *debugfs_create_blob(const char *name, mode_t mode,
108 struct dentry *parent,
109 struct debugfs_blob_wrapper *blob);
110
111A read of this file will return the data pointed to by the
112debugfs_blob_wrapper structure. Some drivers use "blobs" as a simple way
113to return several lines of (static) formatted text output. This function
114can be used to export binary information, but there does not appear to be
115any code which does so in the mainline. Note that all files created with
116debugfs_create_blob() are read-only.
117
118There are a couple of other directory-oriented helper functions:
119
120 struct dentry *debugfs_rename(struct dentry *old_dir,
121 struct dentry *old_dentry,
122 struct dentry *new_dir,
123 const char *new_name);
124
125 struct dentry *debugfs_create_symlink(const char *name,
126 struct dentry *parent,
127 const char *target);
128
129A call to debugfs_rename() will give a new name to an existing debugfs
130file, possibly in a different directory. The new_name must not exist prior
131to the call; the return value is old_dentry with updated information.
132Symbolic links can be created with debugfs_create_symlink().
133
134There is one important thing that all debugfs users must take into account:
135there is no automatic cleanup of any directories created in debugfs. If a
136module is unloaded without explicitly removing debugfs entries, the result
137will be a lot of stale pointers and no end of highly antisocial behavior.
138So all debugfs users - at least those which can be built as modules - must
139be prepared to remove all files and directories they create there. A file
140can be removed with:
141
142 void debugfs_remove(struct dentry *dentry);
143
144The dentry value can be NULL, in which case nothing will be removed.
145
146Once upon a time, debugfs users were required to remember the dentry
147pointer for every debugfs file they created so that all files could be
148cleaned up. We live in more civilized times now, though, and debugfs users
149can call:
150
151 void debugfs_remove_recursive(struct dentry *dentry);
152
153If this function is passed a pointer for the dentry corresponding to the
154top-level directory, the entire hierarchy below that directory will be
155removed.
156
157Notes:
158 [1] http://lwn.net/Articles/309298/
diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt
index e055acb6b2d4..67639f905f10 100644
--- a/Documentation/filesystems/ext2.txt
+++ b/Documentation/filesystems/ext2.txt
@@ -322,7 +322,7 @@ an upper limit on the block size imposed by the page size of the kernel,
322so 8kB blocks are only allowed on Alpha systems (and other architectures 322so 8kB blocks are only allowed on Alpha systems (and other architectures
323which support larger pages). 323which support larger pages).
324 324
325There is an upper limit of 32768 subdirectories in a single directory. 325There is an upper limit of 32000 subdirectories in a single directory.
326 326
327There is a "soft" upper limit of about 10-15k files in a single directory 327There is a "soft" upper limit of about 10-15k files in a single directory
328with the current linear linked-list directory implementation. This limit 328with the current linear linked-list directory implementation. This limit
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 97882df04865..7be02ac5fa36 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -235,6 +235,10 @@ minixdf Make 'df' act like Minix.
235 235
236debug Extra debugging information is sent to syslog. 236debug Extra debugging information is sent to syslog.
237 237
238abort Simulate the effects of calling ext4_abort() for
239 debugging purposes. This is normally used while
240 remounting a filesystem which is already mounted.
241
238errors=remount-ro Remount the filesystem read-only on an error. 242errors=remount-ro Remount the filesystem read-only on an error.
239errors=continue Keep going on a filesystem error. 243errors=continue Keep going on a filesystem error.
240errors=panic Panic and halt the machine if an error occurs. 244errors=panic Panic and halt the machine if an error occurs.
@@ -294,7 +298,7 @@ max_batch_time=usec Maximum amount of time ext4 should wait for
294 amount of time (on average) that it takes to 298 amount of time (on average) that it takes to
295 finish committing a transaction. Call this time 299 finish committing a transaction. Call this time
296 the "commit time". If the time that the 300 the "commit time". If the time that the
297 transactoin has been running is less than the 301 transaction has been running is less than the
298 commit time, ext4 will try sleeping for the 302 commit time, ext4 will try sleeping for the
299 commit time to see if other operations will join 303 commit time to see if other operations will join
300 the transaction. The commit time is capped by 304 the transaction. The commit time is capped by
@@ -328,7 +332,7 @@ noauto_da_alloc replacing existing files via patterns such as
328 journal commit, in the default data=ordered 332 journal commit, in the default data=ordered
329 mode, the data blocks of the new file are forced 333 mode, the data blocks of the new file are forced
330 to disk before the rename() operation is 334 to disk before the rename() operation is
331 commited. This provides roughly the same level 335 committed. This provides roughly the same level
332 of guarantees as ext3, and avoids the 336 of guarantees as ext3, and avoids the
333 "zero-length" problem that can happen when a 337 "zero-length" problem that can happen when a
334 system crashes before the delayed allocation 338 system crashes before the delayed allocation
@@ -358,7 +362,7 @@ written to the journal first, and then to its final location.
358In the event of a crash, the journal can be replayed, bringing both data and 362In the event of a crash, the journal can be replayed, bringing both data and
359metadata into a consistent state. This mode is the slowest except when data 363metadata into a consistent state. This mode is the slowest except when data
360needs to be read from and written to disk at the same time where it 364needs to be read from and written to disk at the same time where it
361outperforms all others modes. Curently ext4 does not have delayed 365outperforms all others modes. Currently ext4 does not have delayed
362allocation support if this data journalling mode is selected. 366allocation support if this data journalling mode is selected.
363 367
364References 368References
diff --git a/Documentation/filesystems/fiemap.txt b/Documentation/filesystems/fiemap.txt
index 1e3defcfe50b..606233cd4618 100644
--- a/Documentation/filesystems/fiemap.txt
+++ b/Documentation/filesystems/fiemap.txt
@@ -204,7 +204,7 @@ fiemap_check_flags() helper:
204 204
205int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); 205int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags);
206 206
207The struct fieinfo should be passed in as recieved from ioctl_fiemap(). The 207The struct fieinfo should be passed in as received from ioctl_fiemap(). The
208set of fiemap flags which the fs understands should be passed via fs_flags. If 208set of fiemap flags which the fs understands should be passed via fs_flags. If
209fiemap_check_flags finds invalid user flags, it will place the bad values in 209fiemap_check_flags finds invalid user flags, it will place the bad values in
210fieinfo->fi_flags and return -EBADR. If the file system gets -EBADR, from 210fieinfo->fi_flags and return -EBADR. If the file system gets -EBADR, from
diff --git a/Documentation/filesystems/gfs2-glocks.txt b/Documentation/filesystems/gfs2-glocks.txt
index 4dae9a3840bf..0494f78d87e4 100644
--- a/Documentation/filesystems/gfs2-glocks.txt
+++ b/Documentation/filesystems/gfs2-glocks.txt
@@ -60,7 +60,7 @@ go_lock | Called for the first local holder of a lock
60go_unlock | Called on the final local unlock of a lock 60go_unlock | Called on the final local unlock of a lock
61go_dump | Called to print content of object for debugfs file, or on 61go_dump | Called to print content of object for debugfs file, or on
62 | error to dump glock to the log. 62 | error to dump glock to the log.
63go_type; | The type of the glock, LM_TYPE_..... 63go_type | The type of the glock, LM_TYPE_.....
64go_min_hold_time | The minimum hold time 64go_min_hold_time | The minimum hold time
65 65
66The minimum hold time for each lock is the time after a remote lock 66The minimum hold time for each lock is the time after a remote lock
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt
index 593004b6bbab..5e3ab8f3beff 100644
--- a/Documentation/filesystems/gfs2.txt
+++ b/Documentation/filesystems/gfs2.txt
@@ -11,18 +11,15 @@ their I/O so file system consistency is maintained. One of the nifty
11features of GFS is perfect consistency -- changes made to the file system 11features of GFS is perfect consistency -- changes made to the file system
12on one machine show up immediately on all other machines in the cluster. 12on one machine show up immediately on all other machines in the cluster.
13 13
14GFS uses interchangable inter-node locking mechanisms. Different lock 14GFS uses interchangable inter-node locking mechanisms, the currently
15modules can plug into GFS and each file system selects the appropriate 15supported mechanisms are:
16lock module at mount time. Lock modules include:
17 16
18 lock_nolock -- allows gfs to be used as a local file system 17 lock_nolock -- allows gfs to be used as a local file system
19 18
20 lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking 19 lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking
21 The dlm is found at linux/fs/dlm/ 20 The dlm is found at linux/fs/dlm/
22 21
23In addition to interfacing with an external locking manager, a gfs lock 22Lock_dlm depends on user space cluster management systems found
24module is responsible for interacting with external cluster management
25systems. Lock_dlm depends on user space cluster management systems found
26at the URL above. 23at the URL above.
27 24
28To use gfs as a local file system, no external clustering systems are 25To use gfs as a local file system, no external clustering systems are
@@ -31,13 +28,19 @@ needed, simply:
31 $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device 28 $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device
32 $ mount -t gfs2 /dev/block_device /dir 29 $ mount -t gfs2 /dev/block_device /dir
33 30
34GFS2 is not on-disk compatible with previous versions of GFS. 31If you are using Fedora, you need to install the gfs2-utils package
32and, for lock_dlm, you will also need to install the cman package
33and write a cluster.conf as per the documentation.
34
35GFS2 is not on-disk compatible with previous versions of GFS, but it
36is pretty close.
35 37
36The following man pages can be found at the URL above: 38The following man pages can be found at the URL above:
37 gfs2_fsck to repair a filesystem 39 fsck.gfs2 to repair a filesystem
38 gfs2_grow to expand a filesystem online 40 gfs2_grow to expand a filesystem online
39 gfs2_jadd to add journals to a filesystem online 41 gfs2_jadd to add journals to a filesystem online
40 gfs2_tool to manipulate, examine and tune a filesystem 42 gfs2_tool to manipulate, examine and tune a filesystem
41 gfs2_quota to examine and change quota values in a filesystem 43 gfs2_quota to examine and change quota values in a filesystem
44 gfs2_convert to convert a gfs filesystem to gfs2 in-place
42 mount.gfs2 to help mount(8) mount a filesystem 45 mount.gfs2 to help mount(8) mount a filesystem
43 mkfs.gfs2 to make a filesystem 46 mkfs.gfs2 to make a filesystem
diff --git a/Documentation/filesystems/isofs.txt b/Documentation/filesystems/isofs.txt
index 6973b980ca2a..3c367c3b3608 100644
--- a/Documentation/filesystems/isofs.txt
+++ b/Documentation/filesystems/isofs.txt
@@ -23,8 +23,13 @@ Mount options unique to the isofs filesystem.
23 map=off Do not map non-Rock Ridge filenames to lower case 23 map=off Do not map non-Rock Ridge filenames to lower case
24 map=normal Map non-Rock Ridge filenames to lower case 24 map=normal Map non-Rock Ridge filenames to lower case
25 map=acorn As map=normal but also apply Acorn extensions if present 25 map=acorn As map=normal but also apply Acorn extensions if present
26 mode=xxx Sets the permissions on files to xxx 26 mode=xxx Sets the permissions on files to xxx unless Rock Ridge
27 dmode=xxx Sets the permissions on directories to xxx 27 extensions set the permissions otherwise
28 dmode=xxx Sets the permissions on directories to xxx unless Rock Ridge
29 extensions set the permissions otherwise
30 overriderockperm Set permissions on files and directories according to
31 'mode' and 'dmode' even though Rock Ridge extensions are
32 present.
28 nojoliet Ignore Joliet extensions if they are present. 33 nojoliet Ignore Joliet extensions if they are present.
29 norock Ignore Rock Ridge extensions if they are present. 34 norock Ignore Rock Ridge extensions if they are present.
30 hide Completely strip hidden files from the file system. 35 hide Completely strip hidden files from the file system.
diff --git a/Documentation/filesystems/nfs-rdma.txt b/Documentation/filesystems/nfs-rdma.txt
index 85eaeaddd27c..e386f7e4bcee 100644
--- a/Documentation/filesystems/nfs-rdma.txt
+++ b/Documentation/filesystems/nfs-rdma.txt
@@ -100,7 +100,7 @@ Installation
100 $ sudo cp utils/mount/mount.nfs /sbin/mount.nfs 100 $ sudo cp utils/mount/mount.nfs /sbin/mount.nfs
101 101
102 In this location, mount.nfs will be invoked automatically for NFS mounts 102 In this location, mount.nfs will be invoked automatically for NFS mounts
103 by the system mount commmand. 103 by the system mount command.
104 104
105 NOTE: mount.nfs and therefore nfs-utils-1.1.2 or greater is only needed 105 NOTE: mount.nfs and therefore nfs-utils-1.1.2 or greater is only needed
106 on the NFS client machine. You do not need this specific version of 106 on the NFS client machine. You do not need this specific version of
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 55c4300abfcb..01539f410676 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -39,9 +39,8 @@ Features which NILFS2 does not support yet:
39 - extended attributes 39 - extended attributes
40 - POSIX ACLs 40 - POSIX ACLs
41 - quotas 41 - quotas
42 - writable snapshots 42 - fsck
43 - remote backup (CDP) 43 - resize
44 - data integrity
45 - defragmentation 44 - defragmentation
46 45
47Mount options 46Mount options
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index ce84cfc9eae0..fad18f9456e4 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -5,11 +5,12 @@
5 Bodo Bauer <bb@ricochet.net> 5 Bodo Bauer <bb@ricochet.net>
6 6
72.4.x update Jorge Nerin <comandante@zaralinux.com> November 14 2000 72.4.x update Jorge Nerin <comandante@zaralinux.com> November 14 2000
8move /proc/sys Shen Feng <shen@cn.fujitsu.com> April 1 2009 8move /proc/sys Shen Feng <shen@cn.fujitsu.com> April 1 2009
9------------------------------------------------------------------------------ 9------------------------------------------------------------------------------
10Version 1.3 Kernel version 2.2.12 10Version 1.3 Kernel version 2.2.12
11 Kernel version 2.4.0-test11-pre4 11 Kernel version 2.4.0-test11-pre4
12------------------------------------------------------------------------------ 12------------------------------------------------------------------------------
13fixes/update part 1.1 Stefani Seibold <stefani@seibold.net> June 9 2009
13 14
14Table of Contents 15Table of Contents
15----------------- 16-----------------
@@ -116,7 +117,7 @@ The link self points to the process reading the file system. Each process
116subdirectory has the entries listed in Table 1-1. 117subdirectory has the entries listed in Table 1-1.
117 118
118 119
119Table 1-1: Process specific entries in /proc 120Table 1-1: Process specific entries in /proc
120.............................................................................. 121..............................................................................
121 File Content 122 File Content
122 clear_refs Clears page referenced bits shown in smaps output 123 clear_refs Clears page referenced bits shown in smaps output
@@ -134,46 +135,103 @@ Table 1-1: Process specific entries in /proc
134 status Process status in human readable form 135 status Process status in human readable form
135 wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan 136 wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan
136 stack Report full stack trace, enable via CONFIG_STACKTRACE 137 stack Report full stack trace, enable via CONFIG_STACKTRACE
137 smaps Extension based on maps, the rss size for each mapped file 138 smaps a extension based on maps, showing the memory consumption of
139 each mapping
138.............................................................................. 140..............................................................................
139 141
140For example, to get the status information of a process, all you have to do is 142For example, to get the status information of a process, all you have to do is
141read the file /proc/PID/status: 143read the file /proc/PID/status:
142 144
143 >cat /proc/self/status 145 >cat /proc/self/status
144 Name: cat 146 Name: cat
145 State: R (running) 147 State: R (running)
146 Pid: 5452 148 Tgid: 5452
147 PPid: 743 149 Pid: 5452
150 PPid: 743
148 TracerPid: 0 (2.4) 151 TracerPid: 0 (2.4)
149 Uid: 501 501 501 501 152 Uid: 501 501 501 501
150 Gid: 100 100 100 100 153 Gid: 100 100 100 100
151 Groups: 100 14 16 154 FDSize: 256
152 VmSize: 1112 kB 155 Groups: 100 14 16
153 VmLck: 0 kB 156 VmPeak: 5004 kB
154 VmRSS: 348 kB 157 VmSize: 5004 kB
155 VmData: 24 kB 158 VmLck: 0 kB
156 VmStk: 12 kB 159 VmHWM: 476 kB
157 VmExe: 8 kB 160 VmRSS: 476 kB
158 VmLib: 1044 kB 161 VmData: 156 kB
159 SigPnd: 0000000000000000 162 VmStk: 88 kB
160 SigBlk: 0000000000000000 163 VmExe: 68 kB
161 SigIgn: 0000000000000000 164 VmLib: 1412 kB
162 SigCgt: 0000000000000000 165 VmPTE: 20 kb
163 CapInh: 00000000fffffeff 166 Threads: 1
164 CapPrm: 0000000000000000 167 SigQ: 0/28578
165 CapEff: 0000000000000000 168 SigPnd: 0000000000000000
166 169 ShdPnd: 0000000000000000
170 SigBlk: 0000000000000000
171 SigIgn: 0000000000000000
172 SigCgt: 0000000000000000
173 CapInh: 00000000fffffeff
174 CapPrm: 0000000000000000
175 CapEff: 0000000000000000
176 CapBnd: ffffffffffffffff
177 voluntary_ctxt_switches: 0
178 nonvoluntary_ctxt_switches: 1
167 179
168This shows you nearly the same information you would get if you viewed it with 180This shows you nearly the same information you would get if you viewed it with
169the ps command. In fact, ps uses the proc file system to obtain its 181the ps command. In fact, ps uses the proc file system to obtain its
170information. The statm file contains more detailed information about the 182information. But you get a more detailed view of the process by reading the
171process memory usage. Its seven fields are explained in Table 1-2. The stat 183file /proc/PID/status. It fields are described in table 1-2.
172file contains details information about the process itself. Its fields are 184
173explained in Table 1-3. 185The statm file contains more detailed information about the process
186memory usage. Its seven fields are explained in Table 1-3. The stat file
187contains details information about the process itself. Its fields are
188explained in Table 1-4.
174 189
190Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
191..............................................................................
192 Field Content
193 Name filename of the executable
194 State state (R is running, S is sleeping, D is sleeping
195 in an uninterruptible wait, Z is zombie,
196 T is traced or stopped)
197 Tgid thread group ID
198 Pid process id
199 PPid process id of the parent process
200 TracerPid PID of process tracing this process (0 if not)
201 Uid Real, effective, saved set, and file system UIDs
202 Gid Real, effective, saved set, and file system GIDs
203 FDSize number of file descriptor slots currently allocated
204 Groups supplementary group list
205 VmPeak peak virtual memory size
206 VmSize total program size
207 VmLck locked memory size
208 VmHWM peak resident set size ("high water mark")
209 VmRSS size of memory portions
210 VmData size of data, stack, and text segments
211 VmStk size of data, stack, and text segments
212 VmExe size of text segment
213 VmLib size of shared library code
214 VmPTE size of page table entries
215 Threads number of threads
216 SigQ number of signals queued/max. number for queue
217 SigPnd bitmap of pending signals for the thread
218 ShdPnd bitmap of shared pending signals for the process
219 SigBlk bitmap of blocked signals
220 SigIgn bitmap of ignored signals
221 SigCgt bitmap of catched signals
222 CapInh bitmap of inheritable capabilities
223 CapPrm bitmap of permitted capabilities
224 CapEff bitmap of effective capabilities
225 CapBnd bitmap of capabilities bounding set
226 Cpus_allowed mask of CPUs on which this process may run
227 Cpus_allowed_list Same as previous, but in "list format"
228 Mems_allowed mask of memory nodes allowed to this process
229 Mems_allowed_list Same as previous, but in "list format"
230 voluntary_ctxt_switches number of voluntary context switches
231 nonvoluntary_ctxt_switches number of non voluntary context switches
232..............................................................................
175 233
176Table 1-2: Contents of the statm files (as of 2.6.8-rc3) 234Table 1-3: Contents of the statm files (as of 2.6.8-rc3)
177.............................................................................. 235..............................................................................
178 Field Content 236 Field Content
179 size total program size (pages) (same as VmSize in status) 237 size total program size (pages) (same as VmSize in status)
@@ -188,7 +246,7 @@ Table 1-2: Contents of the statm files (as of 2.6.8-rc3)
188.............................................................................. 246..............................................................................
189 247
190 248
191Table 1-3: Contents of the stat files (as of 2.6.22-rc3) 249Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
192.............................................................................. 250..............................................................................
193 Field Content 251 Field Content
194 pid process id 252 pid process id
@@ -222,10 +280,10 @@ Table 1-3: Contents of the stat files (as of 2.6.22-rc3)
222 start_stack address of the start of the stack 280 start_stack address of the start of the stack
223 esp current value of ESP 281 esp current value of ESP
224 eip current value of EIP 282 eip current value of EIP
225 pending bitmap of pending signals (obsolete) 283 pending bitmap of pending signals
226 blocked bitmap of blocked signals (obsolete) 284 blocked bitmap of blocked signals
227 sigign bitmap of ignored signals (obsolete) 285 sigign bitmap of ignored signals
228 sigcatch bitmap of catched signals (obsolete) 286 sigcatch bitmap of catched signals
229 wchan address where process went to sleep 287 wchan address where process went to sleep
230 0 (place holder) 288 0 (place holder)
231 0 (place holder) 289 0 (place holder)
@@ -234,19 +292,99 @@ Table 1-3: Contents of the stat files (as of 2.6.22-rc3)
234 rt_priority realtime priority 292 rt_priority realtime priority
235 policy scheduling policy (man sched_setscheduler) 293 policy scheduling policy (man sched_setscheduler)
236 blkio_ticks time spent waiting for block IO 294 blkio_ticks time spent waiting for block IO
295 gtime guest time of the task in jiffies
296 cgtime guest time of the task children in jiffies
237.............................................................................. 297..............................................................................
238 298
299The /proc/PID/map file containing the currently mapped memory regions and
300their access permissions.
301
302The format is:
303
304address perms offset dev inode pathname
305
30608048000-08049000 r-xp 00000000 03:00 8312 /opt/test
30708049000-0804a000 rw-p 00001000 03:00 8312 /opt/test
3080804a000-0806b000 rw-p 00000000 00:00 0 [heap]
309a7cb1000-a7cb2000 ---p 00000000 00:00 0
310a7cb2000-a7eb2000 rw-p 00000000 00:00 0
311a7eb2000-a7eb3000 ---p 00000000 00:00 0
312a7eb3000-a7ed5000 rw-p 00000000 00:00 0
313a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6
314a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6
315a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6
316a800b000-a800e000 rw-p 00000000 00:00 0
317a800e000-a8022000 r-xp 00000000 03:00 14462 /lib/libpthread.so.0
318a8022000-a8023000 r--p 00013000 03:00 14462 /lib/libpthread.so.0
319a8023000-a8024000 rw-p 00014000 03:00 14462 /lib/libpthread.so.0
320a8024000-a8027000 rw-p 00000000 00:00 0
321a8027000-a8043000 r-xp 00000000 03:00 8317 /lib/ld-linux.so.2
322a8043000-a8044000 r--p 0001b000 03:00 8317 /lib/ld-linux.so.2
323a8044000-a8045000 rw-p 0001c000 03:00 8317 /lib/ld-linux.so.2
324aff35000-aff4a000 rw-p 00000000 00:00 0 [stack]
325ffffe000-fffff000 r-xp 00000000 00:00 0 [vdso]
326
327where "address" is the address space in the process that it occupies, "perms"
328is a set of permissions:
329
330 r = read
331 w = write
332 x = execute
333 s = shared
334 p = private (copy on write)
335
336"offset" is the offset into the mapping, "dev" is the device (major:minor), and
337"inode" is the inode on that device. 0 indicates that no inode is associated
338with the memory region, as the case would be with BSS (uninitialized data).
339The "pathname" shows the name associated file for this mapping. If the mapping
340is not associated with a file:
341
342 [heap] = the heap of the program
343 [stack] = the stack of the main process
344 [vdso] = the "virtual dynamic shared object",
345 the kernel system call handler
346
347 or if empty, the mapping is anonymous.
348
349
350The /proc/PID/smaps is an extension based on maps, showing the memory
351consumption for each of the process's mappings. For each of mappings there
352is a series of lines such as the following:
353
35408048000-080bc000 r-xp 00000000 03:02 13130 /bin/bash
355Size: 1084 kB
356Rss: 892 kB
357Pss: 374 kB
358Shared_Clean: 892 kB
359Shared_Dirty: 0 kB
360Private_Clean: 0 kB
361Private_Dirty: 0 kB
362Referenced: 892 kB
363Swap: 0 kB
364KernelPageSize: 4 kB
365MMUPageSize: 4 kB
366
367The first of these lines shows the same information as is displayed for the
368mapping in /proc/PID/maps. The remaining lines show the size of the mapping,
369the amount of the mapping that is currently resident in RAM, the "proportional
370set size” (divide each shared page by the number of processes sharing it), the
371number of clean and dirty shared pages in the mapping, and the number of clean
372and dirty private pages in the mapping. The "Referenced" indicates the amount
373of memory currently marked as referenced or accessed.
374
375This file is only present if the CONFIG_MMU kernel configuration option is
376enabled.
239 377
2401.2 Kernel data 3781.2 Kernel data
241--------------- 379---------------
242 380
243Similar to the process entries, the kernel data files give information about 381Similar to the process entries, the kernel data files give information about
244the running kernel. The files used to obtain this information are contained in 382the running kernel. The files used to obtain this information are contained in
245/proc and are listed in Table 1-4. Not all of these will be present in your 383/proc and are listed in Table 1-5. Not all of these will be present in your
246system. It depends on the kernel configuration and the loaded modules, which 384system. It depends on the kernel configuration and the loaded modules, which
247files are there, and which are missing. 385files are there, and which are missing.
248 386
249Table 1-4: Kernel info in /proc 387Table 1-5: Kernel info in /proc
250.............................................................................. 388..............................................................................
251 File Content 389 File Content
252 apm Advanced power management info 390 apm Advanced power management info
@@ -283,6 +421,7 @@ Table 1-4: Kernel info in /proc
283 rtc Real time clock 421 rtc Real time clock
284 scsi SCSI info (see text) 422 scsi SCSI info (see text)
285 slabinfo Slab pool info 423 slabinfo Slab pool info
424 softirqs softirq usage
286 stat Overall statistics 425 stat Overall statistics
287 swaps Swap space utilization 426 swaps Swap space utilization
288 sys See chapter 2 427 sys See chapter 2
@@ -366,7 +505,7 @@ just those considered 'most important'. The new vectors are:
366 RES, CAL, TLB -- rescheduling, call and TLB flush interrupts are 505 RES, CAL, TLB -- rescheduling, call and TLB flush interrupts are
367 sent from one CPU to another per the needs of the OS. Typically, 506 sent from one CPU to another per the needs of the OS. Typically,
368 their statistics are used by kernel developers and interested users to 507 their statistics are used by kernel developers and interested users to
369 determine the occurance of interrupt of the given type. 508 determine the occurrence of interrupts of the given type.
370 509
371The above IRQ vectors are displayed only when relevent. For example, 510The above IRQ vectors are displayed only when relevent. For example,
372the threshold vector does not exist on x86_64 platforms. Others are 511the threshold vector does not exist on x86_64 platforms. Others are
@@ -551,7 +690,7 @@ Committed_AS: The amount of memory presently allocated on the system.
551 memory once that memory has been successfully allocated. 690 memory once that memory has been successfully allocated.
552VmallocTotal: total size of vmalloc memory area 691VmallocTotal: total size of vmalloc memory area
553 VmallocUsed: amount of vmalloc area which is used 692 VmallocUsed: amount of vmalloc area which is used
554VmallocChunk: largest contigious block of vmalloc area which is free 693VmallocChunk: largest contiguous block of vmalloc area which is free
555 694
556.............................................................................. 695..............................................................................
557 696
@@ -597,6 +736,25 @@ on the kind of area :
5970xffffffffa0017000-0xffffffffa0022000 45056 sys_init_module+0xc27/0x1d00 ... 7360xffffffffa0017000-0xffffffffa0022000 45056 sys_init_module+0xc27/0x1d00 ...
598 pages=10 vmalloc N0=10 737 pages=10 vmalloc N0=10
599 738
739..............................................................................
740
741softirqs:
742
743Provides counts of softirq handlers serviced since boot time, for each cpu.
744
745> cat /proc/softirqs
746 CPU0 CPU1 CPU2 CPU3
747 HI: 0 0 0 0
748 TIMER: 27166 27120 27097 27034
749 NET_TX: 0 0 0 17
750 NET_RX: 42 0 0 39
751 BLOCK: 0 0 107 1121
752 TASKLET: 0 0 0 290
753 SCHED: 27035 26983 26971 26746
754 HRTIMER: 0 0 0 0
755 RCU: 1678 1769 2178 2250
756
757
6001.3 IDE devices in /proc/ide 7581.3 IDE devices in /proc/ide
601---------------------------- 759----------------------------
602 760
@@ -614,10 +772,10 @@ IDE devices:
614 772
615More detailed information can be found in the controller specific 773More detailed information can be found in the controller specific
616subdirectories. These are named ide0, ide1 and so on. Each of these 774subdirectories. These are named ide0, ide1 and so on. Each of these
617directories contains the files shown in table 1-5. 775directories contains the files shown in table 1-6.
618 776
619 777
620Table 1-5: IDE controller info in /proc/ide/ide? 778Table 1-6: IDE controller info in /proc/ide/ide?
621.............................................................................. 779..............................................................................
622 File Content 780 File Content
623 channel IDE channel (0 or 1) 781 channel IDE channel (0 or 1)
@@ -627,11 +785,11 @@ Table 1-5: IDE controller info in /proc/ide/ide?
627.............................................................................. 785..............................................................................
628 786
629Each device connected to a controller has a separate subdirectory in the 787Each device connected to a controller has a separate subdirectory in the
630controllers directory. The files listed in table 1-6 are contained in these 788controllers directory. The files listed in table 1-7 are contained in these
631directories. 789directories.
632 790
633 791
634Table 1-6: IDE device information 792Table 1-7: IDE device information
635.............................................................................. 793..............................................................................
636 File Content 794 File Content
637 cache The cache 795 cache The cache
@@ -673,12 +831,12 @@ the drive parameters:
6731.4 Networking info in /proc/net 8311.4 Networking info in /proc/net
674-------------------------------- 832--------------------------------
675 833
676The subdirectory /proc/net follows the usual pattern. Table 1-6 shows the 834The subdirectory /proc/net follows the usual pattern. Table 1-8 shows the
677additional values you get for IP version 6 if you configure the kernel to 835additional values you get for IP version 6 if you configure the kernel to
678support this. Table 1-7 lists the files and their meaning. 836support this. Table 1-9 lists the files and their meaning.
679 837
680 838
681Table 1-6: IPv6 info in /proc/net 839Table 1-8: IPv6 info in /proc/net
682.............................................................................. 840..............................................................................
683 File Content 841 File Content
684 udp6 UDP sockets (IPv6) 842 udp6 UDP sockets (IPv6)
@@ -693,7 +851,7 @@ Table 1-6: IPv6 info in /proc/net
693.............................................................................. 851..............................................................................
694 852
695 853
696Table 1-7: Network info in /proc/net 854Table 1-9: Network info in /proc/net
697.............................................................................. 855..............................................................................
698 File Content 856 File Content
699 arp Kernel ARP table 857 arp Kernel ARP table
@@ -817,10 +975,10 @@ The directory /proc/parport contains information about the parallel ports of
817your system. It has one subdirectory for each port, named after the port 975your system. It has one subdirectory for each port, named after the port
818number (0,1,2,...). 976number (0,1,2,...).
819 977
820These directories contain the four files shown in Table 1-8. 978These directories contain the four files shown in Table 1-10.
821 979
822 980
823Table 1-8: Files in /proc/parport 981Table 1-10: Files in /proc/parport
824.............................................................................. 982..............................................................................
825 File Content 983 File Content
826 autoprobe Any IEEE-1284 device ID information that has been acquired. 984 autoprobe Any IEEE-1284 device ID information that has been acquired.
@@ -838,10 +996,10 @@ Table 1-8: Files in /proc/parport
838 996
839Information about the available and actually used tty's can be found in the 997Information about the available and actually used tty's can be found in the
840directory /proc/tty.You'll find entries for drivers and line disciplines in 998directory /proc/tty.You'll find entries for drivers and line disciplines in
841this directory, as shown in Table 1-9. 999this directory, as shown in Table 1-11.
842 1000
843 1001
844Table 1-9: Files in /proc/tty 1002Table 1-11: Files in /proc/tty
845.............................................................................. 1003..............................................................................
846 File Content 1004 File Content
847 drivers list of drivers and their usage 1005 drivers list of drivers and their usage
@@ -883,6 +1041,7 @@ since the system first booted. For a quick look, simply cat the file:
883 processes 2915 1041 processes 2915
884 procs_running 1 1042 procs_running 1
885 procs_blocked 0 1043 procs_blocked 0
1044 softirq 183433 0 21755 12 39 1137 231 21459 2263
886 1045
887The very first "cpu" line aggregates the numbers in all of the other "cpuN" 1046The very first "cpu" line aggregates the numbers in all of the other "cpuN"
888lines. These numbers identify the amount of time the CPU has spent performing 1047lines. These numbers identify the amount of time the CPU has spent performing
@@ -918,6 +1077,11 @@ CPUs.
918The "procs_blocked" line gives the number of processes currently blocked, 1077The "procs_blocked" line gives the number of processes currently blocked,
919waiting for I/O to complete. 1078waiting for I/O to complete.
920 1079
1080The "softirq" line gives counts of softirqs serviced since boot time, for each
1081of the possible system softirqs. The first column is the total of all
1082softirqs serviced; each subsequent column is the total for that particular
1083softirq.
1084
921 1085
9221.9 Ext4 file system parameters 10861.9 Ext4 file system parameters
923------------------------------ 1087------------------------------
@@ -926,9 +1090,9 @@ Information about mounted ext4 file systems can be found in
926/proc/fs/ext4. Each mounted filesystem will have a directory in 1090/proc/fs/ext4. Each mounted filesystem will have a directory in
927/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or 1091/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or
928/proc/fs/ext4/dm-0). The files in each per-device directory are shown 1092/proc/fs/ext4/dm-0). The files in each per-device directory are shown
929in Table 1-10, below. 1093in Table 1-12, below.
930 1094
931Table 1-10: Files in /proc/fs/ext4/<devname> 1095Table 1-12: Files in /proc/fs/ext4/<devname>
932.............................................................................. 1096..............................................................................
933 File Content 1097 File Content
934 mb_groups details of multiblock allocator buddy cache of free blocks 1098 mb_groups details of multiblock allocator buddy cache of free blocks
@@ -1003,11 +1167,13 @@ CHAPTER 3: PER-PROCESS PARAMETERS
10033.1 /proc/<pid>/oom_adj - Adjust the oom-killer score 11673.1 /proc/<pid>/oom_adj - Adjust the oom-killer score
1004------------------------------------------------------ 1168------------------------------------------------------
1005 1169
1006This file can be used to adjust the score used to select which processes 1170This file can be used to adjust the score used to select which processes should
1007should be killed in an out-of-memory situation. Giving it a high score will 1171be killed in an out-of-memory situation. The oom_adj value is a characteristic
1008increase the likelihood of this process being killed by the oom-killer. Valid 1172of the task's mm, so all threads that share an mm with pid will have the same
1009values are in the range -16 to +15, plus the special value -17, which disables 1173oom_adj value. A high value will increase the likelihood of this process being
1010oom-killing altogether for this process. 1174killed by the oom-killer. Valid values are in the range -16 to +15 as
1175explained below and a special value of -17, which disables oom-killing
1176altogether for threads sharing pid's mm.
1011 1177
1012The process to be killed in an out-of-memory situation is selected among all others 1178The process to be killed in an out-of-memory situation is selected among all others
1013based on its badness score. This value equals the original memory size of the process 1179based on its badness score. This value equals the original memory size of the process
@@ -1021,6 +1187,9 @@ the parent's score if they do not share the same memory. Thus forking servers
1021are the prime candidates to be killed. Having only one 'hungry' child will make 1187are the prime candidates to be killed. Having only one 'hungry' child will make
1022parent less preferable than the child. 1188parent less preferable than the child.
1023 1189
1190/proc/<pid>/oom_adj cannot be changed for kthreads since they are immune from
1191oom-killing already.
1192
1024/proc/<pid>/oom_score shows process' current badness score. 1193/proc/<pid>/oom_score shows process' current badness score.
1025 1194
1026The following heuristics are then applied: 1195The following heuristics are then applied:
diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt
index 26e4b8bc53ee..85354b32d731 100644
--- a/Documentation/filesystems/sysfs-pci.txt
+++ b/Documentation/filesystems/sysfs-pci.txt
@@ -72,7 +72,7 @@ The 'rom' file is special in that it provides read-only access to the device's
72ROM file, if available. It's disabled by default, however, so applications 72ROM file, if available. It's disabled by default, however, so applications
73should write the string "1" to the file to enable it before attempting a read 73should write the string "1" to the file to enable it before attempting a read
74call, and disable it following the access by writing "0" to the file. Note 74call, and disable it following the access by writing "0" to the file. Note
75that the device must be enabled for a rom read to return data succesfully. 75that the device must be enabled for a rom read to return data successfully.
76In the event a driver is not bound to the device, it can be enabled using the 76In the event a driver is not bound to the device, it can be enabled using the
77'enable' file, documented above. 77'enable' file, documented above.
78 78
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index 3a5ddc96901a..b58b84b50fa2 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -124,14 +124,19 @@ sys_immutable -- If set, ATTR_SYS attribute on FAT is handled as
124flush -- If set, the filesystem will try to flush to disk more 124flush -- If set, the filesystem will try to flush to disk more
125 early than normal. Not set by default. 125 early than normal. Not set by default.
126 126
127rodir -- FAT has the ATTR_RO (read-only) attribute. But on Windows, 127rodir -- FAT has the ATTR_RO (read-only) attribute. On Windows,
128 the ATTR_RO of the directory will be just ignored actually, 128 the ATTR_RO of the directory will just be ignored,
129 and is used by only applications as flag. E.g. it's setted 129 and is used only by applications as a flag (e.g. it's set
130 for the customized folder. 130 for the customized folder).
131 131
132 If you want to use ATTR_RO as read-only flag even for 132 If you want to use ATTR_RO as read-only flag even for
133 the directory, set this option. 133 the directory, set this option.
134 134
135errors=panic|continue|remount-ro
136 -- specify FAT behavior on critical errors: panic, continue
137 without doing anything or remount the partition in
138 read-only mode (default behavior).
139
135<bool>: 0,1,yes,no,true,false 140<bool>: 0,1,yes,no,true,false
136 141
137TODO 142TODO
diff --git a/Documentation/firmware_class/README b/Documentation/firmware_class/README
index c3480aa66ba8..7eceaff63f5f 100644
--- a/Documentation/firmware_class/README
+++ b/Documentation/firmware_class/README
@@ -77,7 +77,8 @@
77 seconds for the whole load operation. 77 seconds for the whole load operation.
78 78
79 - request_firmware_nowait() is also provided for convenience in 79 - request_firmware_nowait() is also provided for convenience in
80 non-user contexts. 80 user contexts to request firmware asynchronously, but can't be called
81 in atomic contexts.
81 82
82 83
83 about in-kernel persistence: 84 about in-kernel persistence:
diff --git a/Documentation/futex-requeue-pi.txt b/Documentation/futex-requeue-pi.txt
new file mode 100644
index 000000000000..9dc1ff4fd536
--- /dev/null
+++ b/Documentation/futex-requeue-pi.txt
@@ -0,0 +1,131 @@
1Futex Requeue PI
2----------------
3
4Requeueing of tasks from a non-PI futex to a PI futex requires
5special handling in order to ensure the underlying rt_mutex is never
6left without an owner if it has waiters; doing so would break the PI
7boosting logic [see rt-mutex-desgin.txt] For the purposes of
8brevity, this action will be referred to as "requeue_pi" throughout
9this document. Priority inheritance is abbreviated throughout as
10"PI".
11
12Motivation
13----------
14
15Without requeue_pi, the glibc implementation of
16pthread_cond_broadcast() must resort to waking all the tasks waiting
17on a pthread_condvar and letting them try to sort out which task
18gets to run first in classic thundering-herd formation. An ideal
19implementation would wake the highest-priority waiter, and leave the
20rest to the natural wakeup inherent in unlocking the mutex
21associated with the condvar.
22
23Consider the simplified glibc calls:
24
25/* caller must lock mutex */
26pthread_cond_wait(cond, mutex)
27{
28 lock(cond->__data.__lock);
29 unlock(mutex);
30 do {
31 unlock(cond->__data.__lock);
32 futex_wait(cond->__data.__futex);
33 lock(cond->__data.__lock);
34 } while(...)
35 unlock(cond->__data.__lock);
36 lock(mutex);
37}
38
39pthread_cond_broadcast(cond)
40{
41 lock(cond->__data.__lock);
42 unlock(cond->__data.__lock);
43 futex_requeue(cond->data.__futex, cond->mutex);
44}
45
46Once pthread_cond_broadcast() requeues the tasks, the cond->mutex
47has waiters. Note that pthread_cond_wait() attempts to lock the
48mutex only after it has returned to user space. This will leave the
49underlying rt_mutex with waiters, and no owner, breaking the
50previously mentioned PI-boosting algorithms.
51
52In order to support PI-aware pthread_condvar's, the kernel needs to
53be able to requeue tasks to PI futexes. This support implies that
54upon a successful futex_wait system call, the caller would return to
55user space already holding the PI futex. The glibc implementation
56would be modified as follows:
57
58
59/* caller must lock mutex */
60pthread_cond_wait_pi(cond, mutex)
61{
62 lock(cond->__data.__lock);
63 unlock(mutex);
64 do {
65 unlock(cond->__data.__lock);
66 futex_wait_requeue_pi(cond->__data.__futex);
67 lock(cond->__data.__lock);
68 } while(...)
69 unlock(cond->__data.__lock);
70 /* the kernel acquired the the mutex for us */
71}
72
73pthread_cond_broadcast_pi(cond)
74{
75 lock(cond->__data.__lock);
76 unlock(cond->__data.__lock);
77 futex_requeue_pi(cond->data.__futex, cond->mutex);
78}
79
80The actual glibc implementation will likely test for PI and make the
81necessary changes inside the existing calls rather than creating new
82calls for the PI cases. Similar changes are needed for
83pthread_cond_timedwait() and pthread_cond_signal().
84
85Implementation
86--------------
87
88In order to ensure the rt_mutex has an owner if it has waiters, it
89is necessary for both the requeue code, as well as the waiting code,
90to be able to acquire the rt_mutex before returning to user space.
91The requeue code cannot simply wake the waiter and leave it to
92acquire the rt_mutex as it would open a race window between the
93requeue call returning to user space and the waiter waking and
94starting to run. This is especially true in the uncontended case.
95
96The solution involves two new rt_mutex helper routines,
97rt_mutex_start_proxy_lock() and rt_mutex_finish_proxy_lock(), which
98allow the requeue code to acquire an uncontended rt_mutex on behalf
99of the waiter and to enqueue the waiter on a contended rt_mutex.
100Two new system calls provide the kernel<->user interface to
101requeue_pi: FUTEX_WAIT_REQUEUE_PI and FUTEX_REQUEUE_CMP_PI.
102
103FUTEX_WAIT_REQUEUE_PI is called by the waiter (pthread_cond_wait()
104and pthread_cond_timedwait()) to block on the initial futex and wait
105to be requeued to a PI-aware futex. The implementation is the
106result of a high-speed collision between futex_wait() and
107futex_lock_pi(), with some extra logic to check for the additional
108wake-up scenarios.
109
110FUTEX_REQUEUE_CMP_PI is called by the waker
111(pthread_cond_broadcast() and pthread_cond_signal()) to requeue and
112possibly wake the waiting tasks. Internally, this system call is
113still handled by futex_requeue (by passing requeue_pi=1). Before
114requeueing, futex_requeue() attempts to acquire the requeue target
115PI futex on behalf of the top waiter. If it can, this waiter is
116woken. futex_requeue() then proceeds to requeue the remaining
117nr_wake+nr_requeue tasks to the PI futex, calling
118rt_mutex_start_proxy_lock() prior to each requeue to prepare the
119task as a waiter on the underlying rt_mutex. It is possible that
120the lock can be acquired at this stage as well, if so, the next
121waiter is woken to finish the acquisition of the lock.
122
123FUTEX_REQUEUE_PI accepts nr_wake and nr_requeue as arguments, but
124their sum is all that really matters. futex_requeue() will wake or
125requeue up to nr_wake + nr_requeue tasks. It will wake only as many
126tasks as it can acquire the lock for, which in the majority of cases
127should be 0 as good programming practice dictates that the caller of
128either pthread_cond_broadcast() or pthread_cond_signal() acquire the
129mutex prior to making the call. FUTEX_REQUEUE_PI requires that
130nr_wake=1. nr_requeue should be INT_MAX for broadcast and 0 for
131signal.
diff --git a/Documentation/gcov.txt b/Documentation/gcov.txt
new file mode 100644
index 000000000000..e716aadb3a33
--- /dev/null
+++ b/Documentation/gcov.txt
@@ -0,0 +1,246 @@
1Using gcov with the Linux kernel
2================================
3
41. Introduction
52. Preparation
63. Customization
74. Files
85. Modules
96. Separated build and test machines
107. Troubleshooting
11Appendix A: sample script: gather_on_build.sh
12Appendix B: sample script: gather_on_test.sh
13
14
151. Introduction
16===============
17
18gcov profiling kernel support enables the use of GCC's coverage testing
19tool gcov [1] with the Linux kernel. Coverage data of a running kernel
20is exported in gcov-compatible format via the "gcov" debugfs directory.
21To get coverage data for a specific file, change to the kernel build
22directory and use gcov with the -o option as follows (requires root):
23
24# cd /tmp/linux-out
25# gcov -o /sys/kernel/debug/gcov/tmp/linux-out/kernel spinlock.c
26
27This will create source code files annotated with execution counts
28in the current directory. In addition, graphical gcov front-ends such
29as lcov [2] can be used to automate the process of collecting data
30for the entire kernel and provide coverage overviews in HTML format.
31
32Possible uses:
33
34* debugging (has this line been reached at all?)
35* test improvement (how do I change my test to cover these lines?)
36* minimizing kernel configurations (do I need this option if the
37 associated code is never run?)
38
39--
40
41[1] http://gcc.gnu.org/onlinedocs/gcc/Gcov.html
42[2] http://ltp.sourceforge.net/coverage/lcov.php
43
44
452. Preparation
46==============
47
48Configure the kernel with:
49
50 CONFIG_DEBUGFS=y
51 CONFIG_GCOV_KERNEL=y
52
53and to get coverage data for the entire kernel:
54
55 CONFIG_GCOV_PROFILE_ALL=y
56
57Note that kernels compiled with profiling flags will be significantly
58larger and run slower. Also CONFIG_GCOV_PROFILE_ALL may not be supported
59on all architectures.
60
61Profiling data will only become accessible once debugfs has been
62mounted:
63
64 mount -t debugfs none /sys/kernel/debug
65
66
673. Customization
68================
69
70To enable profiling for specific files or directories, add a line
71similar to the following to the respective kernel Makefile:
72
73 For a single file (e.g. main.o):
74 GCOV_PROFILE_main.o := y
75
76 For all files in one directory:
77 GCOV_PROFILE := y
78
79To exclude files from being profiled even when CONFIG_GCOV_PROFILE_ALL
80is specified, use:
81
82 GCOV_PROFILE_main.o := n
83 and:
84 GCOV_PROFILE := n
85
86Only files which are linked to the main kernel image or are compiled as
87kernel modules are supported by this mechanism.
88
89
904. Files
91========
92
93The gcov kernel support creates the following files in debugfs:
94
95 /sys/kernel/debug/gcov
96 Parent directory for all gcov-related files.
97
98 /sys/kernel/debug/gcov/reset
99 Global reset file: resets all coverage data to zero when
100 written to.
101
102 /sys/kernel/debug/gcov/path/to/compile/dir/file.gcda
103 The actual gcov data file as understood by the gcov
104 tool. Resets file coverage data to zero when written to.
105
106 /sys/kernel/debug/gcov/path/to/compile/dir/file.gcno
107 Symbolic link to a static data file required by the gcov
108 tool. This file is generated by gcc when compiling with
109 option -ftest-coverage.
110
111
1125. Modules
113==========
114
115Kernel modules may contain cleanup code which is only run during
116module unload time. The gcov mechanism provides a means to collect
117coverage data for such code by keeping a copy of the data associated
118with the unloaded module. This data remains available through debugfs.
119Once the module is loaded again, the associated coverage counters are
120initialized with the data from its previous instantiation.
121
122This behavior can be deactivated by specifying the gcov_persist kernel
123parameter:
124
125 gcov_persist=0
126
127At run-time, a user can also choose to discard data for an unloaded
128module by writing to its data file or the global reset file.
129
130
1316. Separated build and test machines
132====================================
133
134The gcov kernel profiling infrastructure is designed to work out-of-the
135box for setups where kernels are built and run on the same machine. In
136cases where the kernel runs on a separate machine, special preparations
137must be made, depending on where the gcov tool is used:
138
139a) gcov is run on the TEST machine
140
141The gcov tool version on the test machine must be compatible with the
142gcc version used for kernel build. Also the following files need to be
143copied from build to test machine:
144
145from the source tree:
146 - all C source files + headers
147
148from the build tree:
149 - all C source files + headers
150 - all .gcda and .gcno files
151 - all links to directories
152
153It is important to note that these files need to be placed into the
154exact same file system location on the test machine as on the build
155machine. If any of the path components is symbolic link, the actual
156directory needs to be used instead (due to make's CURDIR handling).
157
158b) gcov is run on the BUILD machine
159
160The following files need to be copied after each test case from test
161to build machine:
162
163from the gcov directory in sysfs:
164 - all .gcda files
165 - all links to .gcno files
166
167These files can be copied to any location on the build machine. gcov
168must then be called with the -o option pointing to that directory.
169
170Example directory setup on the build machine:
171
172 /tmp/linux: kernel source tree
173 /tmp/out: kernel build directory as specified by make O=
174 /tmp/coverage: location of the files copied from the test machine
175
176 [user@build] cd /tmp/out
177 [user@build] gcov -o /tmp/coverage/tmp/out/init main.c
178
179
1807. Troubleshooting
181==================
182
183Problem: Compilation aborts during linker step.
184Cause: Profiling flags are specified for source files which are not
185 linked to the main kernel or which are linked by a custom
186 linker procedure.
187Solution: Exclude affected source files from profiling by specifying
188 GCOV_PROFILE := n or GCOV_PROFILE_basename.o := n in the
189 corresponding Makefile.
190
191
192Appendix A: gather_on_build.sh
193==============================
194
195Sample script to gather coverage meta files on the build machine
196(see 6a):
197
198#!/bin/bash
199
200KSRC=$1
201KOBJ=$2
202DEST=$3
203
204if [ -z "$KSRC" ] || [ -z "$KOBJ" ] || [ -z "$DEST" ]; then
205 echo "Usage: $0 <ksrc directory> <kobj directory> <output.tar.gz>" >&2
206 exit 1
207fi
208
209KSRC=$(cd $KSRC; printf "all:\n\t@echo \${CURDIR}\n" | make -f -)
210KOBJ=$(cd $KOBJ; printf "all:\n\t@echo \${CURDIR}\n" | make -f -)
211
212find $KSRC $KOBJ \( -name '*.gcno' -o -name '*.[ch]' -o -type l \) -a \
213 -perm /u+r,g+r | tar cfz $DEST -P -T -
214
215if [ $? -eq 0 ] ; then
216 echo "$DEST successfully created, copy to test system and unpack with:"
217 echo " tar xfz $DEST -P"
218else
219 echo "Could not create file $DEST"
220fi
221
222
223Appendix B: gather_on_test.sh
224=============================
225
226Sample script to gather coverage data files on the test machine
227(see 6b):
228
229#!/bin/bash
230
231DEST=$1
232GCDA=/sys/kernel/debug/gcov
233
234if [ -z "$DEST" ] ; then
235 echo "Usage: $0 <output.tar.gz>" >&2
236 exit 1
237fi
238
239find $GCDA -name '*.gcno' -o -name '*.gcda' | tar cfz $DEST -T -
240
241if [ $? -eq 0 ] ; then
242 echo "$DEST successfully created, copy to build system and unpack with:"
243 echo " tar xfz $DEST"
244else
245 echo "Could not create file $DEST"
246fi
diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
index 145c25a170c7..e4b6985044a2 100644
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -458,7 +458,7 @@ debugfs interface, since it provides control over GPIO direction and
458value instead of just showing a gpio state summary. Plus, it could be 458value instead of just showing a gpio state summary. Plus, it could be
459present on production systems without debugging support. 459present on production systems without debugging support.
460 460
461Given approprate hardware documentation for the system, userspace could 461Given appropriate hardware documentation for the system, userspace could
462know for example that GPIO #23 controls the write protect line used to 462know for example that GPIO #23 controls the write protect line used to
463protect boot loader segments in flash memory. System upgrade procedures 463protect boot loader segments in flash memory. System upgrade procedures
464may need to temporarily remove that protection, first importing a GPIO, 464may need to temporarily remove that protection, first importing a GPIO,
diff --git a/Documentation/hwmon/f71882fg b/Documentation/hwmon/f71882fg
index a8321267b5b6..bee4c30bc1e2 100644
--- a/Documentation/hwmon/f71882fg
+++ b/Documentation/hwmon/f71882fg
@@ -2,14 +2,18 @@ Kernel driver f71882fg
2====================== 2======================
3 3
4Supported chips: 4Supported chips:
5 * Fintek F71882FG and F71883FG 5 * Fintek F71858FG
6 Prefix: 'f71882fg' 6 Prefix: 'f71858fg'
7 Addresses scanned: none, address read from Super I/O config space 7 Addresses scanned: none, address read from Super I/O config space
8 Datasheet: Available from the Fintek website 8 Datasheet: Available from the Fintek website
9 * Fintek F71862FG and F71863FG 9 * Fintek F71862FG and F71863FG
10 Prefix: 'f71862fg' 10 Prefix: 'f71862fg'
11 Addresses scanned: none, address read from Super I/O config space 11 Addresses scanned: none, address read from Super I/O config space
12 Datasheet: Available from the Fintek website 12 Datasheet: Available from the Fintek website
13 * Fintek F71882FG and F71883FG
14 Prefix: 'f71882fg'
15 Addresses scanned: none, address read from Super I/O config space
16 Datasheet: Available from the Fintek website
13 * Fintek F8000 17 * Fintek F8000
14 Prefix: 'f8000' 18 Prefix: 'f8000'
15 Addresses scanned: none, address read from Super I/O config space 19 Addresses scanned: none, address read from Super I/O config space
@@ -66,13 +70,13 @@ printed when loading the driver.
66 70
67Three different fan control modes are supported; the mode number is written 71Three different fan control modes are supported; the mode number is written
68to the pwm#_enable file. Note that not all modes are supported on all 72to the pwm#_enable file. Note that not all modes are supported on all
69chips, and some modes may only be available in RPM / PWM mode on the F8000. 73chips, and some modes may only be available in RPM / PWM mode.
70Writing an unsupported mode will result in an invalid parameter error. 74Writing an unsupported mode will result in an invalid parameter error.
71 75
72* 1: Manual mode 76* 1: Manual mode
73 You ask for a specific PWM duty cycle / DC voltage or a specific % of 77 You ask for a specific PWM duty cycle / DC voltage or a specific % of
74 fan#_full_speed by writing to the pwm# file. This mode is only 78 fan#_full_speed by writing to the pwm# file. This mode is only
75 available on the F8000 if the fan channel is in RPM mode. 79 available on the F71858FG / F8000 if the fan channel is in RPM mode.
76 80
77* 2: Normal auto mode 81* 2: Normal auto mode
78 You can define a number of temperature/fan speed trip points, which % the 82 You can define a number of temperature/fan speed trip points, which % the
diff --git a/Documentation/hwmon/ibmaem b/Documentation/hwmon/ibmaem
index e98bdfea3467..1e0d59e000b4 100644
--- a/Documentation/hwmon/ibmaem
+++ b/Documentation/hwmon/ibmaem
@@ -7,7 +7,7 @@ henceforth as AEM.
7Supported systems: 7Supported systems:
8 * Any recent IBM System X server with AEM support. 8 * Any recent IBM System X server with AEM support.
9 This includes the x3350, x3550, x3650, x3655, x3755, x3850 M2, 9 This includes the x3350, x3550, x3650, x3655, x3755, x3850 M2,
10 x3950 M2, and certain HS2x/LS2x/QS2x blades. The IPMI host interface 10 x3950 M2, and certain HC10/HS2x/LS2x/QS2x blades. The IPMI host interface
11 driver ("ipmi-si") needs to be loaded for this driver to do anything. 11 driver ("ipmi-si") needs to be loaded for this driver to do anything.
12 Prefix: 'ibmaem' 12 Prefix: 'ibmaem'
13 Datasheet: Not available 13 Datasheet: Not available
diff --git a/Documentation/hwmon/sysfs-interface b/Documentation/hwmon/sysfs-interface
index 004ee161721e..dcbd502c8792 100644
--- a/Documentation/hwmon/sysfs-interface
+++ b/Documentation/hwmon/sysfs-interface
@@ -70,6 +70,7 @@ are interpreted as 0! For more on how written strings are interpreted see the
70[0-*] denotes any positive number starting from 0 70[0-*] denotes any positive number starting from 0
71[1-*] denotes any positive number starting from 1 71[1-*] denotes any positive number starting from 1
72RO read only value 72RO read only value
73WO write only value
73RW read/write value 74RW read/write value
74 75
75Read/write values may be read-only for some chips, depending on the 76Read/write values may be read-only for some chips, depending on the
@@ -295,6 +296,24 @@ temp[1-*]_label Suggested temperature channel label.
295 user-space. 296 user-space.
296 RO 297 RO
297 298
299temp[1-*]_lowest
300 Historical minimum temperature
301 Unit: millidegree Celsius
302 RO
303
304temp[1-*]_highest
305 Historical maximum temperature
306 Unit: millidegree Celsius
307 RO
308
309temp[1-*]_reset_history
310 Reset temp_lowest and temp_highest
311 WO
312
313temp_reset_history
314 Reset temp_lowest and temp_highest for all sensors
315 WO
316
298Some chips measure temperature using external thermistors and an ADC, and 317Some chips measure temperature using external thermistors and an ADC, and
299report the temperature measurement as a voltage. Converting this voltage 318report the temperature measurement as a voltage. Converting this voltage
300back to a temperature (or the other way around for limits) requires 319back to a temperature (or the other way around for limits) requires
diff --git a/Documentation/hwmon/tmp401 b/Documentation/hwmon/tmp401
new file mode 100644
index 000000000000..9fc447249212
--- /dev/null
+++ b/Documentation/hwmon/tmp401
@@ -0,0 +1,42 @@
1Kernel driver tmp401
2====================
3
4Supported chips:
5 * Texas Instruments TMP401
6 Prefix: 'tmp401'
7 Addresses scanned: I2C 0x4c
8 Datasheet: http://focus.ti.com/docs/prod/folders/print/tmp401.html
9 * Texas Instruments TMP411
10 Prefix: 'tmp411'
11 Addresses scanned: I2C 0x4c
12 Datasheet: http://focus.ti.com/docs/prod/folders/print/tmp411.html
13
14Authors:
15 Hans de Goede <hdegoede@redhat.com>
16 Andre Prendel <andre.prendel@gmx.de>
17
18Description
19-----------
20
21This driver implements support for Texas Instruments TMP401 and
22TMP411 chips. These chips implements one remote and one local
23temperature sensor. Temperature is measured in degrees
24Celsius. Resolution of the remote sensor is 0.0625 degree. Local
25sensor resolution can be set to 0.5, 0.25, 0.125 or 0.0625 degree (not
26supported by the driver so far, so using the default resolution of 0.5
27degree).
28
29The driver provides the common sysfs-interface for temperatures (see
30/Documentation/hwmon/sysfs-interface under Temperatures).
31
32The TMP411 chip is compatible with TMP401. It provides some additional
33features.
34
35* Minimum and Maximum temperature measured since power-on, chip-reset
36
37 Exported via sysfs attributes tempX_lowest and tempX_highest.
38
39* Reset of historical minimum/maximum temperature measurements
40
41 Exported via sysfs attribute temp_reset_history. Writing 1 to this
42 file triggers a reset.
diff --git a/Documentation/hwmon/w83627ehf b/Documentation/hwmon/w83627ehf
index b6eb59384bb3..02b74899edaf 100644
--- a/Documentation/hwmon/w83627ehf
+++ b/Documentation/hwmon/w83627ehf
@@ -12,6 +12,10 @@ Supported chips:
12 Addresses scanned: ISA address retrieved from Super I/O registers 12 Addresses scanned: ISA address retrieved from Super I/O registers
13 Datasheet: 13 Datasheet:
14 http://www.nuvoton.com.tw/NR/rdonlyres/7885623D-A487-4CF9-A47F-30C5F73D6FE6/0/W83627DHG.pdf 14 http://www.nuvoton.com.tw/NR/rdonlyres/7885623D-A487-4CF9-A47F-30C5F73D6FE6/0/W83627DHG.pdf
15 * Winbond W83627DHG-P
16 Prefix: 'w83627dhg'
17 Addresses scanned: ISA address retrieved from Super I/O registers
18 Datasheet: not available
15 * Winbond W83667HG 19 * Winbond W83667HG
16 Prefix: 'w83667hg' 20 Prefix: 'w83667hg'
17 Addresses scanned: ISA address retrieved from Super I/O registers 21 Addresses scanned: ISA address retrieved from Super I/O registers
@@ -28,8 +32,8 @@ Description
28----------- 32-----------
29 33
30This driver implements support for the Winbond W83627EHF, W83627EHG, 34This driver implements support for the Winbond W83627EHF, W83627EHG,
31W83627DHG and W83667HG super I/O chips. We will refer to them collectively 35W83627DHG, W83627DHG-P and W83667HG super I/O chips. We will refer to them
32as Winbond chips. 36collectively as Winbond chips.
33 37
34The chips implement three temperature sensors, five fan rotation 38The chips implement three temperature sensors, five fan rotation
35speed sensors, ten analog voltage sensors (only nine for the 627DHG), one 39speed sensors, ten analog voltage sensors (only nine for the 627DHG), one
@@ -135,3 +139,6 @@ done in the driver for all register addresses.
135The DHG also supports PECI, where the DHG queries Intel CPU temperatures, and 139The DHG also supports PECI, where the DHG queries Intel CPU temperatures, and
136the ICH8 southbridge gets that data via PECI from the DHG, so that the 140the ICH8 southbridge gets that data via PECI from the DHG, so that the
137southbridge drives the fans. And the DHG supports SST, a one-wire serial bus. 141southbridge drives the fans. And the DHG supports SST, a one-wire serial bus.
142
143The DHG-P has an additional automatic fan speed control mode named Smart Fan
144(TM) III+. This mode is not yet supported by the driver.
diff --git a/Documentation/i2c/busses/i2c-ocores b/Documentation/i2c/busses/i2c-ocores
index cfcebb10d14e..c269aaa2f26a 100644
--- a/Documentation/i2c/busses/i2c-ocores
+++ b/Documentation/i2c/busses/i2c-ocores
@@ -20,6 +20,8 @@ platform_device with the base address and interrupt number. The
20dev.platform_data of the device should also point to a struct 20dev.platform_data of the device should also point to a struct
21ocores_i2c_platform_data (see linux/i2c-ocores.h) describing the 21ocores_i2c_platform_data (see linux/i2c-ocores.h) describing the
22distance between registers and the input clock speed. 22distance between registers and the input clock speed.
23There is also a possibility to attach a list of i2c_board_info which
24the i2c-ocores driver will add to the bus upon creation.
23 25
24E.G. something like: 26E.G. something like:
25 27
@@ -36,9 +38,24 @@ static struct resource ocores_resources[] = {
36 }, 38 },
37}; 39};
38 40
41/* optional board info */
42struct i2c_board_info ocores_i2c_board_info[] = {
43 {
44 I2C_BOARD_INFO("tsc2003", 0x48),
45 .platform_data = &tsc2003_platform_data,
46 .irq = TSC_IRQ
47 },
48 {
49 I2C_BOARD_INFO("adv7180", 0x42 >> 1),
50 .irq = ADV_IRQ
51 }
52};
53
39static struct ocores_i2c_platform_data myi2c_data = { 54static struct ocores_i2c_platform_data myi2c_data = {
40 .regstep = 2, /* two bytes between registers */ 55 .regstep = 2, /* two bytes between registers */
41 .clock_khz = 50000, /* input clock of 50MHz */ 56 .clock_khz = 50000, /* input clock of 50MHz */
57 .devices = ocores_i2c_board_info, /* optional table of devices */
58 .num_devices = ARRAY_SIZE(ocores_i2c_board_info), /* table size */
42}; 59};
43 60
44static struct platform_device myi2c = { 61static struct platform_device myi2c = {
diff --git a/Documentation/i2c/busses/i2c-viapro b/Documentation/i2c/busses/i2c-viapro
index 22efedf60c87..2e758b0e9456 100644
--- a/Documentation/i2c/busses/i2c-viapro
+++ b/Documentation/i2c/busses/i2c-viapro
@@ -19,6 +19,9 @@ Supported adapters:
19 * VIA Technologies, Inc. VX800/VX820 19 * VIA Technologies, Inc. VX800/VX820
20 Datasheet: available on http://linux.via.com.tw 20 Datasheet: available on http://linux.via.com.tw
21 21
22 * VIA Technologies, Inc. VX855/VX875
23 Datasheet: Availability unknown
24
22Authors: 25Authors:
23 Kyösti Mälkki <kmalkki@cc.hut.fi>, 26 Kyösti Mälkki <kmalkki@cc.hut.fi>,
24 Mark D. Studebaker <mdsxyz123@yahoo.com>, 27 Mark D. Studebaker <mdsxyz123@yahoo.com>,
@@ -53,6 +56,7 @@ Your lspci -n listing must show one of these :
53 device 1106:3287 (VT8251) 56 device 1106:3287 (VT8251)
54 device 1106:8324 (CX700) 57 device 1106:8324 (CX700)
55 device 1106:8353 (VX800/VX820) 58 device 1106:8353 (VX800/VX820)
59 device 1106:8409 (VX855/VX875)
56 60
57If none of these show up, you should look in the BIOS for settings like 61If none of these show up, you should look in the BIOS for settings like
58enable ACPI / SMBus or even USB. 62enable ACPI / SMBus or even USB.
diff --git a/Documentation/i2c/instantiating-devices b/Documentation/i2c/instantiating-devices
index b55ce57a84db..c740b7b41088 100644
--- a/Documentation/i2c/instantiating-devices
+++ b/Documentation/i2c/instantiating-devices
@@ -165,3 +165,47 @@ was done there. Two significant differences are:
165Once again, method 3 should be avoided wherever possible. Explicit device 165Once again, method 3 should be avoided wherever possible. Explicit device
166instantiation (methods 1 and 2) is much preferred for it is safer and 166instantiation (methods 1 and 2) is much preferred for it is safer and
167faster. 167faster.
168
169
170Method 4: Instantiate from user-space
171-------------------------------------
172
173In general, the kernel should know which I2C devices are connected and
174what addresses they live at. However, in certain cases, it does not, so a
175sysfs interface was added to let the user provide the information. This
176interface is made of 2 attribute files which are created in every I2C bus
177directory: new_device and delete_device. Both files are write only and you
178must write the right parameters to them in order to properly instantiate,
179respectively delete, an I2C device.
180
181File new_device takes 2 parameters: the name of the I2C device (a string)
182and the address of the I2C device (a number, typically expressed in
183hexadecimal starting with 0x, but can also be expressed in decimal.)
184
185File delete_device takes a single parameter: the address of the I2C
186device. As no two devices can live at the same address on a given I2C
187segment, the address is sufficient to uniquely identify the device to be
188deleted.
189
190Example:
191# echo eeprom 0x50 > /sys/class/i2c-adapter/i2c-3/new_device
192
193While this interface should only be used when in-kernel device declaration
194can't be done, there is a variety of cases where it can be helpful:
195* The I2C driver usually detects devices (method 3 above) but the bus
196 segment your device lives on doesn't have the proper class bit set and
197 thus detection doesn't trigger.
198* The I2C driver usually detects devices, but your device lives at an
199 unexpected address.
200* The I2C driver usually detects devices, but your device is not detected,
201 either because the detection routine is too strict, or because your
202 device is not officially supported yet but you know it is compatible.
203* You are developing a driver on a test board, where you soldered the I2C
204 device yourself.
205
206This interface is a replacement for the force_* module parameters some I2C
207drivers implement. Being implemented in i2c-core rather than in each
208device driver individually, it is much more efficient, and also has the
209advantage that you do not have to reload the driver to change a setting.
210You can also instantiate the device before the driver is loaded or even
211available, and you don't need to know what driver the device needs.
diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients
index c1a06f989cf7..7860aafb483d 100644
--- a/Documentation/i2c/writing-clients
+++ b/Documentation/i2c/writing-clients
@@ -126,19 +126,9 @@ different) configuration information, as do drivers handling chip variants
126that can't be distinguished by protocol probing, or which need some board 126that can't be distinguished by protocol probing, or which need some board
127specific information to operate correctly. 127specific information to operate correctly.
128 128
129Accordingly, the I2C stack now has two models for associating I2C devices
130with their drivers: the original "legacy" model, and a newer one that's
131fully compatible with the Linux 2.6 driver model. These models do not mix,
132since the "legacy" model requires drivers to create "i2c_client" device
133objects after SMBus style probing, while the Linux driver model expects
134drivers to be given such device objects in their probe() routines.
135 129
136The legacy model is deprecated now and will soon be removed, so we no 130Device/Driver Binding
137longer document it here. 131---------------------
138
139
140Standard Driver Model Binding ("New Style")
141-------------------------------------------
142 132
143System infrastructure, typically board-specific initialization code or 133System infrastructure, typically board-specific initialization code or
144boot firmware, reports what I2C devices exist. For example, there may be 134boot firmware, reports what I2C devices exist. For example, there may be
@@ -201,7 +191,7 @@ a given I2C bus. This is for example the case of hardware monitoring
201devices on a PC's SMBus. In that case, you may want to let your driver 191devices on a PC's SMBus. In that case, you may want to let your driver
202detect supported devices automatically. This is how the legacy model 192detect supported devices automatically. This is how the legacy model
203was working, and is now available as an extension to the standard 193was working, and is now available as an extension to the standard
204driver model (so that we can finally get rid of the legacy model.) 194driver model.
205 195
206You simply have to define a detect callback which will attempt to 196You simply have to define a detect callback which will attempt to
207identify supported devices (returning 0 for supported ones and -ENODEV 197identify supported devices (returning 0 for supported ones and -ENODEV
diff --git a/Documentation/ide/ide.txt b/Documentation/ide/ide.txt
index 0c78f4b1d9d9..e77bebfa7b0d 100644
--- a/Documentation/ide/ide.txt
+++ b/Documentation/ide/ide.txt
@@ -216,6 +216,8 @@ Other kernel parameters for ide_core are:
216 216
217* "noflush=[interface_number.device_number]" to disable flush requests 217* "noflush=[interface_number.device_number]" to disable flush requests
218 218
219* "nohpa=[interface_number.device_number]" to disable Host Protected Area
220
219* "noprobe=[interface_number.device_number]" to skip probing 221* "noprobe=[interface_number.device_number]" to skip probing
220 222
221* "nowerr=[interface_number.device_number]" to ignore the WRERR_STAT bit 223* "nowerr=[interface_number.device_number]" to ignore the WRERR_STAT bit
diff --git a/Documentation/input/input.txt b/Documentation/input/input.txt
index 686ee9932dff..b93c08442e3c 100644
--- a/Documentation/input/input.txt
+++ b/Documentation/input/input.txt
@@ -278,7 +278,7 @@ struct input_event {
278}; 278};
279 279
280 'time' is the timestamp, it returns the time at which the event happened. 280 'time' is the timestamp, it returns the time at which the event happened.
281Type is for example EV_REL for relative moment, REL_KEY for a keypress or 281Type is for example EV_REL for relative moment, EV_KEY for a keypress or
282release. More types are defined in include/linux/input.h. 282release. More types are defined in include/linux/input.h.
283 283
284 'code' is event code, for example REL_X or KEY_BACKSPACE, again a complete 284 'code' is event code, for example REL_X or KEY_BACKSPACE, again a complete
diff --git a/Documentation/input/rotary-encoder.txt b/Documentation/input/rotary-encoder.txt
index 435102a26d96..3a6aec40c0b0 100644
--- a/Documentation/input/rotary-encoder.txt
+++ b/Documentation/input/rotary-encoder.txt
@@ -67,7 +67,12 @@ data with it.
67struct rotary_encoder_platform_data is declared in 67struct rotary_encoder_platform_data is declared in
68include/linux/rotary-encoder.h and needs to be filled with the number of 68include/linux/rotary-encoder.h and needs to be filled with the number of
69steps the encoder has and can carry information about externally inverted 69steps the encoder has and can carry information about externally inverted
70signals (because of used invertig buffer or other reasons). 70signals (because of an inverting buffer or other reasons). The encoder
71can be set up to deliver input information as either an absolute or relative
72axes. For relative axes the input event returns +/-1 for each step. For
73absolute axes the position of the encoder can either roll over between zero
74and the number of steps or will clamp at the maximum and zero depending on
75the configuration.
71 76
72Because GPIO to IRQ mapping is platform specific, this information must 77Because GPIO to IRQ mapping is platform specific, this information must
73be given in seperately to the driver. See the example below. 78be given in seperately to the driver. See the example below.
@@ -85,6 +90,8 @@ be given in seperately to the driver. See the example below.
85static struct rotary_encoder_platform_data my_rotary_encoder_info = { 90static struct rotary_encoder_platform_data my_rotary_encoder_info = {
86 .steps = 24, 91 .steps = 24,
87 .axis = ABS_X, 92 .axis = ABS_X,
93 .relative_axis = false,
94 .rollover = false,
88 .gpio_a = GPIO_ROTARY_A, 95 .gpio_a = GPIO_ROTARY_A,
89 .gpio_b = GPIO_ROTARY_B, 96 .gpio_b = GPIO_ROTARY_B,
90 .inverted_a = 0, 97 .inverted_a = 0,
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 1f779a25c703..7bb0d934b6d8 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -149,6 +149,8 @@ Code Seq# Include File Comments
149'p' 40-7F linux/nvram.h 149'p' 40-7F linux/nvram.h
150'p' 80-9F user-space parport 150'p' 80-9F user-space parport
151 <mailto:tim@cyberelk.net> 151 <mailto:tim@cyberelk.net>
152'p' a1-a4 linux/pps.h LinuxPPS
153 <mailto:giometti@linux.it>
152'q' 00-1F linux/serio.h 154'q' 00-1F linux/serio.h
153'q' 80-FF Internet PhoneJACK, Internet LineJACK 155'q' 80-FF Internet PhoneJACK, Internet LineJACK
154 <http://www.quicknet.net> 156 <http://www.quicknet.net>
diff --git a/Documentation/isdn/00-INDEX b/Documentation/isdn/00-INDEX
index 5a2d69989a8c..e87e336f590e 100644
--- a/Documentation/isdn/00-INDEX
+++ b/Documentation/isdn/00-INDEX
@@ -14,39 +14,37 @@ README
14 - general info on what you need and what to do for Linux ISDN. 14 - general info on what you need and what to do for Linux ISDN.
15README.FAQ 15README.FAQ
16 - general info for FAQ. 16 - general info for FAQ.
17README.HiSax
18 - info on the HiSax driver which replaces the old teles.
19README.act2000
20 - info on driver for IBM ACT-2000 card.
17README.audio 21README.audio
18 - info for running audio over ISDN. 22 - info for running audio over ISDN.
23README.avmb1
24 - info on driver for AVM-B1 ISDN card.
25README.concap
26 - info on "CONCAP" encapsulation protocol interface used for X.25.
27README.diversion
28 - info on module for isdn diversion services.
19README.fax 29README.fax
20 - info for using Fax over ISDN. 30 - info for using Fax over ISDN.
21README.gigaset 31README.gigaset
22 - info on the drivers for Siemens Gigaset ISDN adapters. 32 - info on the drivers for Siemens Gigaset ISDN adapters
23README.icn
24 - info on the ICN-ISDN-card and its driver.
25README.HiSax
26 - info on the HiSax driver which replaces the old teles.
27README.hfc-pci 33README.hfc-pci
28 - info on hfc-pci based cards. 34 - info on hfc-pci based cards.
35README.hysdn
36 - info on driver for Hypercope active HYSDN cards
37README.icn
38 - info on the ICN-ISDN-card and its driver.
39README.mISDN
40 - info on the Modular ISDN subsystem (mISDN)
29README.pcbit 41README.pcbit
30 - info on the PCBIT-D ISDN adapter and driver. 42 - info on the PCBIT-D ISDN adapter and driver.
31README.syncppp
32 - info on running Sync PPP over ISDN.
33syncPPP.FAQ
34 - frequently asked questions about running PPP over ISDN.
35README.avmb1
36 - info on driver for AVM-B1 ISDN card.
37README.act2000
38 - info on driver for IBM ACT-2000 card.
39README.eicon
40 - info on driver for Eicon active cards.
41README.concap
42 - info on "CONCAP" encapsulation protocol interface used for X.25.
43README.diversion
44 - info on module for isdn diversion services.
45README.sc 43README.sc
46 - info on driver for Spellcaster cards. 44 - info on driver for Spellcaster cards.
45README.syncppp
46 - info on running Sync PPP over ISDN.
47README.x25 47README.x25
48 - info for running X.25 over ISDN. 48 - info for running X.25 over ISDN.
49README.hysdn 49syncPPP.FAQ
50 - info on driver for Hypercope active HYSDN cards 50 - frequently asked questions about running PPP over ISDN.
51README.mISDN
52 - info on the Modular ISDN subsystem (mISDN).
diff --git a/Documentation/isdn/INTERFACE.CAPI b/Documentation/isdn/INTERFACE.CAPI
index 786d619b36e5..686e107923ec 100644
--- a/Documentation/isdn/INTERFACE.CAPI
+++ b/Documentation/isdn/INTERFACE.CAPI
@@ -45,7 +45,7 @@ From then on, Kernel CAPI may call the registered callback functions for the
45device. 45device.
46 46
47If the device becomes unusable for any reason (shutdown, disconnect ...), the 47If the device becomes unusable for any reason (shutdown, disconnect ...), the
48driver has to call capi_ctr_reseted(). This will prevent further calls to the 48driver has to call capi_ctr_down(). This will prevent further calls to the
49callback functions by Kernel CAPI. 49callback functions by Kernel CAPI.
50 50
51 51
@@ -114,20 +114,36 @@ char *driver_name
114int (*load_firmware)(struct capi_ctr *ctrlr, capiloaddata *ldata) 114int (*load_firmware)(struct capi_ctr *ctrlr, capiloaddata *ldata)
115 (optional) pointer to a callback function for sending firmware and 115 (optional) pointer to a callback function for sending firmware and
116 configuration data to the device 116 configuration data to the device
117 Return value: 0 on success, error code on error
118 Called in process context.
117 119
118void (*reset_ctr)(struct capi_ctr *ctrlr) 120void (*reset_ctr)(struct capi_ctr *ctrlr)
119 pointer to a callback function for performing a reset on the device, 121 (optional) pointer to a callback function for performing a reset on
120 releasing all registered applications 122 the device, releasing all registered applications
123 Called in process context.
121 124
122void (*register_appl)(struct capi_ctr *ctrlr, u16 applid, 125void (*register_appl)(struct capi_ctr *ctrlr, u16 applid,
123 capi_register_params *rparam) 126 capi_register_params *rparam)
124void (*release_appl)(struct capi_ctr *ctrlr, u16 applid) 127void (*release_appl)(struct capi_ctr *ctrlr, u16 applid)
125 pointers to callback functions for registration and deregistration of 128 pointers to callback functions for registration and deregistration of
126 applications with the device 129 applications with the device
130 Calls to these functions are serialized by Kernel CAPI so that only
131 one call to any of them is active at any time.
127 132
128u16 (*send_message)(struct capi_ctr *ctrlr, struct sk_buff *skb) 133u16 (*send_message)(struct capi_ctr *ctrlr, struct sk_buff *skb)
129 pointer to a callback function for sending a CAPI message to the 134 pointer to a callback function for sending a CAPI message to the
130 device 135 device
136 Return value: CAPI error code
137 If the method returns 0 (CAPI_NOERROR) the driver has taken ownership
138 of the skb and the caller may no longer access it. If it returns a
139 non-zero (error) value then ownership of the skb returns to the caller
140 who may reuse or free it.
141 The return value should only be used to signal problems with respect
142 to accepting or queueing the message. Errors occurring during the
143 actual processing of the message should be signaled with an
144 appropriate reply message.
145 Calls to this function are not serialized by Kernel CAPI, ie. it must
146 be prepared to be re-entered.
131 147
132char *(*procinfo)(struct capi_ctr *ctrlr) 148char *(*procinfo)(struct capi_ctr *ctrlr)
133 pointer to a callback function returning the entry for the device in 149 pointer to a callback function returning the entry for the device in
@@ -138,6 +154,8 @@ read_proc_t *ctr_read_proc
138 system entry, /proc/capi/controllers/<n>; will be called with a 154 system entry, /proc/capi/controllers/<n>; will be called with a
139 pointer to the device's capi_ctr structure as the last (data) argument 155 pointer to the device's capi_ctr structure as the last (data) argument
140 156
157Note: Callback functions are never called in interrupt context.
158
141- to be filled in before calling capi_ctr_ready(): 159- to be filled in before calling capi_ctr_ready():
142 160
143u8 manu[CAPI_MANUFACTURER_LEN] 161u8 manu[CAPI_MANUFACTURER_LEN]
@@ -153,6 +171,45 @@ u8 serial[CAPI_SERIAL_LEN]
153 value to return for CAPI_GET_SERIAL 171 value to return for CAPI_GET_SERIAL
154 172
155 173
1744.3 The _cmsg Structure
175
176(declared in <linux/isdn/capiutil.h>)
177
178The _cmsg structure stores the contents of a CAPI 2.0 message in an easily
179accessible form. It contains members for all possible CAPI 2.0 parameters, of
180which only those appearing in the message type currently being processed are
181actually used. Unused members should be set to zero.
182
183Members are named after the CAPI 2.0 standard names of the parameters they
184represent. See <linux/isdn/capiutil.h> for the exact spelling. Member data
185types are:
186
187u8 for CAPI parameters of type 'byte'
188
189u16 for CAPI parameters of type 'word'
190
191u32 for CAPI parameters of type 'dword'
192
193_cstruct for CAPI parameters of type 'struct' not containing any
194 variably-sized (struct) subparameters (eg. 'Called Party Number')
195 The member is a pointer to a buffer containing the parameter in
196 CAPI encoding (length + content). It may also be NULL, which will
197 be taken to represent an empty (zero length) parameter.
198
199_cmstruct for CAPI parameters of type 'struct' containing 'struct'
200 subparameters ('Additional Info' and 'B Protocol')
201 The representation is a single byte containing one of the values:
202 CAPI_DEFAULT: the parameter is empty
203 CAPI_COMPOSE: the values of the subparameters are stored
204 individually in the corresponding _cmsg structure members
205
206Functions capi_cmsg2message() and capi_message2cmsg() are provided to convert
207messages between their transport encoding described in the CAPI 2.0 standard
208and their _cmsg structure representation. Note that capi_cmsg2message() does
209not know or check the size of its destination buffer. The caller must make
210sure it is big enough to accomodate the resulting CAPI message.
211
212
1565. Lower Layer Interface Functions 2135. Lower Layer Interface Functions
157 214
158(declared in <linux/isdn/capilli.h>) 215(declared in <linux/isdn/capilli.h>)
@@ -166,7 +223,7 @@ int detach_capi_ctr(struct capi_ctr *ctrlr)
166 register/unregister a device (controller) with Kernel CAPI 223 register/unregister a device (controller) with Kernel CAPI
167 224
168void capi_ctr_ready(struct capi_ctr *ctrlr) 225void capi_ctr_ready(struct capi_ctr *ctrlr)
169void capi_ctr_reseted(struct capi_ctr *ctrlr) 226void capi_ctr_down(struct capi_ctr *ctrlr)
170 signal controller ready/not ready 227 signal controller ready/not ready
171 228
172void capi_ctr_suspend_output(struct capi_ctr *ctrlr) 229void capi_ctr_suspend_output(struct capi_ctr *ctrlr)
@@ -211,3 +268,32 @@ CAPIMSG_CONTROL(m) CAPIMSG_SETCONTROL(m, contr) Controller/PLCI/NCCI
211 (u32) 268 (u32)
212CAPIMSG_DATALEN(m) CAPIMSG_SETDATALEN(m, len) Data Length (u16) 269CAPIMSG_DATALEN(m) CAPIMSG_SETDATALEN(m, len) Data Length (u16)
213 270
271
272Library functions for working with _cmsg structures
273(from <linux/isdn/capiutil.h>):
274
275unsigned capi_cmsg2message(_cmsg *cmsg, u8 *msg)
276 Assembles a CAPI 2.0 message from the parameters in *cmsg, storing the
277 result in *msg.
278
279unsigned capi_message2cmsg(_cmsg *cmsg, u8 *msg)
280 Disassembles the CAPI 2.0 message in *msg, storing the parameters in
281 *cmsg.
282
283unsigned capi_cmsg_header(_cmsg *cmsg, u16 ApplId, u8 Command, u8 Subcommand,
284 u16 Messagenumber, u32 Controller)
285 Fills the header part and address field of the _cmsg structure *cmsg
286 with the given values, zeroing the remainder of the structure so only
287 parameters with non-default values need to be changed before sending
288 the message.
289
290void capi_cmsg_answer(_cmsg *cmsg)
291 Sets the low bit of the Subcommand field in *cmsg, thereby converting
292 _REQ to _CONF and _IND to _RESP.
293
294char *capi_cmd2str(u8 Command, u8 Subcommand)
295 Returns the CAPI 2.0 message name corresponding to the given command
296 and subcommand values, as a static ASCII string. The return value may
297 be NULL if the command/subcommand is not one of those defined in the
298 CAPI 2.0 standard.
299
diff --git a/Documentation/isdn/README.gigaset b/Documentation/isdn/README.gigaset
index 02c0e9341dd8..f9963103ae3d 100644
--- a/Documentation/isdn/README.gigaset
+++ b/Documentation/isdn/README.gigaset
@@ -149,10 +149,8 @@ GigaSet 307x Device Driver
149 configuration files and chat scripts in the gigaset-VERSION/ppp directory 149 configuration files and chat scripts in the gigaset-VERSION/ppp directory
150 in the driver packages from http://sourceforge.net/projects/gigaset307x/. 150 in the driver packages from http://sourceforge.net/projects/gigaset307x/.
151 Please note that the USB drivers are not able to change the state of the 151 Please note that the USB drivers are not able to change the state of the
152 control lines (the M105 driver can be configured to use some undocumented 152 control lines. This means you must use "Stupid Mode" if you are using
153 control requests, if you really need the control lines, though). This means 153 wvdial or you should use the nocrtscts option of pppd.
154 you must use "Stupid Mode" if you are using wvdial or you should use the
155 nocrtscts option of pppd.
156 You must also assure that the ppp_async module is loaded with the parameter 154 You must also assure that the ppp_async module is loaded with the parameter
157 flag_time=0. You can do this e.g. by adding a line like 155 flag_time=0. You can do this e.g. by adding a line like
158 156
@@ -190,20 +188,19 @@ GigaSet 307x Device Driver
190 You can also use /sys/class/tty/ttyGxy/cidmode for changing the CID mode 188 You can also use /sys/class/tty/ttyGxy/cidmode for changing the CID mode
191 setting (ttyGxy is ttyGU0 or ttyGB0). 189 setting (ttyGxy is ttyGU0 or ttyGB0).
192 190
1932.6. M105 Undocumented USB Requests 1912.6. Unregistered Wireless Devices (M101/M105)
194 ------------------------------ 192 -----------------------------------------
195 193 The main purpose of the ser_gigaset and usb_gigaset drivers is to allow
196 The Gigaset M105 USB data box understands a couple of useful, but 194 the M101 and M105 wireless devices to be used as ISDN devices for ISDN
197 undocumented USB commands. These requests are not used in normal 195 connections through a Gigaset base. Therefore they assume that the device
198 operation (for wireless access to the base), but are needed for access 196 is registered to a DECT base.
199 to the M105's own configuration mode (registration to the base, baudrate 197
200 and line format settings, device status queries) via the gigacontr 198 If the M101/M105 device is not registered to a base, initialization of
201 utility. Their use is controlled by the kernel configuration option 199 the device fails, and a corresponding error message is logged by the
202 "Support for undocumented USB requests" (CONFIG_GIGASET_UNDOCREQ). If you 200 driver. In that situation, a restricted set of functions is available
203 encounter error code -ENOTTY when trying to use some features of the 201 which includes, in particular, those necessary for registering the device
204 M105, try setting that option to "y" via 'make {x,menu}config' and 202 to a base or for switching it between Fixed Part and Portable Part
205 recompiling the driver. 203 modes.
206
207 204
2083. Troubleshooting 2053. Troubleshooting
209 --------------- 206 ---------------
@@ -234,11 +231,12 @@ GigaSet 307x Device Driver
234 Select Unimodem mode for all DECT data adapters. (see section 2.4.) 231 Select Unimodem mode for all DECT data adapters. (see section 2.4.)
235 232
236 Problem: 233 Problem:
237 You want to configure your USB DECT data adapter (M105) but gigacontr 234 Messages like this:
238 reports an error: "/dev/ttyGU0: Inappropriate ioctl for device". 235 usb_gigaset 3-2:1.0: Could not initialize the device.
236 appear in your syslog.
239 Solution: 237 Solution:
240 Recompile the usb_gigaset driver with the kernel configuration option 238 Check whether your M10x wireless device is correctly registered to the
241 CONFIG_GIGASET_UNDOCREQ set to 'y'. (see section 2.6.) 239 Gigaset base. (see section 2.6.)
242 240
2433.2. Telling the driver to provide more information 2413.2. Telling the driver to provide more information
244 ---------------------------------------------- 242 ----------------------------------------------
diff --git a/Documentation/ja_JP/SubmitChecklist b/Documentation/ja_JP/SubmitChecklist
index 6c42e071d723..2df4576f1173 100644
--- a/Documentation/ja_JP/SubmitChecklist
+++ b/Documentation/ja_JP/SubmitChecklist
@@ -75,7 +75,7 @@ Linux カーネルパッチ投稿者向けチェックリスト
75 ビルドした上、動作確認を行ってください。 75 ビルドした上、動作確認を行ってください。
76 76
7714: もしパッチがディスクのI/O性能などに影響を与えるようであれば、 7714: もしパッチがディスクのI/O性能などに影響を与えるようであれば、
78 'CONFIG_LBD'オプションを有効にした場合と無効にした場合の両方で 78 'CONFIG_LBDAF'オプションを有効にした場合と無効にした場合の両方で
79 テストを実施してみてください。 79 テストを実施してみてください。
80 80
8115: lockdepの機能を全て有効にした上で、全てのコードパスを評価してください。 8115: lockdepの機能を全て有効にした上で、全てのコードパスを評価してください。
diff --git a/Documentation/kbuild/kconfig.txt b/Documentation/kbuild/kconfig.txt
index 26a7c0a93193..849b5e56d06f 100644
--- a/Documentation/kbuild/kconfig.txt
+++ b/Documentation/kbuild/kconfig.txt
@@ -35,48 +35,26 @@ new .config files to see the differences:
35 35
36(Yes, we need something better here.) 36(Yes, we need something better here.)
37 37
38
39======================================================================
40menuconfig
41--------------------------------------------------
42
43SEARCHING for CONFIG symbols
44
45Searching in menuconfig:
46
47 The Search function searches for kernel configuration symbol
48 names, so you have to know something close to what you are
49 looking for.
50
51 Example:
52 /hotplug
53 This lists all config symbols that contain "hotplug",
54 e.g., HOTPLUG, HOTPLUG_CPU, MEMORY_HOTPLUG.
55
56 For search help, enter / followed TAB-TAB-TAB (to highlight
57 <Help>) and Enter. This will tell you that you can also use
58 regular expressions (regexes) in the search string, so if you
59 are not interested in MEMORY_HOTPLUG, you could try
60
61 /^hotplug
62
63
64______________________________________________________________________ 38______________________________________________________________________
65Color Themes for 'menuconfig' 39Environment variables for '*config'
66 40
67It is possible to select different color themes using the variable 41KCONFIG_CONFIG
68MENUCONFIG_COLOR. To select a theme use: 42--------------------------------------------------
43This environment variable can be used to specify a default kernel config
44file name to override the default name of ".config".
69 45
70 make MENUCONFIG_COLOR=<theme> menuconfig 46KCONFIG_OVERWRITECONFIG
47--------------------------------------------------
48If you set KCONFIG_OVERWRITECONFIG in the environment, Kconfig will not
49break symlinks when .config is a symlink to somewhere else.
71 50
72Available themes are: 51KCONFIG_NOTIMESTAMP
73 mono => selects colors suitable for monochrome displays 52--------------------------------------------------
74 blackbg => selects a color scheme with black background 53If this environment variable exists and is non-null, the timestamp line
75 classic => theme with blue background. The classic look 54in generated .config files is omitted.
76 bluetitle => a LCD friendly version of classic. (default)
77 55
78______________________________________________________________________ 56______________________________________________________________________
79Environment variables in 'menuconfig' 57Environment variables for '{allyes/allmod/allno/rand}config'
80 58
81KCONFIG_ALLCONFIG 59KCONFIG_ALLCONFIG
82-------------------------------------------------- 60--------------------------------------------------
@@ -95,8 +73,7 @@ values.
95This enables you to create "miniature" config (miniconfig) or custom 73This enables you to create "miniature" config (miniconfig) or custom
96config files containing just the config symbols that you are interested 74config files containing just the config symbols that you are interested
97in. Then the kernel config system generates the full .config file, 75in. Then the kernel config system generates the full .config file,
98including dependencies of your miniconfig file, based on the miniconfig 76including symbols of your miniconfig file.
99file.
100 77
101This 'KCONFIG_ALLCONFIG' file is a config file which contains 78This 'KCONFIG_ALLCONFIG' file is a config file which contains
102(usually a subset of all) preset config symbols. These variable 79(usually a subset of all) preset config symbols. These variable
@@ -113,26 +90,14 @@ These examples will disable most options (allnoconfig) but enable or
113disable the options that are explicitly listed in the specified 90disable the options that are explicitly listed in the specified
114mini-config files. 91mini-config files.
115 92
93______________________________________________________________________
94Environment variables for 'silentoldconfig'
95
116KCONFIG_NOSILENTUPDATE 96KCONFIG_NOSILENTUPDATE
117-------------------------------------------------- 97--------------------------------------------------
118If this variable has a non-blank value, it prevents silent kernel 98If this variable has a non-blank value, it prevents silent kernel
119config udpates (requires explicit updates). 99config udpates (requires explicit updates).
120 100
121KCONFIG_CONFIG
122--------------------------------------------------
123This environment variable can be used to specify a default kernel config
124file name to override the default name of ".config".
125
126KCONFIG_OVERWRITECONFIG
127--------------------------------------------------
128If you set KCONFIG_OVERWRITECONFIG in the environment, Kconfig will not
129break symlinks when .config is a symlink to somewhere else.
130
131KCONFIG_NOTIMESTAMP
132--------------------------------------------------
133If this environment variable exists and is non-null, the timestamp line
134in generated .config files is omitted.
135
136KCONFIG_AUTOCONFIG 101KCONFIG_AUTOCONFIG
137-------------------------------------------------- 102--------------------------------------------------
138This environment variable can be set to specify the path & name of the 103This environment variable can be set to specify the path & name of the
@@ -143,15 +108,54 @@ KCONFIG_AUTOHEADER
143This environment variable can be set to specify the path & name of the 108This environment variable can be set to specify the path & name of the
144"autoconf.h" (header) file. Its default value is "include/linux/autoconf.h". 109"autoconf.h" (header) file. Its default value is "include/linux/autoconf.h".
145 110
111
112======================================================================
113menuconfig
114--------------------------------------------------
115
116SEARCHING for CONFIG symbols
117
118Searching in menuconfig:
119
120 The Search function searches for kernel configuration symbol
121 names, so you have to know something close to what you are
122 looking for.
123
124 Example:
125 /hotplug
126 This lists all config symbols that contain "hotplug",
127 e.g., HOTPLUG, HOTPLUG_CPU, MEMORY_HOTPLUG.
128
129 For search help, enter / followed TAB-TAB-TAB (to highlight
130 <Help>) and Enter. This will tell you that you can also use
131 regular expressions (regexes) in the search string, so if you
132 are not interested in MEMORY_HOTPLUG, you could try
133
134 /^hotplug
135
146______________________________________________________________________ 136______________________________________________________________________
147menuconfig User Interface Options 137User interface options for 'menuconfig'
148---------------------------------------------------------------------- 138
139MENUCONFIG_COLOR
140--------------------------------------------------
141It is possible to select different color themes using the variable
142MENUCONFIG_COLOR. To select a theme use:
143
144 make MENUCONFIG_COLOR=<theme> menuconfig
145
146Available themes are:
147 mono => selects colors suitable for monochrome displays
148 blackbg => selects a color scheme with black background
149 classic => theme with blue background. The classic look
150 bluetitle => a LCD friendly version of classic. (default)
151
149MENUCONFIG_MODE 152MENUCONFIG_MODE
150-------------------------------------------------- 153--------------------------------------------------
151This mode shows all sub-menus in one large tree. 154This mode shows all sub-menus in one large tree.
152 155
153Example: 156Example:
154 MENUCONFIG_MODE=single_menu make menuconfig 157 make MENUCONFIG_MODE=single_menu menuconfig
158
155 159
156====================================================================== 160======================================================================
157xconfig 161xconfig
diff --git a/Documentation/kbuild/modules.txt b/Documentation/kbuild/modules.txt
index b1096da953c8..0767cf69c69e 100644
--- a/Documentation/kbuild/modules.txt
+++ b/Documentation/kbuild/modules.txt
@@ -275,7 +275,7 @@ following files:
275 275
276 KERNELDIR := /lib/modules/`uname -r`/build 276 KERNELDIR := /lib/modules/`uname -r`/build
277 all:: 277 all::
278 $(MAKE) -C $KERNELDIR M=`pwd` $@ 278 $(MAKE) -C $(KERNELDIR) M=`pwd` $@
279 279
280 # Module specific targets 280 # Module specific targets
281 genbin: 281 genbin:
diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt
index 3f4bc840da8b..cab61d842259 100644
--- a/Documentation/kdump/kdump.txt
+++ b/Documentation/kdump/kdump.txt
@@ -108,7 +108,7 @@ There are two possible methods of using Kdump.
108 108
1092) Or use the system kernel binary itself as dump-capture kernel and there is 1092) Or use the system kernel binary itself as dump-capture kernel and there is
110 no need to build a separate dump-capture kernel. This is possible 110 no need to build a separate dump-capture kernel. This is possible
111 only with the architecutres which support a relocatable kernel. As 111 only with the architectures which support a relocatable kernel. As
112 of today, i386, x86_64, ppc64 and ia64 architectures support relocatable 112 of today, i386, x86_64, ppc64 and ia64 architectures support relocatable
113 kernel. 113 kernel.
114 114
@@ -222,7 +222,7 @@ Dump-capture kernel config options (Arch Dependent, ia64)
222---------------------------------------------------------- 222----------------------------------------------------------
223 223
224- No specific options are required to create a dump-capture kernel 224- No specific options are required to create a dump-capture kernel
225 for ia64, other than those specified in the arch idependent section 225 for ia64, other than those specified in the arch independent section
226 above. This means that it is possible to use the system kernel 226 above. This means that it is possible to use the system kernel
227 as a dump-capture kernel if desired. 227 as a dump-capture kernel if desired.
228 228
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index fd5cac013037..d08759aa0903 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -48,6 +48,7 @@ parameter is applicable:
48 EFI EFI Partitioning (GPT) is enabled 48 EFI EFI Partitioning (GPT) is enabled
49 EIDE EIDE/ATAPI support is enabled. 49 EIDE EIDE/ATAPI support is enabled.
50 FB The frame buffer device is enabled. 50 FB The frame buffer device is enabled.
51 GCOV GCOV profiling is enabled.
51 HW Appropriate hardware is enabled. 52 HW Appropriate hardware is enabled.
52 IA-64 IA-64 architecture is enabled. 53 IA-64 IA-64 architecture is enabled.
53 IMA Integrity measurement architecture is enabled. 54 IMA Integrity measurement architecture is enabled.
@@ -56,7 +57,6 @@ parameter is applicable:
56 ISAPNP ISA PnP code is enabled. 57 ISAPNP ISA PnP code is enabled.
57 ISDN Appropriate ISDN support is enabled. 58 ISDN Appropriate ISDN support is enabled.
58 JOY Appropriate joystick support is enabled. 59 JOY Appropriate joystick support is enabled.
59 KMEMTRACE kmemtrace is enabled.
60 LIBATA Libata driver is enabled 60 LIBATA Libata driver is enabled
61 LP Printer support is enabled. 61 LP Printer support is enabled.
62 LOOP Loopback device support is enabled. 62 LOOP Loopback device support is enabled.
@@ -229,14 +229,6 @@ and is between 256 and 4096 characters. It is defined in the file
229 to assume that this machine's pmtimer latches its value 229 to assume that this machine's pmtimer latches its value
230 and always returns good values. 230 and always returns good values.
231 231
232 acpi.power_nocheck= [HW,ACPI]
233 Format: 1/0 enable/disable the check of power state.
234 On some bogus BIOS the _PSC object/_STA object of
235 power resource can't return the correct device power
236 state. In such case it is unneccessary to check its
237 power state again in power transition.
238 1 : disable the power state check
239
240 acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode 232 acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode
241 Format: { level | edge | high | low } 233 Format: { level | edge | high | low }
242 234
@@ -329,11 +321,6 @@ and is between 256 and 4096 characters. It is defined in the file
329 flushed before they will be reused, which 321 flushed before they will be reused, which
330 is a lot of faster 322 is a lot of faster
331 323
332 amd_iommu_size= [HW,X86-64]
333 Define the size of the aperture for the AMD IOMMU
334 driver. Possible values are:
335 '32M', '64M' (default), '128M', '256M', '512M', '1G'
336
337 amijoy.map= [HW,JOY] Amiga joystick support 324 amijoy.map= [HW,JOY] Amiga joystick support
338 Map of devices attached to JOY0DAT and JOY1DAT 325 Map of devices attached to JOY0DAT and JOY1DAT
339 Format: <a>,<b> 326 Format: <a>,<b>
@@ -497,6 +484,13 @@ and is between 256 and 4096 characters. It is defined in the file
497 Also note the kernel might malfunction if you disable 484 Also note the kernel might malfunction if you disable
498 some critical bits. 485 some critical bits.
499 486
487 cmo_free_hint= [PPC] Format: { yes | no }
488 Specify whether pages are marked as being inactive
489 when they are freed. This is used in CMO environments
490 to determine OS memory pressure for page stealing by
491 a hypervisor.
492 Default: yes
493
500 code_bytes [X86] How many bytes of object code to print 494 code_bytes [X86] How many bytes of object code to print
501 in an oops report. 495 in an oops report.
502 Range: 0 - 8192 496 Range: 0 - 8192
@@ -545,6 +539,10 @@ and is between 256 and 4096 characters. It is defined in the file
545 console=brl,ttyS0 539 console=brl,ttyS0
546 For now, only VisioBraille is supported. 540 For now, only VisioBraille is supported.
547 541
542 consoleblank= [KNL] The console blank (screen saver) timeout in
543 seconds. Defaults to 10*60 = 10mins. A value of 0
544 disables the blank timer.
545
548 coredump_filter= 546 coredump_filter=
549 [KNL] Change the default value for 547 [KNL] Change the default value for
550 /proc/<pid>/coredump_filter. 548 /proc/<pid>/coredump_filter.
@@ -646,6 +644,13 @@ and is between 256 and 4096 characters. It is defined in the file
646 DMA-API debugging code disables itself because the 644 DMA-API debugging code disables itself because the
647 architectural default is too low. 645 architectural default is too low.
648 646
647 dma_debug_driver=<driver_name>
648 With this option the DMA-API debugging driver
649 filter feature can be enabled at boot time. Just
650 pass the driver to filter for as the parameter.
651 The filter can be disabled or changed to another
652 driver later using sysfs.
653
649 dscc4.setup= [NET] 654 dscc4.setup= [NET]
650 655
651 dtc3181e= [HW,SCSI] 656 dtc3181e= [HW,SCSI]
@@ -752,12 +757,25 @@ and is between 256 and 4096 characters. It is defined in the file
752 ia64_pal_cache_flush instead of SAL_CACHE_FLUSH. 757 ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
753 758
754 ftrace=[tracer] 759 ftrace=[tracer]
755 [ftrace] will set and start the specified tracer 760 [FTRACE] will set and start the specified tracer
756 as early as possible in order to facilitate early 761 as early as possible in order to facilitate early
757 boot debugging. 762 boot debugging.
758 763
759 ftrace_dump_on_oops 764 ftrace_dump_on_oops
760 [ftrace] will dump the trace buffers on oops. 765 [FTRACE] will dump the trace buffers on oops.
766
767 ftrace_filter=[function-list]
768 [FTRACE] Limit the functions traced by the function
769 tracer at boot up. function-list is a comma separated
770 list of functions. This list can be changed at run
771 time by the set_ftrace_filter file in the debugfs
772 tracing directory.
773
774 ftrace_notrace=[function-list]
775 [FTRACE] Do not trace the functions specified in
776 function-list. This list can be changed at run time
777 by the set_ftrace_notrace file in the debugfs
778 tracing directory.
761 779
762 gamecon.map[2|3]= 780 gamecon.map[2|3]=
763 [HW,JOY] Multisystem joystick and NES/SNES/PSX pad 781 [HW,JOY] Multisystem joystick and NES/SNES/PSX pad
@@ -771,6 +789,12 @@ and is between 256 and 4096 characters. It is defined in the file
771 Format: off | on 789 Format: off | on
772 default: on 790 default: on
773 791
792 gcov_persist= [GCOV] When non-zero (default), profiling data for
793 kernel modules is saved and remains accessible via
794 debugfs, even when the module is unloaded/reloaded.
795 When zero, profiling data is discarded and associated
796 debugfs files are removed at module unload time.
797
774 gdth= [HW,SCSI] 798 gdth= [HW,SCSI]
775 See header of drivers/scsi/gdth.c. 799 See header of drivers/scsi/gdth.c.
776 800
@@ -873,11 +897,8 @@ and is between 256 and 4096 characters. It is defined in the file
873 897
874 ide-core.nodma= [HW] (E)IDE subsystem 898 ide-core.nodma= [HW] (E)IDE subsystem
875 Format: =0.0 to prevent dma on hda, =0.1 hdb =1.0 hdc 899 Format: =0.0 to prevent dma on hda, =0.1 hdb =1.0 hdc
876 .vlb_clock .pci_clock .noflush .noprobe .nowerr .cdrom 900 .vlb_clock .pci_clock .noflush .nohpa .noprobe .nowerr
877 .chs .ignore_cable are additional options 901 .cdrom .chs .ignore_cable are additional options
878 See Documentation/ide/ide.txt.
879
880 idebus= [HW] (E)IDE subsystem - VLB/PCI bus speed
881 See Documentation/ide/ide.txt. 902 See Documentation/ide/ide.txt.
882 903
883 ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem 904 ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
@@ -914,6 +935,12 @@ and is between 256 and 4096 characters. It is defined in the file
914 Formt: { "sha1" | "md5" } 935 Formt: { "sha1" | "md5" }
915 default: "sha1" 936 default: "sha1"
916 937
938 ima_tcb [IMA]
939 Load a policy which meets the needs of the Trusted
940 Computing Base. This means IMA will measure all
941 programs exec'd, files mmap'd for exec, and all files
942 opened for read by uid=0.
943
917 in2000= [HW,SCSI] 944 in2000= [HW,SCSI]
918 See header of drivers/scsi/in2000.c. 945 See header of drivers/scsi/in2000.c.
919 946
@@ -971,6 +998,7 @@ and is between 256 and 4096 characters. It is defined in the file
971 nomerge 998 nomerge
972 forcesac 999 forcesac
973 soft 1000 soft
1001 pt [x86, IA64]
974 1002
975 io7= [HW] IO7 for Marvel based alpha systems 1003 io7= [HW] IO7 for Marvel based alpha systems
976 See comment before marvel_specify_io7 in 1004 See comment before marvel_specify_io7 in
@@ -1054,24 +1082,19 @@ and is between 256 and 4096 characters. It is defined in the file
1054 use the HighMem zone if it exists, and the Normal 1082 use the HighMem zone if it exists, and the Normal
1055 zone if it does not. 1083 zone if it does not.
1056 1084
1057 kmemtrace.enable= [KNL,KMEMTRACE] Format: { yes | no }
1058 Controls whether kmemtrace is enabled
1059 at boot-time.
1060
1061 kmemtrace.subbufs=n [KNL,KMEMTRACE] Overrides the number of
1062 subbufs kmemtrace's relay channel has. Set this
1063 higher than default (KMEMTRACE_N_SUBBUFS in code) if
1064 you experience buffer overruns.
1065
1066 kgdboc= [HW] kgdb over consoles. 1085 kgdboc= [HW] kgdb over consoles.
1067 Requires a tty driver that supports console polling. 1086 Requires a tty driver that supports console polling.
1068 (only serial suported for now) 1087 (only serial supported for now)
1069 Format: <serial_device>[,baud] 1088 Format: <serial_device>[,baud]
1070 1089
1071 kmac= [MIPS] korina ethernet MAC address. 1090 kmac= [MIPS] korina ethernet MAC address.
1072 Configure the RouterBoard 532 series on-chip 1091 Configure the RouterBoard 532 series on-chip
1073 Ethernet adapter MAC address. 1092 Ethernet adapter MAC address.
1074 1093
1094 kmemleak= [KNL] Boot-time kmemleak enable/disable
1095 Valid arguments: on, off
1096 Default: on
1097
1075 kstack=N [X86] Print N words from the kernel stack 1098 kstack=N [X86] Print N words from the kernel stack
1076 in oops dumps. 1099 in oops dumps.
1077 1100
@@ -1339,6 +1362,27 @@ and is between 256 and 4096 characters. It is defined in the file
1339 min_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory below this 1362 min_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory below this
1340 physical address is ignored. 1363 physical address is ignored.
1341 1364
1365 mini2440= [ARM,HW,KNL]
1366 Format:[0..2][b][c][t]
1367 Default: "0tb"
1368 MINI2440 configuration specification:
1369 0 - The attached screen is the 3.5" TFT
1370 1 - The attached screen is the 7" TFT
1371 2 - The VGA Shield is attached (1024x768)
1372 Leaving out the screen size parameter will not load
1373 the TFT driver, and the framebuffer will be left
1374 unconfigured.
1375 b - Enable backlight. The TFT backlight pin will be
1376 linked to the kernel VESA blanking code and a GPIO
1377 LED. This parameter is not necessary when using the
1378 VGA shield.
1379 c - Enable the s3c camera interface.
1380 t - Reserved for enabling touchscreen support. The
1381 touchscreen support is not enabled in the mainstream
1382 kernel as of 2.6.30, a preliminary port can be found
1383 in the "bleeding edge" mini2440 support kernel at
1384 http://repo.or.cz/w/linux-2.6/mini2440.git
1385
1342 mminit_loglevel= 1386 mminit_loglevel=
1343 [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this 1387 [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
1344 parameter allows control of the logging verbosity for 1388 parameter allows control of the logging verbosity for
@@ -1380,6 +1424,16 @@ and is between 256 and 4096 characters. It is defined in the file
1380 mtdparts= [MTD] 1424 mtdparts= [MTD]
1381 See drivers/mtd/cmdlinepart.c. 1425 See drivers/mtd/cmdlinepart.c.
1382 1426
1427 onenand.bdry= [HW,MTD] Flex-OneNAND Boundary Configuration
1428
1429 Format: [die0_boundary][,die0_lock][,die1_boundary][,die1_lock]
1430
1431 boundary - index of last SLC block on Flex-OneNAND.
1432 The remaining blocks are configured as MLC blocks.
1433 lock - Configure if Flex-OneNAND boundary should be locked.
1434 Once locked, the boundary cannot be changed.
1435 1 indicates lock status, 0 indicates unlock status.
1436
1383 mtdset= [ARM] 1437 mtdset= [ARM]
1384 ARM/S3C2412 JIVE boot control 1438 ARM/S3C2412 JIVE boot control
1385 1439
@@ -1390,7 +1444,7 @@ and is between 256 and 4096 characters. It is defined in the file
1390 ('y', default) or cooked coordinates ('n') 1444 ('y', default) or cooked coordinates ('n')
1391 1445
1392 mtrr_chunk_size=nn[KMG] [X86] 1446 mtrr_chunk_size=nn[KMG] [X86]
1393 used for mtrr cleanup. It is largest continous chunk 1447 used for mtrr cleanup. It is largest continuous chunk
1394 that could hold holes aka. UC entries. 1448 that could hold holes aka. UC entries.
1395 1449
1396 mtrr_gran_size=nn[KMG] [X86] 1450 mtrr_gran_size=nn[KMG] [X86]
@@ -1575,6 +1629,9 @@ and is between 256 and 4096 characters. It is defined in the file
1575 noinitrd [RAM] Tells the kernel not to load any configured 1629 noinitrd [RAM] Tells the kernel not to load any configured
1576 initial RAM disk. 1630 initial RAM disk.
1577 1631
1632 nointremap [X86-64, Intel-IOMMU] Do not enable interrupt
1633 remapping.
1634
1578 nointroute [IA-64] 1635 nointroute [IA-64]
1579 1636
1580 nojitter [IA64] Disables jitter checking for ITC timers. 1637 nojitter [IA64] Disables jitter checking for ITC timers.
@@ -1660,6 +1717,14 @@ and is between 256 and 4096 characters. It is defined in the file
1660 oprofile.timer= [HW] 1717 oprofile.timer= [HW]
1661 Use timer interrupt instead of performance counters 1718 Use timer interrupt instead of performance counters
1662 1719
1720 oprofile.cpu_type= Force an oprofile cpu type
1721 This might be useful if you have an older oprofile
1722 userland or if you want common events.
1723 Format: { archperfmon }
1724 archperfmon: [X86] Force use of architectural
1725 perfmon on Intel CPUs instead of the
1726 CPU specific event set.
1727
1663 osst= [HW,SCSI] SCSI Tape Driver 1728 osst= [HW,SCSI] SCSI Tape Driver
1664 Format: <buffer_size>,<write_threshold> 1729 Format: <buffer_size>,<write_threshold>
1665 See also Documentation/scsi/st.txt. 1730 See also Documentation/scsi/st.txt.
@@ -1735,6 +1800,9 @@ and is between 256 and 4096 characters. It is defined in the file
1735 root domains (aka PCI segments, in ACPI-speak). 1800 root domains (aka PCI segments, in ACPI-speak).
1736 nommconf [X86] Disable use of MMCONFIG for PCI 1801 nommconf [X86] Disable use of MMCONFIG for PCI
1737 Configuration 1802 Configuration
1803 check_enable_amd_mmconf [X86] check for and enable
1804 properly configured MMIO access to PCI
1805 config space on AMD family 10h CPU
1738 nomsi [MSI] If the PCI_MSI kernel config parameter is 1806 nomsi [MSI] If the PCI_MSI kernel config parameter is
1739 enabled, this kernel boot option can be used to 1807 enabled, this kernel boot option can be used to
1740 disable the use of MSI interrupts system-wide. 1808 disable the use of MSI interrupts system-wide.
@@ -1824,6 +1892,12 @@ and is between 256 and 4096 characters. It is defined in the file
1824 PAGE_SIZE is used as alignment. 1892 PAGE_SIZE is used as alignment.
1825 PCI-PCI bridge can be specified, if resource 1893 PCI-PCI bridge can be specified, if resource
1826 windows need to be expanded. 1894 windows need to be expanded.
1895 ecrc= Enable/disable PCIe ECRC (transaction layer
1896 end-to-end CRC checking).
1897 bios: Use BIOS/firmware settings. This is the
1898 the default.
1899 off: Turn ECRC off
1900 on: Turn ECRC on.
1827 1901
1828 pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power 1902 pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power
1829 Management. 1903 Management.
diff --git a/Documentation/kmemcheck.txt b/Documentation/kmemcheck.txt
new file mode 100644
index 000000000000..363044609dad
--- /dev/null
+++ b/Documentation/kmemcheck.txt
@@ -0,0 +1,773 @@
1GETTING STARTED WITH KMEMCHECK
2==============================
3
4Vegard Nossum <vegardno@ifi.uio.no>
5
6
7Contents
8========
90. Introduction
101. Downloading
112. Configuring and compiling
123. How to use
133.1. Booting
143.2. Run-time enable/disable
153.3. Debugging
163.4. Annotating false positives
174. Reporting errors
185. Technical description
19
20
210. Introduction
22===============
23
24kmemcheck is a debugging feature for the Linux Kernel. More specifically, it
25is a dynamic checker that detects and warns about some uses of uninitialized
26memory.
27
28Userspace programmers might be familiar with Valgrind's memcheck. The main
29difference between memcheck and kmemcheck is that memcheck works for userspace
30programs only, and kmemcheck works for the kernel only. The implementations
31are of course vastly different. Because of this, kmemcheck is not as accurate
32as memcheck, but it turns out to be good enough in practice to discover real
33programmer errors that the compiler is not able to find through static
34analysis.
35
36Enabling kmemcheck on a kernel will probably slow it down to the extent that
37the machine will not be usable for normal workloads such as e.g. an
38interactive desktop. kmemcheck will also cause the kernel to use about twice
39as much memory as normal. For this reason, kmemcheck is strictly a debugging
40feature.
41
42
431. Downloading
44==============
45
46kmemcheck can only be downloaded using git. If you want to write patches
47against the current code, you should use the kmemcheck development branch of
48the tip tree. It is also possible to use the linux-next tree, which also
49includes the latest version of kmemcheck.
50
51Assuming that you've already cloned the linux-2.6.git repository, all you
52have to do is add the -tip tree as a remote, like this:
53
54 $ git remote add tip git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git
55
56To actually download the tree, fetch the remote:
57
58 $ git fetch tip
59
60And to check out a new local branch with the kmemcheck code:
61
62 $ git checkout -b kmemcheck tip/kmemcheck
63
64General instructions for the -tip tree can be found here:
65http://people.redhat.com/mingo/tip.git/readme.txt
66
67
682. Configuring and compiling
69============================
70
71kmemcheck only works for the x86 (both 32- and 64-bit) platform. A number of
72configuration variables must have specific settings in order for the kmemcheck
73menu to even appear in "menuconfig". These are:
74
75 o CONFIG_CC_OPTIMIZE_FOR_SIZE=n
76
77 This option is located under "General setup" / "Optimize for size".
78
79 Without this, gcc will use certain optimizations that usually lead to
80 false positive warnings from kmemcheck. An example of this is a 16-bit
81 field in a struct, where gcc may load 32 bits, then discard the upper
82 16 bits. kmemcheck sees only the 32-bit load, and may trigger a
83 warning for the upper 16 bits (if they're uninitialized).
84
85 o CONFIG_SLAB=y or CONFIG_SLUB=y
86
87 This option is located under "General setup" / "Choose SLAB
88 allocator".
89
90 o CONFIG_FUNCTION_TRACER=n
91
92 This option is located under "Kernel hacking" / "Tracers" / "Kernel
93 Function Tracer"
94
95 When function tracing is compiled in, gcc emits a call to another
96 function at the beginning of every function. This means that when the
97 page fault handler is called, the ftrace framework will be called
98 before kmemcheck has had a chance to handle the fault. If ftrace then
99 modifies memory that was tracked by kmemcheck, the result is an
100 endless recursive page fault.
101
102 o CONFIG_DEBUG_PAGEALLOC=n
103
104 This option is located under "Kernel hacking" / "Debug page memory
105 allocations".
106
107In addition, I highly recommend turning on CONFIG_DEBUG_INFO=y. This is also
108located under "Kernel hacking". With this, you will be able to get line number
109information from the kmemcheck warnings, which is extremely valuable in
110debugging a problem. This option is not mandatory, however, because it slows
111down the compilation process and produces a much bigger kernel image.
112
113Now the kmemcheck menu should be visible (under "Kernel hacking" / "kmemcheck:
114trap use of uninitialized memory"). Here follows a description of the
115kmemcheck configuration variables:
116
117 o CONFIG_KMEMCHECK
118
119 This must be enabled in order to use kmemcheck at all...
120
121 o CONFIG_KMEMCHECK_[DISABLED | ENABLED | ONESHOT]_BY_DEFAULT
122
123 This option controls the status of kmemcheck at boot-time. "Enabled"
124 will enable kmemcheck right from the start, "disabled" will boot the
125 kernel as normal (but with the kmemcheck code compiled in, so it can
126 be enabled at run-time after the kernel has booted), and "one-shot" is
127 a special mode which will turn kmemcheck off automatically after
128 detecting the first use of uninitialized memory.
129
130 If you are using kmemcheck to actively debug a problem, then you
131 probably want to choose "enabled" here.
132
133 The one-shot mode is mostly useful in automated test setups because it
134 can prevent floods of warnings and increase the chances of the machine
135 surviving in case something is really wrong. In other cases, the one-
136 shot mode could actually be counter-productive because it would turn
137 itself off at the very first error -- in the case of a false positive
138 too -- and this would come in the way of debugging the specific
139 problem you were interested in.
140
141 If you would like to use your kernel as normal, but with a chance to
142 enable kmemcheck in case of some problem, it might be a good idea to
143 choose "disabled" here. When kmemcheck is disabled, most of the run-
144 time overhead is not incurred, and the kernel will be almost as fast
145 as normal.
146
147 o CONFIG_KMEMCHECK_QUEUE_SIZE
148
149 Select the maximum number of error reports to store in an internal
150 (fixed-size) buffer. Since errors can occur virtually anywhere and in
151 any context, we need a temporary storage area which is guaranteed not
152 to generate any other page faults when accessed. The queue will be
153 emptied as soon as a tasklet may be scheduled. If the queue is full,
154 new error reports will be lost.
155
156 The default value of 64 is probably fine. If some code produces more
157 than 64 errors within an irqs-off section, then the code is likely to
158 produce many, many more, too, and these additional reports seldom give
159 any more information (the first report is usually the most valuable
160 anyway).
161
162 This number might have to be adjusted if you are not using serial
163 console or similar to capture the kernel log. If you are using the
164 "dmesg" command to save the log, then getting a lot of kmemcheck
165 warnings might overflow the kernel log itself, and the earlier reports
166 will get lost in that way instead. Try setting this to 10 or so on
167 such a setup.
168
169 o CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT
170
171 Select the number of shadow bytes to save along with each entry of the
172 error-report queue. These bytes indicate what parts of an allocation
173 are initialized, uninitialized, etc. and will be displayed when an
174 error is detected to help the debugging of a particular problem.
175
176 The number entered here is actually the logarithm of the number of
177 bytes that will be saved. So if you pick for example 5 here, kmemcheck
178 will save 2^5 = 32 bytes.
179
180 The default value should be fine for debugging most problems. It also
181 fits nicely within 80 columns.
182
183 o CONFIG_KMEMCHECK_PARTIAL_OK
184
185 This option (when enabled) works around certain GCC optimizations that
186 produce 32-bit reads from 16-bit variables where the upper 16 bits are
187 thrown away afterwards.
188
189 The default value (enabled) is recommended. This may of course hide
190 some real errors, but disabling it would probably produce a lot of
191 false positives.
192
193 o CONFIG_KMEMCHECK_BITOPS_OK
194
195 This option silences warnings that would be generated for bit-field
196 accesses where not all the bits are initialized at the same time. This
197 may also hide some real bugs.
198
199 This option is probably obsolete, or it should be replaced with
200 the kmemcheck-/bitfield-annotations for the code in question. The
201 default value is therefore fine.
202
203Now compile the kernel as usual.
204
205
2063. How to use
207=============
208
2093.1. Booting
210============
211
212First some information about the command-line options. There is only one
213option specific to kmemcheck, and this is called "kmemcheck". It can be used
214to override the default mode as chosen by the CONFIG_KMEMCHECK_*_BY_DEFAULT
215option. Its possible settings are:
216
217 o kmemcheck=0 (disabled)
218 o kmemcheck=1 (enabled)
219 o kmemcheck=2 (one-shot mode)
220
221If SLUB debugging has been enabled in the kernel, it may take precedence over
222kmemcheck in such a way that the slab caches which are under SLUB debugging
223will not be tracked by kmemcheck. In order to ensure that this doesn't happen
224(even though it shouldn't by default), use SLUB's boot option "slub_debug",
225like this: slub_debug=-
226
227In fact, this option may also be used for fine-grained control over SLUB vs.
228kmemcheck. For example, if the command line includes "kmemcheck=1
229slub_debug=,dentry", then SLUB debugging will be used only for the "dentry"
230slab cache, and with kmemcheck tracking all the other caches. This is advanced
231usage, however, and is not generally recommended.
232
233
2343.2. Run-time enable/disable
235============================
236
237When the kernel has booted, it is possible to enable or disable kmemcheck at
238run-time. WARNING: This feature is still experimental and may cause false
239positive warnings to appear. Therefore, try not to use this. If you find that
240it doesn't work properly (e.g. you see an unreasonable amount of warnings), I
241will be happy to take bug reports.
242
243Use the file /proc/sys/kernel/kmemcheck for this purpose, e.g.:
244
245 $ echo 0 > /proc/sys/kernel/kmemcheck # disables kmemcheck
246
247The numbers are the same as for the kmemcheck= command-line option.
248
249
2503.3. Debugging
251==============
252
253A typical report will look something like this:
254
255WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024)
25680000000000000000000000000000000000000000088ffff0000000000000000
257 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
258 ^
259
260Pid: 1856, comm: ntpdate Not tainted 2.6.29-rc5 #264 945P-A
261RIP: 0010:[<ffffffff8104ede8>] [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190
262RSP: 0018:ffff88003cdf7d98 EFLAGS: 00210002
263RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009
264RDX: ffff88003e5d6018 RSI: ffff88003e5d6024 RDI: ffff88003cdf7e84
265RBP: ffff88003cdf7db8 R08: ffff88003e5d6000 R09: 0000000000000000
266R10: 0000000000000080 R11: 0000000000000000 R12: 000000000000000e
267R13: ffff88003cdf7e78 R14: ffff88003d530710 R15: ffff88003d5a98c8
268FS: 0000000000000000(0000) GS:ffff880001982000(0063) knlGS:00000
269CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033
270CR2: ffff88003f806ea0 CR3: 000000003c036000 CR4: 00000000000006a0
271DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
272DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400
273 [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170
274 [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390
275 [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0
276 [<ffffffff8100c7b5>] int_signal+0x12/0x17
277 [<ffffffffffffffff>] 0xffffffffffffffff
278
279The single most valuable information in this report is the RIP (or EIP on 32-
280bit) value. This will help us pinpoint exactly which instruction that caused
281the warning.
282
283If your kernel was compiled with CONFIG_DEBUG_INFO=y, then all we have to do
284is give this address to the addr2line program, like this:
285
286 $ addr2line -e vmlinux -i ffffffff8104ede8
287 arch/x86/include/asm/string_64.h:12
288 include/asm-generic/siginfo.h:287
289 kernel/signal.c:380
290 kernel/signal.c:410
291
292The "-e vmlinux" tells addr2line which file to look in. IMPORTANT: This must
293be the vmlinux of the kernel that produced the warning in the first place! If
294not, the line number information will almost certainly be wrong.
295
296The "-i" tells addr2line to also print the line numbers of inlined functions.
297In this case, the flag was very important, because otherwise, it would only
298have printed the first line, which is just a call to memcpy(), which could be
299called from a thousand places in the kernel, and is therefore not very useful.
300These inlined functions would not show up in the stack trace above, simply
301because the kernel doesn't load the extra debugging information. This
302technique can of course be used with ordinary kernel oopses as well.
303
304In this case, it's the caller of memcpy() that is interesting, and it can be
305found in include/asm-generic/siginfo.h, line 287:
306
307281 static inline void copy_siginfo(struct siginfo *to, struct siginfo *from)
308282 {
309283 if (from->si_code < 0)
310284 memcpy(to, from, sizeof(*to));
311285 else
312286 /* _sigchld is currently the largest know union member */
313287 memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld));
314288 }
315
316Since this was a read (kmemcheck usually warns about reads only, though it can
317warn about writes to unallocated or freed memory as well), it was probably the
318"from" argument which contained some uninitialized bytes. Following the chain
319of calls, we move upwards to see where "from" was allocated or initialized,
320kernel/signal.c, line 380:
321
322359 static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
323360 {
324...
325367 list_for_each_entry(q, &list->list, list) {
326368 if (q->info.si_signo == sig) {
327369 if (first)
328370 goto still_pending;
329371 first = q;
330...
331377 if (first) {
332378 still_pending:
333379 list_del_init(&first->list);
334380 copy_siginfo(info, &first->info);
335381 __sigqueue_free(first);
336...
337392 }
338393 }
339
340Here, it is &first->info that is being passed on to copy_siginfo(). The
341variable "first" was found on a list -- passed in as the second argument to
342collect_signal(). We continue our journey through the stack, to figure out
343where the item on "list" was allocated or initialized. We move to line 410:
344
345395 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
346396 siginfo_t *info)
347397 {
348...
349410 collect_signal(sig, pending, info);
350...
351414 }
352
353Now we need to follow the "pending" pointer, since that is being passed on to
354collect_signal() as "list". At this point, we've run out of lines from the
355"addr2line" output. Not to worry, we just paste the next addresses from the
356kmemcheck stack dump, i.e.:
357
358 [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170
359 [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390
360 [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0
361 [<ffffffff8100c7b5>] int_signal+0x12/0x17
362
363 $ addr2line -e vmlinux -i ffffffff8104f04e ffffffff81050bd8 \
364 ffffffff8100b87d ffffffff8100c7b5
365 kernel/signal.c:446
366 kernel/signal.c:1806
367 arch/x86/kernel/signal.c:805
368 arch/x86/kernel/signal.c:871
369 arch/x86/kernel/entry_64.S:694
370
371Remember that since these addresses were found on the stack and not as the
372RIP value, they actually point to the _next_ instruction (they are return
373addresses). This becomes obvious when we look at the code for line 446:
374
375422 int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
376423 {
377...
378431 signr = __dequeue_signal(&tsk->signal->shared_pending,
379432 mask, info);
380433 /*
381434 * itimer signal ?
382435 *
383436 * itimers are process shared and we restart periodic
384437 * itimers in the signal delivery path to prevent DoS
385438 * attacks in the high resolution timer case. This is
386439 * compliant with the old way of self restarting
387440 * itimers, as the SIGALRM is a legacy signal and only
388441 * queued once. Changing the restart behaviour to
389442 * restart the timer in the signal dequeue path is
390443 * reducing the timer noise on heavy loaded !highres
391444 * systems too.
392445 */
393446 if (unlikely(signr == SIGALRM)) {
394...
395489 }
396
397So instead of looking at 446, we should be looking at 431, which is the line
398that executes just before 446. Here we see that what we are looking for is
399&tsk->signal->shared_pending.
400
401Our next task is now to figure out which function that puts items on this
402"shared_pending" list. A crude, but efficient tool, is git grep:
403
404 $ git grep -n 'shared_pending' kernel/
405 ...
406 kernel/signal.c:828: pending = group ? &t->signal->shared_pending : &t->pending;
407 kernel/signal.c:1339: pending = group ? &t->signal->shared_pending : &t->pending;
408 ...
409
410There were more results, but none of them were related to list operations,
411and these were the only assignments. We inspect the line numbers more closely
412and find that this is indeed where items are being added to the list:
413
414816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
415817 int group)
416818 {
417...
418828 pending = group ? &t->signal->shared_pending : &t->pending;
419...
420851 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
421852 (is_si_special(info) ||
422853 info->si_code >= 0)));
423854 if (q) {
424855 list_add_tail(&q->list, &pending->list);
425...
426890 }
427
428and:
429
4301309 int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
4311310 {
432....
4331339 pending = group ? &t->signal->shared_pending : &t->pending;
4341340 list_add_tail(&q->list, &pending->list);
435....
4361347 }
437
438In the first case, the list element we are looking for, "q", is being returned
439from the function __sigqueue_alloc(), which looks like an allocation function.
440Let's take a look at it:
441
442187 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
443188 int override_rlimit)
444189 {
445190 struct sigqueue *q = NULL;
446191 struct user_struct *user;
447192
448193 /*
449194 * We won't get problems with the target's UID changing under us
450195 * because changing it requires RCU be used, and if t != current, the
451196 * caller must be holding the RCU readlock (by way of a spinlock) and
452197 * we use RCU protection here
453198 */
454199 user = get_uid(__task_cred(t)->user);
455200 atomic_inc(&user->sigpending);
456201 if (override_rlimit ||
457202 atomic_read(&user->sigpending) <=
458203 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
459204 q = kmem_cache_alloc(sigqueue_cachep, flags);
460205 if (unlikely(q == NULL)) {
461206 atomic_dec(&user->sigpending);
462207 free_uid(user);
463208 } else {
464209 INIT_LIST_HEAD(&q->list);
465210 q->flags = 0;
466211 q->user = user;
467212 }
468213
469214 return q;
470215 }
471
472We see that this function initializes q->list, q->flags, and q->user. It seems
473that now is the time to look at the definition of "struct sigqueue", e.g.:
474
47514 struct sigqueue {
47615 struct list_head list;
47716 int flags;
47817 siginfo_t info;
47918 struct user_struct *user;
48019 };
481
482And, you might remember, it was a memcpy() on &first->info that caused the
483warning, so this makes perfect sense. It also seems reasonable to assume that
484it is the caller of __sigqueue_alloc() that has the responsibility of filling
485out (initializing) this member.
486
487But just which fields of the struct were uninitialized? Let's look at
488kmemcheck's report again:
489
490WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024)
49180000000000000000000000000000000000000000088ffff0000000000000000
492 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
493 ^
494
495These first two lines are the memory dump of the memory object itself, and the
496shadow bytemap, respectively. The memory object itself is in this case
497&first->info. Just beware that the start of this dump is NOT the start of the
498object itself! The position of the caret (^) corresponds with the address of
499the read (ffff88003e4a2024).
500
501The shadow bytemap dump legend is as follows:
502
503 i - initialized
504 u - uninitialized
505 a - unallocated (memory has been allocated by the slab layer, but has not
506 yet been handed off to anybody)
507 f - freed (memory has been allocated by the slab layer, but has been freed
508 by the previous owner)
509
510In order to figure out where (relative to the start of the object) the
511uninitialized memory was located, we have to look at the disassembly. For
512that, we'll need the RIP address again:
513
514RIP: 0010:[<ffffffff8104ede8>] [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190
515
516 $ objdump -d --no-show-raw-insn vmlinux | grep -C 8 ffffffff8104ede8:
517 ffffffff8104edc8: mov %r8,0x8(%r8)
518 ffffffff8104edcc: test %r10d,%r10d
519 ffffffff8104edcf: js ffffffff8104ee88 <__dequeue_signal+0x168>
520 ffffffff8104edd5: mov %rax,%rdx
521 ffffffff8104edd8: mov $0xc,%ecx
522 ffffffff8104eddd: mov %r13,%rdi
523 ffffffff8104ede0: mov $0x30,%eax
524 ffffffff8104ede5: mov %rdx,%rsi
525 ffffffff8104ede8: rep movsl %ds:(%rsi),%es:(%rdi)
526 ffffffff8104edea: test $0x2,%al
527 ffffffff8104edec: je ffffffff8104edf0 <__dequeue_signal+0xd0>
528 ffffffff8104edee: movsw %ds:(%rsi),%es:(%rdi)
529 ffffffff8104edf0: test $0x1,%al
530 ffffffff8104edf2: je ffffffff8104edf5 <__dequeue_signal+0xd5>
531 ffffffff8104edf4: movsb %ds:(%rsi),%es:(%rdi)
532 ffffffff8104edf5: mov %r8,%rdi
533 ffffffff8104edf8: callq ffffffff8104de60 <__sigqueue_free>
534
535As expected, it's the "rep movsl" instruction from the memcpy() that causes
536the warning. We know about REP MOVSL that it uses the register RCX to count
537the number of remaining iterations. By taking a look at the register dump
538again (from the kmemcheck report), we can figure out how many bytes were left
539to copy:
540
541RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009
542
543By looking at the disassembly, we also see that %ecx is being loaded with the
544value $0xc just before (ffffffff8104edd8), so we are very lucky. Keep in mind
545that this is the number of iterations, not bytes. And since this is a "long"
546operation, we need to multiply by 4 to get the number of bytes. So this means
547that the uninitialized value was encountered at 4 * (0xc - 0x9) = 12 bytes
548from the start of the object.
549
550We can now try to figure out which field of the "struct siginfo" that was not
551initialized. This is the beginning of the struct:
552
55340 typedef struct siginfo {
55441 int si_signo;
55542 int si_errno;
55643 int si_code;
55744
55845 union {
559..
56092 } _sifields;
56193 } siginfo_t;
562
563On 64-bit, the int is 4 bytes long, so it must the the union member that has
564not been initialized. We can verify this using gdb:
565
566 $ gdb vmlinux
567 ...
568 (gdb) p &((struct siginfo *) 0)->_sifields
569 $1 = (union {...} *) 0x10
570
571Actually, it seems that the union member is located at offset 0x10 -- which
572means that gcc has inserted 4 bytes of padding between the members si_code
573and _sifields. We can now get a fuller picture of the memory dump:
574
575 _----------------------------=> si_code
576 / _--------------------=> (padding)
577 | / _------------=> _sifields(._kill._pid)
578 | | / _----=> _sifields(._kill._uid)
579 | | | /
580-------|-------|-------|-------|
58180000000000000000000000000000000000000000088ffff0000000000000000
582 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
583
584This allows us to realize another important fact: si_code contains the value
5850x80. Remember that x86 is little endian, so the first 4 bytes "80000000" are
586really the number 0x00000080. With a bit of research, we find that this is
587actually the constant SI_KERNEL defined in include/asm-generic/siginfo.h:
588
589144 #define SI_KERNEL 0x80 /* sent by the kernel from somewhere */
590
591This macro is used in exactly one place in the x86 kernel: In send_signal()
592in kernel/signal.c:
593
594816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
595817 int group)
596818 {
597...
598828 pending = group ? &t->signal->shared_pending : &t->pending;
599...
600851 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
601852 (is_si_special(info) ||
602853 info->si_code >= 0)));
603854 if (q) {
604855 list_add_tail(&q->list, &pending->list);
605856 switch ((unsigned long) info) {
606...
607865 case (unsigned long) SEND_SIG_PRIV:
608866 q->info.si_signo = sig;
609867 q->info.si_errno = 0;
610868 q->info.si_code = SI_KERNEL;
611869 q->info.si_pid = 0;
612870 q->info.si_uid = 0;
613871 break;
614...
615890 }
616
617Not only does this match with the .si_code member, it also matches the place
618we found earlier when looking for where siginfo_t objects are enqueued on the
619"shared_pending" list.
620
621So to sum up: It seems that it is the padding introduced by the compiler
622between two struct fields that is uninitialized, and this gets reported when
623we do a memcpy() on the struct. This means that we have identified a false
624positive warning.
625
626Normally, kmemcheck will not report uninitialized accesses in memcpy() calls
627when both the source and destination addresses are tracked. (Instead, we copy
628the shadow bytemap as well). In this case, the destination address clearly
629was not tracked. We can dig a little deeper into the stack trace from above:
630
631 arch/x86/kernel/signal.c:805
632 arch/x86/kernel/signal.c:871
633 arch/x86/kernel/entry_64.S:694
634
635And we clearly see that the destination siginfo object is located on the
636stack:
637
638782 static void do_signal(struct pt_regs *regs)
639783 {
640784 struct k_sigaction ka;
641785 siginfo_t info;
642...
643804 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
644...
645854 }
646
647And this &info is what eventually gets passed to copy_siginfo() as the
648destination argument.
649
650Now, even though we didn't find an actual error here, the example is still a
651good one, because it shows how one would go about to find out what the report
652was all about.
653
654
6553.4. Annotating false positives
656===============================
657
658There are a few different ways to make annotations in the source code that
659will keep kmemcheck from checking and reporting certain allocations. Here
660they are:
661
662 o __GFP_NOTRACK_FALSE_POSITIVE
663
664 This flag can be passed to kmalloc() or kmem_cache_alloc() (therefore
665 also to other functions that end up calling one of these) to indicate
666 that the allocation should not be tracked because it would lead to
667 a false positive report. This is a "big hammer" way of silencing
668 kmemcheck; after all, even if the false positive pertains to
669 particular field in a struct, for example, we will now lose the
670 ability to find (real) errors in other parts of the same struct.
671
672 Example:
673
674 /* No warnings will ever trigger on accessing any part of x */
675 x = kmalloc(sizeof *x, GFP_KERNEL | __GFP_NOTRACK_FALSE_POSITIVE);
676
677 o kmemcheck_bitfield_begin(name)/kmemcheck_bitfield_end(name) and
678 kmemcheck_annotate_bitfield(ptr, name)
679
680 The first two of these three macros can be used inside struct
681 definitions to signal, respectively, the beginning and end of a
682 bitfield. Additionally, this will assign the bitfield a name, which
683 is given as an argument to the macros.
684
685 Having used these markers, one can later use
686 kmemcheck_annotate_bitfield() at the point of allocation, to indicate
687 which parts of the allocation is part of a bitfield.
688
689 Example:
690
691 struct foo {
692 int x;
693
694 kmemcheck_bitfield_begin(flags);
695 int flag_a:1;
696 int flag_b:1;
697 kmemcheck_bitfield_end(flags);
698
699 int y;
700 };
701
702 struct foo *x = kmalloc(sizeof *x);
703
704 /* No warnings will trigger on accessing the bitfield of x */
705 kmemcheck_annotate_bitfield(x, flags);
706
707 Note that kmemcheck_annotate_bitfield() can be used even before the
708 return value of kmalloc() is checked -- in other words, passing NULL
709 as the first argument is legal (and will do nothing).
710
711
7124. Reporting errors
713===================
714
715As we have seen, kmemcheck will produce false positive reports. Therefore, it
716is not very wise to blindly post kmemcheck warnings to mailing lists and
717maintainers. Instead, I encourage maintainers and developers to find errors
718in their own code. If you get a warning, you can try to work around it, try
719to figure out if it's a real error or not, or simply ignore it. Most
720developers know their own code and will quickly and efficiently determine the
721root cause of a kmemcheck report. This is therefore also the most efficient
722way to work with kmemcheck.
723
724That said, we (the kmemcheck maintainers) will always be on the lookout for
725false positives that we can annotate and silence. So whatever you find,
726please drop us a note privately! Kernel configs and steps to reproduce (if
727available) are of course a great help too.
728
729Happy hacking!
730
731
7325. Technical description
733========================
734
735kmemcheck works by marking memory pages non-present. This means that whenever
736somebody attempts to access the page, a page fault is generated. The page
737fault handler notices that the page was in fact only hidden, and so it calls
738on the kmemcheck code to make further investigations.
739
740When the investigations are completed, kmemcheck "shows" the page by marking
741it present (as it would be under normal circumstances). This way, the
742interrupted code can continue as usual.
743
744But after the instruction has been executed, we should hide the page again, so
745that we can catch the next access too! Now kmemcheck makes use of a debugging
746feature of the processor, namely single-stepping. When the processor has
747finished the one instruction that generated the memory access, a debug
748exception is raised. From here, we simply hide the page again and continue
749execution, this time with the single-stepping feature turned off.
750
751kmemcheck requires some assistance from the memory allocator in order to work.
752The memory allocator needs to
753
754 1. Tell kmemcheck about newly allocated pages and pages that are about to
755 be freed. This allows kmemcheck to set up and tear down the shadow memory
756 for the pages in question. The shadow memory stores the status of each
757 byte in the allocation proper, e.g. whether it is initialized or
758 uninitialized.
759
760 2. Tell kmemcheck which parts of memory should be marked uninitialized.
761 There are actually a few more states, such as "not yet allocated" and
762 "recently freed".
763
764If a slab cache is set up using the SLAB_NOTRACK flag, it will never return
765memory that can take page faults because of kmemcheck.
766
767If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still
768request memory with the __GFP_NOTRACK or __GFP_NOTRACK_FALSE_POSITIVE flags.
769This does not prevent the page faults from occurring, however, but marks the
770object in question as being initialized so that no warnings will ever be
771produced for this object.
772
773Currently, the SLAB and SLUB allocators are supported by kmemcheck.
diff --git a/Documentation/kmemleak.txt b/Documentation/kmemleak.txt
new file mode 100644
index 000000000000..0112da3b9ab8
--- /dev/null
+++ b/Documentation/kmemleak.txt
@@ -0,0 +1,142 @@
1Kernel Memory Leak Detector
2===========================
3
4Introduction
5------------
6
7Kmemleak provides a way of detecting possible kernel memory leaks in a
8way similar to a tracing garbage collector
9(http://en.wikipedia.org/wiki/Garbage_collection_%28computer_science%29#Tracing_garbage_collectors),
10with the difference that the orphan objects are not freed but only
11reported via /sys/kernel/debug/kmemleak. A similar method is used by the
12Valgrind tool (memcheck --leak-check) to detect the memory leaks in
13user-space applications.
14
15Usage
16-----
17
18CONFIG_DEBUG_KMEMLEAK in "Kernel hacking" has to be enabled. A kernel
19thread scans the memory every 10 minutes (by default) and prints any new
20unreferenced objects found. To trigger an intermediate scan and display
21all the possible memory leaks:
22
23 # mount -t debugfs nodev /sys/kernel/debug/
24 # cat /sys/kernel/debug/kmemleak
25
26Note that the orphan objects are listed in the order they were allocated
27and one object at the beginning of the list may cause other subsequent
28objects to be reported as orphan.
29
30Memory scanning parameters can be modified at run-time by writing to the
31/sys/kernel/debug/kmemleak file. The following parameters are supported:
32
33 off - disable kmemleak (irreversible)
34 stack=on - enable the task stacks scanning
35 stack=off - disable the tasks stacks scanning
36 scan=on - start the automatic memory scanning thread
37 scan=off - stop the automatic memory scanning thread
38 scan=<secs> - set the automatic memory scanning period in seconds (0
39 to disable it)
40
41Kmemleak can also be disabled at boot-time by passing "kmemleak=off" on
42the kernel command line.
43
44Basic Algorithm
45---------------
46
47The memory allocations via kmalloc, vmalloc, kmem_cache_alloc and
48friends are traced and the pointers, together with additional
49information like size and stack trace, are stored in a prio search tree.
50The corresponding freeing function calls are tracked and the pointers
51removed from the kmemleak data structures.
52
53An allocated block of memory is considered orphan if no pointer to its
54start address or to any location inside the block can be found by
55scanning the memory (including saved registers). This means that there
56might be no way for the kernel to pass the address of the allocated
57block to a freeing function and therefore the block is considered a
58memory leak.
59
60The scanning algorithm steps:
61
62 1. mark all objects as white (remaining white objects will later be
63 considered orphan)
64 2. scan the memory starting with the data section and stacks, checking
65 the values against the addresses stored in the prio search tree. If
66 a pointer to a white object is found, the object is added to the
67 gray list
68 3. scan the gray objects for matching addresses (some white objects
69 can become gray and added at the end of the gray list) until the
70 gray set is finished
71 4. the remaining white objects are considered orphan and reported via
72 /sys/kernel/debug/kmemleak
73
74Some allocated memory blocks have pointers stored in the kernel's
75internal data structures and they cannot be detected as orphans. To
76avoid this, kmemleak can also store the number of values pointing to an
77address inside the block address range that need to be found so that the
78block is not considered a leak. One example is __vmalloc().
79
80Kmemleak API
81------------
82
83See the include/linux/kmemleak.h header for the functions prototype.
84
85kmemleak_init - initialize kmemleak
86kmemleak_alloc - notify of a memory block allocation
87kmemleak_free - notify of a memory block freeing
88kmemleak_not_leak - mark an object as not a leak
89kmemleak_ignore - do not scan or report an object as leak
90kmemleak_scan_area - add scan areas inside a memory block
91kmemleak_no_scan - do not scan a memory block
92kmemleak_erase - erase an old value in a pointer variable
93kmemleak_alloc_recursive - as kmemleak_alloc but checks the recursiveness
94kmemleak_free_recursive - as kmemleak_free but checks the recursiveness
95
96Dealing with false positives/negatives
97--------------------------------------
98
99The false negatives are real memory leaks (orphan objects) but not
100reported by kmemleak because values found during the memory scanning
101point to such objects. To reduce the number of false negatives, kmemleak
102provides the kmemleak_ignore, kmemleak_scan_area, kmemleak_no_scan and
103kmemleak_erase functions (see above). The task stacks also increase the
104amount of false negatives and their scanning is not enabled by default.
105
106The false positives are objects wrongly reported as being memory leaks
107(orphan). For objects known not to be leaks, kmemleak provides the
108kmemleak_not_leak function. The kmemleak_ignore could also be used if
109the memory block is known not to contain other pointers and it will no
110longer be scanned.
111
112Some of the reported leaks are only transient, especially on SMP
113systems, because of pointers temporarily stored in CPU registers or
114stacks. Kmemleak defines MSECS_MIN_AGE (defaulting to 1000) representing
115the minimum age of an object to be reported as a memory leak.
116
117Limitations and Drawbacks
118-------------------------
119
120The main drawback is the reduced performance of memory allocation and
121freeing. To avoid other penalties, the memory scanning is only performed
122when the /sys/kernel/debug/kmemleak file is read. Anyway, this tool is
123intended for debugging purposes where the performance might not be the
124most important requirement.
125
126To keep the algorithm simple, kmemleak scans for values pointing to any
127address inside a block's address range. This may lead to an increased
128number of false negatives. However, it is likely that a real memory leak
129will eventually become visible.
130
131Another source of false negatives is the data stored in non-pointer
132values. In a future version, kmemleak could only scan the pointer
133members in the allocated structures. This feature would solve many of
134the false negative cases described above.
135
136The tool can report false positives. These are cases where an allocated
137block doesn't need to be freed (some cases in the init_call functions),
138the pointer is calculated by other methods than the usual container_of
139macro or the pointer is stored in a location not scanned by kmemleak.
140
141Page allocations and ioremap are not tracked. Only the ARM and x86
142architectures are currently supported.
diff --git a/Documentation/kobject.txt b/Documentation/kobject.txt
index b2e374586bd8..c79ab996dada 100644
--- a/Documentation/kobject.txt
+++ b/Documentation/kobject.txt
@@ -132,7 +132,7 @@ kobject_name():
132 const char *kobject_name(const struct kobject * kobj); 132 const char *kobject_name(const struct kobject * kobj);
133 133
134There is a helper function to both initialize and add the kobject to the 134There is a helper function to both initialize and add the kobject to the
135kernel at the same time, called supprisingly enough kobject_init_and_add(): 135kernel at the same time, called surprisingly enough kobject_init_and_add():
136 136
137 int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype, 137 int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype,
138 struct kobject *parent, const char *fmt, ...); 138 struct kobject *parent, const char *fmt, ...);
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 1e7a769a10f9..053037a1fe6d 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -507,9 +507,9 @@ http://www.linuxsymposium.org/2006/linuxsymposium_procv2.pdf (pages 101-115)
507Appendix A: The kprobes debugfs interface 507Appendix A: The kprobes debugfs interface
508 508
509With recent kernels (> 2.6.20) the list of registered kprobes is visible 509With recent kernels (> 2.6.20) the list of registered kprobes is visible
510under the /debug/kprobes/ directory (assuming debugfs is mounted at /debug). 510under the /sys/kernel/debug/kprobes/ directory (assuming debugfs is mounted at //sys/kernel/debug).
511 511
512/debug/kprobes/list: Lists all registered probes on the system 512/sys/kernel/debug/kprobes/list: Lists all registered probes on the system
513 513
514c015d71a k vfs_read+0x0 514c015d71a k vfs_read+0x0
515c011a316 j do_fork+0x0 515c011a316 j do_fork+0x0
@@ -525,7 +525,7 @@ virtual addresses that correspond to modules that've been unloaded),
525such probes are marked with [GONE]. If the probe is temporarily disabled, 525such probes are marked with [GONE]. If the probe is temporarily disabled,
526such probes are marked with [DISABLED]. 526such probes are marked with [DISABLED].
527 527
528/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly. 528/sys/kernel/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly.
529 529
530Provides a knob to globally and forcibly turn registered kprobes ON or OFF. 530Provides a knob to globally and forcibly turn registered kprobes ON or OFF.
531By default, all kprobes are enabled. By echoing "0" to this file, all 531By default, all kprobes are enabled. By echoing "0" to this file, all
diff --git a/Documentation/laptops/acer-wmi.txt b/Documentation/laptops/acer-wmi.txt
index 5ee2a02b3b40..0768fcc3ba3e 100644
--- a/Documentation/laptops/acer-wmi.txt
+++ b/Documentation/laptops/acer-wmi.txt
@@ -40,7 +40,7 @@ NOTE: The Acer Aspire One is not supported hardware. It cannot work with
40acer-wmi until Acer fix their ACPI-WMI implementation on them, so has been 40acer-wmi until Acer fix their ACPI-WMI implementation on them, so has been
41blacklisted until that happens. 41blacklisted until that happens.
42 42
43Please see the website for the current list of known working hardare: 43Please see the website for the current list of known working hardware:
44 44
45http://code.google.com/p/aceracpi/wiki/SupportedHardware 45http://code.google.com/p/aceracpi/wiki/SupportedHardware
46 46
diff --git a/Documentation/laptops/sony-laptop.txt b/Documentation/laptops/sony-laptop.txt
index 8b2bc1572d98..23ce7d350d1a 100644
--- a/Documentation/laptops/sony-laptop.txt
+++ b/Documentation/laptops/sony-laptop.txt
@@ -22,7 +22,7 @@ If your laptop model supports it, you will find sysfs files in the
22/sys/class/backlight/sony/ 22/sys/class/backlight/sony/
23directory. You will be able to query and set the current screen 23directory. You will be able to query and set the current screen
24brightness: 24brightness:
25 brightness get/set screen brightness (an iteger 25 brightness get/set screen brightness (an integer
26 between 0 and 7) 26 between 0 and 7)
27 actual_brightness reading from this file will query the HW 27 actual_brightness reading from this file will query the HW
28 to get real brightness value 28 to get real brightness value
diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt
index e7e9a69069e1..f2296ecedb89 100644
--- a/Documentation/laptops/thinkpad-acpi.txt
+++ b/Documentation/laptops/thinkpad-acpi.txt
@@ -506,7 +506,7 @@ generate input device EV_KEY events.
506In addition to the EV_KEY events, thinkpad-acpi may also issue EV_SW 506In addition to the EV_KEY events, thinkpad-acpi may also issue EV_SW
507events for switches: 507events for switches:
508 508
509SW_RFKILL_ALL T60 and later hardare rfkill rocker switch 509SW_RFKILL_ALL T60 and later hardware rfkill rocker switch
510SW_TABLET_MODE Tablet ThinkPads HKEY events 0x5009 and 0x500A 510SW_TABLET_MODE Tablet ThinkPads HKEY events 0x5009 and 0x500A
511 511
512Non hot-key ACPI HKEY event map: 512Non hot-key ACPI HKEY event map:
@@ -920,7 +920,7 @@ The available commands are:
920 echo '<LED number> off' >/proc/acpi/ibm/led 920 echo '<LED number> off' >/proc/acpi/ibm/led
921 echo '<LED number> blink' >/proc/acpi/ibm/led 921 echo '<LED number> blink' >/proc/acpi/ibm/led
922 922
923The <LED number> range is 0 to 7. The set of LEDs that can be 923The <LED number> range is 0 to 15. The set of LEDs that can be
924controlled varies from model to model. Here is the common ThinkPad 924controlled varies from model to model. Here is the common ThinkPad
925mapping: 925mapping:
926 926
@@ -932,6 +932,11 @@ mapping:
932 5 - UltraBase battery slot 932 5 - UltraBase battery slot
933 6 - (unknown) 933 6 - (unknown)
934 7 - standby 934 7 - standby
935 8 - dock status 1
936 9 - dock status 2
937 10, 11 - (unknown)
938 12 - thinkvantage
939 13, 14, 15 - (unknown)
935 940
936All of the above can be turned on and off and can be made to blink. 941All of the above can be turned on and off and can be made to blink.
937 942
@@ -940,10 +945,12 @@ sysfs notes:
940The ThinkPad LED sysfs interface is described in detail by the LED class 945The ThinkPad LED sysfs interface is described in detail by the LED class
941documentation, in Documentation/leds-class.txt. 946documentation, in Documentation/leds-class.txt.
942 947
943The leds are named (in LED ID order, from 0 to 7): 948The LEDs are named (in LED ID order, from 0 to 12):
944"tpacpi::power", "tpacpi:orange:batt", "tpacpi:green:batt", 949"tpacpi::power", "tpacpi:orange:batt", "tpacpi:green:batt",
945"tpacpi::dock_active", "tpacpi::bay_active", "tpacpi::dock_batt", 950"tpacpi::dock_active", "tpacpi::bay_active", "tpacpi::dock_batt",
946"tpacpi::unknown_led", "tpacpi::standby". 951"tpacpi::unknown_led", "tpacpi::standby", "tpacpi::dock_status1",
952"tpacpi::dock_status2", "tpacpi::unknown_led2", "tpacpi::unknown_led3",
953"tpacpi::thinkvantage".
947 954
948Due to limitations in the sysfs LED class, if the status of the LED 955Due to limitations in the sysfs LED class, if the status of the LED
949indicators cannot be read due to an error, thinkpad-acpi will report it as 956indicators cannot be read due to an error, thinkpad-acpi will report it as
@@ -958,6 +965,12 @@ ThinkPad indicator LED should blink in hardware accelerated mode, use the
958"timer" trigger, and leave the delay_on and delay_off parameters set to 965"timer" trigger, and leave the delay_on and delay_off parameters set to
959zero (to request hardware acceleration autodetection). 966zero (to request hardware acceleration autodetection).
960 967
968LEDs that are known not to exist in a given ThinkPad model are not
969made available through the sysfs interface. If you have a dock and you
970notice there are LEDs listed for your ThinkPad that do not exist (and
971are not in the dock), or if you notice that there are missing LEDs,
972a report to ibm-acpi-devel@lists.sourceforge.net is appreciated.
973
961 974
962ACPI sounds -- /proc/acpi/ibm/beep 975ACPI sounds -- /proc/acpi/ibm/beep
963---------------------------------- 976----------------------------------
@@ -1156,17 +1169,19 @@ may not be distinct. Later Lenovo models that implement the ACPI
1156display backlight brightness control methods have 16 levels, ranging 1169display backlight brightness control methods have 16 levels, ranging
1157from 0 to 15. 1170from 0 to 15.
1158 1171
1159There are two interfaces to the firmware for direct brightness control, 1172For IBM ThinkPads, there are two interfaces to the firmware for direct
1160EC and UCMS (or CMOS). To select which one should be used, use the 1173brightness control, EC and UCMS (or CMOS). To select which one should be
1161brightness_mode module parameter: brightness_mode=1 selects EC mode, 1174used, use the brightness_mode module parameter: brightness_mode=1 selects
1162brightness_mode=2 selects UCMS mode, brightness_mode=3 selects EC 1175EC mode, brightness_mode=2 selects UCMS mode, brightness_mode=3 selects EC
1163mode with NVRAM backing (so that brightness changes are remembered 1176mode with NVRAM backing (so that brightness changes are remembered across
1164across shutdown/reboot). 1177shutdown/reboot).
1165 1178
1166The driver tries to select which interface to use from a table of 1179The driver tries to select which interface to use from a table of
1167defaults for each ThinkPad model. If it makes a wrong choice, please 1180defaults for each ThinkPad model. If it makes a wrong choice, please
1168report this as a bug, so that we can fix it. 1181report this as a bug, so that we can fix it.
1169 1182
1183Lenovo ThinkPads only support brightness_mode=2 (UCMS).
1184
1170When display backlight brightness controls are available through the 1185When display backlight brightness controls are available through the
1171standard ACPI interface, it is best to use it instead of this direct 1186standard ACPI interface, it is best to use it instead of this direct
1172ThinkPad-specific interface. The driver will disable its native 1187ThinkPad-specific interface. The driver will disable its native
@@ -1254,7 +1269,7 @@ Fan control and monitoring: fan speed, fan enable/disable
1254 1269
1255procfs: /proc/acpi/ibm/fan 1270procfs: /proc/acpi/ibm/fan
1256sysfs device attributes: (hwmon "thinkpad") fan1_input, pwm1, 1271sysfs device attributes: (hwmon "thinkpad") fan1_input, pwm1,
1257 pwm1_enable 1272 pwm1_enable, fan2_input
1258sysfs hwmon driver attributes: fan_watchdog 1273sysfs hwmon driver attributes: fan_watchdog
1259 1274
1260NOTE NOTE NOTE: fan control operations are disabled by default for 1275NOTE NOTE NOTE: fan control operations are disabled by default for
@@ -1267,6 +1282,9 @@ from the hardware registers of the embedded controller. This is known
1267to work on later R, T, X and Z series ThinkPads but may show a bogus 1282to work on later R, T, X and Z series ThinkPads but may show a bogus
1268value on other models. 1283value on other models.
1269 1284
1285Some Lenovo ThinkPads support a secondary fan. This fan cannot be
1286controlled separately, it shares the main fan control.
1287
1270Fan levels: 1288Fan levels:
1271 1289
1272Most ThinkPad fans work in "levels" at the firmware interface. Level 0 1290Most ThinkPad fans work in "levels" at the firmware interface. Level 0
@@ -1397,6 +1415,11 @@ hwmon device attribute fan1_input:
1397 which can take up to two minutes. May return rubbish on older 1415 which can take up to two minutes. May return rubbish on older
1398 ThinkPads. 1416 ThinkPads.
1399 1417
1418hwmon device attribute fan2_input:
1419 Fan tachometer reading, in RPM, for the secondary fan.
1420 Available only on some ThinkPads. If the secondary fan is
1421 not installed, will always read 0.
1422
1400hwmon driver attribute fan_watchdog: 1423hwmon driver attribute fan_watchdog:
1401 Fan safety watchdog timer interval, in seconds. Minimum is 1424 Fan safety watchdog timer interval, in seconds. Minimum is
1402 1 second, maximum is 120 seconds. 0 disables the watchdog. 1425 1 second, maximum is 120 seconds. 0 disables the watchdog.
@@ -1555,3 +1578,7 @@ Sysfs interface changelog:
15550x020300: hotkey enable/disable support removed, attributes 15780x020300: hotkey enable/disable support removed, attributes
1556 hotkey_bios_enabled and hotkey_enable deprecated and 1579 hotkey_bios_enabled and hotkey_enable deprecated and
1557 marked for removal. 1580 marked for removal.
1581
15820x020400: Marker for 16 LEDs support. Also, LEDs that are known
1583 to not exist in a given model are not registered with
1584 the LED sysfs class anymore.
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile
index 1f4f9e888bd1..28c8cdfcafd8 100644
--- a/Documentation/lguest/Makefile
+++ b/Documentation/lguest/Makefile
@@ -1,6 +1,5 @@
1# This creates the demonstration utility "lguest" which runs a Linux guest. 1# This creates the demonstration utility "lguest" which runs a Linux guest.
2CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE 2CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE
3LDLIBS:=-lz
4 3
5all: lguest 4all: lguest
6 5
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index d36fcc0f2715..9ebcd6ef361b 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -16,6 +16,7 @@
16#include <sys/types.h> 16#include <sys/types.h>
17#include <sys/stat.h> 17#include <sys/stat.h>
18#include <sys/wait.h> 18#include <sys/wait.h>
19#include <sys/eventfd.h>
19#include <fcntl.h> 20#include <fcntl.h>
20#include <stdbool.h> 21#include <stdbool.h>
21#include <errno.h> 22#include <errno.h>
@@ -59,7 +60,6 @@ typedef uint8_t u8;
59/*:*/ 60/*:*/
60 61
61#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ 62#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
62#define NET_PEERNUM 1
63#define BRIDGE_PFX "bridge:" 63#define BRIDGE_PFX "bridge:"
64#ifndef SIOCBRADDIF 64#ifndef SIOCBRADDIF
65#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ 65#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
@@ -76,19 +76,12 @@ static bool verbose;
76 do { if (verbose) printf(args); } while(0) 76 do { if (verbose) printf(args); } while(0)
77/*:*/ 77/*:*/
78 78
79/* File descriptors for the Waker. */
80struct {
81 int pipe[2];
82 int lguest_fd;
83} waker_fds;
84
85/* The pointer to the start of guest memory. */ 79/* The pointer to the start of guest memory. */
86static void *guest_base; 80static void *guest_base;
87/* The maximum guest physical address allowed, and maximum possible. */ 81/* The maximum guest physical address allowed, and maximum possible. */
88static unsigned long guest_limit, guest_max; 82static unsigned long guest_limit, guest_max;
89/* The pipe for signal hander to write to. */ 83/* The /dev/lguest file descriptor. */
90static int timeoutpipe[2]; 84static int lguest_fd;
91static unsigned int timeout_usec = 500;
92 85
93/* a per-cpu variable indicating whose vcpu is currently running */ 86/* a per-cpu variable indicating whose vcpu is currently running */
94static unsigned int __thread cpu_id; 87static unsigned int __thread cpu_id;
@@ -96,11 +89,6 @@ static unsigned int __thread cpu_id;
96/* This is our list of devices. */ 89/* This is our list of devices. */
97struct device_list 90struct device_list
98{ 91{
99 /* Summary information about the devices in our list: ready to pass to
100 * select() to ask which need servicing.*/
101 fd_set infds;
102 int max_infd;
103
104 /* Counter to assign interrupt numbers. */ 92 /* Counter to assign interrupt numbers. */
105 unsigned int next_irq; 93 unsigned int next_irq;
106 94
@@ -126,22 +114,21 @@ struct device
126 /* The linked-list pointer. */ 114 /* The linked-list pointer. */
127 struct device *next; 115 struct device *next;
128 116
129 /* The this device's descriptor, as mapped into the Guest. */ 117 /* The device's descriptor, as mapped into the Guest. */
130 struct lguest_device_desc *desc; 118 struct lguest_device_desc *desc;
131 119
120 /* We can't trust desc values once Guest has booted: we use these. */
121 unsigned int feature_len;
122 unsigned int num_vq;
123
132 /* The name of this device, for --verbose. */ 124 /* The name of this device, for --verbose. */
133 const char *name; 125 const char *name;
134 126
135 /* If handle_input is set, it wants to be called when this file
136 * descriptor is ready. */
137 int fd;
138 bool (*handle_input)(int fd, struct device *me);
139
140 /* Any queues attached to this device */ 127 /* Any queues attached to this device */
141 struct virtqueue *vq; 128 struct virtqueue *vq;
142 129
143 /* Handle status being finalized (ie. feature bits stable). */ 130 /* Is it operational */
144 void (*ready)(struct device *me); 131 bool running;
145 132
146 /* Device-specific data. */ 133 /* Device-specific data. */
147 void *priv; 134 void *priv;
@@ -164,22 +151,28 @@ struct virtqueue
164 /* Last available index we saw. */ 151 /* Last available index we saw. */
165 u16 last_avail_idx; 152 u16 last_avail_idx;
166 153
167 /* The routine to call when the Guest pings us, or timeout. */ 154 /* How many are used since we sent last irq? */
168 void (*handle_output)(int fd, struct virtqueue *me, bool timeout); 155 unsigned int pending_used;
169 156
170 /* Outstanding buffers */ 157 /* Eventfd where Guest notifications arrive. */
171 unsigned int inflight; 158 int eventfd;
172 159
173 /* Is this blocked awaiting a timer? */ 160 /* Function for the thread which is servicing this virtqueue. */
174 bool blocked; 161 void (*service)(struct virtqueue *vq);
162 pid_t thread;
175}; 163};
176 164
177/* Remember the arguments to the program so we can "reboot" */ 165/* Remember the arguments to the program so we can "reboot" */
178static char **main_args; 166static char **main_args;
179 167
180/* Since guest is UP and we don't run at the same time, we don't need barriers. 168/* The original tty settings to restore on exit. */
181 * But I include them in the code in case others copy it. */ 169static struct termios orig_term;
182#define wmb() 170
171/* We have to be careful with barriers: our devices are all run in separate
172 * threads and so we need to make sure that changes visible to the Guest happen
173 * in precise order. */
174#define wmb() __asm__ __volatile__("" : : : "memory")
175#define mb() __asm__ __volatile__("" : : : "memory")
183 176
184/* Convert an iovec element to the given type. 177/* Convert an iovec element to the given type.
185 * 178 *
@@ -245,7 +238,7 @@ static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len)
245static u8 *get_feature_bits(struct device *dev) 238static u8 *get_feature_bits(struct device *dev)
246{ 239{
247 return (u8 *)(dev->desc + 1) 240 return (u8 *)(dev->desc + 1)
248 + dev->desc->num_vq * sizeof(struct lguest_vqconfig); 241 + dev->num_vq * sizeof(struct lguest_vqconfig);
249} 242}
250 243
251/*L:100 The Launcher code itself takes us out into userspace, that scary place 244/*L:100 The Launcher code itself takes us out into userspace, that scary place
@@ -505,99 +498,19 @@ static void concat(char *dst, char *args[])
505 * saw the arguments it expects when we looked at initialize() in lguest_user.c: 498 * saw the arguments it expects when we looked at initialize() in lguest_user.c:
506 * the base of Guest "physical" memory, the top physical page to allow and the 499 * the base of Guest "physical" memory, the top physical page to allow and the
507 * entry point for the Guest. */ 500 * entry point for the Guest. */
508static int tell_kernel(unsigned long start) 501static void tell_kernel(unsigned long start)
509{ 502{
510 unsigned long args[] = { LHREQ_INITIALIZE, 503 unsigned long args[] = { LHREQ_INITIALIZE,
511 (unsigned long)guest_base, 504 (unsigned long)guest_base,
512 guest_limit / getpagesize(), start }; 505 guest_limit / getpagesize(), start };
513 int fd;
514
515 verbose("Guest: %p - %p (%#lx)\n", 506 verbose("Guest: %p - %p (%#lx)\n",
516 guest_base, guest_base + guest_limit, guest_limit); 507 guest_base, guest_base + guest_limit, guest_limit);
517 fd = open_or_die("/dev/lguest", O_RDWR); 508 lguest_fd = open_or_die("/dev/lguest", O_RDWR);
518 if (write(fd, args, sizeof(args)) < 0) 509 if (write(lguest_fd, args, sizeof(args)) < 0)
519 err(1, "Writing to /dev/lguest"); 510 err(1, "Writing to /dev/lguest");
520
521 /* We return the /dev/lguest file descriptor to control this Guest */
522 return fd;
523} 511}
524/*:*/ 512/*:*/
525 513
526static void add_device_fd(int fd)
527{
528 FD_SET(fd, &devices.infds);
529 if (fd > devices.max_infd)
530 devices.max_infd = fd;
531}
532
533/*L:200
534 * The Waker.
535 *
536 * With console, block and network devices, we can have lots of input which we
537 * need to process. We could try to tell the kernel what file descriptors to
538 * watch, but handing a file descriptor mask through to the kernel is fairly
539 * icky.
540 *
541 * Instead, we clone off a thread which watches the file descriptors and writes
542 * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host
543 * stop running the Guest. This causes the Launcher to return from the
544 * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset
545 * the LHREQ_BREAK and wake us up again.
546 *
547 * This, of course, is merely a different *kind* of icky.
548 *
549 * Given my well-known antipathy to threads, I'd prefer to use processes. But
550 * it's easier to share Guest memory with threads, and trivial to share the
551 * devices.infds as the Launcher changes it.
552 */
553static int waker(void *unused)
554{
555 /* Close the write end of the pipe: only the Launcher has it open. */
556 close(waker_fds.pipe[1]);
557
558 for (;;) {
559 fd_set rfds = devices.infds;
560 unsigned long args[] = { LHREQ_BREAK, 1 };
561 unsigned int maxfd = devices.max_infd;
562
563 /* We also listen to the pipe from the Launcher. */
564 FD_SET(waker_fds.pipe[0], &rfds);
565 if (waker_fds.pipe[0] > maxfd)
566 maxfd = waker_fds.pipe[0];
567
568 /* Wait until input is ready from one of the devices. */
569 select(maxfd+1, &rfds, NULL, NULL, NULL);
570
571 /* Message from Launcher? */
572 if (FD_ISSET(waker_fds.pipe[0], &rfds)) {
573 char c;
574 /* If this fails, then assume Launcher has exited.
575 * Don't do anything on exit: we're just a thread! */
576 if (read(waker_fds.pipe[0], &c, 1) != 1)
577 _exit(0);
578 continue;
579 }
580
581 /* Send LHREQ_BREAK command to snap the Launcher out of it. */
582 pwrite(waker_fds.lguest_fd, args, sizeof(args), cpu_id);
583 }
584 return 0;
585}
586
587/* This routine just sets up a pipe to the Waker process. */
588static void setup_waker(int lguest_fd)
589{
590 /* This pipe is closed when Launcher dies, telling Waker. */
591 if (pipe(waker_fds.pipe) != 0)
592 err(1, "Creating pipe for Waker");
593
594 /* Waker also needs to know the lguest fd */
595 waker_fds.lguest_fd = lguest_fd;
596
597 if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1)
598 err(1, "Creating Waker");
599}
600
601/* 514/*
602 * Device Handling. 515 * Device Handling.
603 * 516 *
@@ -623,49 +536,90 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
623/* Each buffer in the virtqueues is actually a chain of descriptors. This 536/* Each buffer in the virtqueues is actually a chain of descriptors. This
624 * function returns the next descriptor in the chain, or vq->vring.num if we're 537 * function returns the next descriptor in the chain, or vq->vring.num if we're
625 * at the end. */ 538 * at the end. */
626static unsigned next_desc(struct virtqueue *vq, unsigned int i) 539static unsigned next_desc(struct vring_desc *desc,
540 unsigned int i, unsigned int max)
627{ 541{
628 unsigned int next; 542 unsigned int next;
629 543
630 /* If this descriptor says it doesn't chain, we're done. */ 544 /* If this descriptor says it doesn't chain, we're done. */
631 if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT)) 545 if (!(desc[i].flags & VRING_DESC_F_NEXT))
632 return vq->vring.num; 546 return max;
633 547
634 /* Check they're not leading us off end of descriptors. */ 548 /* Check they're not leading us off end of descriptors. */
635 next = vq->vring.desc[i].next; 549 next = desc[i].next;
636 /* Make sure compiler knows to grab that: we don't want it changing! */ 550 /* Make sure compiler knows to grab that: we don't want it changing! */
637 wmb(); 551 wmb();
638 552
639 if (next >= vq->vring.num) 553 if (next >= max)
640 errx(1, "Desc next is %u", next); 554 errx(1, "Desc next is %u", next);
641 555
642 return next; 556 return next;
643} 557}
644 558
559/* This actually sends the interrupt for this virtqueue */
560static void trigger_irq(struct virtqueue *vq)
561{
562 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
563
564 /* Don't inform them if nothing used. */
565 if (!vq->pending_used)
566 return;
567 vq->pending_used = 0;
568
569 /* If they don't want an interrupt, don't send one, unless empty. */
570 if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
571 && lg_last_avail(vq) != vq->vring.avail->idx)
572 return;
573
574 /* Send the Guest an interrupt tell them we used something up. */
575 if (write(lguest_fd, buf, sizeof(buf)) != 0)
576 err(1, "Triggering irq %i", vq->config.irq);
577}
578
645/* This looks in the virtqueue and for the first available buffer, and converts 579/* This looks in the virtqueue and for the first available buffer, and converts
646 * it to an iovec for convenient access. Since descriptors consist of some 580 * it to an iovec for convenient access. Since descriptors consist of some
647 * number of output then some number of input descriptors, it's actually two 581 * number of output then some number of input descriptors, it's actually two
648 * iovecs, but we pack them into one and note how many of each there were. 582 * iovecs, but we pack them into one and note how many of each there were.
649 * 583 *
650 * This function returns the descriptor number found, or vq->vring.num (which 584 * This function returns the descriptor number found. */
651 * is never a valid descriptor number) if none was found. */ 585static unsigned wait_for_vq_desc(struct virtqueue *vq,
652static unsigned get_vq_desc(struct virtqueue *vq, 586 struct iovec iov[],
653 struct iovec iov[], 587 unsigned int *out_num, unsigned int *in_num)
654 unsigned int *out_num, unsigned int *in_num)
655{ 588{
656 unsigned int i, head; 589 unsigned int i, head, max;
657 u16 last_avail; 590 struct vring_desc *desc;
591 u16 last_avail = lg_last_avail(vq);
592
593 while (last_avail == vq->vring.avail->idx) {
594 u64 event;
595
596 /* OK, tell Guest about progress up to now. */
597 trigger_irq(vq);
598
599 /* OK, now we need to know about added descriptors. */
600 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
601
602 /* They could have slipped one in as we were doing that: make
603 * sure it's written, then check again. */
604 mb();
605 if (last_avail != vq->vring.avail->idx) {
606 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
607 break;
608 }
609
610 /* Nothing new? Wait for eventfd to tell us they refilled. */
611 if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
612 errx(1, "Event read failed?");
613
614 /* We don't need to be notified again. */
615 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
616 }
658 617
659 /* Check it isn't doing very strange things with descriptor numbers. */ 618 /* Check it isn't doing very strange things with descriptor numbers. */
660 last_avail = lg_last_avail(vq);
661 if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) 619 if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
662 errx(1, "Guest moved used index from %u to %u", 620 errx(1, "Guest moved used index from %u to %u",
663 last_avail, vq->vring.avail->idx); 621 last_avail, vq->vring.avail->idx);
664 622
665 /* If there's nothing new since last we looked, return invalid. */
666 if (vq->vring.avail->idx == last_avail)
667 return vq->vring.num;
668
669 /* Grab the next descriptor number they're advertising, and increment 623 /* Grab the next descriptor number they're advertising, and increment
670 * the index we've seen. */ 624 * the index we've seen. */
671 head = vq->vring.avail->ring[last_avail % vq->vring.num]; 625 head = vq->vring.avail->ring[last_avail % vq->vring.num];
@@ -678,15 +632,28 @@ static unsigned get_vq_desc(struct virtqueue *vq,
678 /* When we start there are none of either input nor output. */ 632 /* When we start there are none of either input nor output. */
679 *out_num = *in_num = 0; 633 *out_num = *in_num = 0;
680 634
635 max = vq->vring.num;
636 desc = vq->vring.desc;
681 i = head; 637 i = head;
638
639 /* If this is an indirect entry, then this buffer contains a descriptor
640 * table which we handle as if it's any normal descriptor chain. */
641 if (desc[i].flags & VRING_DESC_F_INDIRECT) {
642 if (desc[i].len % sizeof(struct vring_desc))
643 errx(1, "Invalid size for indirect buffer table");
644
645 max = desc[i].len / sizeof(struct vring_desc);
646 desc = check_pointer(desc[i].addr, desc[i].len);
647 i = 0;
648 }
649
682 do { 650 do {
683 /* Grab the first descriptor, and check it's OK. */ 651 /* Grab the first descriptor, and check it's OK. */
684 iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len; 652 iov[*out_num + *in_num].iov_len = desc[i].len;
685 iov[*out_num + *in_num].iov_base 653 iov[*out_num + *in_num].iov_base
686 = check_pointer(vq->vring.desc[i].addr, 654 = check_pointer(desc[i].addr, desc[i].len);
687 vq->vring.desc[i].len);
688 /* If this is an input descriptor, increment that count. */ 655 /* If this is an input descriptor, increment that count. */
689 if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE) 656 if (desc[i].flags & VRING_DESC_F_WRITE)
690 (*in_num)++; 657 (*in_num)++;
691 else { 658 else {
692 /* If it's an output descriptor, they're all supposed 659 /* If it's an output descriptor, they're all supposed
@@ -697,11 +664,10 @@ static unsigned get_vq_desc(struct virtqueue *vq,
697 } 664 }
698 665
699 /* If we've got too many, that implies a descriptor loop. */ 666 /* If we've got too many, that implies a descriptor loop. */
700 if (*out_num + *in_num > vq->vring.num) 667 if (*out_num + *in_num > max)
701 errx(1, "Looped descriptor"); 668 errx(1, "Looped descriptor");
702 } while ((i = next_desc(vq, i)) != vq->vring.num); 669 } while ((i = next_desc(desc, i, max)) != max);
703 670
704 vq->inflight++;
705 return head; 671 return head;
706} 672}
707 673
@@ -719,44 +685,20 @@ static void add_used(struct virtqueue *vq, unsigned int head, int len)
719 /* Make sure buffer is written before we update index. */ 685 /* Make sure buffer is written before we update index. */
720 wmb(); 686 wmb();
721 vq->vring.used->idx++; 687 vq->vring.used->idx++;
722 vq->inflight--; 688 vq->pending_used++;
723}
724
725/* This actually sends the interrupt for this virtqueue */
726static void trigger_irq(int fd, struct virtqueue *vq)
727{
728 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
729
730 /* If they don't want an interrupt, don't send one, unless empty. */
731 if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
732 && vq->inflight)
733 return;
734
735 /* Send the Guest an interrupt tell them we used something up. */
736 if (write(fd, buf, sizeof(buf)) != 0)
737 err(1, "Triggering irq %i", vq->config.irq);
738} 689}
739 690
740/* And here's the combo meal deal. Supersize me! */ 691/* And here's the combo meal deal. Supersize me! */
741static void add_used_and_trigger(int fd, struct virtqueue *vq, 692static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
742 unsigned int head, int len)
743{ 693{
744 add_used(vq, head, len); 694 add_used(vq, head, len);
745 trigger_irq(fd, vq); 695 trigger_irq(vq);
746} 696}
747 697
748/* 698/*
749 * The Console 699 * The Console
750 * 700 *
751 * Here is the input terminal setting we save, and the routine to restore them 701 * We associate some data with the console for our exit hack. */
752 * on exit so the user gets their terminal back. */
753static struct termios orig_term;
754static void restore_term(void)
755{
756 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
757}
758
759/* We associate some data with the console for our exit hack. */
760struct console_abort 702struct console_abort
761{ 703{
762 /* How many times have they hit ^C? */ 704 /* How many times have they hit ^C? */
@@ -766,276 +708,275 @@ struct console_abort
766}; 708};
767 709
768/* This is the routine which handles console input (ie. stdin). */ 710/* This is the routine which handles console input (ie. stdin). */
769static bool handle_console_input(int fd, struct device *dev) 711static void console_input(struct virtqueue *vq)
770{ 712{
771 int len; 713 int len;
772 unsigned int head, in_num, out_num; 714 unsigned int head, in_num, out_num;
773 struct iovec iov[dev->vq->vring.num]; 715 struct console_abort *abort = vq->dev->priv;
774 struct console_abort *abort = dev->priv; 716 struct iovec iov[vq->vring.num];
775
776 /* First we need a console buffer from the Guests's input virtqueue. */
777 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
778
779 /* If they're not ready for input, stop listening to this file
780 * descriptor. We'll start again once they add an input buffer. */
781 if (head == dev->vq->vring.num)
782 return false;
783 717
718 /* Make sure there's a descriptor waiting. */
719 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
784 if (out_num) 720 if (out_num)
785 errx(1, "Output buffers in console in queue?"); 721 errx(1, "Output buffers in console in queue?");
786 722
787 /* This is why we convert to iovecs: the readv() call uses them, and so 723 /* Read it in. */
788 * it reads straight into the Guest's buffer. */ 724 len = readv(STDIN_FILENO, iov, in_num);
789 len = readv(dev->fd, iov, in_num);
790 if (len <= 0) { 725 if (len <= 0) {
791 /* This implies that the console is closed, is /dev/null, or 726 /* Ran out of input? */
792 * something went terribly wrong. */
793 warnx("Failed to get console input, ignoring console."); 727 warnx("Failed to get console input, ignoring console.");
794 /* Put the input terminal back. */ 728 /* For simplicity, dying threads kill the whole Launcher. So
795 restore_term(); 729 * just nap here. */
796 /* Remove callback from input vq, so it doesn't restart us. */ 730 for (;;)
797 dev->vq->handle_output = NULL; 731 pause();
798 /* Stop listening to this fd: don't call us again. */
799 return false;
800 } 732 }
801 733
802 /* Tell the Guest about the new input. */ 734 add_used_and_trigger(vq, head, len);
803 add_used_and_trigger(fd, dev->vq, head, len);
804 735
805 /* Three ^C within one second? Exit. 736 /* Three ^C within one second? Exit.
806 * 737 *
807 * This is such a hack, but works surprisingly well. Each ^C has to be 738 * This is such a hack, but works surprisingly well. Each ^C has to
808 * in a buffer by itself, so they can't be too fast. But we check that 739 * be in a buffer by itself, so they can't be too fast. But we check
809 * we get three within about a second, so they can't be too slow. */ 740 * that we get three within about a second, so they can't be too
810 if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) { 741 * slow. */
811 if (!abort->count++) 742 if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
812 gettimeofday(&abort->start, NULL);
813 else if (abort->count == 3) {
814 struct timeval now;
815 gettimeofday(&now, NULL);
816 if (now.tv_sec <= abort->start.tv_sec+1) {
817 unsigned long args[] = { LHREQ_BREAK, 0 };
818 /* Close the fd so Waker will know it has to
819 * exit. */
820 close(waker_fds.pipe[1]);
821 /* Just in case Waker is blocked in BREAK, send
822 * unbreak now. */
823 write(fd, args, sizeof(args));
824 exit(2);
825 }
826 abort->count = 0;
827 }
828 } else
829 /* Any other key resets the abort counter. */
830 abort->count = 0; 743 abort->count = 0;
744 return;
745 }
831 746
832 /* Everything went OK! */ 747 abort->count++;
833 return true; 748 if (abort->count == 1)
749 gettimeofday(&abort->start, NULL);
750 else if (abort->count == 3) {
751 struct timeval now;
752 gettimeofday(&now, NULL);
753 /* Kill all Launcher processes with SIGINT, like normal ^C */
754 if (now.tv_sec <= abort->start.tv_sec+1)
755 kill(0, SIGINT);
756 abort->count = 0;
757 }
834} 758}
835 759
836/* Handling output for console is simple: we just get all the output buffers 760/* This is the routine which handles console output (ie. stdout). */
837 * and write them to stdout. */ 761static void console_output(struct virtqueue *vq)
838static void handle_console_output(int fd, struct virtqueue *vq, bool timeout)
839{ 762{
840 unsigned int head, out, in; 763 unsigned int head, out, in;
841 int len;
842 struct iovec iov[vq->vring.num]; 764 struct iovec iov[vq->vring.num];
843 765
844 /* Keep getting output buffers from the Guest until we run out. */ 766 head = wait_for_vq_desc(vq, iov, &out, &in);
845 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { 767 if (in)
846 if (in) 768 errx(1, "Input buffers in console output queue?");
847 errx(1, "Input buffers in output queue?"); 769 while (!iov_empty(iov, out)) {
848 len = writev(STDOUT_FILENO, iov, out); 770 int len = writev(STDOUT_FILENO, iov, out);
849 add_used_and_trigger(fd, vq, head, len); 771 if (len <= 0)
772 err(1, "Write to stdout gave %i", len);
773 iov_consume(iov, out, len);
850 } 774 }
851} 775 add_used(vq, head, 0);
852
853/* This is called when we no longer want to hear about Guest changes to a
854 * virtqueue. This is more efficient in high-traffic cases, but it means we
855 * have to set a timer to check if any more changes have occurred. */
856static void block_vq(struct virtqueue *vq)
857{
858 struct itimerval itm;
859
860 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
861 vq->blocked = true;
862
863 itm.it_interval.tv_sec = 0;
864 itm.it_interval.tv_usec = 0;
865 itm.it_value.tv_sec = 0;
866 itm.it_value.tv_usec = timeout_usec;
867
868 setitimer(ITIMER_REAL, &itm, NULL);
869} 776}
870 777
871/* 778/*
872 * The Network 779 * The Network
873 * 780 *
874 * Handling output for network is also simple: we get all the output buffers 781 * Handling output for network is also simple: we get all the output buffers
875 * and write them (ignoring the first element) to this device's file descriptor 782 * and write them to /dev/net/tun.
876 * (/dev/net/tun).
877 */ 783 */
878static void handle_net_output(int fd, struct virtqueue *vq, bool timeout) 784struct net_info {
785 int tunfd;
786};
787
788static void net_output(struct virtqueue *vq)
879{ 789{
880 unsigned int head, out, in, num = 0; 790 struct net_info *net_info = vq->dev->priv;
881 int len; 791 unsigned int head, out, in;
882 struct iovec iov[vq->vring.num]; 792 struct iovec iov[vq->vring.num];
883 static int last_timeout_num;
884
885 /* Keep getting output buffers from the Guest until we run out. */
886 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
887 if (in)
888 errx(1, "Input buffers in output queue?");
889 len = writev(vq->dev->fd, iov, out);
890 if (len < 0)
891 err(1, "Writing network packet to tun");
892 add_used_and_trigger(fd, vq, head, len);
893 num++;
894 }
895 793
896 /* Block further kicks and set up a timer if we saw anything. */ 794 head = wait_for_vq_desc(vq, iov, &out, &in);
897 if (!timeout && num) 795 if (in)
898 block_vq(vq); 796 errx(1, "Input buffers in net output queue?");
899 797 if (writev(net_info->tunfd, iov, out) < 0)
900 /* We never quite know how long should we wait before we check the 798 errx(1, "Write to tun failed?");
901 * queue again for more packets. We start at 500 microseconds, and if 799 add_used(vq, head, 0);
902 * we get fewer packets than last time, we assume we made the timeout 800}
903 * too small and increase it by 10 microseconds. Otherwise, we drop it 801
904 * by one microsecond every time. It seems to work well enough. */ 802/* Will reading from this file descriptor block? */
905 if (timeout) { 803static bool will_block(int fd)
906 if (num < last_timeout_num) 804{
907 timeout_usec += 10; 805 fd_set fdset;
908 else if (timeout_usec > 1) 806 struct timeval zero = { 0, 0 };
909 timeout_usec--; 807 FD_ZERO(&fdset);
910 last_timeout_num = num; 808 FD_SET(fd, &fdset);
911 } 809 return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
912} 810}
913 811
914/* This is where we handle a packet coming in from the tun device to our 812/* This is where we handle packets coming in from the tun device to our
915 * Guest. */ 813 * Guest. */
916static bool handle_tun_input(int fd, struct device *dev) 814static void net_input(struct virtqueue *vq)
917{ 815{
918 unsigned int head, in_num, out_num;
919 int len; 816 int len;
920 struct iovec iov[dev->vq->vring.num]; 817 unsigned int head, out, in;
921 818 struct iovec iov[vq->vring.num];
922 /* First we need a network buffer from the Guests's recv virtqueue. */ 819 struct net_info *net_info = vq->dev->priv;
923 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
924 if (head == dev->vq->vring.num) {
925 /* Now, it's expected that if we try to send a packet too
926 * early, the Guest won't be ready yet. Wait until the device
927 * status says it's ready. */
928 /* FIXME: Actually want DRIVER_ACTIVE here. */
929
930 /* Now tell it we want to know if new things appear. */
931 dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
932 wmb();
933
934 /* We'll turn this back on if input buffers are registered. */
935 return false;
936 } else if (out_num)
937 errx(1, "Output buffers in network recv queue?");
938
939 /* Read the packet from the device directly into the Guest's buffer. */
940 len = readv(dev->fd, iov, in_num);
941 if (len <= 0)
942 err(1, "reading network");
943 820
944 /* Tell the Guest about the new packet. */ 821 head = wait_for_vq_desc(vq, iov, &out, &in);
945 add_used_and_trigger(fd, dev->vq, head, len); 822 if (out)
823 errx(1, "Output buffers in net input queue?");
946 824
947 verbose("tun input packet len %i [%02x %02x] (%s)\n", len, 825 /* Deliver interrupt now, since we're about to sleep. */
948 ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], 826 if (vq->pending_used && will_block(net_info->tunfd))
949 head != dev->vq->vring.num ? "sent" : "discarded"); 827 trigger_irq(vq);
950 828
951 /* All good. */ 829 len = readv(net_info->tunfd, iov, in);
952 return true; 830 if (len <= 0)
831 err(1, "Failed to read from tun.");
832 add_used(vq, head, len);
953} 833}
954 834
955/*L:215 This is the callback attached to the network and console input 835/* This is the helper to create threads. */
956 * virtqueues: it ensures we try again, in case we stopped console or net 836static int do_thread(void *_vq)
957 * delivery because Guest didn't have any buffers. */
958static void enable_fd(int fd, struct virtqueue *vq, bool timeout)
959{ 837{
960 add_device_fd(vq->dev->fd); 838 struct virtqueue *vq = _vq;
961 /* Snap the Waker out of its select loop. */ 839
962 write(waker_fds.pipe[1], "", 1); 840 for (;;)
841 vq->service(vq);
842 return 0;
963} 843}
964 844
965static void net_enable_fd(int fd, struct virtqueue *vq, bool timeout) 845/* When a child dies, we kill our entire process group with SIGTERM. This
846 * also has the side effect that the shell restores the console for us! */
847static void kill_launcher(int signal)
966{ 848{
967 /* We don't need to know again when Guest refills receive buffer. */ 849 kill(0, SIGTERM);
968 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
969 enable_fd(fd, vq, timeout);
970} 850}
971 851
972/* When the Guest tells us they updated the status field, we handle it. */ 852static void reset_device(struct device *dev)
973static void update_device_status(struct device *dev)
974{ 853{
975 struct virtqueue *vq; 854 struct virtqueue *vq;
976 855
977 /* This is a reset. */ 856 verbose("Resetting device %s\n", dev->name);
978 if (dev->desc->status == 0) {
979 verbose("Resetting device %s\n", dev->name);
980 857
981 /* Clear any features they've acked. */ 858 /* Clear any features they've acked. */
982 memset(get_feature_bits(dev) + dev->desc->feature_len, 0, 859 memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len);
983 dev->desc->feature_len);
984 860
985 /* Zero out the virtqueues. */ 861 /* We're going to be explicitly killing threads, so ignore them. */
986 for (vq = dev->vq; vq; vq = vq->next) { 862 signal(SIGCHLD, SIG_IGN);
987 memset(vq->vring.desc, 0, 863
988 vring_size(vq->config.num, LGUEST_VRING_ALIGN)); 864 /* Zero out the virtqueues, get rid of their threads */
989 lg_last_avail(vq) = 0; 865 for (vq = dev->vq; vq; vq = vq->next) {
866 if (vq->thread != (pid_t)-1) {
867 kill(vq->thread, SIGTERM);
868 waitpid(vq->thread, NULL, 0);
869 vq->thread = (pid_t)-1;
990 } 870 }
991 } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { 871 memset(vq->vring.desc, 0,
872 vring_size(vq->config.num, LGUEST_VRING_ALIGN));
873 lg_last_avail(vq) = 0;
874 }
875 dev->running = false;
876
877 /* Now we care if threads die. */
878 signal(SIGCHLD, (void *)kill_launcher);
879}
880
881static void create_thread(struct virtqueue *vq)
882{
883 /* Create stack for thread and run it. Since stack grows
884 * upwards, we point the stack pointer to the end of this
885 * region. */
886 char *stack = malloc(32768);
887 unsigned long args[] = { LHREQ_EVENTFD,
888 vq->config.pfn*getpagesize(), 0 };
889
890 /* Create a zero-initialized eventfd. */
891 vq->eventfd = eventfd(0, 0);
892 if (vq->eventfd < 0)
893 err(1, "Creating eventfd");
894 args[2] = vq->eventfd;
895
896 /* Attach an eventfd to this virtqueue: it will go off
897 * when the Guest does an LHCALL_NOTIFY for this vq. */
898 if (write(lguest_fd, &args, sizeof(args)) != 0)
899 err(1, "Attaching eventfd");
900
901 /* CLONE_VM: because it has to access the Guest memory, and
902 * SIGCHLD so we get a signal if it dies. */
903 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
904 if (vq->thread == (pid_t)-1)
905 err(1, "Creating clone");
906 /* We close our local copy, now the child has it. */
907 close(vq->eventfd);
908}
909
910static void start_device(struct device *dev)
911{
912 unsigned int i;
913 struct virtqueue *vq;
914
915 verbose("Device %s OK: offered", dev->name);
916 for (i = 0; i < dev->feature_len; i++)
917 verbose(" %02x", get_feature_bits(dev)[i]);
918 verbose(", accepted");
919 for (i = 0; i < dev->feature_len; i++)
920 verbose(" %02x", get_feature_bits(dev)
921 [dev->feature_len+i]);
922
923 for (vq = dev->vq; vq; vq = vq->next) {
924 if (vq->service)
925 create_thread(vq);
926 }
927 dev->running = true;
928}
929
930static void cleanup_devices(void)
931{
932 struct device *dev;
933
934 for (dev = devices.dev; dev; dev = dev->next)
935 reset_device(dev);
936
937 /* If we saved off the original terminal settings, restore them now. */
938 if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
939 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
940}
941
942/* When the Guest tells us they updated the status field, we handle it. */
943static void update_device_status(struct device *dev)
944{
945 /* A zero status is a reset, otherwise it's a set of flags. */
946 if (dev->desc->status == 0)
947 reset_device(dev);
948 else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
992 warnx("Device %s configuration FAILED", dev->name); 949 warnx("Device %s configuration FAILED", dev->name);
950 if (dev->running)
951 reset_device(dev);
993 } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { 952 } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) {
994 unsigned int i; 953 if (!dev->running)
995 954 start_device(dev);
996 verbose("Device %s OK: offered", dev->name);
997 for (i = 0; i < dev->desc->feature_len; i++)
998 verbose(" %02x", get_feature_bits(dev)[i]);
999 verbose(", accepted");
1000 for (i = 0; i < dev->desc->feature_len; i++)
1001 verbose(" %02x", get_feature_bits(dev)
1002 [dev->desc->feature_len+i]);
1003
1004 if (dev->ready)
1005 dev->ready(dev);
1006 } 955 }
1007} 956}
1008 957
1009/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ 958/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */
1010static void handle_output(int fd, unsigned long addr) 959static void handle_output(unsigned long addr)
1011{ 960{
1012 struct device *i; 961 struct device *i;
1013 struct virtqueue *vq;
1014 962
1015 /* Check each device and virtqueue. */ 963 /* Check each device. */
1016 for (i = devices.dev; i; i = i->next) { 964 for (i = devices.dev; i; i = i->next) {
965 struct virtqueue *vq;
966
1017 /* Notifications to device descriptors update device status. */ 967 /* Notifications to device descriptors update device status. */
1018 if (from_guest_phys(addr) == i->desc) { 968 if (from_guest_phys(addr) == i->desc) {
1019 update_device_status(i); 969 update_device_status(i);
1020 return; 970 return;
1021 } 971 }
1022 972
1023 /* Notifications to virtqueues mean output has occurred. */ 973 /* Devices *can* be used before status is set to DRIVER_OK. */
1024 for (vq = i->vq; vq; vq = vq->next) { 974 for (vq = i->vq; vq; vq = vq->next) {
1025 if (vq->config.pfn != addr/getpagesize()) 975 if (addr != vq->config.pfn*getpagesize())
1026 continue; 976 continue;
1027 977 if (i->running)
1028 /* Guest should acknowledge (and set features!) before 978 errx(1, "Notification on running %s", i->name);
1029 * using the device. */ 979 start_device(i);
1030 if (i->desc->status == 0) {
1031 warnx("%s gave early output", i->name);
1032 return;
1033 }
1034
1035 if (strcmp(vq->dev->name, "console") != 0)
1036 verbose("Output to %s\n", vq->dev->name);
1037 if (vq->handle_output)
1038 vq->handle_output(fd, vq, false);
1039 return; 980 return;
1040 } 981 }
1041 } 982 }
@@ -1049,71 +990,6 @@ static void handle_output(int fd, unsigned long addr)
1049 strnlen(from_guest_phys(addr), guest_limit - addr)); 990 strnlen(from_guest_phys(addr), guest_limit - addr));
1050} 991}
1051 992
1052static void handle_timeout(int fd)
1053{
1054 char buf[32];
1055 struct device *i;
1056 struct virtqueue *vq;
1057
1058 /* Clear the pipe */
1059 read(timeoutpipe[0], buf, sizeof(buf));
1060
1061 /* Check each device and virtqueue: flush blocked ones. */
1062 for (i = devices.dev; i; i = i->next) {
1063 for (vq = i->vq; vq; vq = vq->next) {
1064 if (!vq->blocked)
1065 continue;
1066
1067 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
1068 vq->blocked = false;
1069 if (vq->handle_output)
1070 vq->handle_output(fd, vq, true);
1071 }
1072 }
1073}
1074
1075/* This is called when the Waker wakes us up: check for incoming file
1076 * descriptors. */
1077static void handle_input(int fd)
1078{
1079 /* select() wants a zeroed timeval to mean "don't wait". */
1080 struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
1081
1082 for (;;) {
1083 struct device *i;
1084 fd_set fds = devices.infds;
1085 int num;
1086
1087 num = select(devices.max_infd+1, &fds, NULL, NULL, &poll);
1088 /* Could get interrupted */
1089 if (num < 0)
1090 continue;
1091 /* If nothing is ready, we're done. */
1092 if (num == 0)
1093 break;
1094
1095 /* Otherwise, call the device(s) which have readable file
1096 * descriptors and a method of handling them. */
1097 for (i = devices.dev; i; i = i->next) {
1098 if (i->handle_input && FD_ISSET(i->fd, &fds)) {
1099 if (i->handle_input(fd, i))
1100 continue;
1101
1102 /* If handle_input() returns false, it means we
1103 * should no longer service it. Networking and
1104 * console do this when there's no input
1105 * buffers to deliver into. Console also uses
1106 * it when it discovers that stdin is closed. */
1107 FD_CLR(i->fd, &devices.infds);
1108 }
1109 }
1110
1111 /* Is this the timeout fd? */
1112 if (FD_ISSET(timeoutpipe[0], &fds))
1113 handle_timeout(fd);
1114 }
1115}
1116
1117/*L:190 993/*L:190
1118 * Device Setup 994 * Device Setup
1119 * 995 *
@@ -1129,8 +1005,8 @@ static void handle_input(int fd)
1129static u8 *device_config(const struct device *dev) 1005static u8 *device_config(const struct device *dev)
1130{ 1006{
1131 return (void *)(dev->desc + 1) 1007 return (void *)(dev->desc + 1)
1132 + dev->desc->num_vq * sizeof(struct lguest_vqconfig) 1008 + dev->num_vq * sizeof(struct lguest_vqconfig)
1133 + dev->desc->feature_len * 2; 1009 + dev->feature_len * 2;
1134} 1010}
1135 1011
1136/* This routine allocates a new "struct lguest_device_desc" from descriptor 1012/* This routine allocates a new "struct lguest_device_desc" from descriptor
@@ -1159,7 +1035,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type)
1159/* Each device descriptor is followed by the description of its virtqueues. We 1035/* Each device descriptor is followed by the description of its virtqueues. We
1160 * specify how many descriptors the virtqueue is to have. */ 1036 * specify how many descriptors the virtqueue is to have. */
1161static void add_virtqueue(struct device *dev, unsigned int num_descs, 1037static void add_virtqueue(struct device *dev, unsigned int num_descs,
1162 void (*handle_output)(int, struct virtqueue *, bool)) 1038 void (*service)(struct virtqueue *))
1163{ 1039{
1164 unsigned int pages; 1040 unsigned int pages;
1165 struct virtqueue **i, *vq = malloc(sizeof(*vq)); 1041 struct virtqueue **i, *vq = malloc(sizeof(*vq));
@@ -1174,8 +1050,8 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1174 vq->next = NULL; 1050 vq->next = NULL;
1175 vq->last_avail_idx = 0; 1051 vq->last_avail_idx = 0;
1176 vq->dev = dev; 1052 vq->dev = dev;
1177 vq->inflight = 0; 1053 vq->service = service;
1178 vq->blocked = false; 1054 vq->thread = (pid_t)-1;
1179 1055
1180 /* Initialize the configuration. */ 1056 /* Initialize the configuration. */
1181 vq->config.num = num_descs; 1057 vq->config.num = num_descs;
@@ -1191,6 +1067,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1191 * yet, otherwise we'd be overwriting them. */ 1067 * yet, otherwise we'd be overwriting them. */
1192 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); 1068 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
1193 memcpy(device_config(dev), &vq->config, sizeof(vq->config)); 1069 memcpy(device_config(dev), &vq->config, sizeof(vq->config));
1070 dev->num_vq++;
1194 dev->desc->num_vq++; 1071 dev->desc->num_vq++;
1195 1072
1196 verbose("Virtqueue page %#lx\n", to_guest_phys(p)); 1073 verbose("Virtqueue page %#lx\n", to_guest_phys(p));
@@ -1199,15 +1076,6 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1199 * second. */ 1076 * second. */
1200 for (i = &dev->vq; *i; i = &(*i)->next); 1077 for (i = &dev->vq; *i; i = &(*i)->next);
1201 *i = vq; 1078 *i = vq;
1202
1203 /* Set the routine to call when the Guest does something to this
1204 * virtqueue. */
1205 vq->handle_output = handle_output;
1206
1207 /* As an optimization, set the advisory "Don't Notify Me" flag if we
1208 * don't have a handler */
1209 if (!handle_output)
1210 vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
1211} 1079}
1212 1080
1213/* The first half of the feature bitmask is for us to advertise features. The 1081/* The first half of the feature bitmask is for us to advertise features. The
@@ -1219,7 +1087,7 @@ static void add_feature(struct device *dev, unsigned bit)
1219 /* We can't extend the feature bits once we've added config bytes */ 1087 /* We can't extend the feature bits once we've added config bytes */
1220 if (dev->desc->feature_len <= bit / CHAR_BIT) { 1088 if (dev->desc->feature_len <= bit / CHAR_BIT) {
1221 assert(dev->desc->config_len == 0); 1089 assert(dev->desc->config_len == 0);
1222 dev->desc->feature_len = (bit / CHAR_BIT) + 1; 1090 dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1;
1223 } 1091 }
1224 1092
1225 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); 1093 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
@@ -1243,22 +1111,17 @@ static void set_config(struct device *dev, unsigned len, const void *conf)
1243 * calling new_dev_desc() to allocate the descriptor and device memory. 1111 * calling new_dev_desc() to allocate the descriptor and device memory.
1244 * 1112 *
1245 * See what I mean about userspace being boring? */ 1113 * See what I mean about userspace being boring? */
1246static struct device *new_device(const char *name, u16 type, int fd, 1114static struct device *new_device(const char *name, u16 type)
1247 bool (*handle_input)(int, struct device *))
1248{ 1115{
1249 struct device *dev = malloc(sizeof(*dev)); 1116 struct device *dev = malloc(sizeof(*dev));
1250 1117
1251 /* Now we populate the fields one at a time. */ 1118 /* Now we populate the fields one at a time. */
1252 dev->fd = fd;
1253 /* If we have an input handler for this file descriptor, then we add it
1254 * to the device_list's fdset and maxfd. */
1255 if (handle_input)
1256 add_device_fd(dev->fd);
1257 dev->desc = new_dev_desc(type); 1119 dev->desc = new_dev_desc(type);
1258 dev->handle_input = handle_input;
1259 dev->name = name; 1120 dev->name = name;
1260 dev->vq = NULL; 1121 dev->vq = NULL;
1261 dev->ready = NULL; 1122 dev->feature_len = 0;
1123 dev->num_vq = 0;
1124 dev->running = false;
1262 1125
1263 /* Append to device list. Prepending to a single-linked list is 1126 /* Append to device list. Prepending to a single-linked list is
1264 * easier, but the user expects the devices to be arranged on the bus 1127 * easier, but the user expects the devices to be arranged on the bus
@@ -1286,13 +1149,10 @@ static void setup_console(void)
1286 * raw input stream to the Guest. */ 1149 * raw input stream to the Guest. */
1287 term.c_lflag &= ~(ISIG|ICANON|ECHO); 1150 term.c_lflag &= ~(ISIG|ICANON|ECHO);
1288 tcsetattr(STDIN_FILENO, TCSANOW, &term); 1151 tcsetattr(STDIN_FILENO, TCSANOW, &term);
1289 /* If we exit gracefully, the original settings will be
1290 * restored so the user can see what they're typing. */
1291 atexit(restore_term);
1292 } 1152 }
1293 1153
1294 dev = new_device("console", VIRTIO_ID_CONSOLE, 1154 dev = new_device("console", VIRTIO_ID_CONSOLE);
1295 STDIN_FILENO, handle_console_input); 1155
1296 /* We store the console state in dev->priv, and initialize it. */ 1156 /* We store the console state in dev->priv, and initialize it. */
1297 dev->priv = malloc(sizeof(struct console_abort)); 1157 dev->priv = malloc(sizeof(struct console_abort));
1298 ((struct console_abort *)dev->priv)->count = 0; 1158 ((struct console_abort *)dev->priv)->count = 0;
@@ -1301,31 +1161,13 @@ static void setup_console(void)
1301 * they put something the input queue, we make sure we're listening to 1161 * they put something the input queue, we make sure we're listening to
1302 * stdin. When they put something in the output queue, we write it to 1162 * stdin. When they put something in the output queue, we write it to
1303 * stdout. */ 1163 * stdout. */
1304 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); 1164 add_virtqueue(dev, VIRTQUEUE_NUM, console_input);
1305 add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); 1165 add_virtqueue(dev, VIRTQUEUE_NUM, console_output);
1306 1166
1307 verbose("device %u: console\n", devices.device_num++); 1167 verbose("device %u: console\n", ++devices.device_num);
1308} 1168}
1309/*:*/ 1169/*:*/
1310 1170
1311static void timeout_alarm(int sig)
1312{
1313 write(timeoutpipe[1], "", 1);
1314}
1315
1316static void setup_timeout(void)
1317{
1318 if (pipe(timeoutpipe) != 0)
1319 err(1, "Creating timeout pipe");
1320
1321 if (fcntl(timeoutpipe[1], F_SETFL,
1322 fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0)
1323 err(1, "Making timeout pipe nonblocking");
1324
1325 add_device_fd(timeoutpipe[0]);
1326 signal(SIGALRM, timeout_alarm);
1327}
1328
1329/*M:010 Inter-guest networking is an interesting area. Simplest is to have a 1171/*M:010 Inter-guest networking is an interesting area. Simplest is to have a
1330 * --sharenet=<name> option which opens or creates a named pipe. This can be 1172 * --sharenet=<name> option which opens or creates a named pipe. This can be
1331 * used to send packets to another guest in a 1:1 manner. 1173 * used to send packets to another guest in a 1:1 manner.
@@ -1447,21 +1289,23 @@ static int get_tun_device(char tapif[IFNAMSIZ])
1447static void setup_tun_net(char *arg) 1289static void setup_tun_net(char *arg)
1448{ 1290{
1449 struct device *dev; 1291 struct device *dev;
1450 int netfd, ipfd; 1292 struct net_info *net_info = malloc(sizeof(*net_info));
1293 int ipfd;
1451 u32 ip = INADDR_ANY; 1294 u32 ip = INADDR_ANY;
1452 bool bridging = false; 1295 bool bridging = false;
1453 char tapif[IFNAMSIZ], *p; 1296 char tapif[IFNAMSIZ], *p;
1454 struct virtio_net_config conf; 1297 struct virtio_net_config conf;
1455 1298
1456 netfd = get_tun_device(tapif); 1299 net_info->tunfd = get_tun_device(tapif);
1457 1300
1458 /* First we create a new network device. */ 1301 /* First we create a new network device. */
1459 dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); 1302 dev = new_device("net", VIRTIO_ID_NET);
1303 dev->priv = net_info;
1460 1304
1461 /* Network devices need a receive and a send queue, just like 1305 /* Network devices need a receive and a send queue, just like
1462 * console. */ 1306 * console. */
1463 add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd); 1307 add_virtqueue(dev, VIRTQUEUE_NUM, net_input);
1464 add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); 1308 add_virtqueue(dev, VIRTQUEUE_NUM, net_output);
1465 1309
1466 /* We need a socket to perform the magic network ioctls to bring up the 1310 /* We need a socket to perform the magic network ioctls to bring up the
1467 * tap interface, connect to the bridge etc. Any socket will do! */ 1311 * tap interface, connect to the bridge etc. Any socket will do! */
@@ -1502,6 +1346,8 @@ static void setup_tun_net(char *arg)
1502 add_feature(dev, VIRTIO_NET_F_HOST_TSO4); 1346 add_feature(dev, VIRTIO_NET_F_HOST_TSO4);
1503 add_feature(dev, VIRTIO_NET_F_HOST_TSO6); 1347 add_feature(dev, VIRTIO_NET_F_HOST_TSO6);
1504 add_feature(dev, VIRTIO_NET_F_HOST_ECN); 1348 add_feature(dev, VIRTIO_NET_F_HOST_ECN);
1349 /* We handle indirect ring entries */
1350 add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
1505 set_config(dev, sizeof(conf), &conf); 1351 set_config(dev, sizeof(conf), &conf);
1506 1352
1507 /* We don't need the socket any more; setup is done. */ 1353 /* We don't need the socket any more; setup is done. */
@@ -1550,20 +1396,18 @@ struct vblk_info
1550 * Remember that the block device is handled by a separate I/O thread. We head 1396 * Remember that the block device is handled by a separate I/O thread. We head
1551 * straight into the core of that thread here: 1397 * straight into the core of that thread here:
1552 */ 1398 */
1553static bool service_io(struct device *dev) 1399static void blk_request(struct virtqueue *vq)
1554{ 1400{
1555 struct vblk_info *vblk = dev->priv; 1401 struct vblk_info *vblk = vq->dev->priv;
1556 unsigned int head, out_num, in_num, wlen; 1402 unsigned int head, out_num, in_num, wlen;
1557 int ret; 1403 int ret;
1558 u8 *in; 1404 u8 *in;
1559 struct virtio_blk_outhdr *out; 1405 struct virtio_blk_outhdr *out;
1560 struct iovec iov[dev->vq->vring.num]; 1406 struct iovec iov[vq->vring.num];
1561 off64_t off; 1407 off64_t off;
1562 1408
1563 /* See if there's a request waiting. If not, nothing to do. */ 1409 /* Get the next request. */
1564 head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 1410 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1565 if (head == dev->vq->vring.num)
1566 return false;
1567 1411
1568 /* Every block request should contain at least one output buffer 1412 /* Every block request should contain at least one output buffer
1569 * (detailing the location on disk and the type of request) and one 1413 * (detailing the location on disk and the type of request) and one
@@ -1637,83 +1481,21 @@ static bool service_io(struct device *dev)
1637 if (out->type & VIRTIO_BLK_T_BARRIER) 1481 if (out->type & VIRTIO_BLK_T_BARRIER)
1638 fdatasync(vblk->fd); 1482 fdatasync(vblk->fd);
1639 1483
1640 /* We can't trigger an IRQ, because we're not the Launcher. It does 1484 add_used(vq, head, wlen);
1641 * that when we tell it we're done. */
1642 add_used(dev->vq, head, wlen);
1643 return true;
1644}
1645
1646/* This is the thread which actually services the I/O. */
1647static int io_thread(void *_dev)
1648{
1649 struct device *dev = _dev;
1650 struct vblk_info *vblk = dev->priv;
1651 char c;
1652
1653 /* Close other side of workpipe so we get 0 read when main dies. */
1654 close(vblk->workpipe[1]);
1655 /* Close the other side of the done_fd pipe. */
1656 close(dev->fd);
1657
1658 /* When this read fails, it means Launcher died, so we follow. */
1659 while (read(vblk->workpipe[0], &c, 1) == 1) {
1660 /* We acknowledge each request immediately to reduce latency,
1661 * rather than waiting until we've done them all. I haven't
1662 * measured to see if it makes any difference.
1663 *
1664 * That would be an interesting test, wouldn't it? You could
1665 * also try having more than one I/O thread. */
1666 while (service_io(dev))
1667 write(vblk->done_fd, &c, 1);
1668 }
1669 return 0;
1670}
1671
1672/* Now we've seen the I/O thread, we return to the Launcher to see what happens
1673 * when that thread tells us it's completed some I/O. */
1674static bool handle_io_finish(int fd, struct device *dev)
1675{
1676 char c;
1677
1678 /* If the I/O thread died, presumably it printed the error, so we
1679 * simply exit. */
1680 if (read(dev->fd, &c, 1) != 1)
1681 exit(1);
1682
1683 /* It did some work, so trigger the irq. */
1684 trigger_irq(fd, dev->vq);
1685 return true;
1686}
1687
1688/* When the Guest submits some I/O, we just need to wake the I/O thread. */
1689static void handle_virtblk_output(int fd, struct virtqueue *vq, bool timeout)
1690{
1691 struct vblk_info *vblk = vq->dev->priv;
1692 char c = 0;
1693
1694 /* Wake up I/O thread and tell it to go to work! */
1695 if (write(vblk->workpipe[1], &c, 1) != 1)
1696 /* Presumably it indicated why it died. */
1697 exit(1);
1698} 1485}
1699 1486
1700/*L:198 This actually sets up a virtual block device. */ 1487/*L:198 This actually sets up a virtual block device. */
1701static void setup_block_file(const char *filename) 1488static void setup_block_file(const char *filename)
1702{ 1489{
1703 int p[2];
1704 struct device *dev; 1490 struct device *dev;
1705 struct vblk_info *vblk; 1491 struct vblk_info *vblk;
1706 void *stack;
1707 struct virtio_blk_config conf; 1492 struct virtio_blk_config conf;
1708 1493
1709 /* This is the pipe the I/O thread will use to tell us I/O is done. */
1710 pipe(p);
1711
1712 /* The device responds to return from I/O thread. */ 1494 /* The device responds to return from I/O thread. */
1713 dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); 1495 dev = new_device("block", VIRTIO_ID_BLOCK);
1714 1496
1715 /* The device has one virtqueue, where the Guest places requests. */ 1497 /* The device has one virtqueue, where the Guest places requests. */
1716 add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); 1498 add_virtqueue(dev, VIRTQUEUE_NUM, blk_request);
1717 1499
1718 /* Allocate the room for our own bookkeeping */ 1500 /* Allocate the room for our own bookkeeping */
1719 vblk = dev->priv = malloc(sizeof(*vblk)); 1501 vblk = dev->priv = malloc(sizeof(*vblk));
@@ -1735,49 +1517,29 @@ static void setup_block_file(const char *filename)
1735 1517
1736 set_config(dev, sizeof(conf), &conf); 1518 set_config(dev, sizeof(conf), &conf);
1737 1519
1738 /* The I/O thread writes to this end of the pipe when done. */
1739 vblk->done_fd = p[1];
1740
1741 /* This is the second pipe, which is how we tell the I/O thread about
1742 * more work. */
1743 pipe(vblk->workpipe);
1744
1745 /* Create stack for thread and run it. Since stack grows upwards, we
1746 * point the stack pointer to the end of this region. */
1747 stack = malloc(32768);
1748 /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from
1749 * becoming a zombie. */
1750 if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1)
1751 err(1, "Creating clone");
1752
1753 /* We don't need to keep the I/O thread's end of the pipes open. */
1754 close(vblk->done_fd);
1755 close(vblk->workpipe[0]);
1756
1757 verbose("device %u: virtblock %llu sectors\n", 1520 verbose("device %u: virtblock %llu sectors\n",
1758 devices.device_num, le64_to_cpu(conf.capacity)); 1521 ++devices.device_num, le64_to_cpu(conf.capacity));
1759} 1522}
1760 1523
1524struct rng_info {
1525 int rfd;
1526};
1527
1761/* Our random number generator device reads from /dev/random into the Guest's 1528/* Our random number generator device reads from /dev/random into the Guest's
1762 * input buffers. The usual case is that the Guest doesn't want random numbers 1529 * input buffers. The usual case is that the Guest doesn't want random numbers
1763 * and so has no buffers although /dev/random is still readable, whereas 1530 * and so has no buffers although /dev/random is still readable, whereas
1764 * console is the reverse. 1531 * console is the reverse.
1765 * 1532 *
1766 * The same logic applies, however. */ 1533 * The same logic applies, however. */
1767static bool handle_rng_input(int fd, struct device *dev) 1534static void rng_input(struct virtqueue *vq)
1768{ 1535{
1769 int len; 1536 int len;
1770 unsigned int head, in_num, out_num, totlen = 0; 1537 unsigned int head, in_num, out_num, totlen = 0;
1771 struct iovec iov[dev->vq->vring.num]; 1538 struct rng_info *rng_info = vq->dev->priv;
1539 struct iovec iov[vq->vring.num];
1772 1540
1773 /* First we need a buffer from the Guests's virtqueue. */ 1541 /* First we need a buffer from the Guests's virtqueue. */
1774 head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 1542 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1775
1776 /* If they're not ready for input, stop listening to this file
1777 * descriptor. We'll start again once they add an input buffer. */
1778 if (head == dev->vq->vring.num)
1779 return false;
1780
1781 if (out_num) 1543 if (out_num)
1782 errx(1, "Output buffers in rng?"); 1544 errx(1, "Output buffers in rng?");
1783 1545
@@ -1785,7 +1547,7 @@ static bool handle_rng_input(int fd, struct device *dev)
1785 * it reads straight into the Guest's buffer. We loop to make sure we 1547 * it reads straight into the Guest's buffer. We loop to make sure we
1786 * fill it. */ 1548 * fill it. */
1787 while (!iov_empty(iov, in_num)) { 1549 while (!iov_empty(iov, in_num)) {
1788 len = readv(dev->fd, iov, in_num); 1550 len = readv(rng_info->rfd, iov, in_num);
1789 if (len <= 0) 1551 if (len <= 0)
1790 err(1, "Read from /dev/random gave %i", len); 1552 err(1, "Read from /dev/random gave %i", len);
1791 iov_consume(iov, in_num, len); 1553 iov_consume(iov, in_num, len);
@@ -1793,25 +1555,23 @@ static bool handle_rng_input(int fd, struct device *dev)
1793 } 1555 }
1794 1556
1795 /* Tell the Guest about the new input. */ 1557 /* Tell the Guest about the new input. */
1796 add_used_and_trigger(fd, dev->vq, head, totlen); 1558 add_used(vq, head, totlen);
1797
1798 /* Everything went OK! */
1799 return true;
1800} 1559}
1801 1560
1802/* And this creates a "hardware" random number device for the Guest. */ 1561/* And this creates a "hardware" random number device for the Guest. */
1803static void setup_rng(void) 1562static void setup_rng(void)
1804{ 1563{
1805 struct device *dev; 1564 struct device *dev;
1806 int fd; 1565 struct rng_info *rng_info = malloc(sizeof(*rng_info));
1807 1566
1808 fd = open_or_die("/dev/random", O_RDONLY); 1567 rng_info->rfd = open_or_die("/dev/random", O_RDONLY);
1809 1568
1810 /* The device responds to return from I/O thread. */ 1569 /* The device responds to return from I/O thread. */
1811 dev = new_device("rng", VIRTIO_ID_RNG, fd, handle_rng_input); 1570 dev = new_device("rng", VIRTIO_ID_RNG);
1571 dev->priv = rng_info;
1812 1572
1813 /* The device has one virtqueue, where the Guest places inbufs. */ 1573 /* The device has one virtqueue, where the Guest places inbufs. */
1814 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); 1574 add_virtqueue(dev, VIRTQUEUE_NUM, rng_input);
1815 1575
1816 verbose("device %u: rng\n", devices.device_num++); 1576 verbose("device %u: rng\n", devices.device_num++);
1817} 1577}
@@ -1827,17 +1587,18 @@ static void __attribute__((noreturn)) restart_guest(void)
1827 for (i = 3; i < FD_SETSIZE; i++) 1587 for (i = 3; i < FD_SETSIZE; i++)
1828 close(i); 1588 close(i);
1829 1589
1830 /* The exec automatically gets rid of the I/O and Waker threads. */ 1590 /* Reset all the devices (kills all threads). */
1591 cleanup_devices();
1592
1831 execv(main_args[0], main_args); 1593 execv(main_args[0], main_args);
1832 err(1, "Could not exec %s", main_args[0]); 1594 err(1, "Could not exec %s", main_args[0]);
1833} 1595}
1834 1596
1835/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves 1597/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves
1836 * its input and output, and finally, lays it to rest. */ 1598 * its input and output, and finally, lays it to rest. */
1837static void __attribute__((noreturn)) run_guest(int lguest_fd) 1599static void __attribute__((noreturn)) run_guest(void)
1838{ 1600{
1839 for (;;) { 1601 for (;;) {
1840 unsigned long args[] = { LHREQ_BREAK, 0 };
1841 unsigned long notify_addr; 1602 unsigned long notify_addr;
1842 int readval; 1603 int readval;
1843 1604
@@ -1848,8 +1609,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
1848 /* One unsigned long means the Guest did HCALL_NOTIFY */ 1609 /* One unsigned long means the Guest did HCALL_NOTIFY */
1849 if (readval == sizeof(notify_addr)) { 1610 if (readval == sizeof(notify_addr)) {
1850 verbose("Notify on address %#lx\n", notify_addr); 1611 verbose("Notify on address %#lx\n", notify_addr);
1851 handle_output(lguest_fd, notify_addr); 1612 handle_output(notify_addr);
1852 continue;
1853 /* ENOENT means the Guest died. Reading tells us why. */ 1613 /* ENOENT means the Guest died. Reading tells us why. */
1854 } else if (errno == ENOENT) { 1614 } else if (errno == ENOENT) {
1855 char reason[1024] = { 0 }; 1615 char reason[1024] = { 0 };
@@ -1858,19 +1618,9 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
1858 /* ERESTART means that we need to reboot the guest */ 1618 /* ERESTART means that we need to reboot the guest */
1859 } else if (errno == ERESTART) { 1619 } else if (errno == ERESTART) {
1860 restart_guest(); 1620 restart_guest();
1861 /* EAGAIN means a signal (timeout). 1621 /* Anything else means a bug or incompatible change. */
1862 * Anything else means a bug or incompatible change. */ 1622 } else
1863 } else if (errno != EAGAIN)
1864 err(1, "Running guest failed"); 1623 err(1, "Running guest failed");
1865
1866 /* Only service input on thread for CPU 0. */
1867 if (cpu_id != 0)
1868 continue;
1869
1870 /* Service input, then unset the BREAK to release the Waker. */
1871 handle_input(lguest_fd);
1872 if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
1873 err(1, "Resetting break");
1874 } 1624 }
1875} 1625}
1876/*L:240 1626/*L:240
@@ -1904,8 +1654,8 @@ int main(int argc, char *argv[])
1904 /* Memory, top-level pagetable, code startpoint and size of the 1654 /* Memory, top-level pagetable, code startpoint and size of the
1905 * (optional) initrd. */ 1655 * (optional) initrd. */
1906 unsigned long mem = 0, start, initrd_size = 0; 1656 unsigned long mem = 0, start, initrd_size = 0;
1907 /* Two temporaries and the /dev/lguest file descriptor. */ 1657 /* Two temporaries. */
1908 int i, c, lguest_fd; 1658 int i, c;
1909 /* The boot information for the Guest. */ 1659 /* The boot information for the Guest. */
1910 struct boot_params *boot; 1660 struct boot_params *boot;
1911 /* If they specify an initrd file to load. */ 1661 /* If they specify an initrd file to load. */
@@ -1913,18 +1663,10 @@ int main(int argc, char *argv[])
1913 1663
1914 /* Save the args: we "reboot" by execing ourselves again. */ 1664 /* Save the args: we "reboot" by execing ourselves again. */
1915 main_args = argv; 1665 main_args = argv;
1916 /* We don't "wait" for the children, so prevent them from becoming
1917 * zombies. */
1918 signal(SIGCHLD, SIG_IGN);
1919 1666
1920 /* First we initialize the device list. Since console and network 1667 /* First we initialize the device list. We keep a pointer to the last
1921 * device receive input from a file descriptor, we keep an fdset 1668 * device, and the next interrupt number to use for devices (1:
1922 * (infds) and the maximum fd number (max_infd) with the head of the 1669 * remember that 0 is used by the timer). */
1923 * list. We also keep a pointer to the last device. Finally, we keep
1924 * the next interrupt number to use for devices (1: remember that 0 is
1925 * used by the timer). */
1926 FD_ZERO(&devices.infds);
1927 devices.max_infd = -1;
1928 devices.lastdev = NULL; 1670 devices.lastdev = NULL;
1929 devices.next_irq = 1; 1671 devices.next_irq = 1;
1930 1672
@@ -1982,9 +1724,6 @@ int main(int argc, char *argv[])
1982 /* We always have a console device */ 1724 /* We always have a console device */
1983 setup_console(); 1725 setup_console();
1984 1726
1985 /* We can timeout waiting for Guest network transmit. */
1986 setup_timeout();
1987
1988 /* Now we load the kernel */ 1727 /* Now we load the kernel */
1989 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); 1728 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
1990 1729
@@ -2023,15 +1762,16 @@ int main(int argc, char *argv[])
2023 1762
2024 /* We tell the kernel to initialize the Guest: this returns the open 1763 /* We tell the kernel to initialize the Guest: this returns the open
2025 * /dev/lguest file descriptor. */ 1764 * /dev/lguest file descriptor. */
2026 lguest_fd = tell_kernel(start); 1765 tell_kernel(start);
1766
1767 /* Ensure that we terminate if a child dies. */
1768 signal(SIGCHLD, kill_launcher);
2027 1769
2028 /* We clone off a thread, which wakes the Launcher whenever one of the 1770 /* If we exit via err(), this kills all the threads, restores tty. */
2029 * input file descriptors needs attention. We call this the Waker, and 1771 atexit(cleanup_devices);
2030 * we'll cover it in a moment. */
2031 setup_waker(lguest_fd);
2032 1772
2033 /* Finally, run the Guest. This doesn't return. */ 1773 /* Finally, run the Guest. This doesn't return. */
2034 run_guest(lguest_fd); 1774 run_guest();
2035} 1775}
2036/*:*/ 1776/*:*/
2037 1777
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt
index 28c747362f95..efb3a6a045a2 100644
--- a/Documentation/lguest/lguest.txt
+++ b/Documentation/lguest/lguest.txt
@@ -37,7 +37,6 @@ Running Lguest:
37 "Paravirtualized guest support" = Y 37 "Paravirtualized guest support" = Y
38 "Lguest guest support" = Y 38 "Lguest guest support" = Y
39 "High Memory Support" = off/4GB 39 "High Memory Support" = off/4GB
40 "PAE (Physical Address Extension) Support" = N
41 "Alignment value to which kernel should be aligned" = 0x100000 40 "Alignment value to which kernel should be aligned" = 0x100000
42 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and 41 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
43 CONFIG_PHYSICAL_ALIGN=0x100000) 42 CONFIG_PHYSICAL_ALIGN=0x100000)
diff --git a/Documentation/local_ops.txt b/Documentation/local_ops.txt
index 23045b8b50f0..300da4bdfdbd 100644
--- a/Documentation/local_ops.txt
+++ b/Documentation/local_ops.txt
@@ -34,7 +34,7 @@ out of order wrt other memory writes by the owner CPU.
34 34
35It can be done by slightly modifying the standard atomic operations : only 35It can be done by slightly modifying the standard atomic operations : only
36their UP variant must be kept. It typically means removing LOCK prefix (on 36their UP variant must be kept. It typically means removing LOCK prefix (on
37i386 and x86_64) and any SMP sychronization barrier. If the architecture does 37i386 and x86_64) and any SMP synchronization barrier. If the architecture does
38not have a different behavior between SMP and UP, including asm-generic/local.h 38not have a different behavior between SMP and UP, including asm-generic/local.h
39in your architecture's local.h is sufficient. 39in your architecture's local.h is sufficient.
40 40
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index f5b7127f54ac..7f5809eddee6 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -31,6 +31,7 @@ Contents:
31 31
32 - Locking functions. 32 - Locking functions.
33 - Interrupt disabling functions. 33 - Interrupt disabling functions.
34 - Sleep and wake-up functions.
34 - Miscellaneous functions. 35 - Miscellaneous functions.
35 36
36 (*) Inter-CPU locking barrier effects. 37 (*) Inter-CPU locking barrier effects.
@@ -1217,6 +1218,132 @@ barriers are required in such a situation, they must be provided from some
1217other means. 1218other means.
1218 1219
1219 1220
1221SLEEP AND WAKE-UP FUNCTIONS
1222---------------------------
1223
1224Sleeping and waking on an event flagged in global data can be viewed as an
1225interaction between two pieces of data: the task state of the task waiting for
1226the event and the global data used to indicate the event. To make sure that
1227these appear to happen in the right order, the primitives to begin the process
1228of going to sleep, and the primitives to initiate a wake up imply certain
1229barriers.
1230
1231Firstly, the sleeper normally follows something like this sequence of events:
1232
1233 for (;;) {
1234 set_current_state(TASK_UNINTERRUPTIBLE);
1235 if (event_indicated)
1236 break;
1237 schedule();
1238 }
1239
1240A general memory barrier is interpolated automatically by set_current_state()
1241after it has altered the task state:
1242
1243 CPU 1
1244 ===============================
1245 set_current_state();
1246 set_mb();
1247 STORE current->state
1248 <general barrier>
1249 LOAD event_indicated
1250
1251set_current_state() may be wrapped by:
1252
1253 prepare_to_wait();
1254 prepare_to_wait_exclusive();
1255
1256which therefore also imply a general memory barrier after setting the state.
1257The whole sequence above is available in various canned forms, all of which
1258interpolate the memory barrier in the right place:
1259
1260 wait_event();
1261 wait_event_interruptible();
1262 wait_event_interruptible_exclusive();
1263 wait_event_interruptible_timeout();
1264 wait_event_killable();
1265 wait_event_timeout();
1266 wait_on_bit();
1267 wait_on_bit_lock();
1268
1269
1270Secondly, code that performs a wake up normally follows something like this:
1271
1272 event_indicated = 1;
1273 wake_up(&event_wait_queue);
1274
1275or:
1276
1277 event_indicated = 1;
1278 wake_up_process(event_daemon);
1279
1280A write memory barrier is implied by wake_up() and co. if and only if they wake
1281something up. The barrier occurs before the task state is cleared, and so sits
1282between the STORE to indicate the event and the STORE to set TASK_RUNNING:
1283
1284 CPU 1 CPU 2
1285 =============================== ===============================
1286 set_current_state(); STORE event_indicated
1287 set_mb(); wake_up();
1288 STORE current->state <write barrier>
1289 <general barrier> STORE current->state
1290 LOAD event_indicated
1291
1292The available waker functions include:
1293
1294 complete();
1295 wake_up();
1296 wake_up_all();
1297 wake_up_bit();
1298 wake_up_interruptible();
1299 wake_up_interruptible_all();
1300 wake_up_interruptible_nr();
1301 wake_up_interruptible_poll();
1302 wake_up_interruptible_sync();
1303 wake_up_interruptible_sync_poll();
1304 wake_up_locked();
1305 wake_up_locked_poll();
1306 wake_up_nr();
1307 wake_up_poll();
1308 wake_up_process();
1309
1310
1311[!] Note that the memory barriers implied by the sleeper and the waker do _not_
1312order multiple stores before the wake-up with respect to loads of those stored
1313values after the sleeper has called set_current_state(). For instance, if the
1314sleeper does:
1315
1316 set_current_state(TASK_INTERRUPTIBLE);
1317 if (event_indicated)
1318 break;
1319 __set_current_state(TASK_RUNNING);
1320 do_something(my_data);
1321
1322and the waker does:
1323
1324 my_data = value;
1325 event_indicated = 1;
1326 wake_up(&event_wait_queue);
1327
1328there's no guarantee that the change to event_indicated will be perceived by
1329the sleeper as coming after the change to my_data. In such a circumstance, the
1330code on both sides must interpolate its own memory barriers between the
1331separate data accesses. Thus the above sleeper ought to do:
1332
1333 set_current_state(TASK_INTERRUPTIBLE);
1334 if (event_indicated) {
1335 smp_rmb();
1336 do_something(my_data);
1337 }
1338
1339and the waker should do:
1340
1341 my_data = value;
1342 smp_wmb();
1343 event_indicated = 1;
1344 wake_up(&event_wait_queue);
1345
1346
1220MISCELLANEOUS FUNCTIONS 1347MISCELLANEOUS FUNCTIONS
1221----------------------- 1348-----------------------
1222 1349
@@ -1366,7 +1493,7 @@ WHERE ARE MEMORY BARRIERS NEEDED?
1366 1493
1367Under normal operation, memory operation reordering is generally not going to 1494Under normal operation, memory operation reordering is generally not going to
1368be a problem as a single-threaded linear piece of code will still appear to 1495be a problem as a single-threaded linear piece of code will still appear to
1369work correctly, even if it's in an SMP kernel. There are, however, three 1496work correctly, even if it's in an SMP kernel. There are, however, four
1370circumstances in which reordering definitely _could_ be a problem: 1497circumstances in which reordering definitely _could_ be a problem:
1371 1498
1372 (*) Interprocessor interaction. 1499 (*) Interprocessor interaction.
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 4c2ecf537a4a..bbc8a6a36921 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -73,13 +73,13 @@ this phase is triggered automatically. ACPI can notify this event. If not,
73(see Section 4.). 73(see Section 4.).
74 74
75Logical Memory Hotplug phase is to change memory state into 75Logical Memory Hotplug phase is to change memory state into
76avaiable/unavailable for users. Amount of memory from user's view is 76available/unavailable for users. Amount of memory from user's view is
77changed by this phase. The kernel makes all memory in it as free pages 77changed by this phase. The kernel makes all memory in it as free pages
78when a memory range is available. 78when a memory range is available.
79 79
80In this document, this phase is described as online/offline. 80In this document, this phase is described as online/offline.
81 81
82Logical Memory Hotplug phase is triggred by write of sysfs file by system 82Logical Memory Hotplug phase is triggered by write of sysfs file by system
83administrator. For the hot-add case, it must be executed after Physical Hotplug 83administrator. For the hot-add case, it must be executed after Physical Hotplug
84phase by hand. 84phase by hand.
85(However, if you writes udev's hotplug scripts for memory hotplug, these 85(However, if you writes udev's hotplug scripts for memory hotplug, these
@@ -334,7 +334,7 @@ MEMORY_CANCEL_ONLINE
334 Generated if MEMORY_GOING_ONLINE fails. 334 Generated if MEMORY_GOING_ONLINE fails.
335 335
336MEMORY_ONLINE 336MEMORY_ONLINE
337 Generated when memory has succesfully brought online. The callback may 337 Generated when memory has successfully brought online. The callback may
338 allocate pages from the new memory. 338 allocate pages from the new memory.
339 339
340MEMORY_GOING_OFFLINE 340MEMORY_GOING_OFFLINE
@@ -359,7 +359,7 @@ The third argument is passed by pointer of struct memory_notify.
359struct memory_notify { 359struct memory_notify {
360 unsigned long start_pfn; 360 unsigned long start_pfn;
361 unsigned long nr_pages; 361 unsigned long nr_pages;
362 int status_cahnge_nid; 362 int status_change_nid;
363} 363}
364 364
365start_pfn is start_pfn of online/offline memory. 365start_pfn is start_pfn of online/offline memory.
diff --git a/Documentation/mn10300/ABI.txt b/Documentation/mn10300/ABI.txt
index 1fef1f06dfd2..d3507bad428d 100644
--- a/Documentation/mn10300/ABI.txt
+++ b/Documentation/mn10300/ABI.txt
@@ -26,7 +26,7 @@ registers and the stack. If the first argument is a 64-bit value, it will be
26passed in D0:D1. If the first argument is not a 64-bit value, but the second 26passed in D0:D1. If the first argument is not a 64-bit value, but the second
27is, the second will be passed entirely on the stack and D1 will be unused. 27is, the second will be passed entirely on the stack and D1 will be unused.
28 28
29Arguments smaller than 32-bits are not coelesced within a register or a stack 29Arguments smaller than 32-bits are not coalesced within a register or a stack
30word. For example, two byte-sized arguments will always be passed in separate 30word. For example, two byte-sized arguments will always be passed in separate
31registers or word-sized stack slots. 31registers or word-sized stack slots.
32 32
diff --git a/Documentation/mtd/nand_ecc.txt b/Documentation/mtd/nand_ecc.txt
index bdf93b7f0f24..274821b35a7f 100644
--- a/Documentation/mtd/nand_ecc.txt
+++ b/Documentation/mtd/nand_ecc.txt
@@ -50,7 +50,7 @@ byte 255: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp1 rp3 rp5 ... rp15
50 cp5 cp5 cp5 cp5 cp4 cp4 cp4 cp4 50 cp5 cp5 cp5 cp5 cp4 cp4 cp4 cp4
51 51
52This figure represents a sector of 256 bytes. 52This figure represents a sector of 256 bytes.
53cp is my abbreviaton for column parity, rp for row parity. 53cp is my abbreviation for column parity, rp for row parity.
54 54
55Let's start to explain column parity. 55Let's start to explain column parity.
56cp0 is the parity that belongs to all bit0, bit2, bit4, bit6. 56cp0 is the parity that belongs to all bit0, bit2, bit4, bit6.
@@ -560,7 +560,7 @@ Measuring this code again showed big gain. When executing the original
560linux code 1 million times, this took about 1 second on my system. 560linux code 1 million times, this took about 1 second on my system.
561(using time to measure the performance). After this iteration I was back 561(using time to measure the performance). After this iteration I was back
562to 0.075 sec. Actually I had to decide to start measuring over 10 562to 0.075 sec. Actually I had to decide to start measuring over 10
563million interations in order not to loose too much accuracy. This one 563million iterations in order not to lose too much accuracy. This one
564definitely seemed to be the jackpot! 564definitely seemed to be the jackpot!
565 565
566There is a little bit more room for improvement though. There are three 566There is a little bit more room for improvement though. There are three
@@ -571,8 +571,8 @@ loop; This eliminates 3 statements per loop. Of course after the loop we
571need to correct by adding: 571need to correct by adding:
572 rp4 ^= rp4_6; 572 rp4 ^= rp4_6;
573 rp6 ^= rp4_6 573 rp6 ^= rp4_6
574Furthermore there are 4 sequential assingments to rp8. This can be 574Furthermore there are 4 sequential assignments to rp8. This can be
575encoded slightly more efficient by saving tmppar before those 4 lines 575encoded slightly more efficiently by saving tmppar before those 4 lines
576and later do rp8 = rp8 ^ tmppar ^ notrp8; 576and later do rp8 = rp8 ^ tmppar ^ notrp8;
577(where notrp8 is the value of rp8 before those 4 lines). 577(where notrp8 is the value of rp8 before those 4 lines).
578Again a use of the commutative property of xor. 578Again a use of the commutative property of xor.
@@ -622,7 +622,7 @@ Not a big change, but every penny counts :-)
622Analysis 7 622Analysis 7
623========== 623==========
624 624
625Acutally this made things worse. Not very much, but I don't want to move 625Actually this made things worse. Not very much, but I don't want to move
626into the wrong direction. Maybe something to investigate later. Could 626into the wrong direction. Maybe something to investigate later. Could
627have to do with caching again. 627have to do with caching again.
628 628
@@ -642,7 +642,7 @@ Analysis 8
642This makes things worse. Let's stick with attempt 6 and continue from there. 642This makes things worse. Let's stick with attempt 6 and continue from there.
643Although it seems that the code within the loop cannot be optimised 643Although it seems that the code within the loop cannot be optimised
644further there is still room to optimize the generation of the ecc codes. 644further there is still room to optimize the generation of the ecc codes.
645We can simply calcualate the total parity. If this is 0 then rp4 = rp5 645We can simply calculate the total parity. If this is 0 then rp4 = rp5
646etc. If the parity is 1, then rp4 = !rp5; 646etc. If the parity is 1, then rp4 = !rp5;
647But if rp4 = rp5 we do not need rp5 etc. We can just write the even bits 647But if rp4 = rp5 we do not need rp5 etc. We can just write the even bits
648in the result byte and then do something like 648in the result byte and then do something like
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index 08762750f121..d5181ce9ff62 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -221,7 +221,7 @@ ad_select
221 221
222 - Any slave's 802.3ad association state changes 222 - Any slave's 802.3ad association state changes
223 223
224 - The bond's adminstrative state changes to up 224 - The bond's administrative state changes to up
225 225
226 count or 2 226 count or 2
227 227
@@ -369,7 +369,7 @@ fail_over_mac
369 When this policy is used in conjuction with the mii 369 When this policy is used in conjuction with the mii
370 monitor, devices which assert link up prior to being 370 monitor, devices which assert link up prior to being
371 able to actually transmit and receive are particularly 371 able to actually transmit and receive are particularly
372 susecptible to loss of the gratuitous ARP, and an 372 susceptible to loss of the gratuitous ARP, and an
373 appropriate updelay setting may be required. 373 appropriate updelay setting may be required.
374 374
375 follow or 2 375 follow or 2
@@ -1794,7 +1794,7 @@ target to query.
1794generally referred to as "trunk failover." This is a feature of the 1794generally referred to as "trunk failover." This is a feature of the
1795switch that causes the link state of a particular switch port to be set 1795switch that causes the link state of a particular switch port to be set
1796down (or up) when the state of another switch port goes down (or up). 1796down (or up) when the state of another switch port goes down (or up).
1797It's purpose is to propogate link failures from logically "exterior" ports 1797Its purpose is to propagate link failures from logically "exterior" ports
1798to the logically "interior" ports that bonding is able to monitor via 1798to the logically "interior" ports that bonding is able to monitor via
1799miimon. Availability and configuration for trunk failover varies by 1799miimon. Availability and configuration for trunk failover varies by
1800switch, but this can be a viable alternative to the ARP monitor when using 1800switch, but this can be a viable alternative to the ARP monitor when using
diff --git a/Documentation/networking/can.txt b/Documentation/networking/can.txt
index 2035bc4932f2..cd79735013f9 100644
--- a/Documentation/networking/can.txt
+++ b/Documentation/networking/can.txt
@@ -36,10 +36,15 @@ This file contains
36 6.2 local loopback of sent frames 36 6.2 local loopback of sent frames
37 6.3 CAN controller hardware filters 37 6.3 CAN controller hardware filters
38 6.4 The virtual CAN driver (vcan) 38 6.4 The virtual CAN driver (vcan)
39 6.5 currently supported CAN hardware 39 6.5 The CAN network device driver interface
40 6.6 todo 40 6.5.1 Netlink interface to set/get devices properties
41 6.5.2 Setting the CAN bit-timing
42 6.5.3 Starting and stopping the CAN network device
43 6.6 supported CAN hardware
41 44
42 7 Credits 45 7 Socket CAN resources
46
47 8 Credits
43 48
44============================================================================ 49============================================================================
45 50
@@ -234,6 +239,8 @@ solution for a couple of reasons:
234 the user application using the common CAN filter mechanisms. Inside 239 the user application using the common CAN filter mechanisms. Inside
235 this filter definition the (interested) type of errors may be 240 this filter definition the (interested) type of errors may be
236 selected. The reception of error frames is disabled by default. 241 selected. The reception of error frames is disabled by default.
242 The format of the CAN error frame is briefly decribed in the Linux
243 header file "include/linux/can/error.h".
237 244
2384. How to use Socket CAN 2454. How to use Socket CAN
239------------------------ 246------------------------
@@ -327,7 +334,7 @@ solution for a couple of reasons:
327 return 1; 334 return 1;
328 } 335 }
329 336
330 /* paraniod check ... */ 337 /* paranoid check ... */
331 if (nbytes < sizeof(struct can_frame)) { 338 if (nbytes < sizeof(struct can_frame)) {
332 fprintf(stderr, "read: incomplete CAN frame\n"); 339 fprintf(stderr, "read: incomplete CAN frame\n");
333 return 1; 340 return 1;
@@ -605,61 +612,213 @@ solution for a couple of reasons:
605 removal of vcan network devices can be managed with the ip(8) tool: 612 removal of vcan network devices can be managed with the ip(8) tool:
606 613
607 - Create a virtual CAN network interface: 614 - Create a virtual CAN network interface:
608 ip link add type vcan 615 $ ip link add type vcan
609 616
610 - Create a virtual CAN network interface with a specific name 'vcan42': 617 - Create a virtual CAN network interface with a specific name 'vcan42':
611 ip link add dev vcan42 type vcan 618 $ ip link add dev vcan42 type vcan
612 619
613 - Remove a (virtual CAN) network interface 'vcan42': 620 - Remove a (virtual CAN) network interface 'vcan42':
614 ip link del vcan42 621 $ ip link del vcan42
615 622
616 The tool 'vcan' from the SocketCAN SVN repository on BerliOS is obsolete. 623 6.5 The CAN network device driver interface
617 624
618 Virtual CAN network device creation in older Kernels: 625 The CAN network device driver interface provides a generic interface
619 In Linux Kernel versions < 2.6.24 the vcan driver creates 4 vcan 626 to setup, configure and monitor CAN network devices. The user can then
620 netdevices at module load time by default. This value can be changed 627 configure the CAN device, like setting the bit-timing parameters, via
621 with the module parameter 'numdev'. E.g. 'modprobe vcan numdev=8' 628 the netlink interface using the program "ip" from the "IPROUTE2"
622 629 utility suite. The following chapter describes briefly how to use it.
623 6.5 currently supported CAN hardware 630 Furthermore, the interface uses a common data structure and exports a
631 set of common functions, which all real CAN network device drivers
632 should use. Please have a look to the SJA1000 or MSCAN driver to
633 understand how to use them. The name of the module is can-dev.ko.
634
635 6.5.1 Netlink interface to set/get devices properties
636
637 The CAN device must be configured via netlink interface. The supported
638 netlink message types are defined and briefly described in
639 "include/linux/can/netlink.h". CAN link support for the program "ip"
640 of the IPROUTE2 utility suite is avaiable and it can be used as shown
641 below:
642
643 - Setting CAN device properties:
644
645 $ ip link set can0 type can help
646 Usage: ip link set DEVICE type can
647 [ bitrate BITRATE [ sample-point SAMPLE-POINT] ] |
648 [ tq TQ prop-seg PROP_SEG phase-seg1 PHASE-SEG1
649 phase-seg2 PHASE-SEG2 [ sjw SJW ] ]
650
651 [ loopback { on | off } ]
652 [ listen-only { on | off } ]
653 [ triple-sampling { on | off } ]
654
655 [ restart-ms TIME-MS ]
656 [ restart ]
657
658 Where: BITRATE := { 1..1000000 }
659 SAMPLE-POINT := { 0.000..0.999 }
660 TQ := { NUMBER }
661 PROP-SEG := { 1..8 }
662 PHASE-SEG1 := { 1..8 }
663 PHASE-SEG2 := { 1..8 }
664 SJW := { 1..4 }
665 RESTART-MS := { 0 | NUMBER }
666
667 - Display CAN device details and statistics:
668
669 $ ip -details -statistics link show can0
670 2: can0: <NOARP,UP,LOWER_UP,ECHO> mtu 16 qdisc pfifo_fast state UP qlen 10
671 link/can
672 can <TRIPLE-SAMPLING> state ERROR-ACTIVE restart-ms 100
673 bitrate 125000 sample_point 0.875
674 tq 125 prop-seg 6 phase-seg1 7 phase-seg2 2 sjw 1
675 sja1000: tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1
676 clock 8000000
677 re-started bus-errors arbit-lost error-warn error-pass bus-off
678 41 17457 0 41 42 41
679 RX: bytes packets errors dropped overrun mcast
680 140859 17608 17457 0 0 0
681 TX: bytes packets errors dropped carrier collsns
682 861 112 0 41 0 0
683
684 More info to the above output:
685
686 "<TRIPLE-SAMPLING>"
687 Shows the list of selected CAN controller modes: LOOPBACK,
688 LISTEN-ONLY, or TRIPLE-SAMPLING.
689
690 "state ERROR-ACTIVE"
691 The current state of the CAN controller: "ERROR-ACTIVE",
692 "ERROR-WARNING", "ERROR-PASSIVE", "BUS-OFF" or "STOPPED"
693
694 "restart-ms 100"
695 Automatic restart delay time. If set to a non-zero value, a
696 restart of the CAN controller will be triggered automatically
697 in case of a bus-off condition after the specified delay time
698 in milliseconds. By default it's off.
699
700 "bitrate 125000 sample_point 0.875"
701 Shows the real bit-rate in bits/sec and the sample-point in the
702 range 0.000..0.999. If the calculation of bit-timing parameters
703 is enabled in the kernel (CONFIG_CAN_CALC_BITTIMING=y), the
704 bit-timing can be defined by setting the "bitrate" argument.
705 Optionally the "sample-point" can be specified. By default it's
706 0.000 assuming CIA-recommended sample-points.
707
708 "tq 125 prop-seg 6 phase-seg1 7 phase-seg2 2 sjw 1"
709 Shows the time quanta in ns, propagation segment, phase buffer
710 segment 1 and 2 and the synchronisation jump width in units of
711 tq. They allow to define the CAN bit-timing in a hardware
712 independent format as proposed by the Bosch CAN 2.0 spec (see
713 chapter 8 of http://www.semiconductors.bosch.de/pdf/can2spec.pdf).
714
715 "sja1000: tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1
716 clock 8000000"
717 Shows the bit-timing constants of the CAN controller, here the
718 "sja1000". The minimum and maximum values of the time segment 1
719 and 2, the synchronisation jump width in units of tq, the
720 bitrate pre-scaler and the CAN system clock frequency in Hz.
721 These constants could be used for user-defined (non-standard)
722 bit-timing calculation algorithms in user-space.
723
724 "re-started bus-errors arbit-lost error-warn error-pass bus-off"
725 Shows the number of restarts, bus and arbitration lost errors,
726 and the state changes to the error-warning, error-passive and
727 bus-off state. RX overrun errors are listed in the "overrun"
728 field of the standard network statistics.
729
730 6.5.2 Setting the CAN bit-timing
731
732 The CAN bit-timing parameters can always be defined in a hardware
733 independent format as proposed in the Bosch CAN 2.0 specification
734 specifying the arguments "tq", "prop_seg", "phase_seg1", "phase_seg2"
735 and "sjw":
736
737 $ ip link set canX type can tq 125 prop-seg 6 \
738 phase-seg1 7 phase-seg2 2 sjw 1
739
740 If the kernel option CONFIG_CAN_CALC_BITTIMING is enabled, CIA
741 recommended CAN bit-timing parameters will be calculated if the bit-
742 rate is specified with the argument "bitrate":
743
744 $ ip link set canX type can bitrate 125000
745
746 Note that this works fine for the most common CAN controllers with
747 standard bit-rates but may *fail* for exotic bit-rates or CAN system
748 clock frequencies. Disabling CONFIG_CAN_CALC_BITTIMING saves some
749 space and allows user-space tools to solely determine and set the
750 bit-timing parameters. The CAN controller specific bit-timing
751 constants can be used for that purpose. They are listed by the
752 following command:
753
754 $ ip -details link show can0
755 ...
756 sja1000: clock 8000000 tseg1 1..16 tseg2 1..8 sjw 1..4 brp 1..64 brp-inc 1
757
758 6.5.3 Starting and stopping the CAN network device
759
760 A CAN network device is started or stopped as usual with the command
761 "ifconfig canX up/down" or "ip link set canX up/down". Be aware that
762 you *must* define proper bit-timing parameters for real CAN devices
763 before you can start it to avoid error-prone default settings:
764
765 $ ip link set canX up type can bitrate 125000
766
767 A device may enter the "bus-off" state if too much errors occurred on
768 the CAN bus. Then no more messages are received or sent. An automatic
769 bus-off recovery can be enabled by setting the "restart-ms" to a
770 non-zero value, e.g.:
771
772 $ ip link set canX type can restart-ms 100
773
774 Alternatively, the application may realize the "bus-off" condition
775 by monitoring CAN error frames and do a restart when appropriate with
776 the command:
777
778 $ ip link set canX type can restart
779
780 Note that a restart will also create a CAN error frame (see also
781 chapter 3.4).
624 782
625 On the project website http://developer.berlios.de/projects/socketcan 783 6.6 Supported CAN hardware
626 there are different drivers available:
627 784
628 vcan: Virtual CAN interface driver (if no real hardware is available) 785 Please check the "Kconfig" file in "drivers/net/can" to get an actual
629 sja1000: Philips SJA1000 CAN controller (recommended) 786 list of the support CAN hardware. On the Socket CAN project website
630 i82527: Intel i82527 CAN controller 787 (see chapter 7) there might be further drivers available, also for
631 mscan: Motorola/Freescale CAN controller (e.g. inside SOC MPC5200) 788 older kernel versions.
632 ccan: CCAN controller core (e.g. inside SOC h7202)
633 slcan: For a bunch of CAN adaptors that are attached via a
634 serial line ASCII protocol (for serial / USB adaptors)
635 789
636 Additionally the different CAN adaptors (ISA/PCI/PCMCIA/USB/Parport) 7907. Socket CAN resources
637 from PEAK Systemtechnik support the CAN netdevice driver model 791-----------------------
638 since Linux driver v6.0: http://www.peak-system.com/linux/index.htm
639 792
640 Please check the Mailing Lists on the berlios OSS project website. 793 You can find further resources for Socket CAN like user space tools,
794 support for old kernel versions, more drivers, mailing lists, etc.
795 at the BerliOS OSS project website for Socket CAN:
641 796
642 6.6 todo 797 http://developer.berlios.de/projects/socketcan
643 798
644 The configuration interface for CAN network drivers is still an open 799 If you have questions, bug fixes, etc., don't hesitate to post them to
645 issue that has not been finalized in the socketcan project. Also the 800 the Socketcan-Users mailing list. But please search the archives first.
646 idea of having a library module (candev.ko) that holds functions
647 that are needed by all CAN netdevices is not ready to ship.
648 Your contribution is welcome.
649 801
6507. Credits 8028. Credits
651---------- 803----------
652 804
653 Oliver Hartkopp (PF_CAN core, filters, drivers, bcm) 805 Oliver Hartkopp (PF_CAN core, filters, drivers, bcm, SJA1000 driver)
654 Urs Thuermann (PF_CAN core, kernel integration, socket interfaces, raw, vcan) 806 Urs Thuermann (PF_CAN core, kernel integration, socket interfaces, raw, vcan)
655 Jan Kizka (RT-SocketCAN core, Socket-API reconciliation) 807 Jan Kizka (RT-SocketCAN core, Socket-API reconciliation)
656 Wolfgang Grandegger (RT-SocketCAN core & drivers, Raw Socket-API reviews) 808 Wolfgang Grandegger (RT-SocketCAN core & drivers, Raw Socket-API reviews,
809 CAN device driver interface, MSCAN driver)
657 Robert Schwebel (design reviews, PTXdist integration) 810 Robert Schwebel (design reviews, PTXdist integration)
658 Marc Kleine-Budde (design reviews, Kernel 2.6 cleanups, drivers) 811 Marc Kleine-Budde (design reviews, Kernel 2.6 cleanups, drivers)
659 Benedikt Spranger (reviews) 812 Benedikt Spranger (reviews)
660 Thomas Gleixner (LKML reviews, coding style, posting hints) 813 Thomas Gleixner (LKML reviews, coding style, posting hints)
661 Andrey Volkov (kernel subtree structure, ioctls, mscan driver) 814 Andrey Volkov (kernel subtree structure, ioctls, MSCAN driver)
662 Matthias Brukner (first SJA1000 CAN netdevice implementation Q2/2003) 815 Matthias Brukner (first SJA1000 CAN netdevice implementation Q2/2003)
663 Klaus Hitschler (PEAK driver integration) 816 Klaus Hitschler (PEAK driver integration)
664 Uwe Koppe (CAN netdevices with PF_PACKET approach) 817 Uwe Koppe (CAN netdevices with PF_PACKET approach)
665 Michael Schulze (driver layer loopback requirement, RT CAN drivers review) 818 Michael Schulze (driver layer loopback requirement, RT CAN drivers review)
819 Pavel Pisa (Bit-timing calculation)
820 Sascha Hauer (SJA1000 platform driver)
821 Sebastian Haas (SJA1000 EMS PCI driver)
822 Markus Plessing (SJA1000 EMS PCI driver)
823 Per Dalen (SJA1000 Kvaser PCI driver)
824 Sam Ravnborg (reviews, coding style, kbuild help)
diff --git a/Documentation/networking/dm9000.txt b/Documentation/networking/dm9000.txt
index 65df3dea5561..5552e2e575c5 100644
--- a/Documentation/networking/dm9000.txt
+++ b/Documentation/networking/dm9000.txt
@@ -129,7 +129,7 @@ PHY Link state polling
129---------------------- 129----------------------
130 130
131The driver keeps track of the link state and informs the network core 131The driver keeps track of the link state and informs the network core
132about link (carrier) availablilty. This is managed by several methods 132about link (carrier) availability. This is managed by several methods
133depending on the version of the chip and on which PHY is being used. 133depending on the version of the chip and on which PHY is being used.
134 134
135For the internal PHY, the original (and currently default) method is 135For the internal PHY, the original (and currently default) method is
diff --git a/Documentation/networking/ieee802154.txt b/Documentation/networking/ieee802154.txt
new file mode 100644
index 000000000000..a0280ad2edc9
--- /dev/null
+++ b/Documentation/networking/ieee802154.txt
@@ -0,0 +1,76 @@
1
2 Linux IEEE 802.15.4 implementation
3
4
5Introduction
6============
7
8The Linux-ZigBee project goal is to provide complete implementation
9of IEEE 802.15.4 / ZigBee / 6LoWPAN protocols. IEEE 802.15.4 is a stack
10of protocols for organizing Low-Rate Wireless Personal Area Networks.
11
12Currently only IEEE 802.15.4 layer is implemented. We have choosen
13to use plain Berkeley socket API, the generic Linux networking stack
14to transfer IEEE 802.15.4 messages and a special protocol over genetlink
15for configuration/management
16
17
18Socket API
19==========
20
21int sd = socket(PF_IEEE802154, SOCK_DGRAM, 0);
22.....
23
24The address family, socket addresses etc. are defined in the
25include/net/ieee802154/af_ieee802154.h header or in the special header
26in our userspace package (see either linux-zigbee sourceforge download page
27or git tree at git://linux-zigbee.git.sourceforge.net/gitroot/linux-zigbee).
28
29One can use SOCK_RAW for passing raw data towards device xmit function. YMMV.
30
31
32MLME - MAC Level Management
33============================
34
35Most of IEEE 802.15.4 MLME interfaces are directly mapped on netlink commands.
36See the include/net/ieee802154/nl802154.h header. Our userspace tools package
37(see above) provides CLI configuration utility for radio interfaces and simple
38coordinator for IEEE 802.15.4 networks as an example users of MLME protocol.
39
40
41Kernel side
42=============
43
44Like with WiFi, there are several types of devices implementing IEEE 802.15.4.
451) 'HardMAC'. The MAC layer is implemented in the device itself, the device
46 exports MLME and data API.
472) 'SoftMAC' or just radio. These types of devices are just radio transceivers
48 possibly with some kinds of acceleration like automatic CRC computation and
49 comparation, automagic ACK handling, address matching, etc.
50
51Those types of devices require different approach to be hooked into Linux kernel.
52
53
54HardMAC
55=======
56
57See the header include/net/ieee802154/netdevice.h. You have to implement Linux
58net_device, with .type = ARPHRD_IEEE802154. Data is exchanged with socket family
59code via plain sk_buffs. The control block of sk_buffs will contain additional
60info as described in the struct ieee802154_mac_cb.
61
62To hook the MLME interface you have to populate the ml_priv field of your
63net_device with a pointer to struct ieee802154_mlme_ops instance. All fields are
64required.
65
66We provide an example of simple HardMAC driver at drivers/ieee802154/fakehard.c
67
68
69SoftMAC
70=======
71
72We are going to provide intermediate layer impelementing IEEE 802.15.4 MAC
73in software. This is currently WIP.
74
75See header include/net/ieee802154/mac802154.h and several drivers in
76drivers/ieee802154/
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index b121c5db707f..8be76235fe67 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -168,7 +168,16 @@ tcp_dsack - BOOLEAN
168 Allows TCP to send "duplicate" SACKs. 168 Allows TCP to send "duplicate" SACKs.
169 169
170tcp_ecn - BOOLEAN 170tcp_ecn - BOOLEAN
171 Enable Explicit Congestion Notification in TCP. 171 Enable Explicit Congestion Notification (ECN) in TCP. ECN is only
172 used when both ends of the TCP flow support it. It is useful to
173 avoid losses due to congestion (when the bottleneck router supports
174 ECN).
175 Possible values are:
176 0 disable ECN
177 1 ECN enabled
178 2 Only server-side ECN enabled. If the other end does
179 not support ECN, behavior is like with ECN disabled.
180 Default: 2
172 181
173tcp_fack - BOOLEAN 182tcp_fack - BOOLEAN
174 Enable FACK congestion avoidance and fast retransmission. 183 Enable FACK congestion avoidance and fast retransmission.
@@ -1048,6 +1057,13 @@ disable_ipv6 - BOOLEAN
1048 address. 1057 address.
1049 Default: FALSE (enable IPv6 operation) 1058 Default: FALSE (enable IPv6 operation)
1050 1059
1060 When this value is changed from 1 to 0 (IPv6 is being enabled),
1061 it will dynamically create a link-local address on the given
1062 interface and start Duplicate Address Detection, if necessary.
1063
1064 When this value is changed from 0 to 1 (IPv6 is being disabled),
1065 it will dynamically delete all address on the given interface.
1066
1051accept_dad - INTEGER 1067accept_dad - INTEGER
1052 Whether to accept DAD (Duplicate Address Detection). 1068 Whether to accept DAD (Duplicate Address Detection).
1053 0: Disable DAD 1069 0: Disable DAD
diff --git a/Documentation/networking/ipv6.txt b/Documentation/networking/ipv6.txt
index 268e5c103dd8..9fd7e21296c8 100644
--- a/Documentation/networking/ipv6.txt
+++ b/Documentation/networking/ipv6.txt
@@ -33,3 +33,40 @@ disable
33 33
34 A reboot is required to enable IPv6. 34 A reboot is required to enable IPv6.
35 35
36autoconf
37
38 Specifies whether to enable IPv6 address autoconfiguration
39 on all interfaces. This might be used when one does not wish
40 for addresses to be automatically generated from prefixes
41 received in Router Advertisements.
42
43 The possible values and their effects are:
44
45 0
46 IPv6 address autoconfiguration is disabled on all interfaces.
47
48 Only the IPv6 loopback address (::1) and link-local addresses
49 will be added to interfaces.
50
51 1
52 IPv6 address autoconfiguration is enabled on all interfaces.
53
54 This is the default value.
55
56disable_ipv6
57
58 Specifies whether to disable IPv6 on all interfaces.
59 This might be used when no IPv6 addresses are desired.
60
61 The possible values and their effects are:
62
63 0
64 IPv6 is enabled on all interfaces.
65
66 This is the default value.
67
68 1
69 IPv6 is disabled on all interfaces.
70
71 No IPv6 addresses will be added to interfaces.
72
diff --git a/Documentation/networking/l2tp.txt b/Documentation/networking/l2tp.txt
index 2451f551c505..63214b280e00 100644
--- a/Documentation/networking/l2tp.txt
+++ b/Documentation/networking/l2tp.txt
@@ -158,7 +158,7 @@ Sample Userspace Code
158 } 158 }
159 return 0; 159 return 0;
160 160
161Miscellanous 161Miscellaneous
162============ 162============
163 163
164The PPPoL2TP driver was developed as part of the OpenL2TP project by 164The PPPoL2TP driver was developed as part of the OpenL2TP project by
diff --git a/Documentation/networking/mac80211-injection.txt b/Documentation/networking/mac80211-injection.txt
index 84906ef3ed6e..b30e81ad5307 100644
--- a/Documentation/networking/mac80211-injection.txt
+++ b/Documentation/networking/mac80211-injection.txt
@@ -12,38 +12,22 @@ following format:
12The radiotap format is discussed in 12The radiotap format is discussed in
13./Documentation/networking/radiotap-headers.txt. 13./Documentation/networking/radiotap-headers.txt.
14 14
15Despite 13 radiotap argument types are currently defined, most only make sense 15Despite many radiotap parameters being currently defined, most only make sense
16to appear on received packets. The following information is parsed from the 16to appear on received packets. The following information is parsed from the
17radiotap headers and used to control injection: 17radiotap headers and used to control injection:
18 18
19 * IEEE80211_RADIOTAP_RATE
20
21 rate in 500kbps units, automatic if invalid or not present
22
23
24 * IEEE80211_RADIOTAP_ANTENNA
25
26 antenna to use, automatic if not present
27
28
29 * IEEE80211_RADIOTAP_DBM_TX_POWER
30
31 transmit power in dBm, automatic if not present
32
33
34 * IEEE80211_RADIOTAP_FLAGS 19 * IEEE80211_RADIOTAP_FLAGS
35 20
36 IEEE80211_RADIOTAP_F_FCS: FCS will be removed and recalculated 21 IEEE80211_RADIOTAP_F_FCS: FCS will be removed and recalculated
37 IEEE80211_RADIOTAP_F_WEP: frame will be encrypted if key available 22 IEEE80211_RADIOTAP_F_WEP: frame will be encrypted if key available
38 IEEE80211_RADIOTAP_F_FRAG: frame will be fragmented if longer than the 23 IEEE80211_RADIOTAP_F_FRAG: frame will be fragmented if longer than the
39 current fragmentation threshold. Note that 24 current fragmentation threshold.
40 this flag is only reliable when software 25
41 fragmentation is enabled)
42 26
43The injection code can also skip all other currently defined radiotap fields 27The injection code can also skip all other currently defined radiotap fields
44facilitating replay of captured radiotap headers directly. 28facilitating replay of captured radiotap headers directly.
45 29
46Here is an example valid radiotap header defining these three parameters 30Here is an example valid radiotap header defining some parameters
47 31
48 0x00, 0x00, // <-- radiotap version 32 0x00, 0x00, // <-- radiotap version
49 0x0b, 0x00, // <- radiotap header length 33 0x0b, 0x00, // <- radiotap header length
@@ -72,8 +56,8 @@ interface), along the following lines:
72... 56...
73 r = pcap_inject(ppcap, u8aSendBuffer, nLength); 57 r = pcap_inject(ppcap, u8aSendBuffer, nLength);
74 58
75You can also find sources for a complete inject test applet here: 59You can also find a link to a complete inject application here:
76 60
77http://penumbra.warmcat.com/_twk/tiki-index.php?page=packetspammer 61http://wireless.kernel.org/en/users/Documentation/packetspammer
78 62
79Andy Green <andy@warmcat.com> 63Andy Green <andy@warmcat.com>
diff --git a/Documentation/networking/netdevices.txt b/Documentation/networking/netdevices.txt
index a2ab6a0b116d..87b3d15f523a 100644
--- a/Documentation/networking/netdevices.txt
+++ b/Documentation/networking/netdevices.txt
@@ -74,7 +74,7 @@ dev->hard_start_xmit:
74 for this and return NETDEV_TX_LOCKED when the spin lock fails. 74 for this and return NETDEV_TX_LOCKED when the spin lock fails.
75 The locking there should also properly protect against 75 The locking there should also properly protect against
76 set_multicast_list. Note that the use of NETIF_F_LLTX is deprecated. 76 set_multicast_list. Note that the use of NETIF_F_LLTX is deprecated.
77 Dont use it for new drivers. 77 Don't use it for new drivers.
78 78
79 Context: Process with BHs disabled or BH (timer), 79 Context: Process with BHs disabled or BH (timer),
80 will be called with interrupts disabled by netconsole. 80 will be called with interrupts disabled by netconsole.
diff --git a/Documentation/networking/operstates.txt b/Documentation/networking/operstates.txt
index c9074f9b78bb..1a77a3cfae54 100644
--- a/Documentation/networking/operstates.txt
+++ b/Documentation/networking/operstates.txt
@@ -38,9 +38,6 @@ ifinfomsg::if_flags & IFF_LOWER_UP:
38ifinfomsg::if_flags & IFF_DORMANT: 38ifinfomsg::if_flags & IFF_DORMANT:
39 Driver has signaled netif_dormant_on() 39 Driver has signaled netif_dormant_on()
40 40
41These interface flags can also be queried without netlink using the
42SIOCGIFFLAGS ioctl.
43
44TLV IFLA_OPERSTATE 41TLV IFLA_OPERSTATE
45 42
46contains RFC2863 state of the interface in numeric representation: 43contains RFC2863 state of the interface in numeric representation:
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index 07c53d596035..a22fd85e3796 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -4,16 +4,18 @@
4 4
5This file documents the CONFIG_PACKET_MMAP option available with the PACKET 5This file documents the CONFIG_PACKET_MMAP option available with the PACKET
6socket interface on 2.4 and 2.6 kernels. This type of sockets is used for 6socket interface on 2.4 and 2.6 kernels. This type of sockets is used for
7capture network traffic with utilities like tcpdump or any other that uses 7capture network traffic with utilities like tcpdump or any other that needs
8the libpcap library. 8raw access to network interface.
9
10You can find the latest version of this document at
11 9
10You can find the latest version of this document at:
12 http://pusa.uv.es/~ulisses/packet_mmap/ 11 http://pusa.uv.es/~ulisses/packet_mmap/
13 12
14Please send me your comments to 13Howto can be found at:
14 http://wiki.gnu-log.net (packet_mmap)
15 15
16Please send your comments to
16 Ulisses Alonso Camaró <uaca@i.hate.spam.alumni.uv.es> 17 Ulisses Alonso Camaró <uaca@i.hate.spam.alumni.uv.es>
18 Johann Baudy <johann.baudy@gnu-log.net>
17 19
18------------------------------------------------------------------------------- 20-------------------------------------------------------------------------------
19+ Why use PACKET_MMAP 21+ Why use PACKET_MMAP
@@ -25,19 +27,24 @@ to capture each packet, it requires two if you want to get packet's
25timestamp (like libpcap always does). 27timestamp (like libpcap always does).
26 28
27In the other hand PACKET_MMAP is very efficient. PACKET_MMAP provides a size 29In the other hand PACKET_MMAP is very efficient. PACKET_MMAP provides a size
28configurable circular buffer mapped in user space. This way reading packets just 30configurable circular buffer mapped in user space that can be used to either
29needs to wait for them, most of the time there is no need to issue a single 31send or receive packets. This way reading packets just needs to wait for them,
30system call. By using a shared buffer between the kernel and the user 32most of the time there is no need to issue a single system call. Concerning
31also has the benefit of minimizing packet copies. 33transmission, multiple packets can be sent through one system call to get the
32 34highest bandwidth.
33It's fine to use PACKET_MMAP to improve the performance of the capture process, 35By using a shared buffer between the kernel and the user also has the benefit
34but it isn't everything. At least, if you are capturing at high speeds (this 36of minimizing packet copies.
35is relative to the cpu speed), you should check if the device driver of your 37
36network interface card supports some sort of interrupt load mitigation or 38It's fine to use PACKET_MMAP to improve the performance of the capture and
37(even better) if it supports NAPI, also make sure it is enabled. 39transmission process, but it isn't everything. At least, if you are capturing
40at high speeds (this is relative to the cpu speed), you should check if the
41device driver of your network interface card supports some sort of interrupt
42load mitigation or (even better) if it supports NAPI, also make sure it is
43enabled. For transmission, check the MTU (Maximum Transmission Unit) used and
44supported by devices of your network.
38 45
39-------------------------------------------------------------------------------- 46--------------------------------------------------------------------------------
40+ How to use CONFIG_PACKET_MMAP 47+ How to use CONFIG_PACKET_MMAP to improve capture process
41-------------------------------------------------------------------------------- 48--------------------------------------------------------------------------------
42 49
43From the user standpoint, you should use the higher level libpcap library, which 50From the user standpoint, you should use the higher level libpcap library, which
@@ -57,7 +64,7 @@ the low level details or want to improve libpcap by including PACKET_MMAP
57support. 64support.
58 65
59-------------------------------------------------------------------------------- 66--------------------------------------------------------------------------------
60+ How to use CONFIG_PACKET_MMAP directly 67+ How to use CONFIG_PACKET_MMAP directly to improve capture process
61-------------------------------------------------------------------------------- 68--------------------------------------------------------------------------------
62 69
63From the system calls stand point, the use of PACKET_MMAP involves 70From the system calls stand point, the use of PACKET_MMAP involves
@@ -66,6 +73,7 @@ the following process:
66 73
67[setup] socket() -------> creation of the capture socket 74[setup] socket() -------> creation of the capture socket
68 setsockopt() ---> allocation of the circular buffer (ring) 75 setsockopt() ---> allocation of the circular buffer (ring)
76 option: PACKET_RX_RING
69 mmap() ---------> mapping of the allocated buffer to the 77 mmap() ---------> mapping of the allocated buffer to the
70 user process 78 user process
71 79
@@ -97,13 +105,75 @@ also the mapping of the circular buffer in the user process and
97the use of this buffer. 105the use of this buffer.
98 106
99-------------------------------------------------------------------------------- 107--------------------------------------------------------------------------------
108+ How to use CONFIG_PACKET_MMAP directly to improve transmission process
109--------------------------------------------------------------------------------
110Transmission process is similar to capture as shown below.
111
112[setup] socket() -------> creation of the transmission socket
113 setsockopt() ---> allocation of the circular buffer (ring)
114 option: PACKET_TX_RING
115 bind() ---------> bind transmission socket with a network interface
116 mmap() ---------> mapping of the allocated buffer to the
117 user process
118
119[transmission] poll() ---------> wait for free packets (optional)
120 send() ---------> send all packets that are set as ready in
121 the ring
122 The flag MSG_DONTWAIT can be used to return
123 before end of transfer.
124
125[shutdown] close() --------> destruction of the transmission socket and
126 deallocation of all associated resources.
127
128Binding the socket to your network interface is mandatory (with zero copy) to
129know the header size of frames used in the circular buffer.
130
131As capture, each frame contains two parts:
132
133 --------------------
134| struct tpacket_hdr | Header. It contains the status of
135| | of this frame
136|--------------------|
137| data buffer |
138. . Data that will be sent over the network interface.
139. .
140 --------------------
141
142 bind() associates the socket to your network interface thanks to
143 sll_ifindex parameter of struct sockaddr_ll.
144
145 Initialization example:
146
147 struct sockaddr_ll my_addr;
148 struct ifreq s_ifr;
149 ...
150
151 strncpy (s_ifr.ifr_name, "eth0", sizeof(s_ifr.ifr_name));
152
153 /* get interface index of eth0 */
154 ioctl(this->socket, SIOCGIFINDEX, &s_ifr);
155
156 /* fill sockaddr_ll struct to prepare binding */
157 my_addr.sll_family = AF_PACKET;
158 my_addr.sll_protocol = ETH_P_ALL;
159 my_addr.sll_ifindex = s_ifr.ifr_ifindex;
160
161 /* bind socket to eth0 */
162 bind(this->socket, (struct sockaddr *)&my_addr, sizeof(struct sockaddr_ll));
163
164 A complete tutorial is available at: http://wiki.gnu-log.net/
165
166--------------------------------------------------------------------------------
100+ PACKET_MMAP settings 167+ PACKET_MMAP settings
101-------------------------------------------------------------------------------- 168--------------------------------------------------------------------------------
102 169
103 170
104To setup PACKET_MMAP from user level code is done with a call like 171To setup PACKET_MMAP from user level code is done with a call like
105 172
173 - Capture process
106 setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req, sizeof(req)) 174 setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req, sizeof(req))
175 - Transmission process
176 setsockopt(fd, SOL_PACKET, PACKET_TX_RING, (void *) &req, sizeof(req))
107 177
108The most significant argument in the previous call is the req parameter, 178The most significant argument in the previous call is the req parameter,
109this parameter must to have the following structure: 179this parameter must to have the following structure:
@@ -117,11 +187,11 @@ this parameter must to have the following structure:
117 }; 187 };
118 188
119This structure is defined in /usr/include/linux/if_packet.h and establishes a 189This structure is defined in /usr/include/linux/if_packet.h and establishes a
120circular buffer (ring) of unswappable memory mapped in the capture process. 190circular buffer (ring) of unswappable memory.
121Being mapped in the capture process allows reading the captured frames and 191Being mapped in the capture process allows reading the captured frames and
122related meta-information like timestamps without requiring a system call. 192related meta-information like timestamps without requiring a system call.
123 193
124Captured frames are grouped in blocks. Each block is a physically contiguous 194Frames are grouped in blocks. Each block is a physically contiguous
125region of memory and holds tp_block_size/tp_frame_size frames. The total number 195region of memory and holds tp_block_size/tp_frame_size frames. The total number
126of blocks is tp_block_nr. Note that tp_frame_nr is a redundant parameter because 196of blocks is tp_block_nr. Note that tp_frame_nr is a redundant parameter because
127 197
@@ -336,6 +406,7 @@ struct tpacket_hdr). If this field is 0 means that the frame is ready
336to be used for the kernel, If not, there is a frame the user can read 406to be used for the kernel, If not, there is a frame the user can read
337and the following flags apply: 407and the following flags apply:
338 408
409+++ Capture process:
339 from include/linux/if_packet.h 410 from include/linux/if_packet.h
340 411
341 #define TP_STATUS_COPY 2 412 #define TP_STATUS_COPY 2
@@ -391,6 +462,37 @@ packets are in the ring:
391It doesn't incur in a race condition to first check the status value and 462It doesn't incur in a race condition to first check the status value and
392then poll for frames. 463then poll for frames.
393 464
465
466++ Transmission process
467Those defines are also used for transmission:
468
469 #define TP_STATUS_AVAILABLE 0 // Frame is available
470 #define TP_STATUS_SEND_REQUEST 1 // Frame will be sent on next send()
471 #define TP_STATUS_SENDING 2 // Frame is currently in transmission
472 #define TP_STATUS_WRONG_FORMAT 4 // Frame format is not correct
473
474First, the kernel initializes all frames to TP_STATUS_AVAILABLE. To send a
475packet, the user fills a data buffer of an available frame, sets tp_len to
476current data buffer size and sets its status field to TP_STATUS_SEND_REQUEST.
477This can be done on multiple frames. Once the user is ready to transmit, it
478calls send(). Then all buffers with status equal to TP_STATUS_SEND_REQUEST are
479forwarded to the network device. The kernel updates each status of sent
480frames with TP_STATUS_SENDING until the end of transfer.
481At the end of each transfer, buffer status returns to TP_STATUS_AVAILABLE.
482
483 header->tp_len = in_i_size;
484 header->tp_status = TP_STATUS_SEND_REQUEST;
485 retval = send(this->socket, NULL, 0, 0);
486
487The user can also use poll() to check if a buffer is available:
488(status == TP_STATUS_SENDING)
489
490 struct pollfd pfd;
491 pfd.fd = fd;
492 pfd.revents = 0;
493 pfd.events = POLLOUT;
494 retval = poll(&pfd, 1, timeout);
495
394-------------------------------------------------------------------------------- 496--------------------------------------------------------------------------------
395+ THANKS 497+ THANKS
396-------------------------------------------------------------------------------- 498--------------------------------------------------------------------------------
diff --git a/Documentation/networking/phonet.txt b/Documentation/networking/phonet.txt
index 6a07e45d4a93..6e8ce09f9c73 100644
--- a/Documentation/networking/phonet.txt
+++ b/Documentation/networking/phonet.txt
@@ -36,7 +36,7 @@ Phonet packets have a common header as follows:
36On Linux, the link-layer header includes the pn_media byte (see below). 36On Linux, the link-layer header includes the pn_media byte (see below).
37The next 7 bytes are part of the network-layer header. 37The next 7 bytes are part of the network-layer header.
38 38
39The device ID is split: the 6 higher-order bits consitute the device 39The device ID is split: the 6 higher-order bits constitute the device
40address, while the 2 lower-order bits are used for multiplexing, as are 40address, while the 2 lower-order bits are used for multiplexing, as are
41the 8-bit object identifiers. As such, Phonet can be considered as a 41the 8-bit object identifiers. As such, Phonet can be considered as a
42network layer with 6 bits of address space and 10 bits for transport 42network layer with 6 bits of address space and 10 bits for transport
diff --git a/Documentation/networking/regulatory.txt b/Documentation/networking/regulatory.txt
index dcf31648414a..eaa1a25946c1 100644
--- a/Documentation/networking/regulatory.txt
+++ b/Documentation/networking/regulatory.txt
@@ -89,7 +89,7 @@ added to this document when its support is enabled.
89Device drivers who provide their own built regulatory domain 89Device drivers who provide their own built regulatory domain
90do not need a callback as the channels registered by them are 90do not need a callback as the channels registered by them are
91the only ones that will be allowed and therefore *additional* 91the only ones that will be allowed and therefore *additional*
92cannels cannot be enabled. 92channels cannot be enabled.
93 93
94Example code - drivers hinting an alpha2: 94Example code - drivers hinting an alpha2:
95------------------------------------------ 95------------------------------------------
diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
index 421e7d00ffd0..c9abbd86bc18 100644
--- a/Documentation/power/devices.txt
+++ b/Documentation/power/devices.txt
@@ -75,9 +75,6 @@ may need to apply in domain-specific ways to their devices:
75struct bus_type { 75struct bus_type {
76 ... 76 ...
77 int (*suspend)(struct device *dev, pm_message_t state); 77 int (*suspend)(struct device *dev, pm_message_t state);
78 int (*suspend_late)(struct device *dev, pm_message_t state);
79
80 int (*resume_early)(struct device *dev);
81 int (*resume)(struct device *dev); 78 int (*resume)(struct device *dev);
82}; 79};
83 80
@@ -226,20 +223,7 @@ The phases are seen by driver notifications issued in this order:
226 223
227 This call should handle parts of device suspend logic that require 224 This call should handle parts of device suspend logic that require
228 sleeping. It probably does work to quiesce the device which hasn't 225 sleeping. It probably does work to quiesce the device which hasn't
229 been abstracted into class.suspend() or bus.suspend_late(). 226 been abstracted into class.suspend().
230
231 3 bus.suspend_late(dev, message) is called with IRQs disabled, and
232 with only one CPU active. Until the bus.resume_early() phase
233 completes (see later), IRQs are not enabled again. This method
234 won't be exposed by all busses; for message based busses like USB,
235 I2C, or SPI, device interactions normally require IRQs. This bus
236 call may be morphed into a driver call with bus-specific parameters.
237
238 This call might save low level hardware state that might otherwise
239 be lost in the upcoming low power state, and actually put the
240 device into a low power state ... so that in some cases the device
241 may stay partly usable until this late. This "late" call may also
242 help when coping with hardware that behaves badly.
243 227
244The pm_message_t parameter is currently used to refine those semantics 228The pm_message_t parameter is currently used to refine those semantics
245(described later). 229(described later).
@@ -351,19 +335,11 @@ devices processing each phase's calls before the next phase begins.
351 335
352The phases are seen by driver notifications issued in this order: 336The phases are seen by driver notifications issued in this order:
353 337
354 1 bus.resume_early(dev) is called with IRQs disabled, and with 338 1 bus.resume(dev) reverses the effects of bus.suspend(). This may
355 only one CPU active. As with bus.suspend_late(), this method 339 be morphed into a device driver call with bus-specific parameters;
356 won't be supported on busses that require IRQs in order to 340 implementations may sleep.
357 interact with devices.
358
359 This reverses the effects of bus.suspend_late().
360
361 2 bus.resume(dev) is called next. This may be morphed into a device
362 driver call with bus-specific parameters; implementations may sleep.
363
364 This reverses the effects of bus.suspend().
365 341
366 3 class.resume(dev) is called for devices associated with a class 342 2 class.resume(dev) is called for devices associated with a class
367 that has such a method. Implementations may sleep. 343 that has such a method. Implementations may sleep.
368 344
369 This reverses the effects of class.suspend(), and would usually 345 This reverses the effects of class.suspend(), and would usually
diff --git a/Documentation/power/regulator/consumer.txt b/Documentation/power/regulator/consumer.txt
index 82b7a43aadba..5f83fd24ea84 100644
--- a/Documentation/power/regulator/consumer.txt
+++ b/Documentation/power/regulator/consumer.txt
@@ -178,5 +178,5 @@ Consumers can uregister interest by calling :-
178int regulator_unregister_notifier(struct regulator *regulator, 178int regulator_unregister_notifier(struct regulator *regulator,
179 struct notifier_block *nb); 179 struct notifier_block *nb);
180 180
181Regulators use the kernel notifier framework to send event to thier interested 181Regulators use the kernel notifier framework to send event to their interested
182consumers. 182consumers.
diff --git a/Documentation/power/regulator/overview.txt b/Documentation/power/regulator/overview.txt
index bdcb332bd7fb..0cded696ca01 100644
--- a/Documentation/power/regulator/overview.txt
+++ b/Documentation/power/regulator/overview.txt
@@ -119,7 +119,7 @@ Some terms used in this document:-
119 battery power, USB power) 119 battery power, USB power)
120 120
121 Regulator Domains: is the new current limit within the 121 Regulator Domains: is the new current limit within the
122 regulator operating parameters for input/ouput voltage. 122 regulator operating parameters for input/output voltage.
123 123
124 If the regulator request passes all the constraint tests 124 If the regulator request passes all the constraint tests
125 then the new regulator value is applied. 125 then the new regulator value is applied.
diff --git a/Documentation/power/s2ram.txt b/Documentation/power/s2ram.txt
index 2ebdc6091ce1..514b94fc931e 100644
--- a/Documentation/power/s2ram.txt
+++ b/Documentation/power/s2ram.txt
@@ -63,7 +63,7 @@ hardware during resume operations where a value can be set that will
63survive a reboot. 63survive a reboot.
64 64
65Consequence is that after a resume (even if it is successful) your system 65Consequence is that after a resume (even if it is successful) your system
66clock will have a value corresponding to the magic mumber instead of the 66clock will have a value corresponding to the magic number instead of the
67correct date/time! It is therefore advisable to use a program like ntp-date 67correct date/time! It is therefore advisable to use a program like ntp-date
68or rdate to reset the correct date/time from an external time source when 68or rdate to reset the correct date/time from an external time source when
69using this trace option. 69using this trace option.
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
index 7b99636564c8..b967cd9137d6 100644
--- a/Documentation/power/userland-swsusp.txt
+++ b/Documentation/power/userland-swsusp.txt
@@ -109,7 +109,7 @@ unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are
109still frozen when the device is being closed). 109still frozen when the device is being closed).
110 110
111Currently it is assumed that the userland utilities reading/writing the 111Currently it is assumed that the userland utilities reading/writing the
112snapshot image from/to the kernel will use a swap parition, called the resume 112snapshot image from/to the kernel will use a swap partition, called the resume
113partition, or a swap file as storage space (if a swap file is used, the resume 113partition, or a swap file as storage space (if a swap file is used, the resume
114partition is the partition that holds this file). However, this is not really 114partition is the partition that holds this file). However, this is not really
115required, as they can use, for example, a special (blank) suspend partition or 115required, as they can use, for example, a special (blank) suspend partition or
diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt
index d16b7a1c3793..8d999d862d0e 100644
--- a/Documentation/powerpc/booting-without-of.txt
+++ b/Documentation/powerpc/booting-without-of.txt
@@ -1356,7 +1356,7 @@ platforms are moved over to use the flattened-device-tree model.
1356 - phy-map : 1 cell, optional, bitmap of addresses to probe the PHY 1356 - phy-map : 1 cell, optional, bitmap of addresses to probe the PHY
1357 for, used if phy-address is absent. bit 0x00000001 is 1357 for, used if phy-address is absent. bit 0x00000001 is
1358 MDIO address 0. 1358 MDIO address 0.
1359 For Axon it can be absent, thouugh my current driver 1359 For Axon it can be absent, though my current driver
1360 doesn't handle phy-address yet so for now, keep 1360 doesn't handle phy-address yet so for now, keep
1361 0x00ffffff in it. 1361 0x00ffffff in it.
1362 - rx-fifo-size-gige : 1 cell, Rx fifo size in bytes for 1000 Mb/sec 1362 - rx-fifo-size-gige : 1 cell, Rx fifo size in bytes for 1000 Mb/sec
@@ -1438,7 +1438,7 @@ platforms are moved over to use the flattened-device-tree model.
1438 1438
1439 The Xilinx EDK toolchain ships with a set of IP cores (devices) for use 1439 The Xilinx EDK toolchain ships with a set of IP cores (devices) for use
1440 in Xilinx Spartan and Virtex FPGAs. The devices cover the whole range 1440 in Xilinx Spartan and Virtex FPGAs. The devices cover the whole range
1441 of standard device types (network, serial, etc.) and miscellanious 1441 of standard device types (network, serial, etc.) and miscellaneous
1442 devices (gpio, LCD, spi, etc). Also, since these devices are 1442 devices (gpio, LCD, spi, etc). Also, since these devices are
1443 implemented within the fpga fabric every instance of the device can be 1443 implemented within the fpga fabric every instance of the device can be
1444 synthesised with different options that change the behaviour. 1444 synthesised with different options that change the behaviour.
diff --git a/Documentation/powerpc/dts-bindings/can/sja1000.txt b/Documentation/powerpc/dts-bindings/can/sja1000.txt
new file mode 100644
index 000000000000..d6d209ded937
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/can/sja1000.txt
@@ -0,0 +1,53 @@
1Memory mapped SJA1000 CAN controller from NXP (formerly Philips)
2
3Required properties:
4
5- compatible : should be "nxp,sja1000".
6
7- reg : should specify the chip select, address offset and size required
8 to map the registers of the SJA1000. The size is usually 0x80.
9
10- interrupts: property with a value describing the interrupt source
11 (number and sensitivity) required for the SJA1000.
12
13Optional properties:
14
15- nxp,external-clock-frequency : Frequency of the external oscillator
16 clock in Hz. Note that the internal clock frequency used by the
17 SJA1000 is half of that value. If not specified, a default value
18 of 16000000 (16 MHz) is used.
19
20- nxp,tx-output-mode : operation mode of the TX output control logic:
21 <0x0> : bi-phase output mode
22 <0x1> : normal output mode (default)
23 <0x2> : test output mode
24 <0x3> : clock output mode
25
26- nxp,tx-output-config : TX output pin configuration:
27 <0x01> : TX0 invert
28 <0x02> : TX0 pull-down (default)
29 <0x04> : TX0 pull-up
30 <0x06> : TX0 push-pull
31 <0x08> : TX1 invert
32 <0x10> : TX1 pull-down
33 <0x20> : TX1 pull-up
34 <0x30> : TX1 push-pull
35
36- nxp,clock-out-frequency : clock frequency in Hz on the CLKOUT pin.
37 If not specified or if the specified value is 0, the CLKOUT pin
38 will be disabled.
39
40- nxp,no-comparator-bypass : Allows to disable the CAN input comperator.
41
42For futher information, please have a look to the SJA1000 data sheet.
43
44Examples:
45
46can@3,100 {
47 compatible = "nxp,sja1000";
48 reg = <3 0x100 0x80>;
49 interrupts = <2 0>;
50 interrupt-parent = <&mpic>;
51 nxp,external-clock-frequency = <16000000>;
52};
53
diff --git a/Documentation/powerpc/dts-bindings/ecm.txt b/Documentation/powerpc/dts-bindings/ecm.txt
new file mode 100644
index 000000000000..f514f29c67d6
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/ecm.txt
@@ -0,0 +1,64 @@
1=====================================================================
2E500 LAW & Coherency Module Device Tree Binding
3Copyright (C) 2009 Freescale Semiconductor Inc.
4=====================================================================
5
6Local Access Window (LAW) Node
7
8The LAW node represents the region of CCSR space where local access
9windows are configured. For ECM based devices this is the first 4k
10of CCSR space that includes CCSRBAR, ALTCBAR, ALTCAR, BPTR, and some
11number of local access windows as specified by fsl,num-laws.
12
13PROPERTIES
14
15 - compatible
16 Usage: required
17 Value type: <string>
18 Definition: Must include "fsl,ecm-law"
19
20 - reg
21 Usage: required
22 Value type: <prop-encoded-array>
23 Definition: A standard property. The value specifies the
24 physical address offset and length of the CCSR space
25 registers.
26
27 - fsl,num-laws
28 Usage: required
29 Value type: <u32>
30 Definition: The value specifies the number of local access
31 windows for this device.
32
33=====================================================================
34
35E500 Coherency Module Node
36
37The E500 LAW node represents the region of CCSR space where ECM config
38and error reporting registers exist, this is the second 4k (0x1000)
39of CCSR space.
40
41PROPERTIES
42
43 - compatible
44 Usage: required
45 Value type: <string>
46 Definition: Must include "fsl,CHIP-ecm", "fsl,ecm" where
47 CHIP is the processor (mpc8572, mpc8544, etc.)
48
49 - reg
50 Usage: required
51 Value type: <prop-encoded-array>
52 Definition: A standard property. The value specifies the
53 physical address offset and length of the CCSR space
54 registers.
55
56 - interrupts
57 Usage: required
58 Value type: <prop-encoded-array>
59
60 - interrupt-parent
61 Usage: required
62 Value type: <phandle>
63
64=====================================================================
diff --git a/Documentation/powerpc/dts-bindings/fsl/board.txt b/Documentation/powerpc/dts-bindings/fsl/board.txt
index 6c974d28eeb4..e8b5bc24d0ac 100644
--- a/Documentation/powerpc/dts-bindings/fsl/board.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/board.txt
@@ -38,7 +38,7 @@ Required properities:
38- reg : Should contain the address and the length of the GPIO bank 38- reg : Should contain the address and the length of the GPIO bank
39 register. 39 register.
40- #gpio-cells : Should be two. The first cell is the pin number and the 40- #gpio-cells : Should be two. The first cell is the pin number and the
41 second cell is used to specify optional paramters (currently unused). 41 second cell is used to specify optional parameters (currently unused).
42- gpio-controller : Marks the port as GPIO controller. 42- gpio-controller : Marks the port as GPIO controller.
43 43
44Example: 44Example:
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm.txt
index 088fc471e03a..160c752484b4 100644
--- a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/cpm.txt
@@ -19,7 +19,7 @@ Example:
19 reg = <119c0 30>; 19 reg = <119c0 30>;
20 } 20 }
21 21
22* Properties common to mulitple CPM/QE devices 22* Properties common to multiple CPM/QE devices
23 23
24- fsl,cpm-command : This value is ORed with the opcode and command flag 24- fsl,cpm-command : This value is ORed with the opcode and command flag
25 to specify the device on which a CPM command operates. 25 to specify the device on which a CPM command operates.
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/gpio.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/gpio.txt
index 1815dfede1bc..349f79fd7076 100644
--- a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/gpio.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/gpio.txt
@@ -11,7 +11,7 @@ Required properties:
11 "fsl,cpm1-pario-bank-c", "fsl,cpm1-pario-bank-d", 11 "fsl,cpm1-pario-bank-c", "fsl,cpm1-pario-bank-d",
12 "fsl,cpm1-pario-bank-e", "fsl,cpm2-pario-bank" 12 "fsl,cpm1-pario-bank-e", "fsl,cpm2-pario-bank"
13- #gpio-cells : Should be two. The first cell is the pin number and the 13- #gpio-cells : Should be two. The first cell is the pin number and the
14 second cell is used to specify optional paramters (currently unused). 14 second cell is used to specify optional parameters (currently unused).
15- gpio-controller : Marks the port as GPIO controller. 15- gpio-controller : Marks the port as GPIO controller.
16 16
17Example of three SOC GPIO banks defined as gpio-controller nodes: 17Example of three SOC GPIO banks defined as gpio-controller nodes:
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt
index 78790d58dc2c..6e37be1eeb2d 100644
--- a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe.txt
@@ -17,6 +17,9 @@ Required properties:
17- model : precise model of the QE, Can be "QE", "CPM", or "CPM2" 17- model : precise model of the QE, Can be "QE", "CPM", or "CPM2"
18- reg : offset and length of the device registers. 18- reg : offset and length of the device registers.
19- bus-frequency : the clock frequency for QUICC Engine. 19- bus-frequency : the clock frequency for QUICC Engine.
20- fsl,qe-num-riscs: define how many RISC engines the QE has.
21- fsl,qe-num-snums: define how many serial number(SNUM) the QE can use for the
22 threads.
20 23
21Recommended properties 24Recommended properties
22- brg-frequency : the internal clock source frequency for baud-rate 25- brg-frequency : the internal clock source frequency for baud-rate
diff --git a/Documentation/powerpc/dts-bindings/fsl/esdhc.txt b/Documentation/powerpc/dts-bindings/fsl/esdhc.txt
index 600846557763..3ed3797b5086 100644
--- a/Documentation/powerpc/dts-bindings/fsl/esdhc.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/esdhc.txt
@@ -5,17 +5,18 @@ for MMC, SD, and SDIO types of memory cards.
5 5
6Required properties: 6Required properties:
7 - compatible : should be 7 - compatible : should be
8 "fsl,<chip>-esdhc", "fsl,mpc8379-esdhc" for MPC83xx processors. 8 "fsl,<chip>-esdhc", "fsl,esdhc"
9 "fsl,<chip>-esdhc", "fsl,mpc8536-esdhc" for MPC85xx processors.
10 - reg : should contain eSDHC registers location and length. 9 - reg : should contain eSDHC registers location and length.
11 - interrupts : should contain eSDHC interrupt. 10 - interrupts : should contain eSDHC interrupt.
12 - interrupt-parent : interrupt source phandle. 11 - interrupt-parent : interrupt source phandle.
13 - clock-frequency : specifies eSDHC base clock frequency. 12 - clock-frequency : specifies eSDHC base clock frequency.
13 - sdhci,1-bit-only : (optional) specifies that a controller can
14 only handle 1-bit data transfers.
14 15
15Example: 16Example:
16 17
17sdhci@2e000 { 18sdhci@2e000 {
18 compatible = "fsl,mpc8378-esdhc", "fsl,mpc8379-esdhc"; 19 compatible = "fsl,mpc8378-esdhc", "fsl,esdhc";
19 reg = <0x2e000 0x1000>; 20 reg = <0x2e000 0x1000>;
20 interrupts = <42 0x8>; 21 interrupts = <42 0x8>;
21 interrupt-parent = <&ipic>; 22 interrupt-parent = <&ipic>;
diff --git a/Documentation/powerpc/dts-bindings/fsl/mcm.txt b/Documentation/powerpc/dts-bindings/fsl/mcm.txt
new file mode 100644
index 000000000000..4ceda9b3b413
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/fsl/mcm.txt
@@ -0,0 +1,64 @@
1=====================================================================
2MPX LAW & Coherency Module Device Tree Binding
3Copyright (C) 2009 Freescale Semiconductor Inc.
4=====================================================================
5
6Local Access Window (LAW) Node
7
8The LAW node represents the region of CCSR space where local access
9windows are configured. For MCM based devices this is the first 4k
10of CCSR space that includes CCSRBAR, ALTCBAR, ALTCAR, BPTR, and some
11number of local access windows as specified by fsl,num-laws.
12
13PROPERTIES
14
15 - compatible
16 Usage: required
17 Value type: <string>
18 Definition: Must include "fsl,mcm-law"
19
20 - reg
21 Usage: required
22 Value type: <prop-encoded-array>
23 Definition: A standard property. The value specifies the
24 physical address offset and length of the CCSR space
25 registers.
26
27 - fsl,num-laws
28 Usage: required
29 Value type: <u32>
30 Definition: The value specifies the number of local access
31 windows for this device.
32
33=====================================================================
34
35MPX Coherency Module Node
36
37The MPX LAW node represents the region of CCSR space where MCM config
38and error reporting registers exist, this is the second 4k (0x1000)
39of CCSR space.
40
41PROPERTIES
42
43 - compatible
44 Usage: required
45 Value type: <string>
46 Definition: Must include "fsl,CHIP-mcm", "fsl,mcm" where
47 CHIP is the processor (mpc8641, mpc8610, etc.)
48
49 - reg
50 Usage: required
51 Value type: <prop-encoded-array>
52 Definition: A standard property. The value specifies the
53 physical address offset and length of the CCSR space
54 registers.
55
56 - interrupts
57 Usage: required
58 Value type: <prop-encoded-array>
59
60 - interrupt-parent
61 Usage: required
62 Value type: <phandle>
63
64=====================================================================
diff --git a/Documentation/powerpc/dts-bindings/fsl/msi-pic.txt b/Documentation/powerpc/dts-bindings/fsl/msi-pic.txt
index b26b91992c55..bcc30bac6831 100644
--- a/Documentation/powerpc/dts-bindings/fsl/msi-pic.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/msi-pic.txt
@@ -1,6 +1,6 @@
1* Freescale MSI interrupt controller 1* Freescale MSI interrupt controller
2 2
3Reguired properities: 3Required properties:
4- compatible : compatible list, contains 2 entries, 4- compatible : compatible list, contains 2 entries,
5 first is "fsl,CHIP-msi", where CHIP is the processor(mpc8610, mpc8572, 5 first is "fsl,CHIP-msi", where CHIP is the processor(mpc8610, mpc8572,
6 etc.) and the second is "fsl,mpic-msi" or "fsl,ipic-msi" depending on 6 etc.) and the second is "fsl,mpic-msi" or "fsl,ipic-msi" depending on
diff --git a/Documentation/powerpc/dts-bindings/fsl/pmc.txt b/Documentation/powerpc/dts-bindings/fsl/pmc.txt
index 02f6f43ee1b7..07256b7ffcaa 100644
--- a/Documentation/powerpc/dts-bindings/fsl/pmc.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/pmc.txt
@@ -15,8 +15,8 @@ Properties:
15 compatible; all statements below that apply to "fsl,mpc8548-pmc" also 15 compatible; all statements below that apply to "fsl,mpc8548-pmc" also
16 apply to "fsl,mpc8641d-pmc". 16 apply to "fsl,mpc8641d-pmc".
17 17
18 Compatibility does not include bit assigments in SCCR/PMCDR/DEVDISR; these 18 Compatibility does not include bit assignments in SCCR/PMCDR/DEVDISR; these
19 bit assigments are indicated via the sleep specifier in each device's 19 bit assignments are indicated via the sleep specifier in each device's
20 sleep property. 20 sleep property.
21 21
22- reg: For devices compatible with "fsl,mpc8349-pmc", the first resource 22- reg: For devices compatible with "fsl,mpc8349-pmc", the first resource
diff --git a/Documentation/powerpc/qe_firmware.txt b/Documentation/powerpc/qe_firmware.txt
index 06da4d4b44f9..2031ddb33d09 100644
--- a/Documentation/powerpc/qe_firmware.txt
+++ b/Documentation/powerpc/qe_firmware.txt
@@ -225,7 +225,7 @@ For example, to match the 8323, revision 1.0:
225 soc.major = 1 225 soc.major = 1
226 soc.minor = 0 226 soc.minor = 0
227 227
228'padding' is neccessary for structure alignment. This field ensures that the 228'padding' is necessary for structure alignment. This field ensures that the
229'extended_modes' field is aligned on a 64-bit boundary. 229'extended_modes' field is aligned on a 64-bit boundary.
230 230
231'extended_modes' is a bitfield that defines special functionality which has an 231'extended_modes' is a bitfield that defines special functionality which has an
diff --git a/Documentation/pps/pps.txt b/Documentation/pps/pps.txt
new file mode 100644
index 000000000000..125f4ab48998
--- /dev/null
+++ b/Documentation/pps/pps.txt
@@ -0,0 +1,172 @@
1
2 PPS - Pulse Per Second
3 ----------------------
4
5(C) Copyright 2007 Rodolfo Giometti <giometti@enneenne.com>
6
7This program is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 2 of the License, or
10(at your option) any later version.
11
12This program is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
17
18
19Overview
20--------
21
22LinuxPPS provides a programming interface (API) to define in the
23system several PPS sources.
24
25PPS means "pulse per second" and a PPS source is just a device which
26provides a high precision signal each second so that an application
27can use it to adjust system clock time.
28
29A PPS source can be connected to a serial port (usually to the Data
30Carrier Detect pin) or to a parallel port (ACK-pin) or to a special
31CPU's GPIOs (this is the common case in embedded systems) but in each
32case when a new pulse arrives the system must apply to it a timestamp
33and record it for userland.
34
35Common use is the combination of the NTPD as userland program, with a
36GPS receiver as PPS source, to obtain a wallclock-time with
37sub-millisecond synchronisation to UTC.
38
39
40RFC considerations
41------------------
42
43While implementing a PPS API as RFC 2783 defines and using an embedded
44CPU GPIO-Pin as physical link to the signal, I encountered a deeper
45problem:
46
47 At startup it needs a file descriptor as argument for the function
48 time_pps_create().
49
50This implies that the source has a /dev/... entry. This assumption is
51ok for the serial and parallel port, where you can do something
52useful besides(!) the gathering of timestamps as it is the central
53task for a PPS-API. But this assumption does not work for a single
54purpose GPIO line. In this case even basic file-related functionality
55(like read() and write()) makes no sense at all and should not be a
56precondition for the use of a PPS-API.
57
58The problem can be simply solved if you consider that a PPS source is
59not always connected with a GPS data source.
60
61So your programs should check if the GPS data source (the serial port
62for instance) is a PPS source too, and if not they should provide the
63possibility to open another device as PPS source.
64
65In LinuxPPS the PPS sources are simply char devices usually mapped
66into files /dev/pps0, /dev/pps1, etc..
67
68
69Coding example
70--------------
71
72To register a PPS source into the kernel you should define a struct
73pps_source_info_s as follows:
74
75 static struct pps_source_info pps_ktimer_info = {
76 .name = "ktimer",
77 .path = "",
78 .mode = PPS_CAPTUREASSERT | PPS_OFFSETASSERT | \
79 PPS_ECHOASSERT | \
80 PPS_CANWAIT | PPS_TSFMT_TSPEC,
81 .echo = pps_ktimer_echo,
82 .owner = THIS_MODULE,
83 };
84
85and then calling the function pps_register_source() in your
86intialization routine as follows:
87
88 source = pps_register_source(&pps_ktimer_info,
89 PPS_CAPTUREASSERT | PPS_OFFSETASSERT);
90
91The pps_register_source() prototype is:
92
93 int pps_register_source(struct pps_source_info_s *info, int default_params)
94
95where "info" is a pointer to a structure that describes a particular
96PPS source, "default_params" tells the system what the initial default
97parameters for the device should be (it is obvious that these parameters
98must be a subset of ones defined in the struct
99pps_source_info_s which describe the capabilities of the driver).
100
101Once you have registered a new PPS source into the system you can
102signal an assert event (for example in the interrupt handler routine)
103just using:
104
105 pps_event(source, &ts, PPS_CAPTUREASSERT, ptr)
106
107where "ts" is the event's timestamp.
108
109The same function may also run the defined echo function
110(pps_ktimer_echo(), passing to it the "ptr" pointer) if the user
111asked for that... etc..
112
113Please see the file drivers/pps/clients/ktimer.c for example code.
114
115
116SYSFS support
117-------------
118
119If the SYSFS filesystem is enabled in the kernel it provides a new class:
120
121 $ ls /sys/class/pps/
122 pps0/ pps1/ pps2/
123
124Every directory is the ID of a PPS sources defined in the system and
125inside you find several files:
126
127 $ ls /sys/class/pps/pps0/
128 assert clear echo mode name path subsystem@ uevent
129
130Inside each "assert" and "clear" file you can find the timestamp and a
131sequence number:
132
133 $ cat /sys/class/pps/pps0/assert
134 1170026870.983207967#8
135
136Where before the "#" is the timestamp in seconds; after it is the
137sequence number. Other files are:
138
139* echo: reports if the PPS source has an echo function or not;
140
141* mode: reports available PPS functioning modes;
142
143* name: reports the PPS source's name;
144
145* path: reports the PPS source's device path, that is the device the
146 PPS source is connected to (if it exists).
147
148
149Testing the PPS support
150-----------------------
151
152In order to test the PPS support even without specific hardware you can use
153the ktimer driver (see the client subsection in the PPS configuration menu)
154and the userland tools provided into Documentaion/pps/ directory.
155
156Once you have enabled the compilation of ktimer just modprobe it (if
157not statically compiled):
158
159 # modprobe ktimer
160
161and the run ppstest as follow:
162
163 $ ./ppstest /dev/pps0
164 trying PPS source "/dev/pps1"
165 found PPS source "/dev/pps1"
166 ok, found 1 source(s), now start fetching data...
167 source 0 - assert 1186592699.388832443, sequence: 364 - clear 0.000000000, sequence: 0
168 source 0 - assert 1186592700.388931295, sequence: 365 - clear 0.000000000, sequence: 0
169 source 0 - assert 1186592701.389032765, sequence: 366 - clear 0.000000000, sequence: 0
170
171Please, note that to compile userland programs you need the file timepps.h
172(see Documentation/pps/).
diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt
index 7224459b469e..aae8355d3166 100644
--- a/Documentation/rbtree.txt
+++ b/Documentation/rbtree.txt
@@ -131,8 +131,8 @@ Example:
131 } 131 }
132 132
133 /* Add new node and rebalance tree. */ 133 /* Add new node and rebalance tree. */
134 rb_link_node(data->node, parent, new); 134 rb_link_node(&data->node, parent, new);
135 rb_insert_color(data->node, root); 135 rb_insert_color(&data->node, root);
136 136
137 return TRUE; 137 return TRUE;
138 } 138 }
@@ -146,10 +146,10 @@ To remove an existing node from a tree, call:
146 146
147Example: 147Example:
148 148
149 struct mytype *data = mysearch(mytree, "walrus"); 149 struct mytype *data = mysearch(&mytree, "walrus");
150 150
151 if (data) { 151 if (data) {
152 rb_erase(data->node, mytree); 152 rb_erase(&data->node, &mytree);
153 myfree(data); 153 myfree(data);
154 } 154 }
155 155
@@ -188,5 +188,5 @@ Example:
188 188
189 struct rb_node *node; 189 struct rb_node *node;
190 for (node = rb_first(&mytree); node; node = rb_next(node)) 190 for (node = rb_first(&mytree); node; node = rb_next(node))
191 printk("key=%s\n", rb_entry(node, int, keystring)); 191 printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring);
192 192
diff --git a/Documentation/rfkill.txt b/Documentation/rfkill.txt
index 4d3ee317a4a3..b4860509c319 100644
--- a/Documentation/rfkill.txt
+++ b/Documentation/rfkill.txt
@@ -1,575 +1,139 @@
1rfkill - RF switch subsystem support 1rfkill - RF kill switch support
2==================================== 2===============================
3 3
41 Introduction 41. Introduction
52 Implementation details 52. Implementation details
63 Kernel driver guidelines 63. Kernel API
73.1 wireless device drivers 74. Userspace support
83.2 platform/switch drivers
93.3 input device drivers
104 Kernel API
115 Userspace support
12 8
13 9
141. Introduction: 101. Introduction
15 11
16The rfkill switch subsystem exists to add a generic interface to circuitry that 12The rfkill subsystem provides a generic interface to disabling any radio
17can enable or disable the signal output of a wireless *transmitter* of any 13transmitter in the system. When a transmitter is blocked, it shall not
18type. By far, the most common use is to disable radio-frequency transmitters. 14radiate any power.
19 15
20Note that disabling the signal output means that the the transmitter is to be 16The subsystem also provides the ability to react on button presses and
21made to not emit any energy when "blocked". rfkill is not about blocking data 17disable all transmitters of a certain type (or all). This is intended for
22transmissions, it is about blocking energy emission. 18situations where transmitters need to be turned off, for example on
19aircraft.
23 20
24The rfkill subsystem offers support for keys and switches often found on 21The rfkill subsystem has a concept of "hard" and "soft" block, which
25laptops to enable wireless devices like WiFi and Bluetooth, so that these keys 22differ little in their meaning (block == transmitters off) but rather in
26and switches actually perform an action in all wireless devices of a given type 23whether they can be changed or not:
27attached to the system. 24 - hard block: read-only radio block that cannot be overriden by software
25 - soft block: writable radio block (need not be readable) that is set by
26 the system software.
28 27
29The buttons to enable and disable the wireless transmitters are important in
30situations where the user is for example using his laptop on a location where
31radio-frequency transmitters _must_ be disabled (e.g. airplanes).
32 28
33Because of this requirement, userspace support for the keys should not be made 292. Implementation details
34mandatory. Because userspace might want to perform some additional smarter
35tasks when the key is pressed, rfkill provides userspace the possibility to
36take over the task to handle the key events.
37 30
38=============================================================================== 31The rfkill subsystem is composed of three main components:
392: Implementation details 32 * the rfkill core,
33 * the deprecated rfkill-input module (an input layer handler, being
34 replaced by userspace policy code) and
35 * the rfkill drivers.
40 36
41The rfkill subsystem is composed of various components: the rfkill class, the 37The rfkill core provides API for kernel drivers to register their radio
42rfkill-input module (an input layer handler), and some specific input layer 38transmitter with the kernel, methods for turning it on and off and, letting
43events. 39the system know about hardware-disabled states that may be implemented on
40the device.
44 41
45The rfkill class provides kernel drivers with an interface that allows them to 42The rfkill core code also notifies userspace of state changes, and provides
46know when they should enable or disable a wireless network device transmitter. 43ways for userspace to query the current states. See the "Userspace support"
47This is enabled by the CONFIG_RFKILL Kconfig option. 44section below.
48 45
49The rfkill class support makes sure userspace will be notified of all state 46When the device is hard-blocked (either by a call to rfkill_set_hw_state()
50changes on rfkill devices through uevents. It provides a notification chain 47or from query_hw_block) set_block() will be invoked for additional software
51for interested parties in the kernel to also get notified of rfkill state 48block, but drivers can ignore the method call since they can use the return
52changes in other drivers. It creates several sysfs entries which can be used 49value of the function rfkill_set_hw_state() to sync the software state
53by userspace. See section "Userspace support". 50instead of keeping track of calls to set_block(). In fact, drivers should
54 51use the return value of rfkill_set_hw_state() unless the hardware actually
55The rfkill-input module provides the kernel with the ability to implement a 52keeps track of soft and hard block separately.
56basic response when the user presses a key or button (or toggles a switch)
57related to rfkill functionality. It is an in-kernel implementation of default
58policy of reacting to rfkill-related input events and neither mandatory nor
59required for wireless drivers to operate. It is enabled by the
60CONFIG_RFKILL_INPUT Kconfig option.
61
62rfkill-input is a rfkill-related events input layer handler. This handler will
63listen to all rfkill key events and will change the rfkill state of the
64wireless devices accordingly. With this option enabled userspace could either
65do nothing or simply perform monitoring tasks.
66
67The rfkill-input module also provides EPO (emergency power-off) functionality
68for all wireless transmitters. This function cannot be overridden, and it is
69always active. rfkill EPO is related to *_RFKILL_ALL input layer events.
70
71
72Important terms for the rfkill subsystem:
73
74In order to avoid confusion, we avoid the term "switch" in rfkill when it is
75referring to an electronic control circuit that enables or disables a
76transmitter. We reserve it for the physical device a human manipulates
77(which is an input device, by the way):
78
79rfkill switch:
80
81 A physical device a human manipulates. Its state can be perceived by
82 the kernel either directly (through a GPIO pin, ACPI GPE) or by its
83 effect on a rfkill line of a wireless device.
84
85rfkill controller:
86
87 A hardware circuit that controls the state of a rfkill line, which a
88 kernel driver can interact with *to modify* that state (i.e. it has
89 either write-only or read/write access).
90
91rfkill line:
92
93 An input channel (hardware or software) of a wireless device, which
94 causes a wireless transmitter to stop emitting energy (BLOCK) when it
95 is active. Point of view is extremely important here: rfkill lines are
96 always seen from the PoV of a wireless device (and its driver).
97
98soft rfkill line/software rfkill line:
99
100 A rfkill line the wireless device driver can directly change the state
101 of. Related to rfkill_state RFKILL_STATE_SOFT_BLOCKED.
102
103hard rfkill line/hardware rfkill line:
104
105 A rfkill line that works fully in hardware or firmware, and that cannot
106 be overridden by the kernel driver. The hardware device or the
107 firmware just exports its status to the driver, but it is read-only.
108 Related to rfkill_state RFKILL_STATE_HARD_BLOCKED.
109
110The enum rfkill_state describes the rfkill state of a transmitter:
111
112When a rfkill line or rfkill controller is in the RFKILL_STATE_UNBLOCKED state,
113the wireless transmitter (radio TX circuit for example) is *enabled*. When the
114it is in the RFKILL_STATE_SOFT_BLOCKED or RFKILL_STATE_HARD_BLOCKED, the
115wireless transmitter is to be *blocked* from operating.
116
117RFKILL_STATE_SOFT_BLOCKED indicates that a call to toggle_radio() can change
118that state. RFKILL_STATE_HARD_BLOCKED indicates that a call to toggle_radio()
119will not be able to change the state and will return with a suitable error if
120attempts are made to set the state to RFKILL_STATE_UNBLOCKED.
121
122RFKILL_STATE_HARD_BLOCKED is used by drivers to signal that the device is
123locked in the BLOCKED state by a hardwire rfkill line (typically an input pin
124that, when active, forces the transmitter to be disabled) which the driver
125CANNOT override.
126
127Full rfkill functionality requires two different subsystems to cooperate: the
128input layer and the rfkill class. The input layer issues *commands* to the
129entire system requesting that devices registered to the rfkill class change
130state. The way this interaction happens is not complex, but it is not obvious
131either:
132
133Kernel Input layer:
134
135 * Generates KEY_WWAN, KEY_WLAN, KEY_BLUETOOTH, SW_RFKILL_ALL, and
136 other such events when the user presses certain keys, buttons, or
137 toggles certain physical switches.
138
139 THE INPUT LAYER IS NEVER USED TO PROPAGATE STATUS, NOTIFICATIONS OR THE
140 KIND OF STUFF AN ON-SCREEN-DISPLAY APPLICATION WOULD REPORT. It is
141 used to issue *commands* for the system to change behaviour, and these
142 commands may or may not be carried out by some kernel driver or
143 userspace application. It follows that doing user feedback based only
144 on input events is broken, as there is no guarantee that an input event
145 will be acted upon.
146
147 Most wireless communication device drivers implementing rfkill
148 functionality MUST NOT generate these events, and have no reason to
149 register themselves with the input layer. Doing otherwise is a common
150 misconception. There is an API to propagate rfkill status change
151 information, and it is NOT the input layer.
152
153rfkill class:
154
155 * Calls a hook in a driver to effectively change the wireless
156 transmitter state;
157 * Keeps track of the wireless transmitter state (with help from
158 the driver);
159 * Generates userspace notifications (uevents) and a call to a
160 notification chain (kernel) when there is a wireless transmitter
161 state change;
162 * Connects a wireless communications driver with the common rfkill
163 control system, which, for example, allows actions such as
164 "switch all bluetooth devices offline" to be carried out by
165 userspace or by rfkill-input.
166
167 THE RFKILL CLASS NEVER ISSUES INPUT EVENTS. THE RFKILL CLASS DOES
168 NOT LISTEN TO INPUT EVENTS. NO DRIVER USING THE RFKILL CLASS SHALL
169 EVER LISTEN TO, OR ACT ON RFKILL INPUT EVENTS. Doing otherwise is
170 a layering violation.
171
172 Most wireless data communication drivers in the kernel have just to
173 implement the rfkill class API to work properly. Interfacing to the
174 input layer is not often required (and is very often a *bug*) on
175 wireless drivers.
176
177 Platform drivers often have to attach to the input layer to *issue*
178 (but never to listen to) rfkill events for rfkill switches, and also to
179 the rfkill class to export a control interface for the platform rfkill
180 controllers to the rfkill subsystem. This does NOT mean the rfkill
181 switch is attached to a rfkill class (doing so is almost always wrong).
182 It just means the same kernel module is the driver for different
183 devices (rfkill switches and rfkill controllers).
184
185
186Userspace input handlers (uevents) or kernel input handlers (rfkill-input):
187
188 * Implements the policy of what should happen when one of the input
189 layer events related to rfkill operation is received.
190 * Uses the sysfs interface (userspace) or private rfkill API calls
191 to tell the devices registered with the rfkill class to change
192 their state (i.e. translates the input layer event into real
193 action).
194
195 * rfkill-input implements EPO by handling EV_SW SW_RFKILL_ALL 0
196 (power off all transmitters) in a special way: it ignores any
197 overrides and local state cache and forces all transmitters to the
198 RFKILL_STATE_SOFT_BLOCKED state (including those which are already
199 supposed to be BLOCKED).
200 * rfkill EPO will remain active until rfkill-input receives an
201 EV_SW SW_RFKILL_ALL 1 event. While the EPO is active, transmitters
202 are locked in the blocked state (rfkill will refuse to unblock them).
203 * rfkill-input implements different policies that the user can
204 select for handling EV_SW SW_RFKILL_ALL 1. It will unlock rfkill,
205 and either do nothing (leave transmitters blocked, but now unlocked),
206 restore the transmitters to their state before the EPO, or unblock
207 them all.
208
209Userspace uevent handler or kernel platform-specific drivers hooked to the
210rfkill notifier chain:
211
212 * Taps into the rfkill notifier chain or to KOBJ_CHANGE uevents,
213 in order to know when a device that is registered with the rfkill
214 class changes state;
215 * Issues feedback notifications to the user;
216 * In the rare platforms where this is required, synthesizes an input
217 event to command all *OTHER* rfkill devices to also change their
218 statues when a specific rfkill device changes state.
219
220
221===============================================================================
2223: Kernel driver guidelines
223
224Remember: point-of-view is everything for a driver that connects to the rfkill
225subsystem. All the details below must be measured/perceived from the point of
226view of the specific driver being modified.
227
228The first thing one needs to know is whether his driver should be talking to
229the rfkill class or to the input layer. In rare cases (platform drivers), it
230could happen that you need to do both, as platform drivers often handle a
231variety of devices in the same driver.
232
233Do not mistake input devices for rfkill controllers. The only type of "rfkill
234switch" device that is to be registered with the rfkill class are those
235directly controlling the circuits that cause a wireless transmitter to stop
236working (or the software equivalent of them), i.e. what we call a rfkill
237controller. Every other kind of "rfkill switch" is just an input device and
238MUST NOT be registered with the rfkill class.
239
240A driver should register a device with the rfkill class when ALL of the
241following conditions are met (they define a rfkill controller):
242
2431. The device is/controls a data communications wireless transmitter;
244
2452. The kernel can interact with the hardware/firmware to CHANGE the wireless
246 transmitter state (block/unblock TX operation);
247
2483. The transmitter can be made to not emit any energy when "blocked":
249 rfkill is not about blocking data transmissions, it is about blocking
250 energy emission;
251
252A driver should register a device with the input subsystem to issue
253rfkill-related events (KEY_WLAN, KEY_BLUETOOTH, KEY_WWAN, KEY_WIMAX,
254SW_RFKILL_ALL, etc) when ALL of the folowing conditions are met:
255
2561. It is directly related to some physical device the user interacts with, to
257 command the O.S./firmware/hardware to enable/disable a data communications
258 wireless transmitter.
259
260 Examples of the physical device are: buttons, keys and switches the user
261 will press/touch/slide/switch to enable or disable the wireless
262 communication device.
263
2642. It is NOT slaved to another device, i.e. there is no other device that
265 issues rfkill-related input events in preference to this one.
266
267 Please refer to the corner cases and examples section for more details.
268
269When in doubt, do not issue input events. For drivers that should generate
270input events in some platforms, but not in others (e.g. b43), the best solution
271is to NEVER generate input events in the first place. That work should be
272deferred to a platform-specific kernel module (which will know when to generate
273events through the rfkill notifier chain) or to userspace. This avoids the
274usual maintenance problems with DMI whitelisting.
275 53
276 54
277Corner cases and examples: 553. Kernel API
278====================================
279
2801. If the device is an input device that, because of hardware or firmware,
281causes wireless transmitters to be blocked regardless of the kernel's will, it
282is still just an input device, and NOT to be registered with the rfkill class.
283 56
2842. If the wireless transmitter switch control is read-only, it is an input
285device and not to be registered with the rfkill class (and maybe not to be made
286an input layer event source either, see below).
287 57
2883. If there is some other device driver *closer* to the actual hardware the 58Drivers for radio transmitters normally implement an rfkill driver.
289user interacted with (the button/switch/key) to issue an input event, THAT is
290the device driver that should be issuing input events.
291
292E.g:
293 [RFKILL slider switch] -- [GPIO hardware] -- [WLAN card rf-kill input]
294 (platform driver) (wireless card driver)
295
296The user is closer to the RFKILL slide switch plaform driver, so the driver
297which must issue input events is the platform driver looking at the GPIO
298hardware, and NEVER the wireless card driver (which is just a slave). It is
299very likely that there are other leaves than just the WLAN card rf-kill input
300(e.g. a bluetooth card, etc)...
301 59
302On the other hand, some embedded devices do this: 60Platform drivers might implement input devices if the rfkill button is just
61that, a button. If that button influences the hardware then you need to
62implement an rfkill driver instead. This also applies if the platform provides
63a way to turn on/off the transmitter(s).
303 64
304 [RFKILL slider switch] -- [WLAN card rf-kill input] 65For some platforms, it is possible that the hardware state changes during
305 (wireless card driver) 66suspend/hibernation, in which case it will be necessary to update the rfkill
67core with the current state is at resume time.
306 68
307In this situation, the wireless card driver *could* register itself as an input 69To create an rfkill driver, driver's Kconfig needs to have
308device and issue rf-kill related input events... but in order to AVOID the need
309for DMI whitelisting, the wireless card driver does NOT do it. Userspace (HAL)
310or a platform driver (that exists only on these embedded devices) will do the
311dirty job of issuing the input events.
312 70
71 depends on RFKILL || !RFKILL
313 72
314COMMON MISTAKES in kernel drivers, related to rfkill: 73to ensure the driver cannot be built-in when rfkill is modular. The !RFKILL
315==================================== 74case allows the driver to be built when rfkill is not configured, which which
75case all rfkill API can still be used but will be provided by static inlines
76which compile to almost nothing.
316 77
3171. NEVER confuse input device keys and buttons with input device switches. 78Calling rfkill_set_hw_state() when a state change happens is required from
79rfkill drivers that control devices that can be hard-blocked unless they also
80assign the poll_hw_block() callback (then the rfkill core will poll the
81device). Don't do this unless you cannot get the event in any other way.
318 82
319 1a. Switches are always set or reset. They report the current state
320 (on position or off position).
321 83
322 1b. Keys and buttons are either in the pressed or not-pressed state, and
323 that's it. A "button" that latches down when you press it, and
324 unlatches when you press it again is in fact a switch as far as input
325 devices go.
326 84
327Add the SW_* events you need for switches, do NOT try to emulate a button using 855. Userspace support
328KEY_* events just because there is no such SW_* event yet. Do NOT try to use,
329for example, KEY_BLUETOOTH when you should be using SW_BLUETOOTH instead.
330 86
3312. Input device switches (sources of EV_SW events) DO store their current state 87The recommended userspace interface to use is /dev/rfkill, which is a misc
332(so you *must* initialize it by issuing a gratuitous input layer event on 88character device that allows userspace to obtain and set the state of rfkill
333driver start-up and also when resuming from sleep), and that state CAN be 89devices and sets of devices. It also notifies userspace about device addition
334queried from userspace through IOCTLs. There is no sysfs interface for this, 90and removal. The API is a simple read/write API that is defined in
335but that doesn't mean you should break things trying to hook it to the rfkill 91linux/rfkill.h, with one ioctl that allows turning off the deprecated input
336class to get a sysfs interface :-) 92handler in the kernel for the transition period.
337 93
3383. Do not issue *_RFKILL_ALL events by default, unless you are sure it is the 94Except for the one ioctl, communication with the kernel is done via read()
339correct event for your switch/button. These events are emergency power-off 95and write() of instances of 'struct rfkill_event'. In this structure, the
340events when they are trying to turn the transmitters off. An example of an 96soft and hard block are properly separated (unlike sysfs, see below) and
341input device which SHOULD generate *_RFKILL_ALL events is the wireless-kill 97userspace is able to get a consistent snapshot of all rfkill devices in the
342switch in a laptop which is NOT a hotkey, but a real sliding/rocker switch. 98system. Also, it is possible to switch all rfkill drivers (or all drivers of
343An example of an input device which SHOULD NOT generate *_RFKILL_ALL events by 99a specified type) into a state which also updates the default state for
344default, is any sort of hot key that is type-specific (e.g. the one for WLAN). 100hotplugged devices.
345 101
102After an application opens /dev/rfkill, it can read the current state of
103all devices, and afterwards can poll the descriptor for hotplug or state
104change events.
346 105
3473.1 Guidelines for wireless device drivers 106Applications must ignore operations (the "op" field) they do not handle,
348------------------------------------------ 107this allows the API to be extended in the future.
349 108
350(in this text, rfkill->foo means the foo field of struct rfkill). 109Additionally, each rfkill device is registered in sysfs and there has the
351 110following attributes:
3521. Each independent transmitter in a wireless device (usually there is only one
353transmitter per device) should have a SINGLE rfkill class attached to it.
354
3552. If the device does not have any sort of hardware assistance to allow the
356driver to rfkill the device, the driver should emulate it by taking all actions
357required to silence the transmitter.
358
3593. If it is impossible to silence the transmitter (i.e. it still emits energy,
360even if it is just in brief pulses, when there is no data to transmit and there
361is no hardware support to turn it off) do NOT lie to the users. Do not attach
362it to a rfkill class. The rfkill subsystem does not deal with data
363transmission, it deals with energy emission. If the transmitter is emitting
364energy, it is not blocked in rfkill terms.
365
3664. It doesn't matter if the device has multiple rfkill input lines affecting
367the same transmitter, their combined state is to be exported as a single state
368per transmitter (see rule 1).
369
370This rule exists because users of the rfkill subsystem expect to get (and set,
371when possible) the overall transmitter rfkill state, not of a particular rfkill
372line.
373
3745. The wireless device driver MUST NOT leave the transmitter enabled during
375suspend and hibernation unless:
376
377 5.1. The transmitter has to be enabled for some sort of functionality
378 like wake-on-wireless-packet or autonomous packed forwarding in a mesh
379 network, and that functionality is enabled for this suspend/hibernation
380 cycle.
381
382AND
383
384 5.2. The device was not on a user-requested BLOCKED state before
385 the suspend (i.e. the driver must NOT unblock a device, not even
386 to support wake-on-wireless-packet or remain in the mesh).
387
388In other words, there is absolutely no allowed scenario where a driver can
389automatically take action to unblock a rfkill controller (obviously, this deals
390with scenarios where soft-blocking or both soft and hard blocking is happening.
391Scenarios where hardware rfkill lines are the only ones blocking the
392transmitter are outside of this rule, since the wireless device driver does not
393control its input hardware rfkill lines in the first place).
394
3956. During resume, rfkill will try to restore its previous state.
396
3977. After a rfkill class is suspended, it will *not* call rfkill->toggle_radio
398until it is resumed.
399
400
401Example of a WLAN wireless driver connected to the rfkill subsystem:
402--------------------------------------------------------------------
403
404A certain WLAN card has one input pin that causes it to block the transmitter
405and makes the status of that input pin available (only for reading!) to the
406kernel driver. This is a hard rfkill input line (it cannot be overridden by
407the kernel driver).
408
409The card also has one PCI register that, if manipulated by the driver, causes
410it to block the transmitter. This is a soft rfkill input line.
411
412It has also a thermal protection circuitry that shuts down its transmitter if
413the card overheats, and makes the status of that protection available (only for
414reading!) to the kernel driver. This is also a hard rfkill input line.
415
416If either one of these rfkill lines are active, the transmitter is blocked by
417the hardware and forced offline.
418
419The driver should allocate and attach to its struct device *ONE* instance of
420the rfkill class (there is only one transmitter).
421
422It can implement the get_state() hook, and return RFKILL_STATE_HARD_BLOCKED if
423either one of its two hard rfkill input lines are active. If the two hard
424rfkill lines are inactive, it must return RFKILL_STATE_SOFT_BLOCKED if its soft
425rfkill input line is active. Only if none of the rfkill input lines are
426active, will it return RFKILL_STATE_UNBLOCKED.
427
428Since the device has a hardware rfkill line, it IS subject to state changes
429external to rfkill. Therefore, the driver must make sure that it calls
430rfkill_force_state() to keep the status always up-to-date, and it must do a
431rfkill_force_state() on resume from sleep.
432
433Every time the driver gets a notification from the card that one of its rfkill
434lines changed state (polling might be needed on badly designed cards that don't
435generate interrupts for such events), it recomputes the rfkill state as per
436above, and calls rfkill_force_state() to update it.
437
438The driver should implement the toggle_radio() hook, that:
439
4401. Returns an error if one of the hardware rfkill lines are active, and the
441caller asked for RFKILL_STATE_UNBLOCKED.
442
4432. Activates the soft rfkill line if the caller asked for state
444RFKILL_STATE_SOFT_BLOCKED. It should do this even if one of the hard rfkill
445lines are active, effectively double-blocking the transmitter.
446
4473. Deactivates the soft rfkill line if none of the hardware rfkill lines are
448active and the caller asked for RFKILL_STATE_UNBLOCKED.
449
450===============================================================================
4514: Kernel API
452
453To build a driver with rfkill subsystem support, the driver should depend on
454(or select) the Kconfig symbol RFKILL; it should _not_ depend on RKFILL_INPUT.
455
456The hardware the driver talks to may be write-only (where the current state
457of the hardware is unknown), or read-write (where the hardware can be queried
458about its current state).
459
460The rfkill class will call the get_state hook of a device every time it needs
461to know the *real* current state of the hardware. This can happen often, but
462it does not do any polling, so it is not enough on hardware that is subject
463to state changes outside of the rfkill subsystem.
464
465Therefore, calling rfkill_force_state() when a state change happens is
466mandatory when the device has a hardware rfkill line, or when something else
467like the firmware could cause its state to be changed without going through the
468rfkill class.
469
470Some hardware provides events when its status changes. In these cases, it is
471best for the driver to not provide a get_state hook, and instead register the
472rfkill class *already* with the correct status, and keep it updated using
473rfkill_force_state() when it gets an event from the hardware.
474
475rfkill_force_state() must be used on the device resume handlers to update the
476rfkill status, should there be any chance of the device status changing during
477the sleep.
478
479There is no provision for a statically-allocated rfkill struct. You must
480use rfkill_allocate() to allocate one.
481
482You should:
483 - rfkill_allocate()
484 - modify rfkill fields (flags, name)
485 - modify state to the current hardware state (THIS IS THE ONLY TIME
486 YOU CAN ACCESS state DIRECTLY)
487 - rfkill_register()
488
489The only way to set a device to the RFKILL_STATE_HARD_BLOCKED state is through
490a suitable return of get_state() or through rfkill_force_state().
491
492When a device is in the RFKILL_STATE_HARD_BLOCKED state, the only way to switch
493it to a different state is through a suitable return of get_state() or through
494rfkill_force_state().
495
496If toggle_radio() is called to set a device to state RFKILL_STATE_SOFT_BLOCKED
497when that device is already at the RFKILL_STATE_HARD_BLOCKED state, it should
498not return an error. Instead, it should try to double-block the transmitter,
499so that its state will change from RFKILL_STATE_HARD_BLOCKED to
500RFKILL_STATE_SOFT_BLOCKED should the hardware blocking cease.
501
502Please refer to the source for more documentation.
503
504===============================================================================
5055: Userspace support
506
507rfkill devices issue uevents (with an action of "change"), with the following
508environment variables set:
509
510RFKILL_NAME
511RFKILL_STATE
512RFKILL_TYPE
513
514The ABI for these variables is defined by the sysfs attributes. It is best
515to take a quick look at the source to make sure of the possible values.
516
517It is expected that HAL will trap those, and bridge them to DBUS, etc. These
518events CAN and SHOULD be used to give feedback to the user about the rfkill
519status of the system.
520
521Input devices may issue events that are related to rfkill. These are the
522various KEY_* events and SW_* events supported by rfkill-input.c.
523
524******IMPORTANT******
525When rfkill-input is ACTIVE, userspace is NOT TO CHANGE THE STATE OF AN RFKILL
526SWITCH IN RESPONSE TO AN INPUT EVENT also handled by rfkill-input, unless it
527has set to true the user_claim attribute for that particular switch. This rule
528is *absolute*; do NOT violate it.
529******IMPORTANT******
530
531Userspace must not assume it is the only source of control for rfkill switches.
532Their state CAN and WILL change due to firmware actions, direct user actions,
533and the rfkill-input EPO override for *_RFKILL_ALL.
534
535When rfkill-input is not active, userspace must initiate a rfkill status
536change by writing to the "state" attribute in order for anything to happen.
537
538Take particular care to implement EV_SW SW_RFKILL_ALL properly. When that
539switch is set to OFF, *every* rfkill device *MUST* be immediately put into the
540RFKILL_STATE_SOFT_BLOCKED state, no questions asked.
541
542The following sysfs entries will be created:
543 111
544 name: Name assigned by driver to this key (interface or driver name). 112 name: Name assigned by driver to this key (interface or driver name).
545 type: Name of the key type ("wlan", "bluetooth", etc). 113 type: Driver type string ("wlan", "bluetooth", etc).
114 persistent: Whether the soft blocked state is initialised from
115 non-volatile storage at startup.
546 state: Current state of the transmitter 116 state: Current state of the transmitter
547 0: RFKILL_STATE_SOFT_BLOCKED 117 0: RFKILL_STATE_SOFT_BLOCKED
548 transmitter is forced off, but one can override it 118 transmitter is turned off by software
549 by a write to the state attribute;
550 1: RFKILL_STATE_UNBLOCKED 119 1: RFKILL_STATE_UNBLOCKED
551 transmiter is NOT forced off, and may operate if 120 transmitter is (potentially) active
552 all other conditions for such operation are met
553 (such as interface is up and configured, etc);
554 2: RFKILL_STATE_HARD_BLOCKED 121 2: RFKILL_STATE_HARD_BLOCKED
555 transmitter is forced off by something outside of 122 transmitter is forced off by something outside of
556 the driver's control. One cannot set a device to 123 the driver's control.
557 this state through writes to the state attribute; 124 This file is deprecated because it can only properly show
558 claim: 1: Userspace handles events, 0: Kernel handles events 125 three of the four possible states, soft-and-hard-blocked is
559 126 missing.
560Both the "state" and "claim" entries are also writable. For the "state" entry 127 claim: 0: Kernel handles events
561this means that when 1 or 0 is written, the device rfkill state (if not yet in 128 This file is deprecated because there no longer is a way to
562the requested state), will be will be toggled accordingly. 129 claim just control over a single rfkill instance.
563 130
564For the "claim" entry writing 1 to it means that the kernel no longer handles 131rfkill devices also issue uevents (with an action of "change"), with the
565key events even though RFKILL_INPUT input was enabled. When "claim" has been 132following environment variables set:
566set to 0, userspace should make sure that it listens for the input events or 133
567check the sysfs "state" entry regularly to correctly perform the required tasks 134RFKILL_NAME
568when the rkfill key is pressed. 135RFKILL_STATE
569 136RFKILL_TYPE
570A note about input devices and EV_SW events: 137
571 138The contents of these variables corresponds to the "name", "state" and
572In order to know the current state of an input device switch (like 139"type" sysfs files explained above.
573SW_RFKILL_ALL), you will need to use an IOCTL. That information is not
574available through sysfs in a generic way at this time, and it is not available
575through the rfkill class AT ALL.
diff --git a/Documentation/robust-futex-ABI.txt b/Documentation/robust-futex-ABI.txt
index 535f69fab45f..fd1cd8aae4eb 100644
--- a/Documentation/robust-futex-ABI.txt
+++ b/Documentation/robust-futex-ABI.txt
@@ -135,7 +135,7 @@ manipulating this list), the user code must observe the following
135protocol on 'lock entry' insertion and removal: 135protocol on 'lock entry' insertion and removal:
136 136
137On insertion: 137On insertion:
138 1) set the 'list_op_pending' word to the address of the 'lock word' 138 1) set the 'list_op_pending' word to the address of the 'lock entry'
139 to be inserted, 139 to be inserted,
140 2) acquire the futex lock, 140 2) acquire the futex lock,
141 3) add the lock entry, with its thread id (TID) in the bottom 29 bits 141 3) add the lock entry, with its thread id (TID) in the bottom 29 bits
@@ -143,7 +143,7 @@ On insertion:
143 4) clear the 'list_op_pending' word. 143 4) clear the 'list_op_pending' word.
144 144
145On removal: 145On removal:
146 1) set the 'list_op_pending' word to the address of the 'lock word' 146 1) set the 'list_op_pending' word to the address of the 'lock entry'
147 to be removed, 147 to be removed,
148 2) remove the lock entry for this lock from the 'head' list, 148 2) remove the lock entry for this lock from the 'head' list,
149 2) release the futex lock, and 149 2) release the futex lock, and
diff --git a/Documentation/s390/Debugging390.txt b/Documentation/s390/Debugging390.txt
index 10711d9f0788..1eb576a023bd 100644
--- a/Documentation/s390/Debugging390.txt
+++ b/Documentation/s390/Debugging390.txt
@@ -1984,7 +1984,7 @@ break *$pc
1984 1984
1985break *0x400618 1985break *0x400618
1986 1986
1987heres a really useful one for large programs 1987Here's a really useful one for large programs
1988rbr 1988rbr
1989Set a breakpoint for all functions matching REGEXP 1989Set a breakpoint for all functions matching REGEXP
1990e.g. 1990e.g.
@@ -2211,7 +2211,7 @@ Breakpoint 2 at 0x4d87a4: file top.c, line 2609.
2211#5 0x51692c in readline_internal () at readline.c:521 2211#5 0x51692c in readline_internal () at readline.c:521
2212#6 0x5164fe in readline (prompt=0x7ffff810 "\177ÿøx\177ÿ÷Ø\177ÿøxÀ") 2212#6 0x5164fe in readline (prompt=0x7ffff810 "\177ÿøx\177ÿ÷Ø\177ÿøxÀ")
2213 at readline.c:349 2213 at readline.c:349
2214#7 0x4d7a8a in command_line_input (prrompt=0x564420 "(gdb) ", repeat=1, 2214#7 0x4d7a8a in command_line_input (prompt=0x564420 "(gdb) ", repeat=1,
2215 annotation_suffix=0x4d6b44 "prompt") at top.c:2091 2215 annotation_suffix=0x4d6b44 "prompt") at top.c:2091
2216#8 0x4d6cf0 in command_loop () at top.c:1345 2216#8 0x4d6cf0 in command_loop () at top.c:1345
2217#9 0x4e25bc in main (argc=1, argv=0x7ffffdf4) at main.c:635 2217#9 0x4e25bc in main (argc=1, argv=0x7ffffdf4) at main.c:635
diff --git a/Documentation/scheduler/sched-nice-design.txt b/Documentation/scheduler/sched-nice-design.txt
index e2bae5a577e3..3ac1e46d5365 100644
--- a/Documentation/scheduler/sched-nice-design.txt
+++ b/Documentation/scheduler/sched-nice-design.txt
@@ -55,7 +55,7 @@ To sum it up: we always wanted to make nice levels more consistent, but
55within the constraints of HZ and jiffies and their nasty design level 55within the constraints of HZ and jiffies and their nasty design level
56coupling to timeslices and granularity it was not really viable. 56coupling to timeslices and granularity it was not really viable.
57 57
58The second (less frequent but still periodically occuring) complaint 58The second (less frequent but still periodically occurring) complaint
59about Linux's nice level support was its assymetry around the origo 59about Linux's nice level support was its assymetry around the origo
60(which you can see demonstrated in the picture above), or more 60(which you can see demonstrated in the picture above), or more
61accurately: the fact that nice level behavior depended on the _absolute_ 61accurately: the fact that nice level behavior depended on the _absolute_
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 5ba4d3fc625a..1df7f9cdab05 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -4,6 +4,7 @@
4CONTENTS 4CONTENTS
5======== 5========
6 6
70. WARNING
71. Overview 81. Overview
8 1.1 The problem 9 1.1 The problem
9 1.2 The solution 10 1.2 The solution
@@ -14,6 +15,23 @@ CONTENTS
143. Future plans 153. Future plans
15 16
16 17
180. WARNING
19==========
20
21 Fiddling with these settings can result in an unstable system, the knobs are
22 root only and assumes root knows what he is doing.
23
24Most notable:
25
26 * very small values in sched_rt_period_us can result in an unstable
27 system when the period is smaller than either the available hrtimer
28 resolution, or the time it takes to handle the budget refresh itself.
29
30 * very small values in sched_rt_runtime_us can result in an unstable
31 system when the runtime is so small the system has difficulty making
32 forward progress (NOTE: the migration thread and kstopmachine both
33 are real-time processes).
34
171. Overview 351. Overview
18=========== 36===========
19 37
@@ -169,7 +187,7 @@ get their allocated time.
169 187
170Implementing SCHED_EDF might take a while to complete. Priority Inheritance is 188Implementing SCHED_EDF might take a while to complete. Priority Inheritance is
171the biggest challenge as the current linux PI infrastructure is geared towards 189the biggest challenge as the current linux PI infrastructure is geared towards
172the limited static priority levels 0-139. With deadline scheduling you need to 190the limited static priority levels 0-99. With deadline scheduling you need to
173do deadline inheritance (since priority is inversely proportional to the 191do deadline inheritance (since priority is inversely proportional to the
174deadline delta (deadline - now). 192deadline delta (deadline - now).
175 193
diff --git a/Documentation/scsi/aic79xx.txt b/Documentation/scsi/aic79xx.txt
index 683ccae00ad4..c014eccaf19f 100644
--- a/Documentation/scsi/aic79xx.txt
+++ b/Documentation/scsi/aic79xx.txt
@@ -194,7 +194,7 @@ The following information is available in this file:
194 - Packetized SCSI Protocol at 160MB/s and 320MB/s 194 - Packetized SCSI Protocol at 160MB/s and 320MB/s
195 - Quick Arbitration Selection (QAS) 195 - Quick Arbitration Selection (QAS)
196 - Retained Training Information (Rev B. ASIC only) 196 - Retained Training Information (Rev B. ASIC only)
197 - Interrupt Coalessing 197 - Interrupt Coalescing
198 - Initiator Mode (target mode not currently 198 - Initiator Mode (target mode not currently
199 supported) 199 supported)
200 - Support for the PCI-X standard up to 133MHz 200 - Support for the PCI-X standard up to 133MHz
diff --git a/Documentation/scsi/ncr53c8xx.txt b/Documentation/scsi/ncr53c8xx.txt
index 230e30846ef2..08e2b4d04aab 100644
--- a/Documentation/scsi/ncr53c8xx.txt
+++ b/Documentation/scsi/ncr53c8xx.txt
@@ -206,7 +206,7 @@ of MOVE MEMORY instructions.
206The 896 and the 895A allows handling of the phase mismatch context from 206The 896 and the 895A allows handling of the phase mismatch context from
207SCRIPTS (avoids the phase mismatch interrupt that stops the SCSI processor 207SCRIPTS (avoids the phase mismatch interrupt that stops the SCSI processor
208until the C code has saved the context of the transfer). 208until the C code has saved the context of the transfer).
209Implementing this without using LOAD/STORE instructions would be painfull 209Implementing this without using LOAD/STORE instructions would be painful
210and I didn't even want to try it. 210and I didn't even want to try it.
211 211
212The 896 chip supports 64 bit PCI transactions and addressing, while the 212The 896 chip supports 64 bit PCI transactions and addressing, while the
@@ -240,7 +240,7 @@ characteristics. This feature may also reduce average command latency.
240In order to really gain advantage of this feature, devices must have 240In order to really gain advantage of this feature, devices must have
241a reasonable cache size (No miracle is to be expected for a low-end 241a reasonable cache size (No miracle is to be expected for a low-end
242hard disk with 128 KB or less). 242hard disk with 128 KB or less).
243Some kown SCSI devices do not properly support tagged command queuing. 243Some known SCSI devices do not properly support tagged command queuing.
244Generally, firmware revisions that fix this kind of problems are available 244Generally, firmware revisions that fix this kind of problems are available
245at respective vendor web/ftp sites. 245at respective vendor web/ftp sites.
246All I can say is that the hard disks I use on my machines behave well with 246All I can say is that the hard disks I use on my machines behave well with
diff --git a/Documentation/scsi/scsi_fc_transport.txt b/Documentation/scsi/scsi_fc_transport.txt
index e5b071d46619..d7f181701dc2 100644
--- a/Documentation/scsi/scsi_fc_transport.txt
+++ b/Documentation/scsi/scsi_fc_transport.txt
@@ -1,10 +1,11 @@
1 SCSI FC Tansport 1 SCSI FC Tansport
2 ============================================= 2 =============================================
3 3
4Date: 4/12/2007 4Date: 11/18/2008
5Kernel Revisions for features: 5Kernel Revisions for features:
6 rports : <<TBS>> 6 rports : <<TBS>>
7 vports : 2.6.22 (? TBD) 7 vports : 2.6.22
8 bsg support : 2.6.30 (?TBD?)
8 9
9 10
10Introduction 11Introduction
@@ -15,6 +16,7 @@ The FC transport can be found at:
15 drivers/scsi/scsi_transport_fc.c 16 drivers/scsi/scsi_transport_fc.c
16 include/scsi/scsi_transport_fc.h 17 include/scsi/scsi_transport_fc.h
17 include/scsi/scsi_netlink_fc.h 18 include/scsi/scsi_netlink_fc.h
19 include/scsi/scsi_bsg_fc.h
18 20
19This file is found at Documentation/scsi/scsi_fc_transport.txt 21This file is found at Documentation/scsi/scsi_fc_transport.txt
20 22
@@ -472,6 +474,14 @@ int
472fc_vport_terminate(struct fc_vport *vport) 474fc_vport_terminate(struct fc_vport *vport)
473 475
474 476
477FC BSG support (CT & ELS passthru, and more)
478========================================================================
479<< To Be Supplied >>
480
481
482
483
484
475Credits 485Credits
476======= 486=======
477The following people have contributed to this document: 487The following people have contributed to this document:
diff --git a/Documentation/scsi/scsi_mid_low_api.txt b/Documentation/scsi/scsi_mid_low_api.txt
index a6d5354639b2..de67229251d8 100644
--- a/Documentation/scsi/scsi_mid_low_api.txt
+++ b/Documentation/scsi/scsi_mid_low_api.txt
@@ -1271,6 +1271,11 @@ of interest:
1271 hostdata[0] - area reserved for LLD at end of struct Scsi_Host. Size 1271 hostdata[0] - area reserved for LLD at end of struct Scsi_Host. Size
1272 is set by the second argument (named 'xtr_bytes') to 1272 is set by the second argument (named 'xtr_bytes') to
1273 scsi_host_alloc() or scsi_register(). 1273 scsi_host_alloc() or scsi_register().
1274 vendor_id - a unique value that identifies the vendor supplying
1275 the LLD for the Scsi_Host. Used most often in validating
1276 vendor-specific message requests. Value consists of an
1277 identifier type and a vendor-specific value.
1278 See scsi_netlink.h for a description of valid formats.
1274 1279
1275The scsi_host structure is defined in include/scsi/scsi_host.h 1280The scsi_host structure is defined in include/scsi/scsi_host.h
1276 1281
diff --git a/Documentation/scsi/sym53c8xx_2.txt b/Documentation/scsi/sym53c8xx_2.txt
index 49ea5c58c6bc..eb9a7b905b64 100644
--- a/Documentation/scsi/sym53c8xx_2.txt
+++ b/Documentation/scsi/sym53c8xx_2.txt
@@ -206,7 +206,7 @@ characteristics. This feature may also reduce average command latency.
206In order to really gain advantage of this feature, devices must have 206In order to really gain advantage of this feature, devices must have
207a reasonable cache size (No miracle is to be expected for a low-end 207a reasonable cache size (No miracle is to be expected for a low-end
208hard disk with 128 KB or less). 208hard disk with 128 KB or less).
209Some kown old SCSI devices do not properly support tagged command queuing. 209Some known old SCSI devices do not properly support tagged command queuing.
210Generally, firmware revisions that fix this kind of problems are available 210Generally, firmware revisions that fix this kind of problems are available
211at respective vendor web/ftp sites. 211at respective vendor web/ftp sites.
212All I can say is that I never have had problem with tagged queuing using 212All I can say is that I never have had problem with tagged queuing using
diff --git a/Documentation/sound/alsa/ALSA-Configuration.txt b/Documentation/sound/alsa/ALSA-Configuration.txt
index 012858d2b119..4252697a95d6 100644
--- a/Documentation/sound/alsa/ALSA-Configuration.txt
+++ b/Documentation/sound/alsa/ALSA-Configuration.txt
@@ -460,6 +460,25 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
460 460
461 The power-management is supported. 461 The power-management is supported.
462 462
463 Module snd-ctxfi
464 ----------------
465
466 Module for Creative Sound Blaster X-Fi boards (20k1 / 20k2 chips)
467 * Creative Sound Blaster X-Fi Titanium Fatal1ty Champion Series
468 * Creative Sound Blaster X-Fi Titanium Fatal1ty Professional Series
469 * Creative Sound Blaster X-Fi Titanium Professional Audio
470 * Creative Sound Blaster X-Fi Titanium
471 * Creative Sound Blaster X-Fi Elite Pro
472 * Creative Sound Blaster X-Fi Platinum
473 * Creative Sound Blaster X-Fi Fatal1ty
474 * Creative Sound Blaster X-Fi XtremeGamer
475 * Creative Sound Blaster X-Fi XtremeMusic
476
477 reference_rate - reference sample rate, 44100 or 48000 (default)
478 multiple - multiple to ref. sample rate, 1 or 2 (default)
479
480 This module supports multiple cards.
481
463 Module snd-darla20 482 Module snd-darla20
464 ------------------ 483 ------------------
465 484
@@ -754,7 +773,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
754 single_cmd - Use single immediate commands to communicate with 773 single_cmd - Use single immediate commands to communicate with
755 codecs (for debugging only) 774 codecs (for debugging only)
756 enable_msi - Enable Message Signaled Interrupt (MSI) (default = off) 775 enable_msi - Enable Message Signaled Interrupt (MSI) (default = off)
757 power_save - Automatic power-saving timtout (in second, 0 = 776 power_save - Automatic power-saving timeout (in second, 0 =
758 disable) 777 disable)
759 power_save_controller - Reset HD-audio controller in power-saving mode 778 power_save_controller - Reset HD-audio controller in power-saving mode
760 (default = on) 779 (default = on)
@@ -925,6 +944,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
925 * Onkyo SE-90PCI 944 * Onkyo SE-90PCI
926 * Onkyo SE-200PCI 945 * Onkyo SE-200PCI
927 * ESI Juli@ 946 * ESI Juli@
947 * ESI Maya44
928 * Hercules Fortissimo IV 948 * Hercules Fortissimo IV
929 * EGO-SYS WaveTerminal 192M 949 * EGO-SYS WaveTerminal 192M
930 950
@@ -933,7 +953,7 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
933 prodigy71xt, prodigy71hifi, prodigyhd2, prodigy192, 953 prodigy71xt, prodigy71hifi, prodigyhd2, prodigy192,
934 juli, aureon51, aureon71, universe, ap192, k8x800, 954 juli, aureon51, aureon71, universe, ap192, k8x800,
935 phase22, phase28, ms300, av710, se200pci, se90pci, 955 phase22, phase28, ms300, av710, se200pci, se90pci,
936 fortissimo4, sn25p, WT192M 956 fortissimo4, sn25p, WT192M, maya44
937 957
938 This module supports multiple cards and autoprobe. 958 This module supports multiple cards and autoprobe.
939 959
@@ -1093,6 +1113,13 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
1093 This module supports multiple cards. 1113 This module supports multiple cards.
1094 The driver requires the firmware loader support on kernel. 1114 The driver requires the firmware loader support on kernel.
1095 1115
1116 Module snd-lx6464es
1117 -------------------
1118
1119 Module for Digigram LX6464ES boards
1120
1121 This module supports multiple cards.
1122
1096 Module snd-maestro3 1123 Module snd-maestro3
1097 ------------------- 1124 -------------------
1098 1125
@@ -1543,13 +1570,15 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
1543 Module snd-sc6000 1570 Module snd-sc6000
1544 ----------------- 1571 -----------------
1545 1572
1546 Module for Gallant SC-6000 soundcard. 1573 Module for Gallant SC-6000 soundcard and later models: SC-6600
1574 and SC-7000.
1547 1575
1548 port - Port # (0x220 or 0x240) 1576 port - Port # (0x220 or 0x240)
1549 mss_port - MSS Port # (0x530 or 0xe80) 1577 mss_port - MSS Port # (0x530 or 0xe80)
1550 irq - IRQ # (5,7,9,10,11) 1578 irq - IRQ # (5,7,9,10,11)
1551 mpu_irq - MPU-401 IRQ # (5,7,9,10) ,0 - no MPU-401 irq 1579 mpu_irq - MPU-401 IRQ # (5,7,9,10) ,0 - no MPU-401 irq
1552 dma - DMA # (1,3,0) 1580 dma - DMA # (1,3,0)
1581 joystick - Enable gameport - 0 = disable (default), 1 = enable
1553 1582
1554 This module supports multiple cards. 1583 This module supports multiple cards.
1555 1584
@@ -1859,7 +1888,8 @@ Prior to version 0.9.0rc4 options had a 'snd_' prefix. This was removed.
1859 ------------------- 1888 -------------------
1860 1889
1861 Module for sound cards based on the Asus AV100/AV200 chips, 1890 Module for sound cards based on the Asus AV100/AV200 chips,
1862 i.e., Xonar D1, DX, D2, D2X, HDAV1.3 (Deluxe), and Essence STX. 1891 i.e., Xonar D1, DX, D2, D2X, HDAV1.3 (Deluxe), Essence ST
1892 (Deluxe) and Essence STX.
1863 1893
1864 This module supports autoprobe and multiple cards. 1894 This module supports autoprobe and multiple cards.
1865 1895
diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt
index 322869fc8a9e..0d8d23581c44 100644
--- a/Documentation/sound/alsa/HD-Audio-Models.txt
+++ b/Documentation/sound/alsa/HD-Audio-Models.txt
@@ -36,6 +36,7 @@ ALC260
36 acer Acer TravelMate 36 acer Acer TravelMate
37 will Will laptops (PB V7900) 37 will Will laptops (PB V7900)
38 replacer Replacer 672V 38 replacer Replacer 672V
39 favorit100 Maxdata Favorit 100XS
39 basic fixed pin assignment (old default model) 40 basic fixed pin assignment (old default model)
40 test for testing/debugging purpose, almost all controls can 41 test for testing/debugging purpose, almost all controls can
41 adjusted. Appearing only when compiled with 42 adjusted. Appearing only when compiled with
@@ -85,10 +86,11 @@ ALC269
85 eeepc-p703 ASUS Eeepc P703 P900A 86 eeepc-p703 ASUS Eeepc P703 P900A
86 eeepc-p901 ASUS Eeepc P901 S101 87 eeepc-p901 ASUS Eeepc P901 S101
87 fujitsu FSC Amilo 88 fujitsu FSC Amilo
89 lifebook Fujitsu Lifebook S6420
88 auto auto-config reading BIOS (default) 90 auto auto-config reading BIOS (default)
89 91
90ALC662/663 92ALC662/663/272
91========== 93==============
92 3stack-dig 3-stack (2-channel) with SPDIF 94 3stack-dig 3-stack (2-channel) with SPDIF
93 3stack-6ch 3-stack (6-channel) 95 3stack-6ch 3-stack (6-channel)
94 3stack-6ch-dig 3-stack (6-channel) with SPDIF 96 3stack-6ch-dig 3-stack (6-channel) with SPDIF
@@ -107,6 +109,9 @@ ALC662/663
107 asus-mode4 ASUS 109 asus-mode4 ASUS
108 asus-mode5 ASUS 110 asus-mode5 ASUS
109 asus-mode6 ASUS 111 asus-mode6 ASUS
112 dell Dell with ALC272
113 dell-zm1 Dell ZM1 with ALC272
114 samsung-nc10 Samsung NC10 mini notebook
110 auto auto-config reading BIOS (default) 115 auto auto-config reading BIOS (default)
111 116
112ALC882/885 117ALC882/885
@@ -118,6 +123,7 @@ ALC882/885
118 asus-a7j ASUS A7J 123 asus-a7j ASUS A7J
119 asus-a7m ASUS A7M 124 asus-a7m ASUS A7M
120 macpro MacPro support 125 macpro MacPro support
126 mb5 Macbook 5,1
121 mbp3 Macbook Pro rev3 127 mbp3 Macbook Pro rev3
122 imac24 iMac 24'' with jack detection 128 imac24 iMac 24'' with jack detection
123 w2jc ASUS W2JC 129 w2jc ASUS W2JC
@@ -133,10 +139,13 @@ ALC883/888
133 acer Acer laptops (Travelmate 3012WTMi, Aspire 5600, etc) 139 acer Acer laptops (Travelmate 3012WTMi, Aspire 5600, etc)
134 acer-aspire Acer Aspire 9810 140 acer-aspire Acer Aspire 9810
135 acer-aspire-4930g Acer Aspire 4930G 141 acer-aspire-4930g Acer Aspire 4930G
142 acer-aspire-6530g Acer Aspire 6530G
143 acer-aspire-8930g Acer Aspire 8930G
136 medion Medion Laptops 144 medion Medion Laptops
137 medion-md2 Medion MD2 145 medion-md2 Medion MD2
138 targa-dig Targa/MSI 146 targa-dig Targa/MSI
139 targa-2ch-dig Targs/MSI with 2-channel 147 targa-2ch-dig Targa/MSI with 2-channel
148 targa-8ch-dig Targa/MSI with 8-channel (MSI GX620)
140 laptop-eapd 3-jack with SPDIF I/O and EAPD (Clevo M540JE, M550JE) 149 laptop-eapd 3-jack with SPDIF I/O and EAPD (Clevo M540JE, M550JE)
141 lenovo-101e Lenovo 101E 150 lenovo-101e Lenovo 101E
142 lenovo-nb0763 Lenovo NB0763 151 lenovo-nb0763 Lenovo NB0763
@@ -150,6 +159,9 @@ ALC883/888
150 fujitsu-pi2515 Fujitsu AMILO Pi2515 159 fujitsu-pi2515 Fujitsu AMILO Pi2515
151 fujitsu-xa3530 Fujitsu AMILO XA3530 160 fujitsu-xa3530 Fujitsu AMILO XA3530
152 3stack-6ch-intel Intel DG33* boards 161 3stack-6ch-intel Intel DG33* boards
162 asus-p5q ASUS P5Q-EM boards
163 mb31 MacBook 3,1
164 sony-vaio-tt Sony VAIO TT
153 auto auto-config reading BIOS (default) 165 auto auto-config reading BIOS (default)
154 166
155ALC861/660 167ALC861/660
@@ -348,6 +360,7 @@ STAC92HD71B*
348 hp-m4 HP mini 1000 360 hp-m4 HP mini 1000
349 hp-dv5 HP dv series 361 hp-dv5 HP dv series
350 hp-hdx HP HDX series 362 hp-hdx HP HDX series
363 hp-dv4-1222nr HP dv4-1222nr (with LED support)
351 auto BIOS setup (default) 364 auto BIOS setup (default)
352 365
353STAC92HD73* 366STAC92HD73*
diff --git a/Documentation/sound/alsa/HD-Audio.txt b/Documentation/sound/alsa/HD-Audio.txt
index 88b7433d2f11..71ac995b1915 100644
--- a/Documentation/sound/alsa/HD-Audio.txt
+++ b/Documentation/sound/alsa/HD-Audio.txt
@@ -16,7 +16,7 @@ methods for the HD-audio hardware.
16The HD-audio component consists of two parts: the controller chip and 16The HD-audio component consists of two parts: the controller chip and
17the codec chips on the HD-audio bus. Linux provides a single driver 17the codec chips on the HD-audio bus. Linux provides a single driver
18for all controllers, snd-hda-intel. Although the driver name contains 18for all controllers, snd-hda-intel. Although the driver name contains
19a word of a well-known harware vendor, it's not specific to it but for 19a word of a well-known hardware vendor, it's not specific to it but for
20all controller chips by other companies. Since the HD-audio 20all controller chips by other companies. Since the HD-audio
21controllers are supposed to be compatible, the single snd-hda-driver 21controllers are supposed to be compatible, the single snd-hda-driver
22should work in most cases. But, not surprisingly, there are known 22should work in most cases. But, not surprisingly, there are known
diff --git a/Documentation/sound/alsa/Procfile.txt b/Documentation/sound/alsa/Procfile.txt
index cfac20cf9e33..381908d8ca42 100644
--- a/Documentation/sound/alsa/Procfile.txt
+++ b/Documentation/sound/alsa/Procfile.txt
@@ -88,26 +88,34 @@ card*/pcm*/info
88 substreams, etc. 88 substreams, etc.
89 89
90card*/pcm*/xrun_debug 90card*/pcm*/xrun_debug
91 This file appears when CONFIG_SND_DEBUG=y. 91 This file appears when CONFIG_SND_DEBUG=y and
92 This shows the status of xrun (= buffer overrun/xrun) debug of 92 CONFIG_PCM_XRUN_DEBUG=y.
93 ALSA PCM middle layer, as an integer from 0 to 2. The value 93 This shows the status of xrun (= buffer overrun/xrun) and
94 can be changed by writing to this file, such as 94 invalid PCM position debug/check of ALSA PCM middle layer.
95 95 It takes an integer value, can be changed by writing to this
96 # cat 2 > /proc/asound/card0/pcm0p/xrun_debug 96 file, such as
97 97
98 When this value is greater than 0, the driver will show the 98 # cat 5 > /proc/asound/card0/pcm0p/xrun_debug
99 messages to kernel log when an xrun is detected. The debug 99
100 message is shown also when the invalid H/W pointer is detected 100 The value consists of the following bit flags:
101 at the update of periods (usually called from the interrupt 101 bit 0 = Enable XRUN/jiffies debug messages
102 bit 1 = Show stack trace at XRUN / jiffies check
103 bit 2 = Enable additional jiffies check
104
105 When the bit 0 is set, the driver will show the messages to
106 kernel log when an xrun is detected. The debug message is
107 shown also when the invalid H/W pointer is detected at the
108 update of periods (usually called from the interrupt
102 handler). 109 handler).
103 110
104 When this value is greater than 1, the driver will show the 111 When the bit 1 is set, the driver will show the stack trace
105 stack trace additionally. This may help the debugging. 112 additionally. This may help the debugging.
106 113
107 Since 2.6.30, this option also enables the hwptr check using 114 Since 2.6.30, this option can enable the hwptr check using
108 jiffies. This detects spontaneous invalid pointer callback 115 jiffies. This detects spontaneous invalid pointer callback
109 values, but can be lead to too much corrections for a (mostly 116 values, but can be lead to too much corrections for a (mostly
110 buggy) hardware that doesn't give smooth pointer updates. 117 buggy) hardware that doesn't give smooth pointer updates.
118 This feature is enabled via the bit 2.
111 119
112card*/pcm*/sub*/info 120card*/pcm*/sub*/info
113 The general information of this PCM sub-stream. 121 The general information of this PCM sub-stream.
diff --git a/Documentation/sound/alsa/README.maya44 b/Documentation/sound/alsa/README.maya44
new file mode 100644
index 000000000000..0e41576fa13e
--- /dev/null
+++ b/Documentation/sound/alsa/README.maya44
@@ -0,0 +1,163 @@
1NOTE: The following is the original document of Rainer's patch that the
2current maya44 code based on. Some contents might be obsoleted, but I
3keep here as reference -- tiwai
4
5----------------------------------------------------------------
6
7STATE OF DEVELOPMENT:
8
9This driver is being developed on the initiative of Piotr Makowski (oponek@gmail.com) and financed by Lars Bergmann.
10Development is carried out by Rainer Zimmermann (mail@lightshed.de).
11
12ESI provided a sample Maya44 card for the development work.
13
14However, unfortunately it has turned out difficult to get detailed programming information, so I (Rainer Zimmermann) had to find out some card-specific information by experiment and conjecture. Some information (in particular, several GPIO bits) is still missing.
15
16This is the first testing version of the Maya44 driver released to the alsa-devel mailing list (Feb 5, 2008).
17
18
19The following functions work, as tested by Rainer Zimmermann and Piotr Makowski:
20
21- playback and capture at all sampling rates
22- input/output level
23- crossmixing
24- line/mic switch
25- phantom power switch
26- analogue monitor a.k.a bypass
27
28
29The following functions *should* work, but are not fully tested:
30
31- Channel 3+4 analogue - S/PDIF input switching
32- S/PDIF output
33- all inputs/outputs on the M/IO/DIO extension card
34- internal/external clock selection
35
36
37*In particular, we would appreciate testing of these functions by anyone who has access to an M/IO/DIO extension card.*
38
39
40Things that do not seem to work:
41
42- The level meters ("multi track") in 'alsamixer' do not seem to react to signals in (if this is a bug, it would probably be in the existing ICE1724 code).
43
44- Ardour 2.1 seems to work only via JACK, not using ALSA directly or via OSS. This still needs to be tracked down.
45
46
47DRIVER DETAILS:
48
49the following files were added:
50
51pci/ice1724/maya44.c - Maya44 specific code
52pci/ice1724/maya44.h
53pci/ice1724/ice1724.patch
54pci/ice1724/ice1724.h.patch - PROPOSED patch to ice1724.h (see SAMPLING RATES)
55i2c/other/wm8776.c - low-level access routines for Wolfson WM8776 codecs
56include/wm8776.h
57
58
59Note that the wm8776.c code is meant to be card-independent and does not actually register the codec with the ALSA infrastructure.
60This is done in maya44.c, mainly because some of the WM8776 controls are used in Maya44-specific ways, and should be named appropriately.
61
62
63the following files were created in pci/ice1724, simply #including the corresponding file from the alsa-kernel tree:
64
65wtm.h
66vt1720_mobo.h
67revo.h
68prodigy192.h
69pontis.h
70phase.h
71maya44.h
72juli.h
73aureon.h
74amp.h
75envy24ht.h
76se.h
77prodigy_hifi.h
78
79
80*I hope this is the correct way to do things.*
81
82
83SAMPLING RATES:
84
85The Maya44 card (or more exactly, the Wolfson WM8776 codecs) allow a maximum sampling rate of 192 kHz for playback and 92 kHz for capture.
86
87As the ICE1724 chip only allows one global sampling rate, this is handled as follows:
88
89* setting the sampling rate on any open PCM device on the maya44 card will always set the *global* sampling rate for all playback and capture channels.
90
91* In the current state of the driver, setting rates of up to 192 kHz is permitted even for capture devices.
92
93*AVOID CAPTURING AT RATES ABOVE 96kHz*, even though it may appear to work. The codec cannot actually capture at such rates, meaning poor quality.
94
95
96I propose some additional code for limiting the sampling rate when setting on a capture pcm device. However because of the global sampling rate, this logic would be somewhat problematic.
97
98The proposed code (currently deactivated) is in ice1712.h.patch, ice1724.c and maya44.c (in pci/ice1712).
99
100
101SOUND DEVICES:
102
103PCM devices correspond to inputs/outputs as follows (assuming Maya44 is card #0):
104
105hw:0,0 input - stereo, analog input 1+2
106hw:0,0 output - stereo, analog output 1+2
107hw:0,1 input - stereo, analog input 3+4 OR S/PDIF input
108hw:0,1 output - stereo, analog output 3+4 (and SPDIF out)
109
110
111NAMING OF MIXER CONTROLS:
112
113(for more information about the signal flow, please refer to the block diagram on p.24 of the ESI Maya44 manual, or in the ESI windows software).
114
115
116PCM: (digital) output level for channel 1+2
117PCM 1: same for channel 3+4
118
119Mic Phantom+48V: switch for +48V phantom power for electrostatic microphones on input 1/2.
120 Make sure this is not turned on while any other source is connected to input 1/2.
121 It might damage the source and/or the maya44 card.
122
123Mic/Line input: if switch is is on, input jack 1/2 is microphone input (mono), otherwise line input (stereo).
124
125Bypass: analogue bypass from ADC input to output for channel 1+2. Same as "Monitor" in the windows driver.
126Bypass 1: same for channel 3+4.
127
128Crossmix: cross-mixer from channels 1+2 to channels 3+4
129Crossmix 1: cross-mixer from channels 3+4 to channels 1+2
130
131IEC958 Output: switch for S/PDIF output.
132 This is not supported by the ESI windows driver.
133 S/PDIF should output the same signal as channel 3+4. [untested!]
134
135
136Digitial output selectors:
137
138 These switches allow a direct digital routing from the ADCs to the DACs.
139 Each switch determines where the digital input data to one of the DACs comes from.
140 They are not supported by the ESI windows driver.
141 For normal operation, they should all be set to "PCM out".
142
143H/W: Output source channel 1
144H/W 1: Output source channel 2
145H/W 2: Output source channel 3
146H/W 3: Output source channel 4
147
148H/W 4 ... H/W 9: unknown function, left in to enable testing.
149 Possibly some of these control S/PDIF output(s).
150 If these turn out to be unused, they will go away in later driver versions.
151
152Selectable values for each of the digital output selectors are:
153 "PCM out" -> DAC output of the corresponding channel (default setting)
154 "Input 1"...
155 "Input 4" -> direct routing from ADC output of the selected input channel
156
157
158--------
159
160Feb 14, 2008
161Rainer Zimmermann
162mail@lightshed.de
163
diff --git a/Documentation/sound/alsa/hda_codec.txt b/Documentation/sound/alsa/hda_codec.txt
index 34e87ec1379c..de8efbc7e4bd 100644
--- a/Documentation/sound/alsa/hda_codec.txt
+++ b/Documentation/sound/alsa/hda_codec.txt
@@ -114,7 +114,7 @@ For writing a sequence of verbs, use snd_hda_sequence_write().
114 114
115There are variants of cached read/write, snd_hda_codec_write_cache(), 115There are variants of cached read/write, snd_hda_codec_write_cache(),
116snd_hda_sequence_write_cache(). These are used for recording the 116snd_hda_sequence_write_cache(). These are used for recording the
117register states for the power-mangement resume. When no PM is needed, 117register states for the power-management resume. When no PM is needed,
118these are equivalent with non-cached version. 118these are equivalent with non-cached version.
119 119
120To retrieve the number of sub nodes connected to the given node, use 120To retrieve the number of sub nodes connected to the given node, use
diff --git a/Documentation/sound/alsa/soc/dapm.txt b/Documentation/sound/alsa/soc/dapm.txt
index 9e6763264a2e..9ac842be9b4f 100644
--- a/Documentation/sound/alsa/soc/dapm.txt
+++ b/Documentation/sound/alsa/soc/dapm.txt
@@ -62,6 +62,7 @@ Audio DAPM widgets fall into a number of types:-
62 o Mic - Mic (and optional Jack) 62 o Mic - Mic (and optional Jack)
63 o Line - Line Input/Output (and optional Jack) 63 o Line - Line Input/Output (and optional Jack)
64 o Speaker - Speaker 64 o Speaker - Speaker
65 o Supply - Power or clock supply widget used by other widgets.
65 o Pre - Special PRE widget (exec before all others) 66 o Pre - Special PRE widget (exec before all others)
66 o Post - Special POST widget (exec after all others) 67 o Post - Special POST widget (exec after all others)
67 68
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index f11ca7979fa6..322a00bb99d9 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -32,6 +32,7 @@ show up in /proc/sys/kernel:
32- kstack_depth_to_print [ X86 only ] 32- kstack_depth_to_print [ X86 only ]
33- l2cr [ PPC only ] 33- l2cr [ PPC only ]
34- modprobe ==> Documentation/debugging-modules.txt 34- modprobe ==> Documentation/debugging-modules.txt
35- modules_disabled
35- msgmax 36- msgmax
36- msgmnb 37- msgmnb
37- msgmni 38- msgmni
@@ -184,6 +185,16 @@ kernel stack.
184 185
185============================================================== 186==============================================================
186 187
188modules_disabled:
189
190A toggle value indicating if modules are allowed to be loaded
191in an otherwise modular kernel. This toggle defaults to off
192(0), but can be set true (1). Once true, modules can be
193neither loaded nor unloaded, and the toggle cannot be set back
194to false.
195
196==============================================================
197
187osrelease, ostype & version: 198osrelease, ostype & version:
188 199
189# cat osrelease 200# cat osrelease
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index c302ddf629a0..c4de6359d440 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -233,8 +233,8 @@ These protections are added to score to judge whether this zone should be used
233for page allocation or should be reclaimed. 233for page allocation or should be reclaimed.
234 234
235In this example, if normal pages (index=2) are required to this DMA zone and 235In this example, if normal pages (index=2) are required to this DMA zone and
236pages_high is used for watermark, the kernel judges this zone should not be 236watermark[WMARK_HIGH] is used for watermark, the kernel judges this zone should
237used because pages_free(1355) is smaller than watermark + protection[2] 237not be used because pages_free(1355) is smaller than watermark + protection[2]
238(4 + 2004 = 2008). If this protection value is 0, this zone would be used for 238(4 + 2004 = 2008). If this protection value is 0, this zone would be used for
239normal page requirement. If requirement is DMA zone(index=0), protection[0] 239normal page requirement. If requirement is DMA zone(index=0), protection[0]
240(=0) is used. 240(=0) is used.
@@ -280,9 +280,10 @@ The default value is 65536.
280min_free_kbytes: 280min_free_kbytes:
281 281
282This is used to force the Linux VM to keep a minimum number 282This is used to force the Linux VM to keep a minimum number
283of kilobytes free. The VM uses this number to compute a pages_min 283of kilobytes free. The VM uses this number to compute a
284value for each lowmem zone in the system. Each lowmem zone gets 284watermark[WMARK_MIN] value for each lowmem zone in the system.
285a number of reserved free pages based proportionally on its size. 285Each lowmem zone gets a number of reserved free pages based
286proportionally on its size.
286 287
287Some minimal amount of memory is needed to satisfy PF_MEMALLOC 288Some minimal amount of memory is needed to satisfy PF_MEMALLOC
288allocations; if you set this to lower than 1024KB, your system will 289allocations; if you set this to lower than 1024KB, your system will
@@ -314,10 +315,14 @@ min_unmapped_ratio:
314 315
315This is available only on NUMA kernels. 316This is available only on NUMA kernels.
316 317
317A percentage of the total pages in each zone. Zone reclaim will only 318This is a percentage of the total pages in each zone. Zone reclaim will
318occur if more than this percentage of pages are file backed and unmapped. 319only occur if more than this percentage of pages are in a state that
319This is to insure that a minimal amount of local pages is still available for 320zone_reclaim_mode allows to be reclaimed.
320file I/O even if the node is overallocated. 321
322If zone_reclaim_mode has the value 4 OR'd, then the percentage is compared
323against all file-backed unmapped pages including swapcache pages and tmpfs
324files. Otherwise, only unmapped pages backed by normal files but not tmpfs
325files and similar are considered.
321 326
322The default is 1 percent. 327The default is 1 percent.
323 328
@@ -358,7 +363,7 @@ nr_pdflush_threads
358The current number of pdflush threads. This value is read-only. 363The current number of pdflush threads. This value is read-only.
359The value changes according to the number of dirty pages in the system. 364The value changes according to the number of dirty pages in the system.
360 365
361When neccessary, additional pdflush threads are created, one per second, up to 366When necessary, additional pdflush threads are created, one per second, up to
362nr_pdflush_threads_max. 367nr_pdflush_threads_max.
363 368
364============================================================== 369==============================================================
@@ -565,7 +570,7 @@ swappiness
565 570
566This control is used to define how aggressive the kernel will swap 571This control is used to define how aggressive the kernel will swap
567memory pages. Higher values will increase agressiveness, lower values 572memory pages. Higher values will increase agressiveness, lower values
568descrease the amount of swap. 573decrease the amount of swap.
569 574
570The default value is 60. 575The default value is 60.
571 576
diff --git a/Documentation/timers/hpet.txt b/Documentation/timers/hpet.txt
index e7c09abcfab4..04763a325520 100644
--- a/Documentation/timers/hpet.txt
+++ b/Documentation/timers/hpet.txt
@@ -7,7 +7,7 @@ by Intel and Microsoft which can be found at
7 7
8Each HPET has one fixed-rate counter (at 10+ MHz, hence "High Precision") 8Each HPET has one fixed-rate counter (at 10+ MHz, hence "High Precision")
9and up to 32 comparators. Normally three or more comparators are provided, 9and up to 32 comparators. Normally three or more comparators are provided,
10each of which can generate oneshot interupts and at least one of which has 10each of which can generate oneshot interrupts and at least one of which has
11additional hardware to support periodic interrupts. The comparators are 11additional hardware to support periodic interrupts. The comparators are
12also called "timers", which can be misleading since usually timers are 12also called "timers", which can be misleading since usually timers are
13independent of each other ... these share a counter, complicating resets. 13independent of each other ... these share a counter, complicating resets.
diff --git a/Documentation/timers/timer_stats.txt b/Documentation/timers/timer_stats.txt
index 20d368c59814..9bd00fc2e823 100644
--- a/Documentation/timers/timer_stats.txt
+++ b/Documentation/timers/timer_stats.txt
@@ -62,7 +62,7 @@ Timerstats sample period: 3.888770 s
62 62
63The first column is the number of events, the second column the pid, the third 63The first column is the number of events, the second column the pid, the third
64column is the name of the process. The forth column shows the function which 64column is the name of the process. The forth column shows the function which
65initialized the timer and in parantheses the callback function which was 65initialized the timer and in parenthesis the callback function which was
66executed on expiry. 66executed on expiry.
67 67
68 Thomas, Ingo 68 Thomas, Ingo
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
new file mode 100644
index 000000000000..f157d7594ea7
--- /dev/null
+++ b/Documentation/trace/events.txt
@@ -0,0 +1,90 @@
1 Event Tracing
2
3 Documentation written by Theodore Ts'o
4 Updated by Li Zefan
5
61. Introduction
7===============
8
9Tracepoints (see Documentation/trace/tracepoints.txt) can be used
10without creating custom kernel modules to register probe functions
11using the event tracing infrastructure.
12
13Not all tracepoints can be traced using the event tracing system;
14the kernel developer must provide code snippets which define how the
15tracing information is saved into the tracing buffer, and how the
16tracing information should be printed.
17
182. Using Event Tracing
19======================
20
212.1 Via the 'set_event' interface
22---------------------------------
23
24The events which are available for tracing can be found in the file
25/debug/tracing/available_events.
26
27To enable a particular event, such as 'sched_wakeup', simply echo it
28to /debug/tracing/set_event. For example:
29
30 # echo sched_wakeup >> /debug/tracing/set_event
31
32[ Note: '>>' is necessary, otherwise it will firstly disable
33 all the events. ]
34
35To disable an event, echo the event name to the set_event file prefixed
36with an exclamation point:
37
38 # echo '!sched_wakeup' >> /debug/tracing/set_event
39
40To disable all events, echo an empty line to the set_event file:
41
42 # echo > /debug/tracing/set_event
43
44To enable all events, echo '*:*' or '*:' to the set_event file:
45
46 # echo *:* > /debug/tracing/set_event
47
48The events are organized into subsystems, such as ext4, irq, sched,
49etc., and a full event name looks like this: <subsystem>:<event>. The
50subsystem name is optional, but it is displayed in the available_events
51file. All of the events in a subsystem can be specified via the syntax
52"<subsystem>:*"; for example, to enable all irq events, you can use the
53command:
54
55 # echo 'irq:*' > /debug/tracing/set_event
56
572.2 Via the 'enable' toggle
58---------------------------
59
60The events available are also listed in /debug/tracing/events/ hierarchy
61of directories.
62
63To enable event 'sched_wakeup':
64
65 # echo 1 > /debug/tracing/events/sched/sched_wakeup/enable
66
67To disable it:
68
69 # echo 0 > /debug/tracing/events/sched/sched_wakeup/enable
70
71To enable all events in sched subsystem:
72
73 # echo 1 > /debug/tracing/events/sched/enable
74
75To eanble all events:
76
77 # echo 1 > /debug/tracing/events/enable
78
79When reading one of these enable files, there are four results:
80
81 0 - all events this file affects are disabled
82 1 - all events this file affects are enabled
83 X - there is a mixture of events enabled and disabled
84 ? - this file does not affect any event
85
863. Defining an event-enabled tracepoint
87=======================================
88
89See The example provided in samples/trace_events
90
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index fd9a3e693813..a39b3c749de5 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -7,7 +7,6 @@ Copyright 2008 Red Hat Inc.
7 (dual licensed under the GPL v2) 7 (dual licensed under the GPL v2)
8Reviewers: Elias Oltmanns, Randy Dunlap, Andrew Morton, 8Reviewers: Elias Oltmanns, Randy Dunlap, Andrew Morton,
9 John Kacur, and David Teigland. 9 John Kacur, and David Teigland.
10
11Written for: 2.6.28-rc2 10Written for: 2.6.28-rc2
12 11
13Introduction 12Introduction
@@ -33,13 +32,26 @@ The File System
33Ftrace uses the debugfs file system to hold the control files as 32Ftrace uses the debugfs file system to hold the control files as
34well as the files to display output. 33well as the files to display output.
35 34
36To mount the debugfs system: 35When debugfs is configured into the kernel (which selecting any ftrace
36option will do) the directory /sys/kernel/debug will be created. To mount
37this directory, you can add to your /etc/fstab file:
38
39 debugfs /sys/kernel/debug debugfs defaults 0 0
40
41Or you can mount it at run time with:
42
43 mount -t debugfs nodev /sys/kernel/debug
37 44
38 # mkdir /debug 45For quicker access to that directory you may want to make a soft link to
39 # mount -t debugfs nodev /debug 46it:
40 47
41( Note: it is more common to mount at /sys/kernel/debug, but for 48 ln -s /sys/kernel/debug /debug
42 simplicity this document will use /debug) 49
50Any selected ftrace option will also create a directory called tracing
51within the debugfs. The rest of the document will assume that you are in
52the ftrace directory (cd /sys/kernel/debug/tracing) and will only concentrate
53on the files within that directory and not distract from the content with
54the extended "/sys/kernel/debug/tracing" path name.
43 55
44That's it! (assuming that you have ftrace configured into your kernel) 56That's it! (assuming that you have ftrace configured into your kernel)
45 57
@@ -179,7 +191,7 @@ Here is the list of current tracers that may be configured.
179 191
180 Function call tracer to trace all kernel functions. 192 Function call tracer to trace all kernel functions.
181 193
182 "function_graph_tracer" 194 "function_graph"
183 195
184 Similar to the function tracer except that the 196 Similar to the function tracer except that the
185 function tracer probes the functions on their entry 197 function tracer probes the functions on their entry
@@ -389,18 +401,18 @@ trace_options
389The trace_options file is used to control what gets printed in 401The trace_options file is used to control what gets printed in
390the trace output. To see what is available, simply cat the file: 402the trace output. To see what is available, simply cat the file:
391 403
392 cat /debug/tracing/trace_options 404 cat trace_options
393 print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \ 405 print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
394 noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj 406 noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj
395 407
396To disable one of the options, echo in the option prepended with 408To disable one of the options, echo in the option prepended with
397"no". 409"no".
398 410
399 echo noprint-parent > /debug/tracing/trace_options 411 echo noprint-parent > trace_options
400 412
401To enable an option, leave off the "no". 413To enable an option, leave off the "no".
402 414
403 echo sym-offset > /debug/tracing/trace_options 415 echo sym-offset > trace_options
404 416
405Here are the available options: 417Here are the available options:
406 418
@@ -476,11 +488,11 @@ sched_switch
476This tracer simply records schedule switches. Here is an example 488This tracer simply records schedule switches. Here is an example
477of how to use it. 489of how to use it.
478 490
479 # echo sched_switch > /debug/tracing/current_tracer 491 # echo sched_switch > current_tracer
480 # echo 1 > /debug/tracing/tracing_enabled 492 # echo 1 > tracing_enabled
481 # sleep 1 493 # sleep 1
482 # echo 0 > /debug/tracing/tracing_enabled 494 # echo 0 > tracing_enabled
483 # cat /debug/tracing/trace 495 # cat trace
484 496
485# tracer: sched_switch 497# tracer: sched_switch
486# 498#
@@ -518,9 +530,18 @@ priority with zero (0) being the highest priority and the nice
518values starting at 100 (nice -20). Below is a quick chart to map 530values starting at 100 (nice -20). Below is a quick chart to map
519the kernel priority to user land priorities. 531the kernel priority to user land priorities.
520 532
521 Kernel priority: 0 to 99 ==> user RT priority 99 to 0 533 Kernel Space User Space
522 Kernel priority: 100 to 139 ==> user nice -20 to 19 534 ===============================================================
523 Kernel priority: 140 ==> idle task priority 535 0(high) to 98(low) user RT priority 99(high) to 1(low)
536 with SCHED_RR or SCHED_FIFO
537 ---------------------------------------------------------------
538 99 sched_priority is not used in scheduling
539 decisions(it must be specified as 0)
540 ---------------------------------------------------------------
541 100(high) to 139(low) user nice -20(high) to 19(low)
542 ---------------------------------------------------------------
543 140 idle task priority
544 ---------------------------------------------------------------
524 545
525The task states are: 546The task states are:
526 547
@@ -574,13 +595,13 @@ new trace is saved.
574To reset the maximum, echo 0 into tracing_max_latency. Here is 595To reset the maximum, echo 0 into tracing_max_latency. Here is
575an example: 596an example:
576 597
577 # echo irqsoff > /debug/tracing/current_tracer 598 # echo irqsoff > current_tracer
578 # echo 0 > /debug/tracing/tracing_max_latency 599 # echo 0 > tracing_max_latency
579 # echo 1 > /debug/tracing/tracing_enabled 600 # echo 1 > tracing_enabled
580 # ls -ltr 601 # ls -ltr
581 [...] 602 [...]
582 # echo 0 > /debug/tracing/tracing_enabled 603 # echo 0 > tracing_enabled
583 # cat /debug/tracing/latency_trace 604 # cat latency_trace
584# tracer: irqsoff 605# tracer: irqsoff
585# 606#
586irqsoff latency trace v1.1.5 on 2.6.26 607irqsoff latency trace v1.1.5 on 2.6.26
@@ -681,13 +702,13 @@ Like the irqsoff tracer, it records the maximum latency for
681which preemption was disabled. The control of preemptoff tracer 702which preemption was disabled. The control of preemptoff tracer
682is much like the irqsoff tracer. 703is much like the irqsoff tracer.
683 704
684 # echo preemptoff > /debug/tracing/current_tracer 705 # echo preemptoff > current_tracer
685 # echo 0 > /debug/tracing/tracing_max_latency 706 # echo 0 > tracing_max_latency
686 # echo 1 > /debug/tracing/tracing_enabled 707 # echo 1 > tracing_enabled
687 # ls -ltr 708 # ls -ltr
688 [...] 709 [...]
689 # echo 0 > /debug/tracing/tracing_enabled 710 # echo 0 > tracing_enabled
690 # cat /debug/tracing/latency_trace 711 # cat latency_trace
691# tracer: preemptoff 712# tracer: preemptoff
692# 713#
693preemptoff latency trace v1.1.5 on 2.6.26-rc8 714preemptoff latency trace v1.1.5 on 2.6.26-rc8
@@ -828,13 +849,13 @@ tracer.
828Again, using this trace is much like the irqsoff and preemptoff 849Again, using this trace is much like the irqsoff and preemptoff
829tracers. 850tracers.
830 851
831 # echo preemptirqsoff > /debug/tracing/current_tracer 852 # echo preemptirqsoff > current_tracer
832 # echo 0 > /debug/tracing/tracing_max_latency 853 # echo 0 > tracing_max_latency
833 # echo 1 > /debug/tracing/tracing_enabled 854 # echo 1 > tracing_enabled
834 # ls -ltr 855 # ls -ltr
835 [...] 856 [...]
836 # echo 0 > /debug/tracing/tracing_enabled 857 # echo 0 > tracing_enabled
837 # cat /debug/tracing/latency_trace 858 # cat latency_trace
838# tracer: preemptirqsoff 859# tracer: preemptirqsoff
839# 860#
840preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8 861preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8
@@ -990,12 +1011,12 @@ slightly differently than we did with the previous tracers.
990Instead of performing an 'ls', we will run 'sleep 1' under 1011Instead of performing an 'ls', we will run 'sleep 1' under
991'chrt' which changes the priority of the task. 1012'chrt' which changes the priority of the task.
992 1013
993 # echo wakeup > /debug/tracing/current_tracer 1014 # echo wakeup > current_tracer
994 # echo 0 > /debug/tracing/tracing_max_latency 1015 # echo 0 > tracing_max_latency
995 # echo 1 > /debug/tracing/tracing_enabled 1016 # echo 1 > tracing_enabled
996 # chrt -f 5 sleep 1 1017 # chrt -f 5 sleep 1
997 # echo 0 > /debug/tracing/tracing_enabled 1018 # echo 0 > tracing_enabled
998 # cat /debug/tracing/latency_trace 1019 # cat latency_trace
999# tracer: wakeup 1020# tracer: wakeup
1000# 1021#
1001wakeup latency trace v1.1.5 on 2.6.26-rc8 1022wakeup latency trace v1.1.5 on 2.6.26-rc8
@@ -1105,11 +1126,11 @@ can be done from the debug file system. Make sure the
1105ftrace_enabled is set; otherwise this tracer is a nop. 1126ftrace_enabled is set; otherwise this tracer is a nop.
1106 1127
1107 # sysctl kernel.ftrace_enabled=1 1128 # sysctl kernel.ftrace_enabled=1
1108 # echo function > /debug/tracing/current_tracer 1129 # echo function > current_tracer
1109 # echo 1 > /debug/tracing/tracing_enabled 1130 # echo 1 > tracing_enabled
1110 # usleep 1 1131 # usleep 1
1111 # echo 0 > /debug/tracing/tracing_enabled 1132 # echo 0 > tracing_enabled
1112 # cat /debug/tracing/trace 1133 # cat trace
1113# tracer: function 1134# tracer: function
1114# 1135#
1115# TASK-PID CPU# TIMESTAMP FUNCTION 1136# TASK-PID CPU# TIMESTAMP FUNCTION
@@ -1146,7 +1167,7 @@ int trace_fd;
1146[...] 1167[...]
1147int main(int argc, char *argv[]) { 1168int main(int argc, char *argv[]) {
1148 [...] 1169 [...]
1149 trace_fd = open("/debug/tracing/tracing_enabled", O_WRONLY); 1170 trace_fd = open(tracing_file("tracing_enabled"), O_WRONLY);
1150 [...] 1171 [...]
1151 if (condition_hit()) { 1172 if (condition_hit()) {
1152 write(trace_fd, "0", 1); 1173 write(trace_fd, "0", 1);
@@ -1154,26 +1175,20 @@ int main(int argc, char *argv[]) {
1154 [...] 1175 [...]
1155} 1176}
1156 1177
1157Note: Here we hard coded the path name. The debugfs mount is not
1158guaranteed to be at /debug (and is more commonly at
1159/sys/kernel/debug). For simple one time traces, the above is
1160sufficent. For anything else, a search through /proc/mounts may
1161be needed to find where the debugfs file-system is mounted.
1162
1163 1178
1164Single thread tracing 1179Single thread tracing
1165--------------------- 1180---------------------
1166 1181
1167By writing into /debug/tracing/set_ftrace_pid you can trace a 1182By writing into set_ftrace_pid you can trace a
1168single thread. For example: 1183single thread. For example:
1169 1184
1170# cat /debug/tracing/set_ftrace_pid 1185# cat set_ftrace_pid
1171no pid 1186no pid
1172# echo 3111 > /debug/tracing/set_ftrace_pid 1187# echo 3111 > set_ftrace_pid
1173# cat /debug/tracing/set_ftrace_pid 1188# cat set_ftrace_pid
11743111 11893111
1175# echo function > /debug/tracing/current_tracer 1190# echo function > current_tracer
1176# cat /debug/tracing/trace | head 1191# cat trace | head
1177 # tracer: function 1192 # tracer: function
1178 # 1193 #
1179 # TASK-PID CPU# TIMESTAMP FUNCTION 1194 # TASK-PID CPU# TIMESTAMP FUNCTION
@@ -1184,8 +1199,8 @@ no pid
1184 yum-updatesd-3111 [003] 1637.254683: lock_hrtimer_base <-hrtimer_try_to_cancel 1199 yum-updatesd-3111 [003] 1637.254683: lock_hrtimer_base <-hrtimer_try_to_cancel
1185 yum-updatesd-3111 [003] 1637.254685: fget_light <-do_sys_poll 1200 yum-updatesd-3111 [003] 1637.254685: fget_light <-do_sys_poll
1186 yum-updatesd-3111 [003] 1637.254686: pipe_poll <-do_sys_poll 1201 yum-updatesd-3111 [003] 1637.254686: pipe_poll <-do_sys_poll
1187# echo -1 > /debug/tracing/set_ftrace_pid 1202# echo -1 > set_ftrace_pid
1188# cat /debug/tracing/trace |head 1203# cat trace |head
1189 # tracer: function 1204 # tracer: function
1190 # 1205 #
1191 # TASK-PID CPU# TIMESTAMP FUNCTION 1206 # TASK-PID CPU# TIMESTAMP FUNCTION
@@ -1207,6 +1222,51 @@ something like this simple program:
1207#include <fcntl.h> 1222#include <fcntl.h>
1208#include <unistd.h> 1223#include <unistd.h>
1209 1224
1225#define _STR(x) #x
1226#define STR(x) _STR(x)
1227#define MAX_PATH 256
1228
1229const char *find_debugfs(void)
1230{
1231 static char debugfs[MAX_PATH+1];
1232 static int debugfs_found;
1233 char type[100];
1234 FILE *fp;
1235
1236 if (debugfs_found)
1237 return debugfs;
1238
1239 if ((fp = fopen("/proc/mounts","r")) == NULL) {
1240 perror("/proc/mounts");
1241 return NULL;
1242 }
1243
1244 while (fscanf(fp, "%*s %"
1245 STR(MAX_PATH)
1246 "s %99s %*s %*d %*d\n",
1247 debugfs, type) == 2) {
1248 if (strcmp(type, "debugfs") == 0)
1249 break;
1250 }
1251 fclose(fp);
1252
1253 if (strcmp(type, "debugfs") != 0) {
1254 fprintf(stderr, "debugfs not mounted");
1255 return NULL;
1256 }
1257
1258 debugfs_found = 1;
1259
1260 return debugfs;
1261}
1262
1263const char *tracing_file(const char *file_name)
1264{
1265 static char trace_file[MAX_PATH+1];
1266 snprintf(trace_file, MAX_PATH, "%s/%s", find_debugfs(), file_name);
1267 return trace_file;
1268}
1269
1210int main (int argc, char **argv) 1270int main (int argc, char **argv)
1211{ 1271{
1212 if (argc < 1) 1272 if (argc < 1)
@@ -1217,12 +1277,12 @@ int main (int argc, char **argv)
1217 char line[64]; 1277 char line[64];
1218 int s; 1278 int s;
1219 1279
1220 ffd = open("/debug/tracing/current_tracer", O_WRONLY); 1280 ffd = open(tracing_file("current_tracer"), O_WRONLY);
1221 if (ffd < 0) 1281 if (ffd < 0)
1222 exit(-1); 1282 exit(-1);
1223 write(ffd, "nop", 3); 1283 write(ffd, "nop", 3);
1224 1284
1225 fd = open("/debug/tracing/set_ftrace_pid", O_WRONLY); 1285 fd = open(tracing_file("set_ftrace_pid"), O_WRONLY);
1226 s = sprintf(line, "%d\n", getpid()); 1286 s = sprintf(line, "%d\n", getpid());
1227 write(fd, line, s); 1287 write(fd, line, s);
1228 1288
@@ -1374,22 +1434,22 @@ want, depending on your needs.
1374 tracing_cpu_mask file) or you might sometimes see unordered 1434 tracing_cpu_mask file) or you might sometimes see unordered
1375 function calls while cpu tracing switch. 1435 function calls while cpu tracing switch.
1376 1436
1377 hide: echo nofuncgraph-cpu > /debug/tracing/trace_options 1437 hide: echo nofuncgraph-cpu > trace_options
1378 show: echo funcgraph-cpu > /debug/tracing/trace_options 1438 show: echo funcgraph-cpu > trace_options
1379 1439
1380- The duration (function's time of execution) is displayed on 1440- The duration (function's time of execution) is displayed on
1381 the closing bracket line of a function or on the same line 1441 the closing bracket line of a function or on the same line
1382 than the current function in case of a leaf one. It is default 1442 than the current function in case of a leaf one. It is default
1383 enabled. 1443 enabled.
1384 1444
1385 hide: echo nofuncgraph-duration > /debug/tracing/trace_options 1445 hide: echo nofuncgraph-duration > trace_options
1386 show: echo funcgraph-duration > /debug/tracing/trace_options 1446 show: echo funcgraph-duration > trace_options
1387 1447
1388- The overhead field precedes the duration field in case of 1448- The overhead field precedes the duration field in case of
1389 reached duration thresholds. 1449 reached duration thresholds.
1390 1450
1391 hide: echo nofuncgraph-overhead > /debug/tracing/trace_options 1451 hide: echo nofuncgraph-overhead > trace_options
1392 show: echo funcgraph-overhead > /debug/tracing/trace_options 1452 show: echo funcgraph-overhead > trace_options
1393 depends on: funcgraph-duration 1453 depends on: funcgraph-duration
1394 1454
1395 ie: 1455 ie:
@@ -1418,8 +1478,8 @@ want, depending on your needs.
1418- The task/pid field displays the thread cmdline and pid which 1478- The task/pid field displays the thread cmdline and pid which
1419 executed the function. It is default disabled. 1479 executed the function. It is default disabled.
1420 1480
1421 hide: echo nofuncgraph-proc > /debug/tracing/trace_options 1481 hide: echo nofuncgraph-proc > trace_options
1422 show: echo funcgraph-proc > /debug/tracing/trace_options 1482 show: echo funcgraph-proc > trace_options
1423 1483
1424 ie: 1484 ie:
1425 1485
@@ -1442,8 +1502,8 @@ want, depending on your needs.
1442 system clock since it started. A snapshot of this time is 1502 system clock since it started. A snapshot of this time is
1443 given on each entry/exit of functions 1503 given on each entry/exit of functions
1444 1504
1445 hide: echo nofuncgraph-abstime > /debug/tracing/trace_options 1505 hide: echo nofuncgraph-abstime > trace_options
1446 show: echo funcgraph-abstime > /debug/tracing/trace_options 1506 show: echo funcgraph-abstime > trace_options
1447 1507
1448 ie: 1508 ie:
1449 1509
@@ -1540,7 +1600,7 @@ listed in:
1540 1600
1541 available_filter_functions 1601 available_filter_functions
1542 1602
1543 # cat /debug/tracing/available_filter_functions 1603 # cat available_filter_functions
1544put_prev_task_idle 1604put_prev_task_idle
1545kmem_cache_create 1605kmem_cache_create
1546pick_next_task_rt 1606pick_next_task_rt
@@ -1552,12 +1612,12 @@ mutex_lock
1552If I am only interested in sys_nanosleep and hrtimer_interrupt: 1612If I am only interested in sys_nanosleep and hrtimer_interrupt:
1553 1613
1554 # echo sys_nanosleep hrtimer_interrupt \ 1614 # echo sys_nanosleep hrtimer_interrupt \
1555 > /debug/tracing/set_ftrace_filter 1615 > set_ftrace_filter
1556 # echo ftrace > /debug/tracing/current_tracer 1616 # echo ftrace > current_tracer
1557 # echo 1 > /debug/tracing/tracing_enabled 1617 # echo 1 > tracing_enabled
1558 # usleep 1 1618 # usleep 1
1559 # echo 0 > /debug/tracing/tracing_enabled 1619 # echo 0 > tracing_enabled
1560 # cat /debug/tracing/trace 1620 # cat trace
1561# tracer: ftrace 1621# tracer: ftrace
1562# 1622#
1563# TASK-PID CPU# TIMESTAMP FUNCTION 1623# TASK-PID CPU# TIMESTAMP FUNCTION
@@ -1568,7 +1628,7 @@ If I am only interested in sys_nanosleep and hrtimer_interrupt:
1568 1628
1569To see which functions are being traced, you can cat the file: 1629To see which functions are being traced, you can cat the file:
1570 1630
1571 # cat /debug/tracing/set_ftrace_filter 1631 # cat set_ftrace_filter
1572hrtimer_interrupt 1632hrtimer_interrupt
1573sys_nanosleep 1633sys_nanosleep
1574 1634
@@ -1588,7 +1648,7 @@ Note: It is better to use quotes to enclose the wild cards,
1588 otherwise the shell may expand the parameters into names 1648 otherwise the shell may expand the parameters into names
1589 of files in the local directory. 1649 of files in the local directory.
1590 1650
1591 # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter 1651 # echo 'hrtimer_*' > set_ftrace_filter
1592 1652
1593Produces: 1653Produces:
1594 1654
@@ -1609,7 +1669,7 @@ Produces:
1609 1669
1610Notice that we lost the sys_nanosleep. 1670Notice that we lost the sys_nanosleep.
1611 1671
1612 # cat /debug/tracing/set_ftrace_filter 1672 # cat set_ftrace_filter
1613hrtimer_run_queues 1673hrtimer_run_queues
1614hrtimer_run_pending 1674hrtimer_run_pending
1615hrtimer_init 1675hrtimer_init
@@ -1635,17 +1695,17 @@ To append to the filters, use '>>'
1635To clear out a filter so that all functions will be recorded 1695To clear out a filter so that all functions will be recorded
1636again: 1696again:
1637 1697
1638 # echo > /debug/tracing/set_ftrace_filter 1698 # echo > set_ftrace_filter
1639 # cat /debug/tracing/set_ftrace_filter 1699 # cat set_ftrace_filter
1640 # 1700 #
1641 1701
1642Again, now we want to append. 1702Again, now we want to append.
1643 1703
1644 # echo sys_nanosleep > /debug/tracing/set_ftrace_filter 1704 # echo sys_nanosleep > set_ftrace_filter
1645 # cat /debug/tracing/set_ftrace_filter 1705 # cat set_ftrace_filter
1646sys_nanosleep 1706sys_nanosleep
1647 # echo 'hrtimer_*' >> /debug/tracing/set_ftrace_filter 1707 # echo 'hrtimer_*' >> set_ftrace_filter
1648 # cat /debug/tracing/set_ftrace_filter 1708 # cat set_ftrace_filter
1649hrtimer_run_queues 1709hrtimer_run_queues
1650hrtimer_run_pending 1710hrtimer_run_pending
1651hrtimer_init 1711hrtimer_init
@@ -1668,7 +1728,7 @@ hrtimer_init_sleeper
1668The set_ftrace_notrace prevents those functions from being 1728The set_ftrace_notrace prevents those functions from being
1669traced. 1729traced.
1670 1730
1671 # echo '*preempt*' '*lock*' > /debug/tracing/set_ftrace_notrace 1731 # echo '*preempt*' '*lock*' > set_ftrace_notrace
1672 1732
1673Produces: 1733Produces:
1674 1734
@@ -1758,13 +1818,13 @@ the effect on the tracing is different. Every read from
1758trace_pipe is consumed. This means that subsequent reads will be 1818trace_pipe is consumed. This means that subsequent reads will be
1759different. The trace is live. 1819different. The trace is live.
1760 1820
1761 # echo function > /debug/tracing/current_tracer 1821 # echo function > current_tracer
1762 # cat /debug/tracing/trace_pipe > /tmp/trace.out & 1822 # cat trace_pipe > /tmp/trace.out &
1763[1] 4153 1823[1] 4153
1764 # echo 1 > /debug/tracing/tracing_enabled 1824 # echo 1 > tracing_enabled
1765 # usleep 1 1825 # usleep 1
1766 # echo 0 > /debug/tracing/tracing_enabled 1826 # echo 0 > tracing_enabled
1767 # cat /debug/tracing/trace 1827 # cat trace
1768# tracer: function 1828# tracer: function
1769# 1829#
1770# TASK-PID CPU# TIMESTAMP FUNCTION 1830# TASK-PID CPU# TIMESTAMP FUNCTION
@@ -1800,7 +1860,7 @@ number listed is the number of entries that can be recorded per
1800CPU. To know the full size, multiply the number of possible CPUS 1860CPU. To know the full size, multiply the number of possible CPUS
1801with the number of entries. 1861with the number of entries.
1802 1862
1803 # cat /debug/tracing/buffer_size_kb 1863 # cat buffer_size_kb
18041408 (units kilobytes) 18641408 (units kilobytes)
1805 1865
1806Note, to modify this, you must have tracing completely disabled. 1866Note, to modify this, you must have tracing completely disabled.
@@ -1808,21 +1868,21 @@ To do that, echo "nop" into the current_tracer. If the
1808current_tracer is not set to "nop", an EINVAL error will be 1868current_tracer is not set to "nop", an EINVAL error will be
1809returned. 1869returned.
1810 1870
1811 # echo nop > /debug/tracing/current_tracer 1871 # echo nop > current_tracer
1812 # echo 10000 > /debug/tracing/buffer_size_kb 1872 # echo 10000 > buffer_size_kb
1813 # cat /debug/tracing/buffer_size_kb 1873 # cat buffer_size_kb
181410000 (units kilobytes) 187410000 (units kilobytes)
1815 1875
1816The number of pages which will be allocated is limited to a 1876The number of pages which will be allocated is limited to a
1817percentage of available memory. Allocating too much will produce 1877percentage of available memory. Allocating too much will produce
1818an error. 1878an error.
1819 1879
1820 # echo 1000000000000 > /debug/tracing/buffer_size_kb 1880 # echo 1000000000000 > buffer_size_kb
1821-bash: echo: write error: Cannot allocate memory 1881-bash: echo: write error: Cannot allocate memory
1822 # cat /debug/tracing/buffer_size_kb 1882 # cat buffer_size_kb
182385 188385
1824 1884
1825----------- 1885-----------
1826 1886
1827More details can be found in the source code, in the 1887More details can be found in the source code, in the
1828kernel/tracing/*.c files. 1888kernel/trace/*.c files.
diff --git a/Documentation/trace/kmemtrace.txt b/Documentation/trace/kmemtrace.txt
index a956d9b7f943..6308735e58ca 100644
--- a/Documentation/trace/kmemtrace.txt
+++ b/Documentation/trace/kmemtrace.txt
@@ -64,7 +64,7 @@ III. Quick usage guide
64CONFIG_KMEMTRACE). 64CONFIG_KMEMTRACE).
65 65
662) Get the userspace tool and build it: 662) Get the userspace tool and build it:
67$ git-clone git://repo.or.cz/kmemtrace-user.git # current repository 67$ git clone git://repo.or.cz/kmemtrace-user.git # current repository
68$ cd kmemtrace-user/ 68$ cd kmemtrace-user/
69$ ./autogen.sh 69$ ./autogen.sh
70$ ./configure 70$ ./configure
diff --git a/Documentation/trace/mmiotrace.txt b/Documentation/trace/mmiotrace.txt
index 5731c67abc55..162effbfbdec 100644
--- a/Documentation/trace/mmiotrace.txt
+++ b/Documentation/trace/mmiotrace.txt
@@ -32,41 +32,41 @@ is no way to automatically detect if you are losing events due to CPUs racing.
32Usage Quick Reference 32Usage Quick Reference
33--------------------- 33---------------------
34 34
35$ mount -t debugfs debugfs /debug 35$ mount -t debugfs debugfs /sys/kernel/debug
36$ echo mmiotrace > /debug/tracing/current_tracer 36$ echo mmiotrace > /sys/kernel/debug/tracing/current_tracer
37$ cat /debug/tracing/trace_pipe > mydump.txt & 37$ cat /sys/kernel/debug/tracing/trace_pipe > mydump.txt &
38Start X or whatever. 38Start X or whatever.
39$ echo "X is up" > /debug/tracing/trace_marker 39$ echo "X is up" > /sys/kernel/debug/tracing/trace_marker
40$ echo nop > /debug/tracing/current_tracer 40$ echo nop > /sys/kernel/debug/tracing/current_tracer
41Check for lost events. 41Check for lost events.
42 42
43 43
44Usage 44Usage
45----- 45-----
46 46
47Make sure debugfs is mounted to /debug. If not, (requires root privileges) 47Make sure debugfs is mounted to /sys/kernel/debug. If not, (requires root privileges)
48$ mount -t debugfs debugfs /debug 48$ mount -t debugfs debugfs /sys/kernel/debug
49 49
50Check that the driver you are about to trace is not loaded. 50Check that the driver you are about to trace is not loaded.
51 51
52Activate mmiotrace (requires root privileges): 52Activate mmiotrace (requires root privileges):
53$ echo mmiotrace > /debug/tracing/current_tracer 53$ echo mmiotrace > /sys/kernel/debug/tracing/current_tracer
54 54
55Start storing the trace: 55Start storing the trace:
56$ cat /debug/tracing/trace_pipe > mydump.txt & 56$ cat /sys/kernel/debug/tracing/trace_pipe > mydump.txt &
57The 'cat' process should stay running (sleeping) in the background. 57The 'cat' process should stay running (sleeping) in the background.
58 58
59Load the driver you want to trace and use it. Mmiotrace will only catch MMIO 59Load the driver you want to trace and use it. Mmiotrace will only catch MMIO
60accesses to areas that are ioremapped while mmiotrace is active. 60accesses to areas that are ioremapped while mmiotrace is active.
61 61
62During tracing you can place comments (markers) into the trace by 62During tracing you can place comments (markers) into the trace by
63$ echo "X is up" > /debug/tracing/trace_marker 63$ echo "X is up" > /sys/kernel/debug/tracing/trace_marker
64This makes it easier to see which part of the (huge) trace corresponds to 64This makes it easier to see which part of the (huge) trace corresponds to
65which action. It is recommended to place descriptive markers about what you 65which action. It is recommended to place descriptive markers about what you
66do. 66do.
67 67
68Shut down mmiotrace (requires root privileges): 68Shut down mmiotrace (requires root privileges):
69$ echo nop > /debug/tracing/current_tracer 69$ echo nop > /sys/kernel/debug/tracing/current_tracer
70The 'cat' process exits. If it does not, kill it by issuing 'fg' command and 70The 'cat' process exits. If it does not, kill it by issuing 'fg' command and
71pressing ctrl+c. 71pressing ctrl+c.
72 72
@@ -78,10 +78,10 @@ to view your kernel log and look for "mmiotrace has lost events" warning. If
78events were lost, the trace is incomplete. You should enlarge the buffers and 78events were lost, the trace is incomplete. You should enlarge the buffers and
79try again. Buffers are enlarged by first seeing how large the current buffers 79try again. Buffers are enlarged by first seeing how large the current buffers
80are: 80are:
81$ cat /debug/tracing/buffer_size_kb 81$ cat /sys/kernel/debug/tracing/buffer_size_kb
82gives you a number. Approximately double this number and write it back, for 82gives you a number. Approximately double this number and write it back, for
83instance: 83instance:
84$ echo 128000 > /debug/tracing/buffer_size_kb 84$ echo 128000 > /sys/kernel/debug/tracing/buffer_size_kb
85Then start again from the top. 85Then start again from the top.
86 86
87If you are doing a trace for a driver project, e.g. Nouveau, you should also 87If you are doing a trace for a driver project, e.g. Nouveau, you should also
diff --git a/Documentation/trace/power.txt b/Documentation/trace/power.txt
new file mode 100644
index 000000000000..cd805e16dc27
--- /dev/null
+++ b/Documentation/trace/power.txt
@@ -0,0 +1,17 @@
1The power tracer collects detailed information about C-state and P-state
2transitions, instead of just looking at the high-level "average"
3information.
4
5There is a helper script found in scrips/tracing/power.pl in the kernel
6sources which can be used to parse this information and create a
7Scalable Vector Graphics (SVG) picture from the trace data.
8
9To use this tracer:
10
11 echo 0 > /sys/kernel/debug/tracing/tracing_enabled
12 echo power > /sys/kernel/debug/tracing/current_tracer
13 echo 1 > /sys/kernel/debug/tracing/tracing_enabled
14 sleep 1
15 echo 0 > /sys/kernel/debug/tracing/tracing_enabled
16 cat /sys/kernel/debug/tracing/trace | \
17 perl scripts/tracing/power.pl > out.sv
diff --git a/Documentation/usb/WUSB-Design-overview.txt b/Documentation/usb/WUSB-Design-overview.txt
index 4c3d62c7843a..c480e9c32dbd 100644
--- a/Documentation/usb/WUSB-Design-overview.txt
+++ b/Documentation/usb/WUSB-Design-overview.txt
@@ -84,7 +84,7 @@ The different logical parts of this driver are:
84 84
85 *UWB*: the Ultra-Wide-Band stack -- manages the radio and 85 *UWB*: the Ultra-Wide-Band stack -- manages the radio and
86 associated spectrum to allow for devices sharing it. Allows to 86 associated spectrum to allow for devices sharing it. Allows to
87 control bandwidth assingment, beaconing, scanning, etc 87 control bandwidth assignment, beaconing, scanning, etc
88 88
89 * 89 *
90 90
@@ -184,7 +184,7 @@ and sends the replies and notifications back to the API
184[/uwb_rc_neh_grok()/]. Notifications are handled to the UWB daemon, that 184[/uwb_rc_neh_grok()/]. Notifications are handled to the UWB daemon, that
185is chartered, among other things, to keep the tab of how the UWB radio 185is chartered, among other things, to keep the tab of how the UWB radio
186neighborhood looks, creating and destroying devices as they show up or 186neighborhood looks, creating and destroying devices as they show up or
187dissapear. 187disappear.
188 188
189Command execution is very simple: a command block is sent and a event 189Command execution is very simple: a command block is sent and a event
190block or reply is expected back. For sending/receiving command/events, a 190block or reply is expected back. For sending/receiving command/events, a
@@ -333,7 +333,7 @@ read descriptors and move our data.
333 333
334*Device life cycle and keep alives* 334*Device life cycle and keep alives*
335 335
336Everytime there is a succesful transfer to/from a device, we update a 336Every time there is a successful transfer to/from a device, we update a
337per-device activity timestamp. If not, every now and then we check and 337per-device activity timestamp. If not, every now and then we check and
338if the activity timestamp gets old, we ping the device by sending it a 338if the activity timestamp gets old, we ping the device by sending it a
339Keep Alive IE; it responds with a /DN_Alive/ pong during the DNTS (this 339Keep Alive IE; it responds with a /DN_Alive/ pong during the DNTS (this
@@ -411,7 +411,7 @@ context (wa_xfer) and submit it. When the xfer is done, our callback is
411called and we assign the status bits and release the xfer resources. 411called and we assign the status bits and release the xfer resources.
412 412
413In dequeue() we are basically cancelling/aborting the transfer. We issue 413In dequeue() we are basically cancelling/aborting the transfer. We issue
414a xfer abort request to the HC, cancell all the URBs we had submitted 414a xfer abort request to the HC, cancel all the URBs we had submitted
415and not yet done and when all that is done, the xfer callback will be 415and not yet done and when all that is done, the xfer callback will be
416called--this will call the URB callback. 416called--this will call the URB callback.
417 417
diff --git a/Documentation/usb/anchors.txt b/Documentation/usb/anchors.txt
index 6f24f566955a..fe6a99a32bbd 100644
--- a/Documentation/usb/anchors.txt
+++ b/Documentation/usb/anchors.txt
@@ -27,7 +27,7 @@ Association and disassociation of URBs with anchors
27 27
28An association of URBs to an anchor is made by an explicit 28An association of URBs to an anchor is made by an explicit
29call to usb_anchor_urb(). The association is maintained until 29call to usb_anchor_urb(). The association is maintained until
30an URB is finished by (successfull) completion. Thus disassociation 30an URB is finished by (successful) completion. Thus disassociation
31is automatic. A function is provided to forcibly finish (kill) 31is automatic. A function is provided to forcibly finish (kill)
32all URBs associated with an anchor. 32all URBs associated with an anchor.
33Furthermore, disassociation can be made with usb_unanchor_urb() 33Furthermore, disassociation can be made with usb_unanchor_urb()
@@ -76,4 +76,4 @@ usb_get_from_anchor()
76Returns the oldest anchored URB of an anchor. The URB is unanchored 76Returns the oldest anchored URB of an anchor. The URB is unanchored
77and returned with a reference. As you may mix URBs to several 77and returned with a reference. As you may mix URBs to several
78destinations in one anchor you have no guarantee the chronologically 78destinations in one anchor you have no guarantee the chronologically
79first submitted URB is returned. \ No newline at end of file 79first submitted URB is returned.
diff --git a/Documentation/usb/callbacks.txt b/Documentation/usb/callbacks.txt
index 7c812411945b..bfb36b34b79e 100644
--- a/Documentation/usb/callbacks.txt
+++ b/Documentation/usb/callbacks.txt
@@ -65,7 +65,7 @@ Accept or decline an interface. If you accept the device return 0,
65otherwise -ENODEV or -ENXIO. Other error codes should be used only if a 65otherwise -ENODEV or -ENXIO. Other error codes should be used only if a
66genuine error occurred during initialisation which prevented a driver 66genuine error occurred during initialisation which prevented a driver
67from accepting a device that would else have been accepted. 67from accepting a device that would else have been accepted.
68You are strongly encouraged to use usbcore'sfacility, 68You are strongly encouraged to use usbcore's facility,
69usb_set_intfdata(), to associate a data structure with an interface, so 69usb_set_intfdata(), to associate a data structure with an interface, so
70that you know which internal state and identity you associate with a 70that you know which internal state and identity you associate with a
71particular interface. The device will not be suspended and you may do IO 71particular interface. The device will not be suspended and you may do IO
diff --git a/Documentation/video4linux/CARDLIST.cx23885 b/Documentation/video4linux/CARDLIST.cx23885
index 91aa3c0f0dd2..450b8f8c389b 100644
--- a/Documentation/video4linux/CARDLIST.cx23885
+++ b/Documentation/video4linux/CARDLIST.cx23885
@@ -16,3 +16,8 @@
16 15 -> TeVii S470 [d470:9022] 16 15 -> TeVii S470 [d470:9022]
17 16 -> DVBWorld DVB-S2 2005 [0001:2005] 17 16 -> DVBWorld DVB-S2 2005 [0001:2005]
18 17 -> NetUP Dual DVB-S2 CI [1b55:2a2c] 18 17 -> NetUP Dual DVB-S2 CI [1b55:2a2c]
19 18 -> Hauppauge WinTV-HVR1270 [0070:2211]
20 19 -> Hauppauge WinTV-HVR1275 [0070:2215]
21 20 -> Hauppauge WinTV-HVR1255 [0070:2251]
22 21 -> Hauppauge WinTV-HVR1210 [0070:2291,0070:2295]
23 22 -> Mygica X8506 DMB-TH [14f1:8651]
diff --git a/Documentation/video4linux/CARDLIST.cx88 b/Documentation/video4linux/CARDLIST.cx88
index 71e9db0b26f7..0736518b2f88 100644
--- a/Documentation/video4linux/CARDLIST.cx88
+++ b/Documentation/video4linux/CARDLIST.cx88
@@ -6,8 +6,8 @@
6 5 -> Leadtek Winfast 2000XP Expert [107d:6611,107d:6613] 6 5 -> Leadtek Winfast 2000XP Expert [107d:6611,107d:6613]
7 6 -> AverTV Studio 303 (M126) [1461:000b] 7 6 -> AverTV Studio 303 (M126) [1461:000b]
8 7 -> MSI TV-@nywhere Master [1462:8606] 8 7 -> MSI TV-@nywhere Master [1462:8606]
9 8 -> Leadtek Winfast DV2000 [107d:6620] 9 8 -> Leadtek Winfast DV2000 [107d:6620,107d:6621]
10 9 -> Leadtek PVR 2000 [107d:663b,107d:663c,107d:6632] 10 9 -> Leadtek PVR 2000 [107d:663b,107d:663c,107d:6632,107d:6630,107d:6638,107d:6631,107d:6637,107d:663d]
11 10 -> IODATA GV-VCP3/PCI [10fc:d003] 11 10 -> IODATA GV-VCP3/PCI [10fc:d003]
12 11 -> Prolink PlayTV PVR 12 11 -> Prolink PlayTV PVR
13 12 -> ASUS PVR-416 [1043:4823,1461:c111] 13 12 -> ASUS PVR-416 [1043:4823,1461:c111]
@@ -59,7 +59,7 @@
59 58 -> Pinnacle PCTV HD 800i [11bd:0051] 59 58 -> Pinnacle PCTV HD 800i [11bd:0051]
60 59 -> DViCO FusionHDTV 5 PCI nano [18ac:d530] 60 59 -> DViCO FusionHDTV 5 PCI nano [18ac:d530]
61 60 -> Pinnacle Hybrid PCTV [12ab:1788] 61 60 -> Pinnacle Hybrid PCTV [12ab:1788]
62 61 -> Winfast TV2000 XP Global [107d:6f18] 62 61 -> Leadtek TV2000 XP Global [107d:6f18,107d:6618]
63 62 -> PowerColor RA330 [14f1:ea3d] 63 62 -> PowerColor RA330 [14f1:ea3d]
64 63 -> Geniatech X8000-MT DVBT [14f1:8852] 64 63 -> Geniatech X8000-MT DVBT [14f1:8852]
65 64 -> DViCO FusionHDTV DVB-T PRO [18ac:db30] 65 64 -> DViCO FusionHDTV DVB-T PRO [18ac:db30]
@@ -78,3 +78,5 @@
78 77 -> TBS 8910 DVB-S [8910:8888] 78 77 -> TBS 8910 DVB-S [8910:8888]
79 78 -> Prof 6200 DVB-S [b022:3022] 79 78 -> Prof 6200 DVB-S [b022:3022]
80 79 -> Terratec Cinergy HT PCI MKII [153b:1177] 80 79 -> Terratec Cinergy HT PCI MKII [153b:1177]
81 80 -> Hauppauge WinTV-IR Only [0070:9290]
82 81 -> Leadtek WinFast DTV1800 Hybrid [107d:6654]
diff --git a/Documentation/video4linux/CARDLIST.em28xx b/Documentation/video4linux/CARDLIST.em28xx
index 78d0a6eed571..873630e7e53e 100644
--- a/Documentation/video4linux/CARDLIST.em28xx
+++ b/Documentation/video4linux/CARDLIST.em28xx
@@ -17,7 +17,7 @@
17 16 -> Hauppauge WinTV HVR 950 (em2883) [2040:6513,2040:6517,2040:651b] 17 16 -> Hauppauge WinTV HVR 950 (em2883) [2040:6513,2040:6517,2040:651b]
18 17 -> Pinnacle PCTV HD Pro Stick (em2880) [2304:0227] 18 17 -> Pinnacle PCTV HD Pro Stick (em2880) [2304:0227]
19 18 -> Hauppauge WinTV HVR 900 (R2) (em2880) [2040:6502] 19 18 -> Hauppauge WinTV HVR 900 (R2) (em2880) [2040:6502]
20 19 -> PointNix Intra-Oral Camera (em2860) 20 19 -> EM2860/SAA711X Reference Design (em2860)
21 20 -> AMD ATI TV Wonder HD 600 (em2880) [0438:b002] 21 20 -> AMD ATI TV Wonder HD 600 (em2880) [0438:b002]
22 21 -> eMPIA Technology, Inc. GrabBeeX+ Video Encoder (em2800) [eb1a:2801] 22 21 -> eMPIA Technology, Inc. GrabBeeX+ Video Encoder (em2800) [eb1a:2801]
23 22 -> Unknown EM2750/EM2751 webcam grabber (em2750) [eb1a:2750,eb1a:2751] 23 22 -> Unknown EM2750/EM2751 webcam grabber (em2750) [eb1a:2750,eb1a:2751]
@@ -61,3 +61,8 @@
61 63 -> Kaiomy TVnPC U2 (em2860) [eb1a:e303] 61 63 -> Kaiomy TVnPC U2 (em2860) [eb1a:e303]
62 64 -> Easy Cap Capture DC-60 (em2860) 62 64 -> Easy Cap Capture DC-60 (em2860)
63 65 -> IO-DATA GV-MVP/SZ (em2820/em2840) [04bb:0515] 63 65 -> IO-DATA GV-MVP/SZ (em2820/em2840) [04bb:0515]
64 66 -> Empire dual TV (em2880)
65 67 -> Terratec Grabby (em2860) [0ccd:0096]
66 68 -> Terratec AV350 (em2860) [0ccd:0084]
67 69 -> KWorld ATSC 315U HDTV TV Box (em2882) [eb1a:a313]
68 70 -> Evga inDtube (em2882)
diff --git a/Documentation/video4linux/CARDLIST.saa7134 b/Documentation/video4linux/CARDLIST.saa7134
index 6dacf2825259..15562427e8a9 100644
--- a/Documentation/video4linux/CARDLIST.saa7134
+++ b/Documentation/video4linux/CARDLIST.saa7134
@@ -124,10 +124,10 @@
124123 -> Beholder BeholdTV 407 [0000:4070] 124123 -> Beholder BeholdTV 407 [0000:4070]
125124 -> Beholder BeholdTV 407 FM [0000:4071] 125124 -> Beholder BeholdTV 407 FM [0000:4071]
126125 -> Beholder BeholdTV 409 [0000:4090] 126125 -> Beholder BeholdTV 409 [0000:4090]
127126 -> Beholder BeholdTV 505 FM/RDS [0000:5051,0000:505B,5ace:5050] 127126 -> Beholder BeholdTV 505 FM [5ace:5050]
128127 -> Beholder BeholdTV 507 FM/RDS / BeholdTV 509 FM [0000:5071,0000:507B,5ace:5070,5ace:5090] 128127 -> Beholder BeholdTV 507 FM / BeholdTV 509 FM [5ace:5070,5ace:5090]
129128 -> Beholder BeholdTV Columbus TVFM [0000:5201] 129128 -> Beholder BeholdTV Columbus TVFM [0000:5201]
130129 -> Beholder BeholdTV 607 / BeholdTV 609 [5ace:6070,5ace:6071,5ace:6072,5ace:6073,5ace:6090,5ace:6091,5ace:6092,5ace:6093] 130129 -> Beholder BeholdTV 607 FM [5ace:6070]
131130 -> Beholder BeholdTV M6 [5ace:6190] 131130 -> Beholder BeholdTV M6 [5ace:6190]
132131 -> Twinhan Hybrid DTV-DVB 3056 PCI [1822:0022] 132131 -> Twinhan Hybrid DTV-DVB 3056 PCI [1822:0022]
133132 -> Genius TVGO AM11MCE 133132 -> Genius TVGO AM11MCE
@@ -143,7 +143,7 @@
143142 -> Beholder BeholdTV H6 [5ace:6290] 143142 -> Beholder BeholdTV H6 [5ace:6290]
144143 -> Beholder BeholdTV M63 [5ace:6191] 144143 -> Beholder BeholdTV M63 [5ace:6191]
145144 -> Beholder BeholdTV M6 Extra [5ace:6193] 145144 -> Beholder BeholdTV M6 Extra [5ace:6193]
146145 -> AVerMedia MiniPCI DVB-T Hybrid M103 [1461:f636] 146145 -> AVerMedia MiniPCI DVB-T Hybrid M103 [1461:f636,1461:f736]
147146 -> ASUSTeK P7131 Analog 147146 -> ASUSTeK P7131 Analog
148147 -> Asus Tiger 3in1 [1043:4878] 148147 -> Asus Tiger 3in1 [1043:4878]
149148 -> Encore ENLTV-FM v5.3 [1a7f:2008] 149148 -> Encore ENLTV-FM v5.3 [1a7f:2008]
@@ -154,4 +154,16 @@
154153 -> Kworld Plus TV Analog Lite PCI [17de:7128] 154153 -> Kworld Plus TV Analog Lite PCI [17de:7128]
155154 -> Avermedia AVerTV GO 007 FM Plus [1461:f31d] 155154 -> Avermedia AVerTV GO 007 FM Plus [1461:f31d]
156155 -> Hauppauge WinTV-HVR1120 ATSC/QAM-Hybrid [0070:6706,0070:6708] 156155 -> Hauppauge WinTV-HVR1120 ATSC/QAM-Hybrid [0070:6706,0070:6708]
157156 -> Hauppauge WinTV-HVR1110r3 [0070:6707,0070:6709,0070:670a] 157156 -> Hauppauge WinTV-HVR1110r3 DVB-T/Hybrid [0070:6707,0070:6709,0070:670a]
158157 -> Avermedia AVerTV Studio 507UA [1461:a11b]
159158 -> AVerMedia Cardbus TV/Radio (E501R) [1461:b7e9]
160159 -> Beholder BeholdTV 505 RDS [0000:505B]
161160 -> Beholder BeholdTV 507 RDS [0000:5071]
162161 -> Beholder BeholdTV 507 RDS [0000:507B]
163162 -> Beholder BeholdTV 607 FM [5ace:6071]
164163 -> Beholder BeholdTV 609 FM [5ace:6090]
165164 -> Beholder BeholdTV 609 FM [5ace:6091]
166165 -> Beholder BeholdTV 607 RDS [5ace:6072]
167166 -> Beholder BeholdTV 607 RDS [5ace:6073]
168167 -> Beholder BeholdTV 609 RDS [5ace:6092]
169168 -> Beholder BeholdTV 609 RDS [5ace:6093]
diff --git a/Documentation/video4linux/CARDLIST.tuner b/Documentation/video4linux/CARDLIST.tuner
index 691d2f37dc57..be67844074dd 100644
--- a/Documentation/video4linux/CARDLIST.tuner
+++ b/Documentation/video4linux/CARDLIST.tuner
@@ -76,3 +76,5 @@ tuner=75 - Philips TEA5761 FM Radio
76tuner=76 - Xceive 5000 tuner 76tuner=76 - Xceive 5000 tuner
77tuner=77 - TCL tuner MF02GIP-5N-E 77tuner=77 - TCL tuner MF02GIP-5N-E
78tuner=78 - Philips FMD1216MEX MK3 Hybrid Tuner 78tuner=78 - Philips FMD1216MEX MK3 Hybrid Tuner
79tuner=79 - Philips PAL/SECAM multi (FM1216 MK5)
80tuner=80 - Philips FQ1216LME MK3 PAL/SECAM w/active loopthrough
diff --git a/Documentation/video4linux/cx18.txt b/Documentation/video4linux/cx18.txt
index 914cb7e734a2..4652c0f5da32 100644
--- a/Documentation/video4linux/cx18.txt
+++ b/Documentation/video4linux/cx18.txt
@@ -11,7 +11,7 @@ encoder chip:
112) Some people have problems getting the i2c bus to work. 112) Some people have problems getting the i2c bus to work.
12 The symptom is that the eeprom cannot be read and the card is 12 The symptom is that the eeprom cannot be read and the card is
13 unusable. This is probably fixed, but if you have problems 13 unusable. This is probably fixed, but if you have problems
14 then post to the video4linux or ivtv-users mailinglist. 14 then post to the video4linux or ivtv-users mailing list.
15 15
163) VBI (raw or sliced) has not yet been implemented. 163) VBI (raw or sliced) has not yet been implemented.
17 17
diff --git a/Documentation/video4linux/gspca.txt b/Documentation/video4linux/gspca.txt
index 98529e03a46e..2bcf78896e22 100644
--- a/Documentation/video4linux/gspca.txt
+++ b/Documentation/video4linux/gspca.txt
@@ -163,10 +163,11 @@ sunplus 055f:c650 Mustek MDC5500Z
163zc3xx 055f:d003 Mustek WCam300A 163zc3xx 055f:d003 Mustek WCam300A
164zc3xx 055f:d004 Mustek WCam300 AN 164zc3xx 055f:d004 Mustek WCam300 AN
165conex 0572:0041 Creative Notebook cx11646 165conex 0572:0041 Creative Notebook cx11646
166ov519 05a9:0519 OmniVision 166ov519 05a9:0519 OV519 Microphone
167ov519 05a9:0530 OmniVision 167ov519 05a9:0530 OmniVision
168ov519 05a9:4519 OmniVision 168ov519 05a9:4519 Webcam Classic
169ov519 05a9:8519 OmniVision 169ov519 05a9:8519 OmniVision
170ov519 05a9:a518 D-Link DSB-C310 Webcam
170sunplus 05da:1018 Digital Dream Enigma 1.3 171sunplus 05da:1018 Digital Dream Enigma 1.3
171stk014 05e1:0893 Syntek DV4000 172stk014 05e1:0893 Syntek DV4000
172spca561 060b:a001 Maxell Compact Pc PM3 173spca561 060b:a001 Maxell Compact Pc PM3
@@ -178,6 +179,7 @@ spca506 06e1:a190 ADS Instant VCD
178ov534 06f8:3002 Hercules Blog Webcam 179ov534 06f8:3002 Hercules Blog Webcam
179ov534 06f8:3003 Hercules Dualpix HD Weblog 180ov534 06f8:3003 Hercules Dualpix HD Weblog
180sonixj 06f8:3004 Hercules Classic Silver 181sonixj 06f8:3004 Hercules Classic Silver
182sonixj 06f8:3008 Hercules Deluxe Optical Glass
181spca508 0733:0110 ViewQuest VQ110 183spca508 0733:0110 ViewQuest VQ110
182spca508 0130:0130 Clone Digital Webcam 11043 184spca508 0130:0130 Clone Digital Webcam 11043
183spca501 0733:0401 Intel Create and Share 185spca501 0733:0401 Intel Create and Share
@@ -209,6 +211,7 @@ sunplus 08ca:2050 Medion MD 41437
209sunplus 08ca:2060 Aiptek PocketDV5300 211sunplus 08ca:2060 Aiptek PocketDV5300
210tv8532 0923:010f ICM532 cams 212tv8532 0923:010f ICM532 cams
211mars 093a:050f Mars-Semi Pc-Camera 213mars 093a:050f Mars-Semi Pc-Camera
214mr97310a 093a:010f Sakar Digital no. 77379
212pac207 093a:2460 Qtec Webcam 100 215pac207 093a:2460 Qtec Webcam 100
213pac207 093a:2461 HP Webcam 216pac207 093a:2461 HP Webcam
214pac207 093a:2463 Philips SPC 220 NC 217pac207 093a:2463 Philips SPC 220 NC
@@ -265,6 +268,11 @@ sonixj 0c45:60ec SN9C105+MO4000
265sonixj 0c45:60fb Surfer NoName 268sonixj 0c45:60fb Surfer NoName
266sonixj 0c45:60fc LG-LIC300 269sonixj 0c45:60fc LG-LIC300
267sonixj 0c45:60fe Microdia Audio 270sonixj 0c45:60fe Microdia Audio
271sonixj 0c45:6100 PC Camera (SN9C128)
272sonixj 0c45:610a PC Camera (SN9C128)
273sonixj 0c45:610b PC Camera (SN9C128)
274sonixj 0c45:610c PC Camera (SN9C128)
275sonixj 0c45:610e PC Camera (SN9C128)
268sonixj 0c45:6128 Microdia/Sonix SNP325 276sonixj 0c45:6128 Microdia/Sonix SNP325
269sonixj 0c45:612a Avant Camera 277sonixj 0c45:612a Avant Camera
270sonixj 0c45:612c Typhoon Rasy Cam 1.3MPix 278sonixj 0c45:612c Typhoon Rasy Cam 1.3MPix
diff --git a/Documentation/video4linux/pxa_camera.txt b/Documentation/video4linux/pxa_camera.txt
index b1137f9a53eb..4f6d0ca01956 100644
--- a/Documentation/video4linux/pxa_camera.txt
+++ b/Documentation/video4linux/pxa_camera.txt
@@ -26,6 +26,55 @@ Global video workflow
26 26
27 Once the last buffer is filled in, the QCI interface stops. 27 Once the last buffer is filled in, the QCI interface stops.
28 28
29 c) Capture global finite state machine schema
30
31 +----+ +---+ +----+
32 | DQ | | Q | | DQ |
33 | v | v | v
34 +-----------+ +------------------------+
35 | STOP | | Wait for capture start |
36 +-----------+ Q +------------------------+
37+-> | QCI: stop | ------------------> | QCI: run | <------------+
38| | DMA: stop | | DMA: stop | |
39| +-----------+ +-----> +------------------------+ |
40| / | |
41| / +---+ +----+ | |
42|capture list empty / | Q | | DQ | | QCI Irq EOF |
43| / | v | v v |
44| +--------------------+ +----------------------+ |
45| | DMA hotlink missed | | Capture running | |
46| +--------------------+ +----------------------+ |
47| | QCI: run | +-----> | QCI: run | <-+ |
48| | DMA: stop | / | DMA: run | | |
49| +--------------------+ / +----------------------+ | Other |
50| ^ /DMA still | | channels |
51| | capture list / running | DMA Irq End | not |
52| | not empty / | | finished |
53| | / v | yet |
54| +----------------------+ +----------------------+ | |
55| | Videobuf released | | Channel completed | | |
56| +----------------------+ +----------------------+ | |
57+-- | QCI: run | | QCI: run | --+ |
58 | DMA: run | | DMA: run | |
59 +----------------------+ +----------------------+ |
60 ^ / | |
61 | no overrun / | overrun |
62 | / v |
63 +--------------------+ / +----------------------+ |
64 | Frame completed | / | Frame overran | |
65 +--------------------+ <-----+ +----------------------+ restart frame |
66 | QCI: run | | QCI: stop | --------------+
67 | DMA: run | | DMA: stop |
68 +--------------------+ +----------------------+
69
70 Legend: - each box is a FSM state
71 - each arrow is the condition to transition to another state
72 - an arrow with a comment is a mandatory transition (no condition)
73 - arrow "Q" means : a buffer was enqueued
74 - arrow "DQ" means : a buffer was dequeued
75 - "QCI: stop" means the QCI interface is not enabled
76 - "DMA: stop" means all 3 DMA channels are stopped
77 - "DMA: run" means at least 1 DMA channel is still running
29 78
30DMA usage 79DMA usage
31--------- 80---------
diff --git a/Documentation/video4linux/v4l2-framework.txt b/Documentation/video4linux/v4l2-framework.txt
index 854808b67fae..ba4706afc5fb 100644
--- a/Documentation/video4linux/v4l2-framework.txt
+++ b/Documentation/video4linux/v4l2-framework.txt
@@ -89,6 +89,11 @@ from dev (driver name followed by the bus_id, to be precise). If you set it
89up before calling v4l2_device_register then it will be untouched. If dev is 89up before calling v4l2_device_register then it will be untouched. If dev is
90NULL, then you *must* setup v4l2_dev->name before calling v4l2_device_register. 90NULL, then you *must* setup v4l2_dev->name before calling v4l2_device_register.
91 91
92You can use v4l2_device_set_name() to set the name based on a driver name and
93a driver-global atomic_t instance. This will generate names like ivtv0, ivtv1,
94etc. If the name ends with a digit, then it will insert a dash: cx18-0,
95cx18-1, etc. This function returns the instance number.
96
92The first 'dev' argument is normally the struct device pointer of a pci_dev, 97The first 'dev' argument is normally the struct device pointer of a pci_dev,
93usb_interface or platform_device. It is rare for dev to be NULL, but it happens 98usb_interface or platform_device. It is rare for dev to be NULL, but it happens
94with ISA devices or when one device creates multiple PCI devices, thus making 99with ISA devices or when one device creates multiple PCI devices, thus making
@@ -385,6 +390,30 @@ later date. It differs between i2c drivers and as such can be confusing.
385To see which chip variants are supported you can look in the i2c driver code 390To see which chip variants are supported you can look in the i2c driver code
386for the i2c_device_id table. This lists all the possibilities. 391for the i2c_device_id table. This lists all the possibilities.
387 392
393There are two more helper functions:
394
395v4l2_i2c_new_subdev_cfg: this function adds new irq and platform_data
396arguments and has both 'addr' and 'probed_addrs' arguments: if addr is not
3970 then that will be used (non-probing variant), otherwise the probed_addrs
398are probed.
399
400For example: this will probe for address 0x10:
401
402struct v4l2_subdev *sd = v4l2_i2c_new_subdev_cfg(v4l2_dev, adapter,
403 "module_foo", "chipid", 0, NULL, 0, I2C_ADDRS(0x10));
404
405v4l2_i2c_new_subdev_board uses an i2c_board_info struct which is passed
406to the i2c driver and replaces the irq, platform_data and addr arguments.
407
408If the subdev supports the s_config core ops, then that op is called with
409the irq and platform_data arguments after the subdev was setup. The older
410v4l2_i2c_new_(probed_)subdev functions will call s_config as well, but with
411irq set to 0 and platform_data set to NULL.
412
413Note that in the next kernel release the functions v4l2_i2c_new_subdev,
414v4l2_i2c_new_probed_subdev and v4l2_i2c_new_probed_subdev_addr will all be
415replaced by a single v4l2_i2c_new_subdev that is identical to
416v4l2_i2c_new_subdev_cfg but without the irq and platform_data arguments.
388 417
389struct video_device 418struct video_device
390------------------- 419-------------------
diff --git a/Documentation/vm/Makefile b/Documentation/vm/Makefile
index 6f562f778b28..5bd269b3731a 100644
--- a/Documentation/vm/Makefile
+++ b/Documentation/vm/Makefile
@@ -2,7 +2,7 @@
2obj- := dummy.o 2obj- := dummy.o
3 3
4# List of programs to build 4# List of programs to build
5hostprogs-y := slabinfo 5hostprogs-y := slabinfo page-types
6 6
7# Tell kbuild to always build the programs 7# Tell kbuild to always build the programs
8always := $(hostprogs-y) 8always := $(hostprogs-y)
diff --git a/Documentation/vm/balance b/Documentation/vm/balance
index bd3d31bc4915..c46e68cf9344 100644
--- a/Documentation/vm/balance
+++ b/Documentation/vm/balance
@@ -75,15 +75,15 @@ Page stealing from process memory and shm is done if stealing the page would
75alleviate memory pressure on any zone in the page's node that has fallen below 75alleviate memory pressure on any zone in the page's node that has fallen below
76its watermark. 76its watermark.
77 77
78pages_min/pages_low/pages_high/low_on_memory/zone_wake_kswapd: These are 78watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These
79per-zone fields, used to determine when a zone needs to be balanced. When 79are per-zone fields, used to determine when a zone needs to be balanced. When
80the number of pages falls below pages_min, the hysteric field low_on_memory 80the number of pages falls below watermark[WMARK_MIN], the hysteric field
81gets set. This stays set till the number of free pages becomes pages_high. 81low_on_memory gets set. This stays set till the number of free pages becomes
82When low_on_memory is set, page allocation requests will try to free some 82watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will
83pages in the zone (providing GFP_WAIT is set in the request). Orthogonal 83try to free some pages in the zone (providing GFP_WAIT is set in the request).
84to this, is the decision to poke kswapd to free some zone pages. That 84Orthogonal to this, is the decision to poke kswapd to free some zone pages.
85decision is not hysteresis based, and is done when the number of free 85That decision is not hysteresis based, and is done when the number of free
86pages is below pages_low; in which case zone_wake_kswapd is also set. 86pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set.
87 87
88 88
89(Good) Ideas that I have heard: 89(Good) Ideas that I have heard:
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c
new file mode 100644
index 000000000000..0833f44ba16b
--- /dev/null
+++ b/Documentation/vm/page-types.c
@@ -0,0 +1,698 @@
1/*
2 * page-types: Tool for querying page flags
3 *
4 * Copyright (C) 2009 Intel corporation
5 * Copyright (C) 2009 Wu Fengguang <fengguang.wu@intel.com>
6 */
7
8#include <stdio.h>
9#include <stdlib.h>
10#include <unistd.h>
11#include <stdint.h>
12#include <stdarg.h>
13#include <string.h>
14#include <getopt.h>
15#include <limits.h>
16#include <sys/types.h>
17#include <sys/errno.h>
18#include <sys/fcntl.h>
19
20
21/*
22 * kernel page flags
23 */
24
25#define KPF_BYTES 8
26#define PROC_KPAGEFLAGS "/proc/kpageflags"
27
28/* copied from kpageflags_read() */
29#define KPF_LOCKED 0
30#define KPF_ERROR 1
31#define KPF_REFERENCED 2
32#define KPF_UPTODATE 3
33#define KPF_DIRTY 4
34#define KPF_LRU 5
35#define KPF_ACTIVE 6
36#define KPF_SLAB 7
37#define KPF_WRITEBACK 8
38#define KPF_RECLAIM 9
39#define KPF_BUDDY 10
40
41/* [11-20] new additions in 2.6.31 */
42#define KPF_MMAP 11
43#define KPF_ANON 12
44#define KPF_SWAPCACHE 13
45#define KPF_SWAPBACKED 14
46#define KPF_COMPOUND_HEAD 15
47#define KPF_COMPOUND_TAIL 16
48#define KPF_HUGE 17
49#define KPF_UNEVICTABLE 18
50#define KPF_NOPAGE 20
51
52/* [32-] kernel hacking assistances */
53#define KPF_RESERVED 32
54#define KPF_MLOCKED 33
55#define KPF_MAPPEDTODISK 34
56#define KPF_PRIVATE 35
57#define KPF_PRIVATE_2 36
58#define KPF_OWNER_PRIVATE 37
59#define KPF_ARCH 38
60#define KPF_UNCACHED 39
61
62/* [48-] take some arbitrary free slots for expanding overloaded flags
63 * not part of kernel API
64 */
65#define KPF_READAHEAD 48
66#define KPF_SLOB_FREE 49
67#define KPF_SLUB_FROZEN 50
68#define KPF_SLUB_DEBUG 51
69
70#define KPF_ALL_BITS ((uint64_t)~0ULL)
71#define KPF_HACKERS_BITS (0xffffULL << 32)
72#define KPF_OVERLOADED_BITS (0xffffULL << 48)
73#define BIT(name) (1ULL << KPF_##name)
74#define BITS_COMPOUND (BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL))
75
76static char *page_flag_names[] = {
77 [KPF_LOCKED] = "L:locked",
78 [KPF_ERROR] = "E:error",
79 [KPF_REFERENCED] = "R:referenced",
80 [KPF_UPTODATE] = "U:uptodate",
81 [KPF_DIRTY] = "D:dirty",
82 [KPF_LRU] = "l:lru",
83 [KPF_ACTIVE] = "A:active",
84 [KPF_SLAB] = "S:slab",
85 [KPF_WRITEBACK] = "W:writeback",
86 [KPF_RECLAIM] = "I:reclaim",
87 [KPF_BUDDY] = "B:buddy",
88
89 [KPF_MMAP] = "M:mmap",
90 [KPF_ANON] = "a:anonymous",
91 [KPF_SWAPCACHE] = "s:swapcache",
92 [KPF_SWAPBACKED] = "b:swapbacked",
93 [KPF_COMPOUND_HEAD] = "H:compound_head",
94 [KPF_COMPOUND_TAIL] = "T:compound_tail",
95 [KPF_HUGE] = "G:huge",
96 [KPF_UNEVICTABLE] = "u:unevictable",
97 [KPF_NOPAGE] = "n:nopage",
98
99 [KPF_RESERVED] = "r:reserved",
100 [KPF_MLOCKED] = "m:mlocked",
101 [KPF_MAPPEDTODISK] = "d:mappedtodisk",
102 [KPF_PRIVATE] = "P:private",
103 [KPF_PRIVATE_2] = "p:private_2",
104 [KPF_OWNER_PRIVATE] = "O:owner_private",
105 [KPF_ARCH] = "h:arch",
106 [KPF_UNCACHED] = "c:uncached",
107
108 [KPF_READAHEAD] = "I:readahead",
109 [KPF_SLOB_FREE] = "P:slob_free",
110 [KPF_SLUB_FROZEN] = "A:slub_frozen",
111 [KPF_SLUB_DEBUG] = "E:slub_debug",
112};
113
114
115/*
116 * data structures
117 */
118
119static int opt_raw; /* for kernel developers */
120static int opt_list; /* list pages (in ranges) */
121static int opt_no_summary; /* don't show summary */
122static pid_t opt_pid; /* process to walk */
123
124#define MAX_ADDR_RANGES 1024
125static int nr_addr_ranges;
126static unsigned long opt_offset[MAX_ADDR_RANGES];
127static unsigned long opt_size[MAX_ADDR_RANGES];
128
129#define MAX_BIT_FILTERS 64
130static int nr_bit_filters;
131static uint64_t opt_mask[MAX_BIT_FILTERS];
132static uint64_t opt_bits[MAX_BIT_FILTERS];
133
134static int page_size;
135
136#define PAGES_BATCH (64 << 10) /* 64k pages */
137static int kpageflags_fd;
138static uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH];
139
140#define HASH_SHIFT 13
141#define HASH_SIZE (1 << HASH_SHIFT)
142#define HASH_MASK (HASH_SIZE - 1)
143#define HASH_KEY(flags) (flags & HASH_MASK)
144
145static unsigned long total_pages;
146static unsigned long nr_pages[HASH_SIZE];
147static uint64_t page_flags[HASH_SIZE];
148
149
150/*
151 * helper functions
152 */
153
154#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
155
156#define min_t(type, x, y) ({ \
157 type __min1 = (x); \
158 type __min2 = (y); \
159 __min1 < __min2 ? __min1 : __min2; })
160
161unsigned long pages2mb(unsigned long pages)
162{
163 return (pages * page_size) >> 20;
164}
165
166void fatal(const char *x, ...)
167{
168 va_list ap;
169
170 va_start(ap, x);
171 vfprintf(stderr, x, ap);
172 va_end(ap);
173 exit(EXIT_FAILURE);
174}
175
176
177/*
178 * page flag names
179 */
180
181char *page_flag_name(uint64_t flags)
182{
183 static char buf[65];
184 int present;
185 int i, j;
186
187 for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
188 present = (flags >> i) & 1;
189 if (!page_flag_names[i]) {
190 if (present)
191 fatal("unkown flag bit %d\n", i);
192 continue;
193 }
194 buf[j++] = present ? page_flag_names[i][0] : '_';
195 }
196
197 return buf;
198}
199
200char *page_flag_longname(uint64_t flags)
201{
202 static char buf[1024];
203 int i, n;
204
205 for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) {
206 if (!page_flag_names[i])
207 continue;
208 if ((flags >> i) & 1)
209 n += snprintf(buf + n, sizeof(buf) - n, "%s,",
210 page_flag_names[i] + 2);
211 }
212 if (n)
213 n--;
214 buf[n] = '\0';
215
216 return buf;
217}
218
219
220/*
221 * page list and summary
222 */
223
224void show_page_range(unsigned long offset, uint64_t flags)
225{
226 static uint64_t flags0;
227 static unsigned long index;
228 static unsigned long count;
229
230 if (flags == flags0 && offset == index + count) {
231 count++;
232 return;
233 }
234
235 if (count)
236 printf("%lu\t%lu\t%s\n",
237 index, count, page_flag_name(flags0));
238
239 flags0 = flags;
240 index = offset;
241 count = 1;
242}
243
244void show_page(unsigned long offset, uint64_t flags)
245{
246 printf("%lu\t%s\n", offset, page_flag_name(flags));
247}
248
249void show_summary(void)
250{
251 int i;
252
253 printf(" flags\tpage-count MB"
254 " symbolic-flags\t\t\tlong-symbolic-flags\n");
255
256 for (i = 0; i < ARRAY_SIZE(nr_pages); i++) {
257 if (nr_pages[i])
258 printf("0x%016llx\t%10lu %8lu %s\t%s\n",
259 (unsigned long long)page_flags[i],
260 nr_pages[i],
261 pages2mb(nr_pages[i]),
262 page_flag_name(page_flags[i]),
263 page_flag_longname(page_flags[i]));
264 }
265
266 printf(" total\t%10lu %8lu\n",
267 total_pages, pages2mb(total_pages));
268}
269
270
271/*
272 * page flag filters
273 */
274
275int bit_mask_ok(uint64_t flags)
276{
277 int i;
278
279 for (i = 0; i < nr_bit_filters; i++) {
280 if (opt_bits[i] == KPF_ALL_BITS) {
281 if ((flags & opt_mask[i]) == 0)
282 return 0;
283 } else {
284 if ((flags & opt_mask[i]) != opt_bits[i])
285 return 0;
286 }
287 }
288
289 return 1;
290}
291
292uint64_t expand_overloaded_flags(uint64_t flags)
293{
294 /* SLOB/SLUB overload several page flags */
295 if (flags & BIT(SLAB)) {
296 if (flags & BIT(PRIVATE))
297 flags ^= BIT(PRIVATE) | BIT(SLOB_FREE);
298 if (flags & BIT(ACTIVE))
299 flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN);
300 if (flags & BIT(ERROR))
301 flags ^= BIT(ERROR) | BIT(SLUB_DEBUG);
302 }
303
304 /* PG_reclaim is overloaded as PG_readahead in the read path */
305 if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM))
306 flags ^= BIT(RECLAIM) | BIT(READAHEAD);
307
308 return flags;
309}
310
311uint64_t well_known_flags(uint64_t flags)
312{
313 /* hide flags intended only for kernel hacker */
314 flags &= ~KPF_HACKERS_BITS;
315
316 /* hide non-hugeTLB compound pages */
317 if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE)))
318 flags &= ~BITS_COMPOUND;
319
320 return flags;
321}
322
323
324/*
325 * page frame walker
326 */
327
328int hash_slot(uint64_t flags)
329{
330 int k = HASH_KEY(flags);
331 int i;
332
333 /* Explicitly reserve slot 0 for flags 0: the following logic
334 * cannot distinguish an unoccupied slot from slot (flags==0).
335 */
336 if (flags == 0)
337 return 0;
338
339 /* search through the remaining (HASH_SIZE-1) slots */
340 for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) {
341 if (!k || k >= ARRAY_SIZE(page_flags))
342 k = 1;
343 if (page_flags[k] == 0) {
344 page_flags[k] = flags;
345 return k;
346 }
347 if (page_flags[k] == flags)
348 return k;
349 }
350
351 fatal("hash table full: bump up HASH_SHIFT?\n");
352 exit(EXIT_FAILURE);
353}
354
355void add_page(unsigned long offset, uint64_t flags)
356{
357 flags = expand_overloaded_flags(flags);
358
359 if (!opt_raw)
360 flags = well_known_flags(flags);
361
362 if (!bit_mask_ok(flags))
363 return;
364
365 if (opt_list == 1)
366 show_page_range(offset, flags);
367 else if (opt_list == 2)
368 show_page(offset, flags);
369
370 nr_pages[hash_slot(flags)]++;
371 total_pages++;
372}
373
374void walk_pfn(unsigned long index, unsigned long count)
375{
376 unsigned long batch;
377 unsigned long n;
378 unsigned long i;
379
380 if (index > ULONG_MAX / KPF_BYTES)
381 fatal("index overflow: %lu\n", index);
382
383 lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET);
384
385 while (count) {
386 batch = min_t(unsigned long, count, PAGES_BATCH);
387 n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES);
388 if (n == 0)
389 break;
390 if (n < 0) {
391 perror(PROC_KPAGEFLAGS);
392 exit(EXIT_FAILURE);
393 }
394
395 if (n % KPF_BYTES != 0)
396 fatal("partial read: %lu bytes\n", n);
397 n = n / KPF_BYTES;
398
399 for (i = 0; i < n; i++)
400 add_page(index + i, kpageflags_buf[i]);
401
402 index += batch;
403 count -= batch;
404 }
405}
406
407void walk_addr_ranges(void)
408{
409 int i;
410
411 kpageflags_fd = open(PROC_KPAGEFLAGS, O_RDONLY);
412 if (kpageflags_fd < 0) {
413 perror(PROC_KPAGEFLAGS);
414 exit(EXIT_FAILURE);
415 }
416
417 if (!nr_addr_ranges)
418 walk_pfn(0, ULONG_MAX);
419
420 for (i = 0; i < nr_addr_ranges; i++)
421 walk_pfn(opt_offset[i], opt_size[i]);
422
423 close(kpageflags_fd);
424}
425
426
427/*
428 * user interface
429 */
430
431const char *page_flag_type(uint64_t flag)
432{
433 if (flag & KPF_HACKERS_BITS)
434 return "(r)";
435 if (flag & KPF_OVERLOADED_BITS)
436 return "(o)";
437 return " ";
438}
439
440void usage(void)
441{
442 int i, j;
443
444 printf(
445"page-types [options]\n"
446" -r|--raw Raw mode, for kernel developers\n"
447" -a|--addr addr-spec Walk a range of pages\n"
448" -b|--bits bits-spec Walk pages with specified bits\n"
449#if 0 /* planned features */
450" -p|--pid pid Walk process address space\n"
451" -f|--file filename Walk file address space\n"
452#endif
453" -l|--list Show page details in ranges\n"
454" -L|--list-each Show page details one by one\n"
455" -N|--no-summary Don't show summay info\n"
456" -h|--help Show this usage message\n"
457"addr-spec:\n"
458" N one page at offset N (unit: pages)\n"
459" N+M pages range from N to N+M-1\n"
460" N,M pages range from N to M-1\n"
461" N, pages range from N to end\n"
462" ,M pages range from 0 to M\n"
463"bits-spec:\n"
464" bit1,bit2 (flags & (bit1|bit2)) != 0\n"
465" bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n"
466" bit1,~bit2 (flags & (bit1|bit2)) == bit1\n"
467" =bit1,bit2 flags == (bit1|bit2)\n"
468"bit-names:\n"
469 );
470
471 for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
472 if (!page_flag_names[i])
473 continue;
474 printf("%16s%s", page_flag_names[i] + 2,
475 page_flag_type(1ULL << i));
476 if (++j > 3) {
477 j = 0;
478 putchar('\n');
479 }
480 }
481 printf("\n "
482 "(r) raw mode bits (o) overloaded bits\n");
483}
484
485unsigned long long parse_number(const char *str)
486{
487 unsigned long long n;
488
489 n = strtoll(str, NULL, 0);
490
491 if (n == 0 && str[0] != '0')
492 fatal("invalid name or number: %s\n", str);
493
494 return n;
495}
496
497void parse_pid(const char *str)
498{
499 opt_pid = parse_number(str);
500}
501
502void parse_file(const char *name)
503{
504}
505
506void add_addr_range(unsigned long offset, unsigned long size)
507{
508 if (nr_addr_ranges >= MAX_ADDR_RANGES)
509 fatal("too much addr ranges\n");
510
511 opt_offset[nr_addr_ranges] = offset;
512 opt_size[nr_addr_ranges] = size;
513 nr_addr_ranges++;
514}
515
516void parse_addr_range(const char *optarg)
517{
518 unsigned long offset;
519 unsigned long size;
520 char *p;
521
522 p = strchr(optarg, ',');
523 if (!p)
524 p = strchr(optarg, '+');
525
526 if (p == optarg) {
527 offset = 0;
528 size = parse_number(p + 1);
529 } else if (p) {
530 offset = parse_number(optarg);
531 if (p[1] == '\0')
532 size = ULONG_MAX;
533 else {
534 size = parse_number(p + 1);
535 if (*p == ',') {
536 if (size < offset)
537 fatal("invalid range: %lu,%lu\n",
538 offset, size);
539 size -= offset;
540 }
541 }
542 } else {
543 offset = parse_number(optarg);
544 size = 1;
545 }
546
547 add_addr_range(offset, size);
548}
549
550void add_bits_filter(uint64_t mask, uint64_t bits)
551{
552 if (nr_bit_filters >= MAX_BIT_FILTERS)
553 fatal("too much bit filters\n");
554
555 opt_mask[nr_bit_filters] = mask;
556 opt_bits[nr_bit_filters] = bits;
557 nr_bit_filters++;
558}
559
560uint64_t parse_flag_name(const char *str, int len)
561{
562 int i;
563
564 if (!*str || !len)
565 return 0;
566
567 if (len <= 8 && !strncmp(str, "compound", len))
568 return BITS_COMPOUND;
569
570 for (i = 0; i < ARRAY_SIZE(page_flag_names); i++) {
571 if (!page_flag_names[i])
572 continue;
573 if (!strncmp(str, page_flag_names[i] + 2, len))
574 return 1ULL << i;
575 }
576
577 return parse_number(str);
578}
579
580uint64_t parse_flag_names(const char *str, int all)
581{
582 const char *p = str;
583 uint64_t flags = 0;
584
585 while (1) {
586 if (*p == ',' || *p == '=' || *p == '\0') {
587 if ((*str != '~') || (*str == '~' && all && *++str))
588 flags |= parse_flag_name(str, p - str);
589 if (*p != ',')
590 break;
591 str = p + 1;
592 }
593 p++;
594 }
595
596 return flags;
597}
598
599void parse_bits_mask(const char *optarg)
600{
601 uint64_t mask;
602 uint64_t bits;
603 const char *p;
604
605 p = strchr(optarg, '=');
606 if (p == optarg) {
607 mask = KPF_ALL_BITS;
608 bits = parse_flag_names(p + 1, 0);
609 } else if (p) {
610 mask = parse_flag_names(optarg, 0);
611 bits = parse_flag_names(p + 1, 0);
612 } else if (strchr(optarg, '~')) {
613 mask = parse_flag_names(optarg, 1);
614 bits = parse_flag_names(optarg, 0);
615 } else {
616 mask = parse_flag_names(optarg, 0);
617 bits = KPF_ALL_BITS;
618 }
619
620 add_bits_filter(mask, bits);
621}
622
623
624struct option opts[] = {
625 { "raw" , 0, NULL, 'r' },
626 { "pid" , 1, NULL, 'p' },
627 { "file" , 1, NULL, 'f' },
628 { "addr" , 1, NULL, 'a' },
629 { "bits" , 1, NULL, 'b' },
630 { "list" , 0, NULL, 'l' },
631 { "list-each" , 0, NULL, 'L' },
632 { "no-summary", 0, NULL, 'N' },
633 { "help" , 0, NULL, 'h' },
634 { NULL , 0, NULL, 0 }
635};
636
637int main(int argc, char *argv[])
638{
639 int c;
640
641 page_size = getpagesize();
642
643 while ((c = getopt_long(argc, argv,
644 "rp:f:a:b:lLNh", opts, NULL)) != -1) {
645 switch (c) {
646 case 'r':
647 opt_raw = 1;
648 break;
649 case 'p':
650 parse_pid(optarg);
651 break;
652 case 'f':
653 parse_file(optarg);
654 break;
655 case 'a':
656 parse_addr_range(optarg);
657 break;
658 case 'b':
659 parse_bits_mask(optarg);
660 break;
661 case 'l':
662 opt_list = 1;
663 break;
664 case 'L':
665 opt_list = 2;
666 break;
667 case 'N':
668 opt_no_summary = 1;
669 break;
670 case 'h':
671 usage();
672 exit(0);
673 default:
674 usage();
675 exit(1);
676 }
677 }
678
679 if (opt_list == 1)
680 printf("offset\tcount\tflags\n");
681 if (opt_list == 2)
682 printf("offset\tflags\n");
683
684 walk_addr_ranges();
685
686 if (opt_list == 1)
687 show_page_range(0, 0); /* drain the buffer */
688
689 if (opt_no_summary)
690 return 0;
691
692 if (opt_list)
693 printf("\n\n");
694
695 show_summary();
696
697 return 0;
698}
diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index ce72c0fe6177..600a304a828c 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -12,9 +12,9 @@ There are three components to pagemap:
12 value for each virtual page, containing the following data (from 12 value for each virtual page, containing the following data (from
13 fs/proc/task_mmu.c, above pagemap_read): 13 fs/proc/task_mmu.c, above pagemap_read):
14 14
15 * Bits 0-55 page frame number (PFN) if present 15 * Bits 0-54 page frame number (PFN) if present
16 * Bits 0-4 swap type if swapped 16 * Bits 0-4 swap type if swapped
17 * Bits 5-55 swap offset if swapped 17 * Bits 5-54 swap offset if swapped
18 * Bits 55-60 page shift (page size = 1<<page shift) 18 * Bits 55-60 page shift (page size = 1<<page shift)
19 * Bit 61 reserved for future use 19 * Bit 61 reserved for future use
20 * Bit 62 page swapped 20 * Bit 62 page swapped
@@ -36,7 +36,7 @@ There are three components to pagemap:
36 * /proc/kpageflags. This file contains a 64-bit set of flags for each 36 * /proc/kpageflags. This file contains a 64-bit set of flags for each
37 page, indexed by PFN. 37 page, indexed by PFN.
38 38
39 The flags are (from fs/proc/proc_misc, above kpageflags_read): 39 The flags are (from fs/proc/page.c, above kpageflags_read):
40 40
41 0. LOCKED 41 0. LOCKED
42 1. ERROR 42 1. ERROR
@@ -49,6 +49,68 @@ There are three components to pagemap:
49 8. WRITEBACK 49 8. WRITEBACK
50 9. RECLAIM 50 9. RECLAIM
51 10. BUDDY 51 10. BUDDY
52 11. MMAP
53 12. ANON
54 13. SWAPCACHE
55 14. SWAPBACKED
56 15. COMPOUND_HEAD
57 16. COMPOUND_TAIL
58 16. HUGE
59 18. UNEVICTABLE
60 20. NOPAGE
61
62Short descriptions to the page flags:
63
64 0. LOCKED
65 page is being locked for exclusive access, eg. by undergoing read/write IO
66
67 7. SLAB
68 page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator
69 When compound page is used, SLUB/SLQB will only set this flag on the head
70 page; SLOB will not flag it at all.
71
7210. BUDDY
73 a free memory block managed by the buddy system allocator
74 The buddy system organizes free memory in blocks of various orders.
75 An order N block has 2^N physically contiguous pages, with the BUDDY flag
76 set for and _only_ for the first page.
77
7815. COMPOUND_HEAD
7916. COMPOUND_TAIL
80 A compound page with order N consists of 2^N physically contiguous pages.
81 A compound page with order 2 takes the form of "HTTT", where H donates its
82 head page and T donates its tail page(s). The major consumers of compound
83 pages are hugeTLB pages (Documentation/vm/hugetlbpage.txt), the SLUB etc.
84 memory allocators and various device drivers. However in this interface,
85 only huge/giga pages are made visible to end users.
8617. HUGE
87 this is an integral part of a HugeTLB page
88
8920. NOPAGE
90 no page frame exists at the requested address
91
92 [IO related page flags]
93 1. ERROR IO error occurred
94 3. UPTODATE page has up-to-date data
95 ie. for file backed page: (in-memory data revision >= on-disk one)
96 4. DIRTY page has been written to, hence contains new data
97 ie. for file backed page: (in-memory data revision > on-disk one)
98 8. WRITEBACK page is being synced to disk
99
100 [LRU related page flags]
101 5. LRU page is in one of the LRU lists
102 6. ACTIVE page is in the active LRU list
10318. UNEVICTABLE page is in the unevictable (non-)LRU list
104 It is somehow pinned and not a candidate for LRU page reclaims,
105 eg. ramfs pages, shmctl(SHM_LOCK) and mlock() memory segments
106 2. REFERENCED page has been referenced since last LRU list enqueue/requeue
107 9. RECLAIM page will be reclaimed soon after its pageout IO completed
10811. MMAP a memory mapped page
10912. ANON a memory mapped page that is not part of a file
11013. SWAPCACHE page is mapped to swap space, ie. has an associated swap entry
11114. SWAPBACKED page is backed by swap/RAM
112
113The page-types tool in this directory can be used to query the above flags.
52 114
53Using pagemap to do something useful: 115Using pagemap to do something useful:
54 116
diff --git a/Documentation/watchdog/hpwdt.txt b/Documentation/watchdog/hpwdt.txt
new file mode 100644
index 000000000000..9c24d5ffbb06
--- /dev/null
+++ b/Documentation/watchdog/hpwdt.txt
@@ -0,0 +1,95 @@
1Last reviewed: 06/02/2009
2
3 HP iLO2 NMI Watchdog Driver
4 NMI sourcing for iLO2 based ProLiant Servers
5 Documentation and Driver by
6 Thomas Mingarelli <thomas.mingarelli@hp.com>
7
8 The HP iLO2 NMI Watchdog driver is a kernel module that provides basic
9 watchdog functionality and the added benefit of NMI sourcing. Both the
10 watchdog functionality and the NMI sourcing capability need to be enabled
11 by the user. Remember that the two modes are not dependant on one another.
12 A user can have the NMI sourcing without the watchdog timer and vice-versa.
13
14 Watchdog functionality is enabled like any other common watchdog driver. That
15 is, an application needs to be started that kicks off the watchdog timer. A
16 basic application exists in the Documentation/watchdog/src directory called
17 watchdog-test.c. Simply compile the C file and kick it off. If the system
18 gets into a bad state and hangs, the HP ProLiant iLO 2 timer register will
19 not be updated in a timely fashion and a hardware system reset (also known as
20 an Automatic Server Recovery (ASR)) event will occur.
21
22 The hpwdt driver also has four (4) module parameters. They are the following:
23
24 soft_margin - allows the user to set the watchdog timer value
25 allow_kdump - allows the user to save off a kernel dump image after an NMI
26 nowayout - basic watchdog parameter that does not allow the timer to
27 be restarted or an impending ASR to be escaped.
28 priority - determines whether or not the hpwdt driver is first on the
29 die_notify list to handle NMIs or last. The default value
30 for this module parameter is 0 or LAST. If the user wants to
31 enable NMI sourcing then reload the hpwdt driver with
32 priority=1 (and boot with nmi_watchdog=0).
33
34 NOTE: More information about watchdog drivers in general, including the ioctl
35 interface to /dev/watchdog can be found in
36 Documentation/watchdog/watchdog-api.txt and Documentation/IPMI.txt.
37
38 The priority parameter was introduced due to other kernel software that relied
39 on handling NMIs (like oprofile). Keeping hpwdt's priority at 0 (or LAST)
40 enables the users of NMIs for non critical events to be work as expected.
41
42 The NMI sourcing capability is disabled by default due to the inability to
43 distinguish between "NMI Watchdog Ticks" and "HW generated NMI events" in the
44 Linux kernel. What this means is that the hpwdt nmi handler code is called
45 each time the NMI signal fires off. This could amount to several thousands of
46 NMIs in a matter of seconds. If a user sees the Linux kernel's "dazed and
47 confused" message in the logs or if the system gets into a hung state, then
48 the hpwdt driver can be reloaded with the "priority" module parameter set
49 (priority=1).
50
51 1. If the kernel has not been booted with nmi_watchdog turned off then
52 edit /boot/grub/menu.lst and place the nmi_watchdog=0 at the end of the
53 currently booting kernel line.
54 2. reboot the sever
55 3. Once the system comes up perform a rmmod hpwdt
56 4. insmod /lib/modules/`uname -r`/kernel/drivers/char/watchdog/hpwdt.ko priority=1
57
58 Now, the hpwdt can successfully receive and source the NMI and provide a log
59 message that details the reason for the NMI (as determined by the HP BIOS).
60
61 Below is a list of NMIs the HP BIOS understands along with the associated
62 code (reason):
63
64 No source found 00h
65
66 Uncorrectable Memory Error 01h
67
68 ASR NMI 1Bh
69
70 PCI Parity Error 20h
71
72 NMI Button Press 27h
73
74 SB_BUS_NMI 28h
75
76 ILO Doorbell NMI 29h
77
78 ILO IOP NMI 2Ah
79
80 ILO Watchdog NMI 2Bh
81
82 Proc Throt NMI 2Ch
83
84 Front Side Bus NMI 2Dh
85
86 PCI Express Error 2Fh
87
88 DMA controller NMI 30h
89
90 Hypertransport/CSI Error 31h
91
92
93
94 -- Tom Mingarelli
95 (thomas.mingarelli@hp.com)
diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index e0203662f9e9..8da3a795083f 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -50,6 +50,10 @@ Protocol 2.08: (Kernel 2.6.26) Added crc32 checksum and ELF format
50Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical 50Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical
51 pointer to single linked list of struct setup_data. 51 pointer to single linked list of struct setup_data.
52 52
53Protocol 2.10: (Kernel 2.6.31) Added a protocol for relaxed alignment
54 beyond the kernel_alignment added, new init_size and
55 pref_address fields. Added extended boot loader IDs.
56
53**** MEMORY LAYOUT 57**** MEMORY LAYOUT
54 58
55The traditional memory map for the kernel loader, used for Image or 59The traditional memory map for the kernel loader, used for Image or
@@ -168,12 +172,13 @@ Offset Proto Name Meaning
168021C/4 2.00+ ramdisk_size initrd size (set by boot loader) 172021C/4 2.00+ ramdisk_size initrd size (set by boot loader)
1690220/4 2.00+ bootsect_kludge DO NOT USE - for bootsect.S use only 1730220/4 2.00+ bootsect_kludge DO NOT USE - for bootsect.S use only
1700224/2 2.01+ heap_end_ptr Free memory after setup end 1740224/2 2.01+ heap_end_ptr Free memory after setup end
1710226/2 N/A pad1 Unused 1750226/1 2.02+(3 ext_loader_ver Extended boot loader version
1760227/1 2.02+(3 ext_loader_type Extended boot loader ID
1720228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line 1770228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line
173022C/4 2.03+ ramdisk_max Highest legal initrd address 178022C/4 2.03+ ramdisk_max Highest legal initrd address
1740230/4 2.05+ kernel_alignment Physical addr alignment required for kernel 1790230/4 2.05+ kernel_alignment Physical addr alignment required for kernel
1750234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not 1800234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not
1760235/1 N/A pad2 Unused 1810235/1 2.10+ min_alignment Minimum alignment, as a power of two
1770236/2 N/A pad3 Unused 1820236/2 N/A pad3 Unused
1780238/4 2.06+ cmdline_size Maximum size of the kernel command line 1830238/4 2.06+ cmdline_size Maximum size of the kernel command line
179023C/4 2.07+ hardware_subarch Hardware subarchitecture 184023C/4 2.07+ hardware_subarch Hardware subarchitecture
@@ -182,6 +187,8 @@ Offset Proto Name Meaning
182024C/4 2.08+ payload_length Length of kernel payload 187024C/4 2.08+ payload_length Length of kernel payload
1830250/8 2.09+ setup_data 64-bit physical pointer to linked list 1880250/8 2.09+ setup_data 64-bit physical pointer to linked list
184 of struct setup_data 189 of struct setup_data
1900258/8 2.10+ pref_address Preferred loading address
1910260/4 2.10+ init_size Linear memory required during initialization
185 192
186(1) For backwards compatibility, if the setup_sects field contains 0, the 193(1) For backwards compatibility, if the setup_sects field contains 0, the
187 real value is 4. 194 real value is 4.
@@ -190,6 +197,8 @@ Offset Proto Name Meaning
190 field are unusable, which means the size of a bzImage kernel 197 field are unusable, which means the size of a bzImage kernel
191 cannot be determined. 198 cannot be determined.
192 199
200(3) Ignored, but safe to set, for boot protocols 2.02-2.09.
201
193If the "HdrS" (0x53726448) magic number is not found at offset 0x202, 202If the "HdrS" (0x53726448) magic number is not found at offset 0x202,
194the boot protocol version is "old". Loading an old kernel, the 203the boot protocol version is "old". Loading an old kernel, the
195following parameters should be assumed: 204following parameters should be assumed:
@@ -343,18 +352,32 @@ Protocol: 2.00+
343 0xTV here, where T is an identifier for the boot loader and V is 352 0xTV here, where T is an identifier for the boot loader and V is
344 a version number. Otherwise, enter 0xFF here. 353 a version number. Otherwise, enter 0xFF here.
345 354
355 For boot loader IDs above T = 0xD, write T = 0xE to this field and
356 write the extended ID minus 0x10 to the ext_loader_type field.
357 Similarly, the ext_loader_ver field can be used to provide more than
358 four bits for the bootloader version.
359
360 For example, for T = 0x15, V = 0x234, write:
361
362 type_of_loader <- 0xE4
363 ext_loader_type <- 0x05
364 ext_loader_ver <- 0x23
365
346 Assigned boot loader ids: 366 Assigned boot loader ids:
347 0 LILO (0x00 reserved for pre-2.00 bootloader) 367 0 LILO (0x00 reserved for pre-2.00 bootloader)
348 1 Loadlin 368 1 Loadlin
349 2 bootsect-loader (0x20, all other values reserved) 369 2 bootsect-loader (0x20, all other values reserved)
350 3 SYSLINUX 370 3 Syslinux
351 4 EtherBoot 371 4 Etherboot/gPXE
352 5 ELILO 372 5 ELILO
353 7 GRUB 373 7 GRUB
354 8 U-BOOT 374 8 U-Boot
355 9 Xen 375 9 Xen
356 A Gujin 376 A Gujin
357 B Qemu 377 B Qemu
378 C Arcturus Networks uCbootloader
379 E Extended (see ext_loader_type)
380 F Special (0xFF = undefined)
358 381
359 Please contact <hpa@zytor.com> if you need a bootloader ID 382 Please contact <hpa@zytor.com> if you need a bootloader ID
360 value assigned. 383 value assigned.
@@ -453,6 +476,35 @@ Protocol: 2.01+
453 Set this field to the offset (from the beginning of the real-mode 476 Set this field to the offset (from the beginning of the real-mode
454 code) of the end of the setup stack/heap, minus 0x0200. 477 code) of the end of the setup stack/heap, minus 0x0200.
455 478
479Field name: ext_loader_ver
480Type: write (optional)
481Offset/size: 0x226/1
482Protocol: 2.02+
483
484 This field is used as an extension of the version number in the
485 type_of_loader field. The total version number is considered to be
486 (type_of_loader & 0x0f) + (ext_loader_ver << 4).
487
488 The use of this field is boot loader specific. If not written, it
489 is zero.
490
491 Kernels prior to 2.6.31 did not recognize this field, but it is safe
492 to write for protocol version 2.02 or higher.
493
494Field name: ext_loader_type
495Type: write (obligatory if (type_of_loader & 0xf0) == 0xe0)
496Offset/size: 0x227/1
497Protocol: 2.02+
498
499 This field is used as an extension of the type number in
500 type_of_loader field. If the type in type_of_loader is 0xE, then
501 the actual type is (ext_loader_type + 0x10).
502
503 This field is ignored if the type in type_of_loader is not 0xE.
504
505 Kernels prior to 2.6.31 did not recognize this field, but it is safe
506 to write for protocol version 2.02 or higher.
507
456Field name: cmd_line_ptr 508Field name: cmd_line_ptr
457Type: write (obligatory) 509Type: write (obligatory)
458Offset/size: 0x228/4 510Offset/size: 0x228/4
@@ -482,11 +534,19 @@ Protocol: 2.03+
482 0x37FFFFFF, you can start your ramdisk at 0x37FE0000.) 534 0x37FFFFFF, you can start your ramdisk at 0x37FE0000.)
483 535
484Field name: kernel_alignment 536Field name: kernel_alignment
485Type: read (reloc) 537Type: read/modify (reloc)
486Offset/size: 0x230/4 538Offset/size: 0x230/4
487Protocol: 2.05+ 539Protocol: 2.05+ (read), 2.10+ (modify)
540
541 Alignment unit required by the kernel (if relocatable_kernel is
542 true.) A relocatable kernel that is loaded at an alignment
543 incompatible with the value in this field will be realigned during
544 kernel initialization.
488 545
489 Alignment unit required by the kernel (if relocatable_kernel is true.) 546 Starting with protocol version 2.10, this reflects the kernel
547 alignment preferred for optimal performance; it is possible for the
548 loader to modify this field to permit a lesser alignment. See the
549 min_alignment and pref_address field below.
490 550
491Field name: relocatable_kernel 551Field name: relocatable_kernel
492Type: read (reloc) 552Type: read (reloc)
@@ -498,6 +558,22 @@ Protocol: 2.05+
498 After loading, the boot loader must set the code32_start field to 558 After loading, the boot loader must set the code32_start field to
499 point to the loaded code, or to a boot loader hook. 559 point to the loaded code, or to a boot loader hook.
500 560
561Field name: min_alignment
562Type: read (reloc)
563Offset/size: 0x235/1
564Protocol: 2.10+
565
566 This field, if nonzero, indicates as a power of two the minimum
567 alignment required, as opposed to preferred, by the kernel to boot.
568 If a boot loader makes use of this field, it should update the
569 kernel_alignment field with the alignment unit desired; typically:
570
571 kernel_alignment = 1 << min_alignment
572
573 There may be a considerable performance cost with an excessively
574 misaligned kernel. Therefore, a loader should typically try each
575 power-of-two alignment from kernel_alignment down to this alignment.
576
501Field name: cmdline_size 577Field name: cmdline_size
502Type: read 578Type: read
503Offset/size: 0x238/4 579Offset/size: 0x238/4
@@ -582,6 +658,36 @@ Protocol: 2.09+
582 sure to consider the case where the linked list already contains 658 sure to consider the case where the linked list already contains
583 entries. 659 entries.
584 660
661Field name: pref_address
662Type: read (reloc)
663Offset/size: 0x258/8
664Protocol: 2.10+
665
666 This field, if nonzero, represents a preferred load address for the
667 kernel. A relocating bootloader should attempt to load at this
668 address if possible.
669
670 A non-relocatable kernel will unconditionally move itself and to run
671 at this address.
672
673Field name: init_size
674Type: read
675Offset/size: 0x25c/4
676
677 This field indicates the amount of linear contiguous memory starting
678 at the kernel runtime start address that the kernel needs before it
679 is capable of examining its memory map. This is not the same thing
680 as the total amount of memory the kernel needs to boot, but it can
681 be used by a relocating boot loader to help select a safe load
682 address for the kernel.
683
684 The kernel runtime start address is determined by the following algorithm:
685
686 if (relocatable_kernel)
687 runtime_start = align_up(load_address, kernel_alignment)
688 else
689 runtime_start = pref_address
690
585 691
586**** THE IMAGE CHECKSUM 692**** THE IMAGE CHECKSUM
587 693
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 34c13040a718..29a6ff8bc7d3 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -5,21 +5,51 @@ only the AMD64 specific ones are listed here.
5 5
6Machine check 6Machine check
7 7
8 mce=off disable machine check 8 Please see Documentation/x86/x86_64/machinecheck for sysfs runtime tunables.
9 mce=bootlog Enable logging of machine checks left over from booting. 9
10 Disabled by default on AMD because some BIOS leave bogus ones. 10 mce=off
11 If your BIOS doesn't do that it's a good idea to enable though 11 Disable machine check
12 to make sure you log even machine check events that result 12 mce=no_cmci
13 in a reboot. On Intel systems it is enabled by default. 13 Disable CMCI(Corrected Machine Check Interrupt) that
14 Intel processor supports. Usually this disablement is
15 not recommended, but it might be handy if your hardware
16 is misbehaving.
17 Note that you'll get more problems without CMCI than with
18 due to the shared banks, i.e. you might get duplicated
19 error logs.
20 mce=dont_log_ce
21 Don't make logs for corrected errors. All events reported
22 as corrected are silently cleared by OS.
23 This option will be useful if you have no interest in any
24 of corrected errors.
25 mce=ignore_ce
26 Disable features for corrected errors, e.g. polling timer
27 and CMCI. All events reported as corrected are not cleared
28 by OS and remained in its error banks.
29 Usually this disablement is not recommended, however if
30 there is an agent checking/clearing corrected errors
31 (e.g. BIOS or hardware monitoring applications), conflicting
32 with OS's error handling, and you cannot deactivate the agent,
33 then this option will be a help.
34 mce=bootlog
35 Enable logging of machine checks left over from booting.
36 Disabled by default on AMD because some BIOS leave bogus ones.
37 If your BIOS doesn't do that it's a good idea to enable though
38 to make sure you log even machine check events that result
39 in a reboot. On Intel systems it is enabled by default.
14 mce=nobootlog 40 mce=nobootlog
15 Disable boot machine check logging. 41 Disable boot machine check logging.
16 mce=tolerancelevel (number) 42 mce=tolerancelevel[,monarchtimeout] (number,number)
43 tolerance levels:
17 0: always panic on uncorrected errors, log corrected errors 44 0: always panic on uncorrected errors, log corrected errors
18 1: panic or SIGBUS on uncorrected errors, log corrected errors 45 1: panic or SIGBUS on uncorrected errors, log corrected errors
19 2: SIGBUS or log uncorrected errors, log corrected errors 46 2: SIGBUS or log uncorrected errors, log corrected errors
20 3: never panic or SIGBUS, log all errors (for testing only) 47 3: never panic or SIGBUS, log all errors (for testing only)
21 Default is 1 48 Default is 1
22 Can be also set using sysfs which is preferable. 49 Can be also set using sysfs which is preferable.
50 monarchtimeout:
51 Sets the time in us to wait for other CPUs on machine checks. 0
52 to disable.
23 53
24 nomce (for compatibility with i386): same as mce=off 54 nomce (for compatibility with i386): same as mce=off
25 55
@@ -150,11 +180,6 @@ NUMA
150 Otherwise, the remaining system RAM is allocated to an 180 Otherwise, the remaining system RAM is allocated to an
151 additional node. 181 additional node.
152 182
153 numa=hotadd=percent
154 Only allow hotadd memory to preallocate page structures upto
155 percent of already available memory.
156 numa=hotadd=0 will disable hotadd memory.
157
158ACPI 183ACPI
159 184
160 acpi=off Don't enable ACPI 185 acpi=off Don't enable ACPI
diff --git a/Documentation/x86/x86_64/machinecheck b/Documentation/x86/x86_64/machinecheck
index a05e58e7b159..b1fb30273286 100644
--- a/Documentation/x86/x86_64/machinecheck
+++ b/Documentation/x86/x86_64/machinecheck
@@ -41,7 +41,9 @@ check_interval
41 the polling interval. When the poller stops finding MCEs, it 41 the polling interval. When the poller stops finding MCEs, it
42 triggers an exponential backoff (poll less often) on the polling 42 triggers an exponential backoff (poll less often) on the polling
43 interval. The check_interval variable is both the initial and 43 interval. The check_interval variable is both the initial and
44 maximum polling interval. 44 maximum polling interval. 0 means no polling for corrected machine
45 check errors (but some corrected errors might be still reported
46 in other ways)
45 47
46tolerant 48tolerant
47 Tolerance level. When a machine check exception occurs for a non 49 Tolerance level. When a machine check exception occurs for a non
@@ -67,6 +69,10 @@ trigger
67 Program to run when a machine check event is detected. 69 Program to run when a machine check event is detected.
68 This is an alternative to running mcelog regularly from cron 70 This is an alternative to running mcelog regularly from cron
69 and allows to detect events faster. 71 and allows to detect events faster.
72monarch_timeout
73 How long to wait for the other CPUs to machine check too on a
74 exception. 0 to disable waiting for other CPUs.
75 Unit: us
70 76
71TBD document entries for AMD threshold interrupt configuration 77TBD document entries for AMD threshold interrupt configuration
72 78
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 29b52b14d0b4..d6498e3cd713 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -6,10 +6,11 @@ Virtual memory map with 4 level page tables:
60000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm 60000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
7hole caused by [48:63] sign extension 7hole caused by [48:63] sign extension
8ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole 8ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole
9ffff880000000000 - ffffc0ffffffffff (=57 TB) direct mapping of all phys. memory 9ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
10ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole 10ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
11ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space 11ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
12ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB) 12ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
13ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
13... unused hole ... 14... unused hole ...
14ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0 15ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0
15ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space 16ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space