aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/ABI/stable/firewire-cdev103
-rw-r--r--Documentation/ABI/stable/sysfs-bus-firewire122
-rw-r--r--Documentation/ABI/stable/vdso27
-rw-r--r--Documentation/ABI/testing/sysfs-class-backlight-driver-adp887056
-rw-r--r--Documentation/ABI/testing/sysfs-driver-hid-roccat-koneplus8
-rw-r--r--Documentation/ABI/testing/sysfs-driver-hid-wiimote10
-rw-r--r--Documentation/Changes43
-rw-r--r--Documentation/CodingStyle4
-rw-r--r--Documentation/DocBook/80211.tmpl5
-rw-r--r--Documentation/DocBook/kernel-hacking.tmpl2
-rw-r--r--Documentation/SubmitChecklist2
-rw-r--r--Documentation/accounting/cgroupstats.txt4
-rw-r--r--Documentation/cgroups/blkio-controller.txt41
-rw-r--r--Documentation/cgroups/cgroups.txt60
-rw-r--r--Documentation/cgroups/cpuacct.txt23
-rw-r--r--Documentation/cgroups/cpusets.txt30
-rw-r--r--Documentation/cgroups/devices.txt6
-rw-r--r--Documentation/cgroups/freezer-subsystem.txt20
-rw-r--r--Documentation/cgroups/memory.txt58
-rw-r--r--Documentation/development-process/4.Coding2
-rw-r--r--Documentation/devicetree/bindings/arm/primecell.txt21
-rw-r--r--Documentation/devicetree/bindings/crypto/fsl-sec2.txt (renamed from Documentation/devicetree/bindings/powerpc/fsl/sec.txt)2
-rw-r--r--Documentation/devicetree/bindings/gpio/fsl-imx-gpio.txt22
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio.txt46
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio_nvidia.txt8
-rw-r--r--Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt22
-rw-r--r--Documentation/devicetree/bindings/spi/spi_nvidia.txt5
-rw-r--r--Documentation/devicetree/bindings/tty/serial/of-serial.txt36
-rw-r--r--Documentation/feature-removal-schedule.txt49
-rw-r--r--Documentation/filesystems/Locking8
-rw-r--r--Documentation/filesystems/caching/netfs-api.txt16
-rw-r--r--Documentation/filesystems/nilfs2.txt1
-rw-r--r--Documentation/filesystems/porting27
-rw-r--r--Documentation/filesystems/proc.txt1
-rw-r--r--Documentation/filesystems/ubifs.txt28
-rw-r--r--Documentation/filesystems/vfs.txt30
-rw-r--r--Documentation/hwmon/f71882fg4
-rw-r--r--Documentation/hwmon/k10temp8
-rw-r--r--Documentation/ja_JP/SubmitChecklist2
-rw-r--r--Documentation/kernel-parameters.txt4
-rw-r--r--Documentation/kmemleak.txt4
-rw-r--r--Documentation/laptops/thinkpad-acpi.txt5
-rw-r--r--Documentation/md.txt2
-rw-r--r--Documentation/mmc/00-INDEX2
-rw-r--r--Documentation/mmc/mmc-async-req.txt87
-rw-r--r--Documentation/networking/ifenslave.c18
-rw-r--r--Documentation/networking/ip-sysctl.txt31
-rw-r--r--Documentation/networking/netdev-features.txt154
-rw-r--r--Documentation/networking/nfc.txt128
-rw-r--r--Documentation/networking/stmmac.txt200
-rw-r--r--Documentation/power/devices.txt81
-rw-r--r--Documentation/power/opp.txt2
-rw-r--r--Documentation/power/runtime_pm.txt258
-rw-r--r--Documentation/printk-formats.txt119
-rw-r--r--Documentation/scheduler/sched-design-CFS.txt7
-rw-r--r--Documentation/scheduler/sched-rt-group.txt7
-rw-r--r--Documentation/spi/ep93xx_spi10
-rw-r--r--Documentation/spi/pxa2xx5
-rw-r--r--Documentation/spinlocks.txt45
-rw-r--r--Documentation/sysctl/kernel.txt215
-rw-r--r--Documentation/trace/kprobetrace.txt9
-rw-r--r--Documentation/usb/error-codes.txt9
-rw-r--r--Documentation/vDSO/parse_vdso.c256
-rw-r--r--Documentation/vDSO/vdso_test.c111
-rw-r--r--Documentation/virtual/lguest/lguest.c47
-rw-r--r--Documentation/vm/hwpoison.txt6
-rw-r--r--Documentation/x86/boot.txt2
-rw-r--r--Documentation/x86/entry_64.txt98
-rw-r--r--Documentation/zh_CN/SubmitChecklist2
69 files changed, 2233 insertions, 653 deletions
diff --git a/Documentation/ABI/stable/firewire-cdev b/Documentation/ABI/stable/firewire-cdev
new file mode 100644
index 000000000000..16d030827368
--- /dev/null
+++ b/Documentation/ABI/stable/firewire-cdev
@@ -0,0 +1,103 @@
1What: /dev/fw[0-9]+
2Date: May 2007
3KernelVersion: 2.6.22
4Contact: linux1394-devel@lists.sourceforge.net
5Description:
6 The character device files /dev/fw* are the interface between
7 firewire-core and IEEE 1394 device drivers implemented in
8 userspace. The ioctl(2)- and read(2)-based ABI is defined and
9 documented in <linux/firewire-cdev.h>.
10
11 This ABI offers most of the features which firewire-core also
12 exposes to kernelspace IEEE 1394 drivers.
13
14 Each /dev/fw* is associated with one IEEE 1394 node, which can
15 be remote or local nodes. Operations on a /dev/fw* file have
16 different scope:
17 - The 1394 node which is associated with the file:
18 - Asynchronous request transmission
19 - Get the Configuration ROM
20 - Query node ID
21 - Query maximum speed of the path between this node
22 and local node
23 - The 1394 bus (i.e. "card") to which the node is attached to:
24 - Isochronous stream transmission and reception
25 - Asynchronous stream transmission and reception
26 - Asynchronous broadcast request transmission
27 - PHY packet transmission and reception
28 - Allocate, reallocate, deallocate isochronous
29 resources (channels, bandwidth) at the bus's IRM
30 - Query node IDs of local node, root node, IRM, bus
31 manager
32 - Query cycle time
33 - Bus reset initiation, bus reset event reception
34 - All 1394 buses:
35 - Allocation of IEEE 1212 address ranges on the local
36 link layers, reception of inbound requests to such
37 an address range, asynchronous response transmission
38 to inbound requests
39 - Addition of descriptors or directories to the local
40 nodes' Configuration ROM
41
42 Due to the different scope of operations and in order to let
43 userland implement different access permission models, some
44 operations are restricted to /dev/fw* files that are associated
45 with a local node:
46 - Addition of descriptors or directories to the local
47 nodes' Configuration ROM
48 - PHY packet transmission and reception
49
50 A /dev/fw* file remains associated with one particular node
51 during its entire life time. Bus topology changes, and hence
52 node ID changes, are tracked by firewire-core. ABI users do not
53 need to be aware of topology.
54
55 The following file operations are supported:
56
57 open(2)
58 Currently the only useful flags are O_RDWR.
59
60 ioctl(2)
61 Initiate various actions. Some take immediate effect, others
62 are performed asynchronously while or after the ioctl returns.
63 See the inline documentation in <linux/firewire-cdev.h> for
64 descriptions of all ioctls.
65
66 poll(2), select(2), epoll_wait(2) etc.
67 Watch for events to become available to be read.
68
69 read(2)
70 Receive various events. There are solicited events like
71 outbound asynchronous transaction completion or isochronous
72 buffer completion, and unsolicited events such as bus resets,
73 request reception, or PHY packet reception. Always use a read
74 buffer which is large enough to receive the largest event that
75 could ever arrive. See <linux/firewire-cdev.h> for descriptions
76 of all event types and for which ioctls affect reception of
77 events.
78
79 mmap(2)
80 Allocate a DMA buffer for isochronous reception or transmission
81 and map it into the process address space. The arguments should
82 be used as follows: addr = NULL, length = the desired buffer
83 size, i.e. number of packets times size of largest packet,
84 prot = at least PROT_READ for reception and at least PROT_WRITE
85 for transmission, flags = MAP_SHARED, fd = the handle to the
86 /dev/fw*, offset = 0.
87
88 Isochronous reception works in packet-per-buffer fashion except
89 for multichannel reception which works in buffer-fill mode.
90
91 munmap(2)
92 Unmap the isochronous I/O buffer from the process address space.
93
94 close(2)
95 Besides stopping and freeing I/O contexts that were associated
96 with the file descriptor, back out any changes to the local
97 nodes' Configuration ROM. Deallocate isochronous channels and
98 bandwidth at the IRM that were marked for kernel-assisted
99 re- and deallocation.
100
101Users: libraw1394
102 libdc1394
103 tools like jujuutils, fwhack, ...
diff --git a/Documentation/ABI/stable/sysfs-bus-firewire b/Documentation/ABI/stable/sysfs-bus-firewire
new file mode 100644
index 000000000000..3d484e5dc846
--- /dev/null
+++ b/Documentation/ABI/stable/sysfs-bus-firewire
@@ -0,0 +1,122 @@
1What: /sys/bus/firewire/devices/fw[0-9]+/
2Date: May 2007
3KernelVersion: 2.6.22
4Contact: linux1394-devel@lists.sourceforge.net
5Description:
6 IEEE 1394 node device attributes.
7 Read-only. Mutable during the node device's lifetime.
8 See IEEE 1212 for semantic definitions.
9
10 config_rom
11 Contents of the Configuration ROM register.
12 Binary attribute; an array of host-endian u32.
13
14 guid
15 The node's EUI-64 in the bus information block of
16 Configuration ROM.
17 Hexadecimal string representation of an u64.
18
19
20What: /sys/bus/firewire/devices/fw[0-9]+/units
21Date: June 2009
22KernelVersion: 2.6.31
23Contact: linux1394-devel@lists.sourceforge.net
24Description:
25 IEEE 1394 node device attribute.
26 Read-only. Mutable during the node device's lifetime.
27 See IEEE 1212 for semantic definitions.
28
29 units
30 Summary of all units present in an IEEE 1394 node.
31 Contains space-separated tuples of specifier_id and
32 version of each unit present in the node. Specifier_id
33 and version are hexadecimal string representations of
34 u24 of the respective unit directory entries.
35 Specifier_id and version within each tuple are separated
36 by a colon.
37
38Users: udev rules to set ownership and access permissions or ACLs of
39 /dev/fw[0-9]+ character device files
40
41
42What: /sys/bus/firewire/devices/fw[0-9]+[.][0-9]+/
43Date: May 2007
44KernelVersion: 2.6.22
45Contact: linux1394-devel@lists.sourceforge.net
46Description:
47 IEEE 1394 unit device attributes.
48 Read-only. Immutable during the unit device's lifetime.
49 See IEEE 1212 for semantic definitions.
50
51 modalias
52 Same as MODALIAS in the uevent at device creation.
53
54 rom_index
55 Offset of the unit directory within the parent device's
56 (node device's) Configuration ROM, in quadlets.
57 Decimal string representation.
58
59
60What: /sys/bus/firewire/devices/*/
61Date: May 2007
62KernelVersion: 2.6.22
63Contact: linux1394-devel@lists.sourceforge.net
64Description:
65 Attributes common to IEEE 1394 node devices and unit devices.
66 Read-only. Mutable during the node device's lifetime.
67 Immutable during the unit device's lifetime.
68 See IEEE 1212 for semantic definitions.
69
70 These attributes are only created if the root directory of an
71 IEEE 1394 node or the unit directory of an IEEE 1394 unit
72 actually contains according entries.
73
74 hardware_version
75 Hexadecimal string representation of an u24.
76
77 hardware_version_name
78 Contents of a respective textual descriptor leaf.
79
80 model
81 Hexadecimal string representation of an u24.
82
83 model_name
84 Contents of a respective textual descriptor leaf.
85
86 specifier_id
87 Hexadecimal string representation of an u24.
88 Mandatory in unit directories according to IEEE 1212.
89
90 vendor
91 Hexadecimal string representation of an u24.
92 Mandatory in the root directory according to IEEE 1212.
93
94 vendor_name
95 Contents of a respective textual descriptor leaf.
96
97 version
98 Hexadecimal string representation of an u24.
99 Mandatory in unit directories according to IEEE 1212.
100
101
102What: /sys/bus/firewire/drivers/sbp2/fw*/host*/target*/*:*:*:*/ieee1394_id
103 formerly
104 /sys/bus/ieee1394/drivers/sbp2/fw*/host*/target*/*:*:*:*/ieee1394_id
105Date: Feb 2004
106KernelVersion: 2.6.4
107Contact: linux1394-devel@lists.sourceforge.net
108Description:
109 SCSI target port identifier and logical unit identifier of a
110 logical unit of an SBP-2 target. The identifiers are specified
111 in SAM-2...SAM-4 annex A. They are persistent and world-wide
112 unique properties the SBP-2 attached target.
113
114 Read-only attribute, immutable during the target's lifetime.
115 Format, as exposed by firewire-sbp2 since 2.6.22, May 2007:
116 Colon-separated hexadecimal string representations of
117 u64 EUI-64 : u24 directory_ID : u16 LUN
118 without 0x prefixes, without whitespace. The former sbp2 driver
119 (removed in 2.6.37 after being superseded by firewire-sbp2) used
120 a somewhat shorter format which was not as close to SAM.
121
122Users: udev rules to create /dev/disk/by-id/ symlinks
diff --git a/Documentation/ABI/stable/vdso b/Documentation/ABI/stable/vdso
new file mode 100644
index 000000000000..8a1cbb594497
--- /dev/null
+++ b/Documentation/ABI/stable/vdso
@@ -0,0 +1,27 @@
1On some architectures, when the kernel loads any userspace program it
2maps an ELF DSO into that program's address space. This DSO is called
3the vDSO and it often contains useful and highly-optimized alternatives
4to real syscalls.
5
6These functions are called just like ordinary C function according to
7your platform's ABI. Call them from a sensible context. (For example,
8if you set CS on x86 to something strange, the vDSO functions are
9within their rights to crash.) In addition, if you pass a bad
10pointer to a vDSO function, you might get SIGSEGV instead of -EFAULT.
11
12To find the DSO, parse the auxiliary vector passed to the program's
13entry point. The AT_SYSINFO_EHDR entry will point to the vDSO.
14
15The vDSO uses symbol versioning; whenever you request a symbol from the
16vDSO, specify the version you are expecting.
17
18Programs that dynamically link to glibc will use the vDSO automatically.
19Otherwise, you can use the reference parser in Documentation/vDSO/parse_vdso.c.
20
21Unless otherwise noted, the set of symbols with any given version and the
22ABI of those symbols is considered stable. It may vary across architectures,
23though.
24
25(As of this writing, this ABI documentation as been confirmed for x86_64.
26 The maintainers of the other vDSO-using architectures should confirm
27 that it is correct for their architecture.) \ No newline at end of file
diff --git a/Documentation/ABI/testing/sysfs-class-backlight-driver-adp8870 b/Documentation/ABI/testing/sysfs-class-backlight-driver-adp8870
new file mode 100644
index 000000000000..aa11dbdd794b
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-backlight-driver-adp8870
@@ -0,0 +1,56 @@
1What: /sys/class/backlight/<backlight>/<ambient light zone>_max
2What: /sys/class/backlight/<backlight>/l1_daylight_max
3What: /sys/class/backlight/<backlight>/l2_bright_max
4What: /sys/class/backlight/<backlight>/l3_office_max
5What: /sys/class/backlight/<backlight>/l4_indoor_max
6What: /sys/class/backlight/<backlight>/l5_dark_max
7Date: Mai 2011
8KernelVersion: 2.6.40
9Contact: device-drivers-devel@blackfin.uclinux.org
10Description:
11 Control the maximum brightness for <ambient light zone>
12 on this <backlight>. Values are between 0 and 127. This file
13 will also show the brightness level stored for this
14 <ambient light zone>.
15
16What: /sys/class/backlight/<backlight>/<ambient light zone>_dim
17What: /sys/class/backlight/<backlight>/l2_bright_dim
18What: /sys/class/backlight/<backlight>/l3_office_dim
19What: /sys/class/backlight/<backlight>/l4_indoor_dim
20What: /sys/class/backlight/<backlight>/l5_dark_dim
21Date: Mai 2011
22KernelVersion: 2.6.40
23Contact: device-drivers-devel@blackfin.uclinux.org
24Description:
25 Control the dim brightness for <ambient light zone>
26 on this <backlight>. Values are between 0 and 127, typically
27 set to 0. Full off when the backlight is disabled.
28 This file will also show the dim brightness level stored for
29 this <ambient light zone>.
30
31What: /sys/class/backlight/<backlight>/ambient_light_level
32Date: Mai 2011
33KernelVersion: 2.6.40
34Contact: device-drivers-devel@blackfin.uclinux.org
35Description:
36 Get conversion value of the light sensor.
37 This value is updated every 80 ms (when the light sensor
38 is enabled). Returns integer between 0 (dark) and
39 8000 (max ambient brightness)
40
41What: /sys/class/backlight/<backlight>/ambient_light_zone
42Date: Mai 2011
43KernelVersion: 2.6.40
44Contact: device-drivers-devel@blackfin.uclinux.org
45Description:
46 Get/Set current ambient light zone. Reading returns
47 integer between 1..5 (1 = daylight, 2 = bright, ..., 5 = dark).
48 Writing a value between 1..5 forces the backlight controller
49 to enter the corresponding ambient light zone.
50 Writing 0 returns to normal/automatic ambient light level
51 operation. The ambient light sensing feature on these devices
52 is an extension to the API documented in
53 Documentation/ABI/stable/sysfs-class-backlight.
54 It can be enabled by writing the value stored in
55 /sys/class/backlight/<backlight>/max_brightness to
56 /sys/class/backlight/<backlight>/brightness. \ No newline at end of file
diff --git a/Documentation/ABI/testing/sysfs-driver-hid-roccat-koneplus b/Documentation/ABI/testing/sysfs-driver-hid-roccat-koneplus
index c1b53b8bc2ae..65e6e5dd67e8 100644
--- a/Documentation/ABI/testing/sysfs-driver-hid-roccat-koneplus
+++ b/Documentation/ABI/testing/sysfs-driver-hid-roccat-koneplus
@@ -92,6 +92,14 @@ Description: The mouse has a tracking- and a distance-control-unit. These
92 This file is writeonly. 92 This file is writeonly.
93Users: http://roccat.sourceforge.net 93Users: http://roccat.sourceforge.net
94 94
95What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/koneplus/roccatkoneplus<minor>/talk
96Date: May 2011
97Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
98Description: Used to active some easy* functions of the mouse from outside.
99 The data has to be 16 bytes long.
100 This file is writeonly.
101Users: http://roccat.sourceforge.net
102
95What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/koneplus/roccatkoneplus<minor>/tcu 103What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/koneplus/roccatkoneplus<minor>/tcu
96Date: October 2010 104Date: October 2010
97Contact: Stefan Achatz <erazor_de@users.sourceforge.net> 105Contact: Stefan Achatz <erazor_de@users.sourceforge.net>
diff --git a/Documentation/ABI/testing/sysfs-driver-hid-wiimote b/Documentation/ABI/testing/sysfs-driver-hid-wiimote
new file mode 100644
index 000000000000..5d5a16ea57c6
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-hid-wiimote
@@ -0,0 +1,10 @@
1What: /sys/bus/hid/drivers/wiimote/<dev>/led1
2What: /sys/bus/hid/drivers/wiimote/<dev>/led2
3What: /sys/bus/hid/drivers/wiimote/<dev>/led3
4What: /sys/bus/hid/drivers/wiimote/<dev>/led4
5Date: July 2011
6KernelVersion: 3.1
7Contact: David Herrmann <dh.herrmann@googlemail.com>
8Description: Make it possible to set/get current led state. Reading from it
9 returns 0 if led is off and 1 if it is on. Writing 0 to it
10 disables the led, writing 1 enables it.
diff --git a/Documentation/Changes b/Documentation/Changes
index 5f4828a034e3..b17580885273 100644
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -2,13 +2,7 @@ Intro
2===== 2=====
3 3
4This document is designed to provide a list of the minimum levels of 4This document is designed to provide a list of the minimum levels of
5software necessary to run the 2.6 kernels, as well as provide brief 5software necessary to run the 3.0 kernels.
6instructions regarding any other "Gotchas" users may encounter when
7trying life on the Bleeding Edge. If upgrading from a pre-2.4.x
8kernel, please consult the Changes file included with 2.4.x kernels for
9additional information; most of that information will not be repeated
10here. Basically, this document assumes that your system is already
11functional and running at least 2.4.x kernels.
12 6
13This document is originally based on my "Changes" file for 2.0.x kernels 7This document is originally based on my "Changes" file for 2.0.x kernels
14and therefore owes credit to the same people as that file (Jared Mauch, 8and therefore owes credit to the same people as that file (Jared Mauch,
@@ -22,11 +16,10 @@ Upgrade to at *least* these software revisions before thinking you've
22encountered a bug! If you're unsure what version you're currently 16encountered a bug! If you're unsure what version you're currently
23running, the suggested command should tell you. 17running, the suggested command should tell you.
24 18
25Again, keep in mind that this list assumes you are already 19Again, keep in mind that this list assumes you are already functionally
26functionally running a Linux 2.4 kernel. Also, not all tools are 20running a Linux kernel. Also, not all tools are necessary on all
27necessary on all systems; obviously, if you don't have any ISDN 21systems; obviously, if you don't have any ISDN hardware, for example,
28hardware, for example, you probably needn't concern yourself with 22you probably needn't concern yourself with isdn4k-utils.
29isdn4k-utils.
30 23
31o Gnu C 3.2 # gcc --version 24o Gnu C 3.2 # gcc --version
32o Gnu make 3.80 # make --version 25o Gnu make 3.80 # make --version
@@ -114,12 +107,12 @@ Ksymoops
114 107
115If the unthinkable happens and your kernel oopses, you may need the 108If the unthinkable happens and your kernel oopses, you may need the
116ksymoops tool to decode it, but in most cases you don't. 109ksymoops tool to decode it, but in most cases you don't.
117In the 2.6 kernel it is generally preferred to build the kernel with 110It is generally preferred to build the kernel with CONFIG_KALLSYMS so
118CONFIG_KALLSYMS so that it produces readable dumps that can be used as-is 111that it produces readable dumps that can be used as-is (this also
119(this also produces better output than ksymoops). 112produces better output than ksymoops). If for some reason your kernel
120If for some reason your kernel is not build with CONFIG_KALLSYMS and 113is not build with CONFIG_KALLSYMS and you have no way to rebuild and
121you have no way to rebuild and reproduce the Oops with that option, then 114reproduce the Oops with that option, then you can still decode that Oops
122you can still decode that Oops with ksymoops. 115with ksymoops.
123 116
124Module-Init-Tools 117Module-Init-Tools
125----------------- 118-----------------
@@ -261,8 +254,8 @@ needs to be recompiled or (preferably) upgraded.
261NFS-utils 254NFS-utils
262--------- 255---------
263 256
264In 2.4 and earlier kernels, the nfs server needed to know about any 257In ancient (2.4 and earlier) kernels, the nfs server needed to know
265client that expected to be able to access files via NFS. This 258about any client that expected to be able to access files via NFS. This
266information would be given to the kernel by "mountd" when the client 259information would be given to the kernel by "mountd" when the client
267mounted the filesystem, or by "exportfs" at system startup. exportfs 260mounted the filesystem, or by "exportfs" at system startup. exportfs
268would take information about active clients from /var/lib/nfs/rmtab. 261would take information about active clients from /var/lib/nfs/rmtab.
@@ -272,11 +265,11 @@ which is not always easy, particularly when trying to implement
272fail-over. Even when the system is working well, rmtab suffers from 265fail-over. Even when the system is working well, rmtab suffers from
273getting lots of old entries that never get removed. 266getting lots of old entries that never get removed.
274 267
275With 2.6 we have the option of having the kernel tell mountd when it 268With modern kernels we have the option of having the kernel tell mountd
276gets a request from an unknown host, and mountd can give appropriate 269when it gets a request from an unknown host, and mountd can give
277export information to the kernel. This removes the dependency on 270appropriate export information to the kernel. This removes the
278rmtab and means that the kernel only needs to know about currently 271dependency on rmtab and means that the kernel only needs to know about
279active clients. 272currently active clients.
280 273
281To enable this new functionality, you need to: 274To enable this new functionality, you need to:
282 275
diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle
index 58b0bf917834..fa6e25b94a54 100644
--- a/Documentation/CodingStyle
+++ b/Documentation/CodingStyle
@@ -680,8 +680,8 @@ ones already enabled by DEBUG.
680 Chapter 14: Allocating memory 680 Chapter 14: Allocating memory
681 681
682The kernel provides the following general purpose memory allocators: 682The kernel provides the following general purpose memory allocators:
683kmalloc(), kzalloc(), kcalloc(), and vmalloc(). Please refer to the API 683kmalloc(), kzalloc(), kcalloc(), vmalloc(), and vzalloc(). Please refer to
684documentation for further information about them. 684the API documentation for further information about them.
685 685
686The preferred form for passing a size of a struct is the following: 686The preferred form for passing a size of a struct is the following:
687 687
diff --git a/Documentation/DocBook/80211.tmpl b/Documentation/DocBook/80211.tmpl
index 8906648f962b..445289cd0e65 100644
--- a/Documentation/DocBook/80211.tmpl
+++ b/Documentation/DocBook/80211.tmpl
@@ -402,8 +402,9 @@
402!Finclude/net/mac80211.h set_key_cmd 402!Finclude/net/mac80211.h set_key_cmd
403!Finclude/net/mac80211.h ieee80211_key_conf 403!Finclude/net/mac80211.h ieee80211_key_conf
404!Finclude/net/mac80211.h ieee80211_key_flags 404!Finclude/net/mac80211.h ieee80211_key_flags
405!Finclude/net/mac80211.h ieee80211_tkip_key_type 405!Finclude/net/mac80211.h ieee80211_get_tkip_p1k
406!Finclude/net/mac80211.h ieee80211_get_tkip_key 406!Finclude/net/mac80211.h ieee80211_get_tkip_p1k_iv
407!Finclude/net/mac80211.h ieee80211_get_tkip_p2k
407!Finclude/net/mac80211.h ieee80211_key_removed 408!Finclude/net/mac80211.h ieee80211_key_removed
408 </chapter> 409 </chapter>
409 410
diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl
index 7b3f49363413..07a9c48de5a2 100644
--- a/Documentation/DocBook/kernel-hacking.tmpl
+++ b/Documentation/DocBook/kernel-hacking.tmpl
@@ -409,7 +409,7 @@ cond_resched(); /* Will sleep */
409 409
410 <para> 410 <para>
411 You should always compile your kernel 411 You should always compile your kernel
412 <symbol>CONFIG_DEBUG_SPINLOCK_SLEEP</symbol> on, and it will warn 412 <symbol>CONFIG_DEBUG_ATOMIC_SLEEP</symbol> on, and it will warn
413 you if you break these rules. If you <emphasis>do</emphasis> break 413 you if you break these rules. If you <emphasis>do</emphasis> break
414 the rules, you will eventually lock up your box. 414 the rules, you will eventually lock up your box.
415 </para> 415 </para>
diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist
index da0382daa395..7b13be41c085 100644
--- a/Documentation/SubmitChecklist
+++ b/Documentation/SubmitChecklist
@@ -53,7 +53,7 @@ kernel patches.
53 53
5412: Has been tested with CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, 5412: Has been tested with CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT,
55 CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, 55 CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES,
56 CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_SPINLOCK_SLEEP all simultaneously 56 CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_ATOMIC_SLEEP all simultaneously
57 enabled. 57 enabled.
58 58
5913: Has been build- and runtime tested with and without CONFIG_SMP and 5913: Has been build- and runtime tested with and without CONFIG_SMP and
diff --git a/Documentation/accounting/cgroupstats.txt b/Documentation/accounting/cgroupstats.txt
index eda40fd39cad..d16a9849e60e 100644
--- a/Documentation/accounting/cgroupstats.txt
+++ b/Documentation/accounting/cgroupstats.txt
@@ -21,7 +21,7 @@ information will not be available.
21To extract cgroup statistics a utility very similar to getdelays.c 21To extract cgroup statistics a utility very similar to getdelays.c
22has been developed, the sample output of the utility is shown below 22has been developed, the sample output of the utility is shown below
23 23
24~/balbir/cgroupstats # ./getdelays -C "/cgroup/a" 24~/balbir/cgroupstats # ./getdelays -C "/sys/fs/cgroup/a"
25sleeping 1, blocked 0, running 1, stopped 0, uninterruptible 0 25sleeping 1, blocked 0, running 1, stopped 0, uninterruptible 0
26~/balbir/cgroupstats # ./getdelays -C "/cgroup" 26~/balbir/cgroupstats # ./getdelays -C "/sys/fs/cgroup"
27sleeping 155, blocked 0, running 1, stopped 0, uninterruptible 2 27sleeping 155, blocked 0, running 1, stopped 0, uninterruptible 2
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 465351d4cf85..84f0a15fc210 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -28,16 +28,19 @@ cgroups. Here is what you can do.
28- Enable group scheduling in CFQ 28- Enable group scheduling in CFQ
29 CONFIG_CFQ_GROUP_IOSCHED=y 29 CONFIG_CFQ_GROUP_IOSCHED=y
30 30
31- Compile and boot into kernel and mount IO controller (blkio). 31- Compile and boot into kernel and mount IO controller (blkio); see
32 cgroups.txt, Why are cgroups needed?.
32 33
33 mount -t cgroup -o blkio none /cgroup 34 mount -t tmpfs cgroup_root /sys/fs/cgroup
35 mkdir /sys/fs/cgroup/blkio
36 mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
34 37
35- Create two cgroups 38- Create two cgroups
36 mkdir -p /cgroup/test1/ /cgroup/test2 39 mkdir -p /sys/fs/cgroup/blkio/test1/ /sys/fs/cgroup/blkio/test2
37 40
38- Set weights of group test1 and test2 41- Set weights of group test1 and test2
39 echo 1000 > /cgroup/test1/blkio.weight 42 echo 1000 > /sys/fs/cgroup/blkio/test1/blkio.weight
40 echo 500 > /cgroup/test2/blkio.weight 43 echo 500 > /sys/fs/cgroup/blkio/test2/blkio.weight
41 44
42- Create two same size files (say 512MB each) on same disk (file1, file2) and 45- Create two same size files (say 512MB each) on same disk (file1, file2) and
43 launch two dd threads in different cgroup to read those files. 46 launch two dd threads in different cgroup to read those files.
@@ -46,12 +49,12 @@ cgroups. Here is what you can do.
46 echo 3 > /proc/sys/vm/drop_caches 49 echo 3 > /proc/sys/vm/drop_caches
47 50
48 dd if=/mnt/sdb/zerofile1 of=/dev/null & 51 dd if=/mnt/sdb/zerofile1 of=/dev/null &
49 echo $! > /cgroup/test1/tasks 52 echo $! > /sys/fs/cgroup/blkio/test1/tasks
50 cat /cgroup/test1/tasks 53 cat /sys/fs/cgroup/blkio/test1/tasks
51 54
52 dd if=/mnt/sdb/zerofile2 of=/dev/null & 55 dd if=/mnt/sdb/zerofile2 of=/dev/null &
53 echo $! > /cgroup/test2/tasks 56 echo $! > /sys/fs/cgroup/blkio/test2/tasks
54 cat /cgroup/test2/tasks 57 cat /sys/fs/cgroup/blkio/test2/tasks
55 58
56- At macro level, first dd should finish first. To get more precise data, keep 59- At macro level, first dd should finish first. To get more precise data, keep
57 on looking at (with the help of script), at blkio.disk_time and 60 on looking at (with the help of script), at blkio.disk_time and
@@ -68,13 +71,13 @@ Throttling/Upper Limit policy
68- Enable throttling in block layer 71- Enable throttling in block layer
69 CONFIG_BLK_DEV_THROTTLING=y 72 CONFIG_BLK_DEV_THROTTLING=y
70 73
71- Mount blkio controller 74- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)
72 mount -t cgroup -o blkio none /cgroup/blkio 75 mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
73 76
74- Specify a bandwidth rate on particular device for root group. The format 77- Specify a bandwidth rate on particular device for root group. The format
75 for policy is "<major>:<minor> <byes_per_second>". 78 for policy is "<major>:<minor> <byes_per_second>".
76 79
77 echo "8:16 1048576" > /cgroup/blkio/blkio.read_bps_device 80 echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
78 81
79 Above will put a limit of 1MB/second on reads happening for root group 82 Above will put a limit of 1MB/second on reads happening for root group
80 on device having major/minor number 8:16. 83 on device having major/minor number 8:16.
@@ -87,7 +90,7 @@ Throttling/Upper Limit policy
87 1024+0 records out 90 1024+0 records out
88 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s 91 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
89 92
90 Limits for writes can be put using blkio.write_bps_device file. 93 Limits for writes can be put using blkio.throttle.write_bps_device file.
91 94
92Hierarchical Cgroups 95Hierarchical Cgroups
93==================== 96====================
@@ -108,7 +111,7 @@ Hierarchical Cgroups
108 CFQ and throttling will practically treat all groups at same level. 111 CFQ and throttling will practically treat all groups at same level.
109 112
110 pivot 113 pivot
111 / | \ \ 114 / / \ \
112 root test1 test2 test3 115 root test1 test2 test3
113 116
114 Down the line we can implement hierarchical accounting/control support 117 Down the line we can implement hierarchical accounting/control support
@@ -149,7 +152,7 @@ Proportional weight policy files
149 152
150 Following is the format. 153 Following is the format.
151 154
152 #echo dev_maj:dev_minor weight > /path/to/cgroup/blkio.weight_device 155 # echo dev_maj:dev_minor weight > blkio.weight_device
153 Configure weight=300 on /dev/sdb (8:16) in this cgroup 156 Configure weight=300 on /dev/sdb (8:16) in this cgroup
154 # echo 8:16 300 > blkio.weight_device 157 # echo 8:16 300 > blkio.weight_device
155 # cat blkio.weight_device 158 # cat blkio.weight_device
@@ -283,28 +286,28 @@ Throttling/Upper limit policy files
283 specified in bytes per second. Rules are per deivce. Following is 286 specified in bytes per second. Rules are per deivce. Following is
284 the format. 287 the format.
285 288
286 echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.read_bps_device 289 echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
287 290
288- blkio.throttle.write_bps_device 291- blkio.throttle.write_bps_device
289 - Specifies upper limit on WRITE rate to the device. IO rate is 292 - Specifies upper limit on WRITE rate to the device. IO rate is
290 specified in bytes per second. Rules are per deivce. Following is 293 specified in bytes per second. Rules are per deivce. Following is
291 the format. 294 the format.
292 295
293 echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.write_bps_device 296 echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
294 297
295- blkio.throttle.read_iops_device 298- blkio.throttle.read_iops_device
296 - Specifies upper limit on READ rate from the device. IO rate is 299 - Specifies upper limit on READ rate from the device. IO rate is
297 specified in IO per second. Rules are per deivce. Following is 300 specified in IO per second. Rules are per deivce. Following is
298 the format. 301 the format.
299 302
300 echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.read_iops_device 303 echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
301 304
302- blkio.throttle.write_iops_device 305- blkio.throttle.write_iops_device
303 - Specifies upper limit on WRITE rate to the device. IO rate is 306 - Specifies upper limit on WRITE rate to the device. IO rate is
304 specified in io per second. Rules are per deivce. Following is 307 specified in io per second. Rules are per deivce. Following is
305 the format. 308 the format.
306 309
307 echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.write_iops_device 310 echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device
308 311
309Note: If both BW and IOPS rules are specified for a device, then IO is 312Note: If both BW and IOPS rules are specified for a device, then IO is
310 subjectd to both the constraints. 313 subjectd to both the constraints.
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 0ed99f08f1f3..cd67e90003c0 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -138,11 +138,11 @@ With the ability to classify tasks differently for different resources
138the admin can easily set up a script which receives exec notifications 138the admin can easily set up a script which receives exec notifications
139and depending on who is launching the browser he can 139and depending on who is launching the browser he can
140 140
141 # echo browser_pid > /mnt/<restype>/<userclass>/tasks 141 # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks
142 142
143With only a single hierarchy, he now would potentially have to create 143With only a single hierarchy, he now would potentially have to create
144a separate cgroup for every browser launched and associate it with 144a separate cgroup for every browser launched and associate it with
145approp network and other resource class. This may lead to 145appropriate network and other resource class. This may lead to
146proliferation of such cgroups. 146proliferation of such cgroups.
147 147
148Also lets say that the administrator would like to give enhanced network 148Also lets say that the administrator would like to give enhanced network
@@ -153,9 +153,9 @@ apps enhanced CPU power,
153With ability to write pids directly to resource classes, it's just a 153With ability to write pids directly to resource classes, it's just a
154matter of : 154matter of :
155 155
156 # echo pid > /mnt/network/<new_class>/tasks 156 # echo pid > /sys/fs/cgroup/network/<new_class>/tasks
157 (after some time) 157 (after some time)
158 # echo pid > /mnt/network/<orig_class>/tasks 158 # echo pid > /sys/fs/cgroup/network/<orig_class>/tasks
159 159
160Without this ability, he would have to split the cgroup into 160Without this ability, he would have to split the cgroup into
161multiple separate ones and then associate the new cgroups with the 161multiple separate ones and then associate the new cgroups with the
@@ -310,21 +310,24 @@ subsystem, this is the case for the cpuset.
310To start a new job that is to be contained within a cgroup, using 310To start a new job that is to be contained within a cgroup, using
311the "cpuset" cgroup subsystem, the steps are something like: 311the "cpuset" cgroup subsystem, the steps are something like:
312 312
313 1) mkdir /dev/cgroup 313 1) mount -t tmpfs cgroup_root /sys/fs/cgroup
314 2) mount -t cgroup -ocpuset cpuset /dev/cgroup 314 2) mkdir /sys/fs/cgroup/cpuset
315 3) Create the new cgroup by doing mkdir's and write's (or echo's) in 315 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
316 the /dev/cgroup virtual file system. 316 4) Create the new cgroup by doing mkdir's and write's (or echo's) in
317 4) Start a task that will be the "founding father" of the new job. 317 the /sys/fs/cgroup virtual file system.
318 5) Attach that task to the new cgroup by writing its pid to the 318 5) Start a task that will be the "founding father" of the new job.
319 /dev/cgroup tasks file for that cgroup. 319 6) Attach that task to the new cgroup by writing its pid to the
320 6) fork, exec or clone the job tasks from this founding father task. 320 /sys/fs/cgroup/cpuset/tasks file for that cgroup.
321 7) fork, exec or clone the job tasks from this founding father task.
321 322
322For example, the following sequence of commands will setup a cgroup 323For example, the following sequence of commands will setup a cgroup
323named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, 324named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
324and then start a subshell 'sh' in that cgroup: 325and then start a subshell 'sh' in that cgroup:
325 326
326 mount -t cgroup cpuset -ocpuset /dev/cgroup 327 mount -t tmpfs cgroup_root /sys/fs/cgroup
327 cd /dev/cgroup 328 mkdir /sys/fs/cgroup/cpuset
329 mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset
330 cd /sys/fs/cgroup/cpuset
328 mkdir Charlie 331 mkdir Charlie
329 cd Charlie 332 cd Charlie
330 /bin/echo 2-3 > cpuset.cpus 333 /bin/echo 2-3 > cpuset.cpus
@@ -345,7 +348,7 @@ Creating, modifying, using the cgroups can be done through the cgroup
345virtual filesystem. 348virtual filesystem.
346 349
347To mount a cgroup hierarchy with all available subsystems, type: 350To mount a cgroup hierarchy with all available subsystems, type:
348# mount -t cgroup xxx /dev/cgroup 351# mount -t cgroup xxx /sys/fs/cgroup
349 352
350The "xxx" is not interpreted by the cgroup code, but will appear in 353The "xxx" is not interpreted by the cgroup code, but will appear in
351/proc/mounts so may be any useful identifying string that you like. 354/proc/mounts so may be any useful identifying string that you like.
@@ -354,23 +357,32 @@ Note: Some subsystems do not work without some user input first. For instance,
354if cpusets are enabled the user will have to populate the cpus and mems files 357if cpusets are enabled the user will have to populate the cpus and mems files
355for each new cgroup created before that group can be used. 358for each new cgroup created before that group can be used.
356 359
360As explained in section `1.2 Why are cgroups needed?' you should create
361different hierarchies of cgroups for each single resource or group of
362resources you want to control. Therefore, you should mount a tmpfs on
363/sys/fs/cgroup and create directories for each cgroup resource or resource
364group.
365
366# mount -t tmpfs cgroup_root /sys/fs/cgroup
367# mkdir /sys/fs/cgroup/rg1
368
357To mount a cgroup hierarchy with just the cpuset and memory 369To mount a cgroup hierarchy with just the cpuset and memory
358subsystems, type: 370subsystems, type:
359# mount -t cgroup -o cpuset,memory hier1 /dev/cgroup 371# mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1
360 372
361To change the set of subsystems bound to a mounted hierarchy, just 373To change the set of subsystems bound to a mounted hierarchy, just
362remount with different options: 374remount with different options:
363# mount -o remount,cpuset,blkio hier1 /dev/cgroup 375# mount -o remount,cpuset,blkio hier1 /sys/fs/cgroup/rg1
364 376
365Now memory is removed from the hierarchy and blkio is added. 377Now memory is removed from the hierarchy and blkio is added.
366 378
367Note this will add blkio to the hierarchy but won't remove memory or 379Note this will add blkio to the hierarchy but won't remove memory or
368cpuset, because the new options are appended to the old ones: 380cpuset, because the new options are appended to the old ones:
369# mount -o remount,blkio /dev/cgroup 381# mount -o remount,blkio /sys/fs/cgroup/rg1
370 382
371To Specify a hierarchy's release_agent: 383To Specify a hierarchy's release_agent:
372# mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \ 384# mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \
373 xxx /dev/cgroup 385 xxx /sys/fs/cgroup/rg1
374 386
375Note that specifying 'release_agent' more than once will return failure. 387Note that specifying 'release_agent' more than once will return failure.
376 388
@@ -379,17 +391,17 @@ when the hierarchy consists of a single (root) cgroup. Supporting
379the ability to arbitrarily bind/unbind subsystems from an existing 391the ability to arbitrarily bind/unbind subsystems from an existing
380cgroup hierarchy is intended to be implemented in the future. 392cgroup hierarchy is intended to be implemented in the future.
381 393
382Then under /dev/cgroup you can find a tree that corresponds to the 394Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
383tree of the cgroups in the system. For instance, /dev/cgroup 395tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
384is the cgroup that holds the whole system. 396is the cgroup that holds the whole system.
385 397
386If you want to change the value of release_agent: 398If you want to change the value of release_agent:
387# echo "/sbin/new_release_agent" > /dev/cgroup/release_agent 399# echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent
388 400
389It can also be changed via remount. 401It can also be changed via remount.
390 402
391If you want to create a new cgroup under /dev/cgroup: 403If you want to create a new cgroup under /sys/fs/cgroup/rg1:
392# cd /dev/cgroup 404# cd /sys/fs/cgroup/rg1
393# mkdir my_cgroup 405# mkdir my_cgroup
394 406
395Now you want to do something with this cgroup. 407Now you want to do something with this cgroup.
diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroups/cpuacct.txt
index 8b930946c52a..9d73cc0cadb9 100644
--- a/Documentation/cgroups/cpuacct.txt
+++ b/Documentation/cgroups/cpuacct.txt
@@ -10,26 +10,25 @@ directly present in its group.
10 10
11Accounting groups can be created by first mounting the cgroup filesystem. 11Accounting groups can be created by first mounting the cgroup filesystem.
12 12
13# mkdir /cgroups 13# mount -t cgroup -ocpuacct none /sys/fs/cgroup
14# mount -t cgroup -ocpuacct none /cgroups 14
15 15With the above step, the initial or the parent accounting group becomes
16With the above step, the initial or the parent accounting group 16visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
17becomes visible at /cgroups. At bootup, this group includes all the 17the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
18tasks in the system. /cgroups/tasks lists the tasks in this cgroup. 18/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained
19/cgroups/cpuacct.usage gives the CPU time (in nanoseconds) obtained by 19by this group which is essentially the CPU time obtained by all the tasks
20this group which is essentially the CPU time obtained by all the tasks
21in the system. 20in the system.
22 21
23New accounting groups can be created under the parent group /cgroups. 22New accounting groups can be created under the parent group /sys/fs/cgroup.
24 23
25# cd /cgroups 24# cd /sys/fs/cgroup
26# mkdir g1 25# mkdir g1
27# echo $$ > g1 26# echo $$ > g1/tasks
28 27
29The above steps create a new group g1 and move the current shell 28The above steps create a new group g1 and move the current shell
30process (bash) into it. CPU time consumed by this bash and its children 29process (bash) into it. CPU time consumed by this bash and its children
31can be obtained from g1/cpuacct.usage and the same is accumulated in 30can be obtained from g1/cpuacct.usage and the same is accumulated in
32/cgroups/cpuacct.usage also. 31/sys/fs/cgroup/cpuacct.usage also.
33 32
34cpuacct.stat file lists a few statistics which further divide the 33cpuacct.stat file lists a few statistics which further divide the
35CPU time obtained by the cgroup into user and system times. Currently 34CPU time obtained by the cgroup into user and system times. Currently
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt
index 98a30829af7a..5c51ed406d1d 100644
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroups/cpusets.txt
@@ -180,7 +180,7 @@ files describing that cpuset:
180 - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset 180 - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
181 - cpuset.sched_relax_domain_level: the searching range when migrating tasks 181 - cpuset.sched_relax_domain_level: the searching range when migrating tasks
182 182
183In addition, the root cpuset only has the following file: 183In addition, only the root cpuset has the following file:
184 - cpuset.memory_pressure_enabled flag: compute memory_pressure? 184 - cpuset.memory_pressure_enabled flag: compute memory_pressure?
185 185
186New cpusets are created using the mkdir system call or shell 186New cpusets are created using the mkdir system call or shell
@@ -661,21 +661,21 @@ than stress the kernel.
661 661
662To start a new job that is to be contained within a cpuset, the steps are: 662To start a new job that is to be contained within a cpuset, the steps are:
663 663
664 1) mkdir /dev/cpuset 664 1) mkdir /sys/fs/cgroup/cpuset
665 2) mount -t cgroup -ocpuset cpuset /dev/cpuset 665 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
666 3) Create the new cpuset by doing mkdir's and write's (or echo's) in 666 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
667 the /dev/cpuset virtual file system. 667 the /sys/fs/cgroup/cpuset virtual file system.
668 4) Start a task that will be the "founding father" of the new job. 668 4) Start a task that will be the "founding father" of the new job.
669 5) Attach that task to the new cpuset by writing its pid to the 669 5) Attach that task to the new cpuset by writing its pid to the
670 /dev/cpuset tasks file for that cpuset. 670 /sys/fs/cgroup/cpuset tasks file for that cpuset.
671 6) fork, exec or clone the job tasks from this founding father task. 671 6) fork, exec or clone the job tasks from this founding father task.
672 672
673For example, the following sequence of commands will setup a cpuset 673For example, the following sequence of commands will setup a cpuset
674named "Charlie", containing just CPUs 2 and 3, and Memory Node 1, 674named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
675and then start a subshell 'sh' in that cpuset: 675and then start a subshell 'sh' in that cpuset:
676 676
677 mount -t cgroup -ocpuset cpuset /dev/cpuset 677 mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
678 cd /dev/cpuset 678 cd /sys/fs/cgroup/cpuset
679 mkdir Charlie 679 mkdir Charlie
680 cd Charlie 680 cd Charlie
681 /bin/echo 2-3 > cpuset.cpus 681 /bin/echo 2-3 > cpuset.cpus
@@ -710,14 +710,14 @@ Creating, modifying, using the cpusets can be done through the cpuset
710virtual filesystem. 710virtual filesystem.
711 711
712To mount it, type: 712To mount it, type:
713# mount -t cgroup -o cpuset cpuset /dev/cpuset 713# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset
714 714
715Then under /dev/cpuset you can find a tree that corresponds to the 715Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the
716tree of the cpusets in the system. For instance, /dev/cpuset 716tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset
717is the cpuset that holds the whole system. 717is the cpuset that holds the whole system.
718 718
719If you want to create a new cpuset under /dev/cpuset: 719If you want to create a new cpuset under /sys/fs/cgroup/cpuset:
720# cd /dev/cpuset 720# cd /sys/fs/cgroup/cpuset
721# mkdir my_cpuset 721# mkdir my_cpuset
722 722
723Now you want to do something with this cpuset. 723Now you want to do something with this cpuset.
@@ -765,12 +765,12 @@ wrapper around the cgroup filesystem.
765 765
766The command 766The command
767 767
768mount -t cpuset X /dev/cpuset 768mount -t cpuset X /sys/fs/cgroup/cpuset
769 769
770is equivalent to 770is equivalent to
771 771
772mount -t cgroup -ocpuset,noprefix X /dev/cpuset 772mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset
773echo "/sbin/cpuset_release_agent" > /dev/cpuset/release_agent 773echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent
774 774
7752.2 Adding/removing cpus 7752.2 Adding/removing cpus
776------------------------ 776------------------------
diff --git a/Documentation/cgroups/devices.txt b/Documentation/cgroups/devices.txt
index 57ca4c89fe5c..16624a7f8222 100644
--- a/Documentation/cgroups/devices.txt
+++ b/Documentation/cgroups/devices.txt
@@ -22,16 +22,16 @@ removed from the child(ren).
22An entry is added using devices.allow, and removed using 22An entry is added using devices.allow, and removed using
23devices.deny. For instance 23devices.deny. For instance
24 24
25 echo 'c 1:3 mr' > /cgroups/1/devices.allow 25 echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow
26 26
27allows cgroup 1 to read and mknod the device usually known as 27allows cgroup 1 to read and mknod the device usually known as
28/dev/null. Doing 28/dev/null. Doing
29 29
30 echo a > /cgroups/1/devices.deny 30 echo a > /sys/fs/cgroup/1/devices.deny
31 31
32will remove the default 'a *:* rwm' entry. Doing 32will remove the default 'a *:* rwm' entry. Doing
33 33
34 echo a > /cgroups/1/devices.allow 34 echo a > /sys/fs/cgroup/1/devices.allow
35 35
36will add the 'a *:* rwm' entry to the whitelist. 36will add the 'a *:* rwm' entry to the whitelist.
37 37
diff --git a/Documentation/cgroups/freezer-subsystem.txt b/Documentation/cgroups/freezer-subsystem.txt
index 41f37fea1276..c21d77742a07 100644
--- a/Documentation/cgroups/freezer-subsystem.txt
+++ b/Documentation/cgroups/freezer-subsystem.txt
@@ -59,28 +59,28 @@ is non-freezable.
59 59
60* Examples of usage : 60* Examples of usage :
61 61
62 # mkdir /containers 62 # mkdir /sys/fs/cgroup/freezer
63 # mount -t cgroup -ofreezer freezer /containers 63 # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer
64 # mkdir /containers/0 64 # mkdir /sys/fs/cgroup/freezer/0
65 # echo $some_pid > /containers/0/tasks 65 # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks
66 66
67to get status of the freezer subsystem : 67to get status of the freezer subsystem :
68 68
69 # cat /containers/0/freezer.state 69 # cat /sys/fs/cgroup/freezer/0/freezer.state
70 THAWED 70 THAWED
71 71
72to freeze all tasks in the container : 72to freeze all tasks in the container :
73 73
74 # echo FROZEN > /containers/0/freezer.state 74 # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state
75 # cat /containers/0/freezer.state 75 # cat /sys/fs/cgroup/freezer/0/freezer.state
76 FREEZING 76 FREEZING
77 # cat /containers/0/freezer.state 77 # cat /sys/fs/cgroup/freezer/0/freezer.state
78 FROZEN 78 FROZEN
79 79
80to unfreeze all tasks in the container : 80to unfreeze all tasks in the container :
81 81
82 # echo THAWED > /containers/0/freezer.state 82 # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state
83 # cat /containers/0/freezer.state 83 # cat /sys/fs/cgroup/freezer/0/freezer.state
84 THAWED 84 THAWED
85 85
86This is the basic mechanism which should do the right thing for user space task 86This is the basic mechanism which should do the right thing for user space task
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 7c163477fcd8..06eb6d957c83 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -1,8 +1,8 @@
1Memory Resource Controller 1Memory Resource Controller
2 2
3NOTE: The Memory Resource Controller has been generically been referred 3NOTE: The Memory Resource Controller has generically been referred to as the
4 to as the memory controller in this document. Do not confuse memory 4 memory controller in this document. Do not confuse memory controller
5 controller used here with the memory controller that is used in hardware. 5 used here with the memory controller that is used in hardware.
6 6
7(For editors) 7(For editors)
8In this document: 8In this document:
@@ -70,6 +70,7 @@ Brief summary of control files.
70 (See sysctl's vm.swappiness) 70 (See sysctl's vm.swappiness)
71 memory.move_charge_at_immigrate # set/show controls of moving charges 71 memory.move_charge_at_immigrate # set/show controls of moving charges
72 memory.oom_control # set/show oom controls. 72 memory.oom_control # set/show oom controls.
73 memory.numa_stat # show the number of memory usage per numa node
73 74
741. History 751. History
75 76
@@ -181,7 +182,7 @@ behind this approach is that a cgroup that aggressively uses a shared
181page will eventually get charged for it (once it is uncharged from 182page will eventually get charged for it (once it is uncharged from
182the cgroup that brought it in -- this will happen on memory pressure). 183the cgroup that brought it in -- this will happen on memory pressure).
183 184
184Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used.. 185Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used.
185When you do swapoff and make swapped-out pages of shmem(tmpfs) to 186When you do swapoff and make swapped-out pages of shmem(tmpfs) to
186be backed into memory in force, charges for pages are accounted against the 187be backed into memory in force, charges for pages are accounted against the
187caller of swapoff rather than the users of shmem. 188caller of swapoff rather than the users of shmem.
@@ -213,7 +214,7 @@ affecting global LRU, memory+swap limit is better than just limiting swap from
213OS point of view. 214OS point of view.
214 215
215* What happens when a cgroup hits memory.memsw.limit_in_bytes 216* What happens when a cgroup hits memory.memsw.limit_in_bytes
216When a cgroup his memory.memsw.limit_in_bytes, it's useless to do swap-out 217When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
217in this cgroup. Then, swap-out will not be done by cgroup routine and file 218in this cgroup. Then, swap-out will not be done by cgroup routine and file
218caches are dropped. But as mentioned above, global LRU can do swapout memory 219caches are dropped. But as mentioned above, global LRU can do swapout memory
219from it for sanity of the system's memory management state. You can't forbid 220from it for sanity of the system's memory management state. You can't forbid
@@ -263,16 +264,17 @@ b. Enable CONFIG_RESOURCE_COUNTERS
263c. Enable CONFIG_CGROUP_MEM_RES_CTLR 264c. Enable CONFIG_CGROUP_MEM_RES_CTLR
264d. Enable CONFIG_CGROUP_MEM_RES_CTLR_SWAP (to use swap extension) 265d. Enable CONFIG_CGROUP_MEM_RES_CTLR_SWAP (to use swap extension)
265 266
2661. Prepare the cgroups 2671. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
267# mkdir -p /cgroups 268# mount -t tmpfs none /sys/fs/cgroup
268# mount -t cgroup none /cgroups -o memory 269# mkdir /sys/fs/cgroup/memory
270# mount -t cgroup none /sys/fs/cgroup/memory -o memory
269 271
2702. Make the new group and move bash into it 2722. Make the new group and move bash into it
271# mkdir /cgroups/0 273# mkdir /sys/fs/cgroup/memory/0
272# echo $$ > /cgroups/0/tasks 274# echo $$ > /sys/fs/cgroup/memory/0/tasks
273 275
274Since now we're in the 0 cgroup, we can alter the memory limit: 276Since now we're in the 0 cgroup, we can alter the memory limit:
275# echo 4M > /cgroups/0/memory.limit_in_bytes 277# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
276 278
277NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, 279NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
278mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.) 280mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.)
@@ -280,11 +282,11 @@ mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.)
280NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited). 282NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
281NOTE: We cannot set limits on the root cgroup any more. 283NOTE: We cannot set limits on the root cgroup any more.
282 284
283# cat /cgroups/0/memory.limit_in_bytes 285# cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
2844194304 2864194304
285 287
286We can check the usage: 288We can check the usage:
287# cat /cgroups/0/memory.usage_in_bytes 289# cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes
2881216512 2901216512
289 291
290A successful write to this file does not guarantee a successful set of 292A successful write to this file does not guarantee a successful set of
@@ -464,6 +466,24 @@ value for efficient access. (Of course, when necessary, it's synchronized.)
464If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP) 466If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP)
465value in memory.stat(see 5.2). 467value in memory.stat(see 5.2).
466 468
4695.6 numa_stat
470
471This is similar to numa_maps but operates on a per-memcg basis. This is
472useful for providing visibility into the numa locality information within
473an memcg since the pages are allowed to be allocated from any physical
474node. One of the usecases is evaluating application performance by
475combining this information with the application's cpu allocation.
476
477We export "total", "file", "anon" and "unevictable" pages per-node for
478each memcg. The ouput format of memory.numa_stat is:
479
480total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
481file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
482anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
483unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
484
485And we have total = file + anon + unevictable.
486
4676. Hierarchy support 4876. Hierarchy support
468 488
469The memory controller supports a deep hierarchy and hierarchical accounting. 489The memory controller supports a deep hierarchy and hierarchical accounting.
@@ -471,13 +491,13 @@ The hierarchy is created by creating the appropriate cgroups in the
471cgroup filesystem. Consider for example, the following cgroup filesystem 491cgroup filesystem. Consider for example, the following cgroup filesystem
472hierarchy 492hierarchy
473 493
474 root 494 root
475 / | \ 495 / | \
476 / | \ 496 / | \
477 a b c 497 a b c
478 | \ 498 | \
479 | \ 499 | \
480 d e 500 d e
481 501
482In the diagram above, with hierarchical accounting enabled, all memory 502In the diagram above, with hierarchical accounting enabled, all memory
483usage of e, is accounted to its ancestors up until the root (i.e, c and root), 503usage of e, is accounted to its ancestors up until the root (i.e, c and root),
diff --git a/Documentation/development-process/4.Coding b/Documentation/development-process/4.Coding
index f3f1a469443c..83f5f5b365a3 100644
--- a/Documentation/development-process/4.Coding
+++ b/Documentation/development-process/4.Coding
@@ -244,7 +244,7 @@ testing purposes. In particular, you should turn on:
244 - DEBUG_SLAB can find a variety of memory allocation and use errors; it 244 - DEBUG_SLAB can find a variety of memory allocation and use errors; it
245 should be used on most development kernels. 245 should be used on most development kernels.
246 246
247 - DEBUG_SPINLOCK, DEBUG_SPINLOCK_SLEEP, and DEBUG_MUTEXES will find a 247 - DEBUG_SPINLOCK, DEBUG_ATOMIC_SLEEP, and DEBUG_MUTEXES will find a
248 number of common locking errors. 248 number of common locking errors.
249 249
250There are quite a few other debugging options, some of which will be 250There are quite a few other debugging options, some of which will be
diff --git a/Documentation/devicetree/bindings/arm/primecell.txt b/Documentation/devicetree/bindings/arm/primecell.txt
new file mode 100644
index 000000000000..1d5d7a870ec7
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/primecell.txt
@@ -0,0 +1,21 @@
1* ARM Primecell Peripherals
2
3ARM, Ltd. Primecell peripherals have a standard id register that can be used to
4identify the peripheral type, vendor, and revision. This value can be used for
5driver matching.
6
7Required properties:
8
9- compatible : should be a specific value for peripheral and "arm,primecell"
10
11Optional properties:
12
13- arm,primecell-periphid : Value to override the h/w value with
14
15Example:
16
17serial@fff36000 {
18 compatible = "arm,pl011", "arm,primecell";
19 arm,primecell-periphid = <0x00341011>;
20};
21
diff --git a/Documentation/devicetree/bindings/powerpc/fsl/sec.txt b/Documentation/devicetree/bindings/crypto/fsl-sec2.txt
index 2b6f2d45c45a..38988ef1336b 100644
--- a/Documentation/devicetree/bindings/powerpc/fsl/sec.txt
+++ b/Documentation/devicetree/bindings/crypto/fsl-sec2.txt
@@ -1,4 +1,4 @@
1Freescale SoC SEC Security Engines 1Freescale SoC SEC Security Engines versions 2.x-3.x
2 2
3Required properties: 3Required properties:
4 4
diff --git a/Documentation/devicetree/bindings/gpio/fsl-imx-gpio.txt b/Documentation/devicetree/bindings/gpio/fsl-imx-gpio.txt
new file mode 100644
index 000000000000..4363ae4b3c14
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/fsl-imx-gpio.txt
@@ -0,0 +1,22 @@
1* Freescale i.MX/MXC GPIO controller
2
3Required properties:
4- compatible : Should be "fsl,<soc>-gpio"
5- reg : Address and length of the register set for the device
6- interrupts : Should be the port interrupt shared by all 32 pins, if
7 one number. If two numbers, the first one is the interrupt shared
8 by low 16 pins and the second one is for high 16 pins.
9- gpio-controller : Marks the device node as a gpio controller.
10- #gpio-cells : Should be two. The first cell is the pin number and
11 the second cell is used to specify optional parameters (currently
12 unused).
13
14Example:
15
16gpio0: gpio@73f84000 {
17 compatible = "fsl,imx51-gpio", "fsl,imx31-gpio";
18 reg = <0x73f84000 0x4000>;
19 interrupts = <50 51>;
20 gpio-controller;
21 #gpio-cells = <2>;
22};
diff --git a/Documentation/devicetree/bindings/gpio/gpio.txt b/Documentation/devicetree/bindings/gpio/gpio.txt
index edaa84d288a1..4e16ba4feab0 100644
--- a/Documentation/devicetree/bindings/gpio/gpio.txt
+++ b/Documentation/devicetree/bindings/gpio/gpio.txt
@@ -4,17 +4,45 @@ Specifying GPIO information for devices
41) gpios property 41) gpios property
5----------------- 5-----------------
6 6
7Nodes that makes use of GPIOs should define them using `gpios' property, 7Nodes that makes use of GPIOs should specify them using one or more
8format of which is: <&gpio-controller1-phandle gpio1-specifier 8properties, each containing a 'gpio-list':
9 &gpio-controller2-phandle gpio2-specifier
10 0 /* holes are permitted, means no GPIO 3 */
11 &gpio-controller4-phandle gpio4-specifier
12 ...>;
13 9
14Note that gpio-specifier length is controller dependent. 10 gpio-list ::= <single-gpio> [gpio-list]
11 single-gpio ::= <gpio-phandle> <gpio-specifier>
12 gpio-phandle : phandle to gpio controller node
13 gpio-specifier : Array of #gpio-cells specifying specific gpio
14 (controller specific)
15
16GPIO properties should be named "[<name>-]gpios". Exact
17meaning of each gpios property must be documented in the device tree
18binding for each device.
19
20For example, the following could be used to describe gpios pins to use
21as chip select lines; with chip selects 0, 1 and 3 populated, and chip
22select 2 left empty:
23
24 gpio1: gpio1 {
25 gpio-controller
26 #gpio-cells = <2>;
27 };
28 gpio2: gpio2 {
29 gpio-controller
30 #gpio-cells = <1>;
31 };
32 [...]
33 chipsel-gpios = <&gpio1 12 0>,
34 <&gpio1 13 0>,
35 <0>, /* holes are permitted, means no GPIO 2 */
36 <&gpio2 2>;
37
38Note that gpio-specifier length is controller dependent. In the
39above example, &gpio1 uses 2 cells to specify a gpio, while &gpio2
40only uses one.
15 41
16gpio-specifier may encode: bank, pin position inside the bank, 42gpio-specifier may encode: bank, pin position inside the bank,
17whether pin is open-drain and whether pin is logically inverted. 43whether pin is open-drain and whether pin is logically inverted.
44Exact meaning of each specifier cell is controller specific, and must
45be documented in the device tree binding for the device.
18 46
19Example of the node using GPIOs: 47Example of the node using GPIOs:
20 48
@@ -28,8 +56,8 @@ and empty GPIO flags as accepted by the "qe_pio_e" gpio-controller.
282) gpio-controller nodes 562) gpio-controller nodes
29------------------------ 57------------------------
30 58
31Every GPIO controller node must have #gpio-cells property defined, 59Every GPIO controller node must both an empty "gpio-controller"
32this information will be used to translate gpio-specifiers. 60property, and have #gpio-cells contain the size of the gpio-specifier.
33 61
34Example of two SOC GPIO banks defined as gpio-controller nodes: 62Example of two SOC GPIO banks defined as gpio-controller nodes:
35 63
diff --git a/Documentation/devicetree/bindings/gpio/gpio_nvidia.txt b/Documentation/devicetree/bindings/gpio/gpio_nvidia.txt
new file mode 100644
index 000000000000..eb4b530d64e1
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/gpio_nvidia.txt
@@ -0,0 +1,8 @@
1NVIDIA Tegra 2 GPIO controller
2
3Required properties:
4- compatible : "nvidia,tegra20-gpio"
5- #gpio-cells : Should be two. The first cell is the pin number and the
6 second cell is used to specify optional parameters:
7 - bit 0 specifies polarity (0 for normal, 1 for inverted)
8- gpio-controller : Marks the device node as a GPIO controller.
diff --git a/Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt b/Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt
new file mode 100644
index 000000000000..9841057d112b
--- /dev/null
+++ b/Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt
@@ -0,0 +1,22 @@
1* Freescale (Enhanced) Configurable Serial Peripheral Interface
2 (CSPI/eCSPI) for i.MX
3
4Required properties:
5- compatible : Should be "fsl,<soc>-cspi" or "fsl,<soc>-ecspi"
6- reg : Offset and length of the register set for the device
7- interrupts : Should contain CSPI/eCSPI interrupt
8- fsl,spi-num-chipselects : Contains the number of the chipselect
9- cs-gpios : Specifies the gpio pins to be used for chipselects.
10
11Example:
12
13ecspi@70010000 {
14 #address-cells = <1>;
15 #size-cells = <0>;
16 compatible = "fsl,imx51-ecspi";
17 reg = <0x70010000 0x4000>;
18 interrupts = <36>;
19 fsl,spi-num-chipselects = <2>;
20 cs-gpios = <&gpio3 24 0>, /* GPIO4_24 */
21 <&gpio3 25 0>; /* GPIO4_25 */
22};
diff --git a/Documentation/devicetree/bindings/spi/spi_nvidia.txt b/Documentation/devicetree/bindings/spi/spi_nvidia.txt
new file mode 100644
index 000000000000..6b9e51896693
--- /dev/null
+++ b/Documentation/devicetree/bindings/spi/spi_nvidia.txt
@@ -0,0 +1,5 @@
1NVIDIA Tegra 2 SPI device
2
3Required properties:
4- compatible : should be "nvidia,tegra20-spi".
5- gpios : should specify GPIOs used for chipselect.
diff --git a/Documentation/devicetree/bindings/tty/serial/of-serial.txt b/Documentation/devicetree/bindings/tty/serial/of-serial.txt
new file mode 100644
index 000000000000..b8b27b0aca10
--- /dev/null
+++ b/Documentation/devicetree/bindings/tty/serial/of-serial.txt
@@ -0,0 +1,36 @@
1* UART (Universal Asynchronous Receiver/Transmitter)
2
3Required properties:
4- compatible : one of:
5 - "ns8250"
6 - "ns16450"
7 - "ns16550a"
8 - "ns16550"
9 - "ns16750"
10 - "ns16850"
11 - "nvidia,tegra20-uart"
12 - "ibm,qpace-nwp-serial"
13 - "serial" if the port type is unknown.
14- reg : offset and length of the register set for the device.
15- interrupts : should contain uart interrupt.
16- clock-frequency : the input clock frequency for the UART.
17
18Optional properties:
19- current-speed : the current active speed of the UART.
20- reg-offset : offset to apply to the mapbase from the start of the registers.
21- reg-shift : quantity to shift the register offsets by.
22- reg-io-width : the size (in bytes) of the IO accesses that should be
23 performed on the device. There are some systems that require 32-bit
24 accesses to the UART (e.g. TI davinci).
25- used-by-rtas : set to indicate that the port is in use by the OpenFirmware
26 RTAS and should not be registered.
27
28Example:
29
30 uart@80230000 {
31 compatible = "ns8250";
32 reg = <0x80230000 0x100>;
33 clock-frequency = <3686400>;
34 interrupts = <10>;
35 reg-shift = <2>;
36 };
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 1a9446b59153..d59e71df5c5c 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -481,23 +481,6 @@ Who: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
481 481
482---------------------------- 482----------------------------
483 483
484What: namespace cgroup (ns_cgroup)
485When: 2.6.38
486Why: The ns_cgroup leads to some problems:
487 * cgroup creation is out-of-control
488 * cgroup name can conflict when pids are looping
489 * it is not possible to have a single process handling
490 a lot of namespaces without falling in a exponential creation time
491 * we may want to create a namespace without creating a cgroup
492
493 The ns_cgroup is replaced by a compatibility flag 'clone_children',
494 where a newly created cgroup will copy the parent cgroup values.
495 The userspace has to manually create a cgroup and add a task to
496 the 'tasks' file.
497Who: Daniel Lezcano <daniel.lezcano@free.fr>
498
499----------------------------
500
501What: iwlwifi disable_hw_scan module parameters 484What: iwlwifi disable_hw_scan module parameters
502When: 2.6.40 485When: 2.6.40
503Why: Hareware scan is the prefer method for iwlwifi devices for 486Why: Hareware scan is the prefer method for iwlwifi devices for
@@ -518,16 +501,6 @@ Who: NeilBrown <neilb@suse.de>
518 501
519---------------------------- 502----------------------------
520 503
521What: cancel_rearming_delayed_work[queue]()
522When: 2.6.39
523
524Why: The functions have been superceded by cancel_delayed_work_sync()
525 quite some time ago. The conversion is trivial and there is no
526 in-kernel user left.
527Who: Tejun Heo <tj@kernel.org>
528
529----------------------------
530
531What: Legacy, non-standard chassis intrusion detection interface. 504What: Legacy, non-standard chassis intrusion detection interface.
532When: June 2011 505When: June 2011
533Why: The adm9240, w83792d and w83793 hardware monitoring drivers have 506Why: The adm9240, w83792d and w83793 hardware monitoring drivers have
@@ -600,3 +573,25 @@ Why: Superseded by the UVCIOC_CTRL_QUERY ioctl.
600Who: Laurent Pinchart <laurent.pinchart@ideasonboard.com> 573Who: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
601 574
602---------------------------- 575----------------------------
576
577What: For VIDIOC_S_FREQUENCY the type field must match the device node's type.
578 If not, return -EINVAL.
579When: 3.2
580Why: It makes no sense to switch the tuner to radio mode by calling
581 VIDIOC_S_FREQUENCY on a video node, or to switch the tuner to tv mode by
582 calling VIDIOC_S_FREQUENCY on a radio node. This is the first step of a
583 move to more consistent handling of tv and radio tuners.
584Who: Hans Verkuil <hans.verkuil@cisco.com>
585
586----------------------------
587
588What: Opening a radio device node will no longer automatically switch the
589 tuner mode from tv to radio.
590When: 3.3
591Why: Just opening a V4L device should not change the state of the hardware
592 like that. It's very unexpected and against the V4L spec. Instead, you
593 switch to radio mode by calling VIDIOC_S_FREQUENCY. This is the second
594 and last step of the move to consistent handling of tv and radio tuners.
595Who: Hans Verkuil <hans.verkuil@cisco.com>
596
597----------------------------
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 57d827d6071d..ca7e25292542 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -52,7 +52,7 @@ ata *);
52 void (*put_link) (struct dentry *, struct nameidata *, void *); 52 void (*put_link) (struct dentry *, struct nameidata *, void *);
53 void (*truncate) (struct inode *); 53 void (*truncate) (struct inode *);
54 int (*permission) (struct inode *, int, unsigned int); 54 int (*permission) (struct inode *, int, unsigned int);
55 int (*check_acl)(struct inode *, int, unsigned int); 55 int (*check_acl)(struct inode *, int);
56 int (*setattr) (struct dentry *, struct iattr *); 56 int (*setattr) (struct dentry *, struct iattr *);
57 int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); 57 int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
58 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); 58 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
@@ -412,7 +412,7 @@ prototypes:
412 int (*open) (struct inode *, struct file *); 412 int (*open) (struct inode *, struct file *);
413 int (*flush) (struct file *); 413 int (*flush) (struct file *);
414 int (*release) (struct inode *, struct file *); 414 int (*release) (struct inode *, struct file *);
415 int (*fsync) (struct file *, int datasync); 415 int (*fsync) (struct file *, loff_t start, loff_t end, int datasync);
416 int (*aio_fsync) (struct kiocb *, int datasync); 416 int (*aio_fsync) (struct kiocb *, int datasync);
417 int (*fasync) (int, struct file *, int); 417 int (*fasync) (int, struct file *, int);
418 int (*lock) (struct file *, int, struct file_lock *); 418 int (*lock) (struct file *, int, struct file_lock *);
@@ -438,9 +438,7 @@ prototypes:
438 438
439locking rules: 439locking rules:
440 All may block except for ->setlease. 440 All may block except for ->setlease.
441 No VFS locks held on entry except for ->fsync and ->setlease. 441 No VFS locks held on entry except for ->setlease.
442
443->fsync() has i_mutex on inode.
444 442
445->setlease has the file_list_lock held and must not sleep. 443->setlease has the file_list_lock held and must not sleep.
446 444
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
index a167ab876c35..7cc6bf2871eb 100644
--- a/Documentation/filesystems/caching/netfs-api.txt
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -673,6 +673,22 @@ storage request to complete, or it may attempt to cancel the storage request -
673in which case the page will not be stored in the cache this time. 673in which case the page will not be stored in the cache this time.
674 674
675 675
676BULK INODE PAGE UNCACHE
677-----------------------
678
679A convenience routine is provided to perform an uncache on all the pages
680attached to an inode. This assumes that the pages on the inode correspond on a
6811:1 basis with the pages in the cache.
682
683 void fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
684 struct inode *inode);
685
686This takes the netfs cookie that the pages were cached with and the inode that
687the pages are attached to. This function will wait for pages to finish being
688written to the cache and for the cache to finish with the page generally. No
689error is returned.
690
691
676========================== 692==========================
677INDEX AND DATA FILE UPDATE 693INDEX AND DATA FILE UPDATE
678========================== 694==========================
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index d5c0cef38a71..873a2ab2e9f8 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -40,7 +40,6 @@ Features which NILFS2 does not support yet:
40 - POSIX ACLs 40 - POSIX ACLs
41 - quotas 41 - quotas
42 - fsck 42 - fsck
43 - resize
44 - defragmentation 43 - defragmentation
45 44
46Mount options 45Mount options
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 6e29954851a2..7f8861d341ea 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -400,10 +400,31 @@ a file off.
400 400
401-- 401--
402[mandatory] 402[mandatory]
403
404--
405[mandatory]
406 ->get_sb() is gone. Switch to use of ->mount(). Typically it's just 403 ->get_sb() is gone. Switch to use of ->mount(). Typically it's just
407a matter of switching from calling get_sb_... to mount_... and changing the 404a matter of switching from calling get_sb_... to mount_... and changing the
408function type. If you were doing it manually, just switch from setting ->mnt_root 405function type. If you were doing it manually, just switch from setting ->mnt_root
409to some pointer to returning that pointer. On errors return ERR_PTR(...). 406to some pointer to returning that pointer. On errors return ERR_PTR(...).
407
408--
409[mandatory]
410 ->permission(), generic_permission() and ->check_acl() have lost flags
411argument; instead of passing IPERM_FLAG_RCU we add MAY_NOT_BLOCK into mask.
412 generic_permission() has also lost the check_acl argument; if you want
413non-NULL to be used for that inode, put it into ->i_op->check_acl.
414
415--
416[mandatory]
417 If you implement your own ->llseek() you must handle SEEK_HOLE and
418SEEK_DATA. You can hanle this by returning -EINVAL, but it would be nicer to
419support it in some way. The generic handler assumes that the entire file is
420data and there is a virtual hole at the end of the file. So if the provided
421offset is less than i_size and SEEK_DATA is specified, return the same offset.
422If the above is true for the offset and you are given SEEK_HOLE, return the end
423of the file. If the offset is i_size or greater return -ENXIO in either case.
424
425[mandatory]
426 If you have your own ->fsync() you must make sure to call
427filemap_write_and_wait_range() so that all dirty pages are synced out properly.
428You must also keep in mind that ->fsync() is not called with i_mutex held
429anymore, so if you require i_mutex locking you must make sure to take it and
430release it yourself.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index f48178024067..db3b1aba32a3 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -843,6 +843,7 @@ Provides counts of softirq handlers serviced since boot time, for each cpu.
843 TASKLET: 0 0 0 290 843 TASKLET: 0 0 0 290
844 SCHED: 27035 26983 26971 26746 844 SCHED: 27035 26983 26971 26746
845 HRTIMER: 0 0 0 0 845 HRTIMER: 0 0 0 0
846 RCU: 1678 1769 2178 2250
846 847
847 848
8481.3 IDE devices in /proc/ide 8491.3 IDE devices in /proc/ide
diff --git a/Documentation/filesystems/ubifs.txt b/Documentation/filesystems/ubifs.txt
index 8e4fab639d9c..a0a61d2f389f 100644
--- a/Documentation/filesystems/ubifs.txt
+++ b/Documentation/filesystems/ubifs.txt
@@ -111,34 +111,6 @@ The following is an example of the kernel boot arguments to attach mtd0
111to UBI and mount volume "rootfs": 111to UBI and mount volume "rootfs":
112ubi.mtd=0 root=ubi0:rootfs rootfstype=ubifs 112ubi.mtd=0 root=ubi0:rootfs rootfstype=ubifs
113 113
114
115Module Parameters for Debugging
116===============================
117
118When UBIFS has been compiled with debugging enabled, there are 2 module
119parameters that are available to control aspects of testing and debugging.
120
121debug_chks Selects extra checks that UBIFS can do while running:
122
123 Check Flag value
124
125 General checks 1
126 Check Tree Node Cache (TNC) 2
127 Check indexing tree size 4
128 Check orphan area 8
129 Check old indexing tree 16
130 Check LEB properties (lprops) 32
131 Check leaf nodes and inodes 64
132
133debug_tsts Selects a mode of testing, as follows:
134
135 Test mode Flag value
136
137 Failure mode for recovery testing 4
138
139For example, set debug_chks to 3 to enable general and TNC checks.
140
141
142References 114References
143========== 115==========
144 116
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 88b9f5519af9..eff6617c9a0f 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -229,6 +229,8 @@ struct super_operations {
229 229
230 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); 230 ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
231 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); 231 ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
232 int (*nr_cached_objects)(struct super_block *);
233 void (*free_cached_objects)(struct super_block *, int);
232}; 234};
233 235
234All methods are called without any locks being held, unless otherwise 236All methods are called without any locks being held, unless otherwise
@@ -301,6 +303,26 @@ or bottom half).
301 303
302 quota_write: called by the VFS to write to filesystem quota file. 304 quota_write: called by the VFS to write to filesystem quota file.
303 305
306 nr_cached_objects: called by the sb cache shrinking function for the
307 filesystem to return the number of freeable cached objects it contains.
308 Optional.
309
310 free_cache_objects: called by the sb cache shrinking function for the
311 filesystem to scan the number of objects indicated to try to free them.
312 Optional, but any filesystem implementing this method needs to also
313 implement ->nr_cached_objects for it to be called correctly.
314
315 We can't do anything with any errors that the filesystem might
316 encountered, hence the void return type. This will never be called if
317 the VM is trying to reclaim under GFP_NOFS conditions, hence this
318 method does not need to handle that situation itself.
319
320 Implementations must include conditional reschedule calls inside any
321 scanning loop that is done. This allows the VFS to determine
322 appropriate scan batch sizes without having to worry about whether
323 implementations will cause holdoff problems due to large scan batch
324 sizes.
325
304Whoever sets up the inode is responsible for filling in the "i_op" field. This 326Whoever sets up the inode is responsible for filling in the "i_op" field. This
305is a pointer to a "struct inode_operations" which describes the methods that 327is a pointer to a "struct inode_operations" which describes the methods that
306can be performed on individual inodes. 328can be performed on individual inodes.
@@ -333,8 +355,8 @@ struct inode_operations {
333 void * (*follow_link) (struct dentry *, struct nameidata *); 355 void * (*follow_link) (struct dentry *, struct nameidata *);
334 void (*put_link) (struct dentry *, struct nameidata *, void *); 356 void (*put_link) (struct dentry *, struct nameidata *, void *);
335 void (*truncate) (struct inode *); 357 void (*truncate) (struct inode *);
336 int (*permission) (struct inode *, int, unsigned int); 358 int (*permission) (struct inode *, int);
337 int (*check_acl)(struct inode *, int, unsigned int); 359 int (*check_acl)(struct inode *, int);
338 int (*setattr) (struct dentry *, struct iattr *); 360 int (*setattr) (struct dentry *, struct iattr *);
339 int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); 361 int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
340 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); 362 int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
@@ -423,7 +445,7 @@ otherwise noted.
423 permission: called by the VFS to check for access rights on a POSIX-like 445 permission: called by the VFS to check for access rights on a POSIX-like
424 filesystem. 446 filesystem.
425 447
426 May be called in rcu-walk mode (flags & IPERM_FLAG_RCU). If in rcu-walk 448 May be called in rcu-walk mode (mask & MAY_NOT_BLOCK). If in rcu-walk
427 mode, the filesystem must check the permission without blocking or 449 mode, the filesystem must check the permission without blocking or
428 storing to the inode. 450 storing to the inode.
429 451
@@ -755,7 +777,7 @@ struct file_operations {
755 int (*open) (struct inode *, struct file *); 777 int (*open) (struct inode *, struct file *);
756 int (*flush) (struct file *); 778 int (*flush) (struct file *);
757 int (*release) (struct inode *, struct file *); 779 int (*release) (struct inode *, struct file *);
758 int (*fsync) (struct file *, int datasync); 780 int (*fsync) (struct file *, loff_t, loff_t, int datasync);
759 int (*aio_fsync) (struct kiocb *, int datasync); 781 int (*aio_fsync) (struct kiocb *, int datasync);
760 int (*fasync) (int, struct file *, int); 782 int (*fasync) (int, struct file *, int);
761 int (*lock) (struct file *, int, struct file_lock *); 783 int (*lock) (struct file *, int, struct file_lock *);
diff --git a/Documentation/hwmon/f71882fg b/Documentation/hwmon/f71882fg
index 84d2623810f3..de91c0db5846 100644
--- a/Documentation/hwmon/f71882fg
+++ b/Documentation/hwmon/f71882fg
@@ -22,6 +22,10 @@ Supported chips:
22 Prefix: 'f71869' 22 Prefix: 'f71869'
23 Addresses scanned: none, address read from Super I/O config space 23 Addresses scanned: none, address read from Super I/O config space
24 Datasheet: Available from the Fintek website 24 Datasheet: Available from the Fintek website
25 * Fintek F71869A
26 Prefix: 'f71869a'
27 Addresses scanned: none, address read from Super I/O config space
28 Datasheet: Not public
25 * Fintek F71882FG and F71883FG 29 * Fintek F71882FG and F71883FG
26 Prefix: 'f71882fg' 30 Prefix: 'f71882fg'
27 Addresses scanned: none, address read from Super I/O config space 31 Addresses scanned: none, address read from Super I/O config space
diff --git a/Documentation/hwmon/k10temp b/Documentation/hwmon/k10temp
index 0393c89277c0..a10f73624ad3 100644
--- a/Documentation/hwmon/k10temp
+++ b/Documentation/hwmon/k10temp
@@ -9,8 +9,8 @@ Supported chips:
9 Socket S1G3: Athlon II, Sempron, Turion II 9 Socket S1G3: Athlon II, Sempron, Turion II
10* AMD Family 11h processors: 10* AMD Family 11h processors:
11 Socket S1G2: Athlon (X2), Sempron (X2), Turion X2 (Ultra) 11 Socket S1G2: Athlon (X2), Sempron (X2), Turion X2 (Ultra)
12* AMD Family 12h processors: "Llano" 12* AMD Family 12h processors: "Llano" (E2/A4/A6/A8-Series)
13* AMD Family 14h processors: "Brazos" (C/E/G-Series) 13* AMD Family 14h processors: "Brazos" (C/E/G/Z-Series)
14* AMD Family 15h processors: "Bulldozer" 14* AMD Family 15h processors: "Bulldozer"
15 15
16 Prefix: 'k10temp' 16 Prefix: 'k10temp'
@@ -20,12 +20,16 @@ Supported chips:
20 http://support.amd.com/us/Processor_TechDocs/31116.pdf 20 http://support.amd.com/us/Processor_TechDocs/31116.pdf
21 BIOS and Kernel Developer's Guide (BKDG) for AMD Family 11h Processors: 21 BIOS and Kernel Developer's Guide (BKDG) for AMD Family 11h Processors:
22 http://support.amd.com/us/Processor_TechDocs/41256.pdf 22 http://support.amd.com/us/Processor_TechDocs/41256.pdf
23 BIOS and Kernel Developer's Guide (BKDG) for AMD Family 12h Processors:
24 http://support.amd.com/us/Processor_TechDocs/41131.pdf
23 BIOS and Kernel Developer's Guide (BKDG) for AMD Family 14h Models 00h-0Fh Processors: 25 BIOS and Kernel Developer's Guide (BKDG) for AMD Family 14h Models 00h-0Fh Processors:
24 http://support.amd.com/us/Processor_TechDocs/43170.pdf 26 http://support.amd.com/us/Processor_TechDocs/43170.pdf
25 Revision Guide for AMD Family 10h Processors: 27 Revision Guide for AMD Family 10h Processors:
26 http://support.amd.com/us/Processor_TechDocs/41322.pdf 28 http://support.amd.com/us/Processor_TechDocs/41322.pdf
27 Revision Guide for AMD Family 11h Processors: 29 Revision Guide for AMD Family 11h Processors:
28 http://support.amd.com/us/Processor_TechDocs/41788.pdf 30 http://support.amd.com/us/Processor_TechDocs/41788.pdf
31 Revision Guide for AMD Family 12h Processors:
32 http://support.amd.com/us/Processor_TechDocs/44739.pdf
29 Revision Guide for AMD Family 14h Models 00h-0Fh Processors: 33 Revision Guide for AMD Family 14h Models 00h-0Fh Processors:
30 http://support.amd.com/us/Processor_TechDocs/47534.pdf 34 http://support.amd.com/us/Processor_TechDocs/47534.pdf
31 AMD Family 11h Processor Power and Thermal Data Sheet for Notebooks: 35 AMD Family 11h Processor Power and Thermal Data Sheet for Notebooks:
diff --git a/Documentation/ja_JP/SubmitChecklist b/Documentation/ja_JP/SubmitChecklist
index 2df4576f1173..cb5507b1ac81 100644
--- a/Documentation/ja_JP/SubmitChecklist
+++ b/Documentation/ja_JP/SubmitChecklist
@@ -68,7 +68,7 @@ Linux 銈兗銉嶃儷銉戙儍銉佹姇绋胯呭悜銇戙儊銈с儍銈儶銈广儓
68 68
6912: CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, CONFIG_DEBUG_SLAB, 6912: CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, CONFIG_DEBUG_SLAB,
70 CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, CONFIG_DEBUG_SPINLOCK, 70 CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, CONFIG_DEBUG_SPINLOCK,
71 CONFIG_DEBUG_SPINLOCK_SLEEP 銇撱倢銈夊叏銇︺倰鍚屾檪銇湁鍔广伀銇椼仸鍕曚綔纰鸿獚銈 71 CONFIG_DEBUG_ATOMIC_SLEEP 銇撱倢銈夊叏銇︺倰鍚屾檪銇湁鍔广伀銇椼仸鍕曚綔纰鸿獚銈
72 琛屻仯銇︺亸銇犮仌銇勩 72 琛屻仯銇︺亸銇犮仌銇勩
73 73
7413: CONFIG_SMP, CONFIG_PREEMPT 銈掓湁鍔广伀銇椼仧鍫村悎銇ㄧ劇鍔广伀銇椼仧鍫村悎銇浮鏂广仹 7413: CONFIG_SMP, CONFIG_PREEMPT 銈掓湁鍔广伀銇椼仧鍫村悎銇ㄧ劇鍔广伀銇椼仧鍫村悎銇浮鏂广仹
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index d9a203b058f1..aa47be71df4c 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2015,6 +2015,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2015 the default. 2015 the default.
2016 off: Turn ECRC off 2016 off: Turn ECRC off
2017 on: Turn ECRC on. 2017 on: Turn ECRC on.
2018 realloc reallocate PCI resources if allocations done by BIOS
2019 are erroneous.
2018 2020
2019 pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power 2021 pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power
2020 Management. 2022 Management.
@@ -2598,6 +2600,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2598 unlock ejectable media); 2600 unlock ejectable media);
2599 m = MAX_SECTORS_64 (don't transfer more 2601 m = MAX_SECTORS_64 (don't transfer more
2600 than 64 sectors = 32 KB at a time); 2602 than 64 sectors = 32 KB at a time);
2603 n = INITIAL_READ10 (force a retry of the
2604 initial READ(10) command);
2601 o = CAPACITY_OK (accept the capacity 2605 o = CAPACITY_OK (accept the capacity
2602 reported by the device); 2606 reported by the device);
2603 r = IGNORE_RESIDUE (the device reports 2607 r = IGNORE_RESIDUE (the device reports
diff --git a/Documentation/kmemleak.txt b/Documentation/kmemleak.txt
index 090e6ee04536..51063e681ca4 100644
--- a/Documentation/kmemleak.txt
+++ b/Documentation/kmemleak.txt
@@ -11,7 +11,9 @@ with the difference that the orphan objects are not freed but only
11reported via /sys/kernel/debug/kmemleak. A similar method is used by the 11reported via /sys/kernel/debug/kmemleak. A similar method is used by the
12Valgrind tool (memcheck --leak-check) to detect the memory leaks in 12Valgrind tool (memcheck --leak-check) to detect the memory leaks in
13user-space applications. 13user-space applications.
14Kmemleak is supported on x86, arm, powerpc, sparc, sh, microblaze and tile. 14
15Please check DEBUG_KMEMLEAK dependencies in lib/Kconfig.debug for supported
16architectures.
15 17
16Usage 18Usage
17----- 19-----
diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt
index 1565eefd6fd5..61815483efa3 100644
--- a/Documentation/laptops/thinkpad-acpi.txt
+++ b/Documentation/laptops/thinkpad-acpi.txt
@@ -534,6 +534,8 @@ Events that are never propagated by the driver:
5340x2404 System is waking up from hibernation to undock 5340x2404 System is waking up from hibernation to undock
5350x2405 System is waking up from hibernation to eject bay 5350x2405 System is waking up from hibernation to eject bay
5360x5010 Brightness level changed/control event 5360x5010 Brightness level changed/control event
5370x6000 KEYBOARD: Numlock key pressed
5380x6005 KEYBOARD: Fn key pressed (TO BE VERIFIED)
537 539
538Events that are propagated by the driver to userspace: 540Events that are propagated by the driver to userspace:
539 541
@@ -545,6 +547,8 @@ Events that are propagated by the driver to userspace:
5450x3006 Bay hotplug request (hint to power up SATA link when 5470x3006 Bay hotplug request (hint to power up SATA link when
546 the optical drive tray is ejected) 548 the optical drive tray is ejected)
5470x4003 Undocked (see 0x2x04), can sleep again 5490x4003 Undocked (see 0x2x04), can sleep again
5500x4010 Docked into hotplug port replicator (non-ACPI dock)
5510x4011 Undocked from hotplug port replicator (non-ACPI dock)
5480x500B Tablet pen inserted into its storage bay 5520x500B Tablet pen inserted into its storage bay
5490x500C Tablet pen removed from its storage bay 5530x500C Tablet pen removed from its storage bay
5500x6011 ALARM: battery is too hot 5540x6011 ALARM: battery is too hot
@@ -552,6 +556,7 @@ Events that are propagated by the driver to userspace:
5520x6021 ALARM: a sensor is too hot 5560x6021 ALARM: a sensor is too hot
5530x6022 ALARM: a sensor is extremely hot 5570x6022 ALARM: a sensor is extremely hot
5540x6030 System thermal table changed 5580x6030 System thermal table changed
5590x6040 Nvidia Optimus/AC adapter related (TO BE VERIFIED)
555 560
556Battery nearly empty alarms are a last resort attempt to get the 561Battery nearly empty alarms are a last resort attempt to get the
557operating system to hibernate or shutdown cleanly (0x2313), or shutdown 562operating system to hibernate or shutdown cleanly (0x2313), or shutdown
diff --git a/Documentation/md.txt b/Documentation/md.txt
index 2366b1c8cf19..f0eee83ff78a 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -555,7 +555,7 @@ also have
555 sync_min 555 sync_min
556 sync_max 556 sync_max
557 The two values, given as numbers of sectors, indicate a range 557 The two values, given as numbers of sectors, indicate a range
558 withing the array where 'check'/'repair' will operate. Must be 558 within the array where 'check'/'repair' will operate. Must be
559 a multiple of chunk_size. When it reaches "sync_max" it will 559 a multiple of chunk_size. When it reaches "sync_max" it will
560 pause, rather than complete. 560 pause, rather than complete.
561 You can use 'select' or 'poll' on "sync_completed" to wait for 561 You can use 'select' or 'poll' on "sync_completed" to wait for
diff --git a/Documentation/mmc/00-INDEX b/Documentation/mmc/00-INDEX
index 93dd7a714075..a9ba6720ffdf 100644
--- a/Documentation/mmc/00-INDEX
+++ b/Documentation/mmc/00-INDEX
@@ -4,3 +4,5 @@ mmc-dev-attrs.txt
4 - info on SD and MMC device attributes 4 - info on SD and MMC device attributes
5mmc-dev-parts.txt 5mmc-dev-parts.txt
6 - info on SD and MMC device partitions 6 - info on SD and MMC device partitions
7mmc-async-req.txt
8 - info on mmc asynchronous requests
diff --git a/Documentation/mmc/mmc-async-req.txt b/Documentation/mmc/mmc-async-req.txt
new file mode 100644
index 000000000000..ae1907b10e4a
--- /dev/null
+++ b/Documentation/mmc/mmc-async-req.txt
@@ -0,0 +1,87 @@
1Rationale
2=========
3
4How significant is the cache maintenance overhead?
5It depends. Fast eMMC and multiple cache levels with speculative cache
6pre-fetch makes the cache overhead relatively significant. If the DMA
7preparations for the next request are done in parallel with the current
8transfer, the DMA preparation overhead would not affect the MMC performance.
9The intention of non-blocking (asynchronous) MMC requests is to minimize the
10time between when an MMC request ends and another MMC request begins.
11Using mmc_wait_for_req(), the MMC controller is idle while dma_map_sg and
12dma_unmap_sg are processing. Using non-blocking MMC requests makes it
13possible to prepare the caches for next job in parallel with an active
14MMC request.
15
16MMC block driver
17================
18
19The mmc_blk_issue_rw_rq() in the MMC block driver is made non-blocking.
20The increase in throughput is proportional to the time it takes to
21prepare (major part of preparations are dma_map_sg() and dma_unmap_sg())
22a request and how fast the memory is. The faster the MMC/SD is the
23more significant the prepare request time becomes. Roughly the expected
24performance gain is 5% for large writes and 10% on large reads on a L2 cache
25platform. In power save mode, when clocks run on a lower frequency, the DMA
26preparation may cost even more. As long as these slower preparations are run
27in parallel with the transfer performance won't be affected.
28
29Details on measurements from IOZone and mmc_test
30================================================
31
32https://wiki.linaro.org/WorkingGroups/Kernel/Specs/StoragePerfMMC-async-req
33
34MMC core API extension
35======================
36
37There is one new public function mmc_start_req().
38It starts a new MMC command request for a host. The function isn't
39truly non-blocking. If there is an ongoing async request it waits
40for completion of that request and starts the new one and returns. It
41doesn't wait for the new request to complete. If there is no ongoing
42request it starts the new request and returns immediately.
43
44MMC host extensions
45===================
46
47There are two optional members in the mmc_host_ops -- pre_req() and
48post_req() -- that the host driver may implement in order to move work
49to before and after the actual mmc_host_ops.request() function is called.
50In the DMA case pre_req() may do dma_map_sg() and prepare the DMA
51descriptor, and post_req() runs the dma_unmap_sg().
52
53Optimize for the first request
54==============================
55
56The first request in a series of requests can't be prepared in parallel
57with the previous transfer, since there is no previous request.
58The argument is_first_req in pre_req() indicates that there is no previous
59request. The host driver may optimize for this scenario to minimize
60the performance loss. A way to optimize for this is to split the current
61request in two chunks, prepare the first chunk and start the request,
62and finally prepare the second chunk and start the transfer.
63
64Pseudocode to handle is_first_req scenario with minimal prepare overhead:
65
66if (is_first_req && req->size > threshold)
67 /* start MMC transfer for the complete transfer size */
68 mmc_start_command(MMC_CMD_TRANSFER_FULL_SIZE);
69
70 /*
71 * Begin to prepare DMA while cmd is being processed by MMC.
72 * The first chunk of the request should take the same time
73 * to prepare as the "MMC process command time".
74 * If prepare time exceeds MMC cmd time
75 * the transfer is delayed, guesstimate max 4k as first chunk size.
76 */
77 prepare_1st_chunk_for_dma(req);
78 /* flush pending desc to the DMAC (dmaengine.h) */
79 dma_issue_pending(req->dma_desc);
80
81 prepare_2nd_chunk_for_dma(req);
82 /*
83 * The second issue_pending should be called before MMC runs out
84 * of the first chunk. If the MMC runs out of the first data chunk
85 * before this call, the transfer is delayed.
86 */
87 dma_issue_pending(req->dma_desc);
diff --git a/Documentation/networking/ifenslave.c b/Documentation/networking/ifenslave.c
index 2bac9618c345..65968fbf1e49 100644
--- a/Documentation/networking/ifenslave.c
+++ b/Documentation/networking/ifenslave.c
@@ -260,7 +260,7 @@ int main(int argc, char *argv[])
260 case 'V': opt_V++; exclusive++; break; 260 case 'V': opt_V++; exclusive++; break;
261 261
262 case '?': 262 case '?':
263 fprintf(stderr, usage_msg); 263 fprintf(stderr, "%s", usage_msg);
264 res = 2; 264 res = 2;
265 goto out; 265 goto out;
266 } 266 }
@@ -268,13 +268,13 @@ int main(int argc, char *argv[])
268 268
269 /* options check */ 269 /* options check */
270 if (exclusive > 1) { 270 if (exclusive > 1) {
271 fprintf(stderr, usage_msg); 271 fprintf(stderr, "%s", usage_msg);
272 res = 2; 272 res = 2;
273 goto out; 273 goto out;
274 } 274 }
275 275
276 if (opt_v || opt_V) { 276 if (opt_v || opt_V) {
277 printf(version); 277 printf("%s", version);
278 if (opt_V) { 278 if (opt_V) {
279 res = 0; 279 res = 0;
280 goto out; 280 goto out;
@@ -282,14 +282,14 @@ int main(int argc, char *argv[])
282 } 282 }
283 283
284 if (opt_u) { 284 if (opt_u) {
285 printf(usage_msg); 285 printf("%s", usage_msg);
286 res = 0; 286 res = 0;
287 goto out; 287 goto out;
288 } 288 }
289 289
290 if (opt_h) { 290 if (opt_h) {
291 printf(usage_msg); 291 printf("%s", usage_msg);
292 printf(help_msg); 292 printf("%s", help_msg);
293 res = 0; 293 res = 0;
294 goto out; 294 goto out;
295 } 295 }
@@ -309,7 +309,7 @@ int main(int argc, char *argv[])
309 goto out; 309 goto out;
310 } else { 310 } else {
311 /* Just show usage */ 311 /* Just show usage */
312 fprintf(stderr, usage_msg); 312 fprintf(stderr, "%s", usage_msg);
313 res = 2; 313 res = 2;
314 goto out; 314 goto out;
315 } 315 }
@@ -320,7 +320,7 @@ int main(int argc, char *argv[])
320 master_ifname = *spp++; 320 master_ifname = *spp++;
321 321
322 if (master_ifname == NULL) { 322 if (master_ifname == NULL) {
323 fprintf(stderr, usage_msg); 323 fprintf(stderr, "%s", usage_msg);
324 res = 2; 324 res = 2;
325 goto out; 325 goto out;
326 } 326 }
@@ -339,7 +339,7 @@ int main(int argc, char *argv[])
339 339
340 if (slave_ifname == NULL) { 340 if (slave_ifname == NULL) {
341 if (opt_d || opt_c) { 341 if (opt_d || opt_c) {
342 fprintf(stderr, usage_msg); 342 fprintf(stderr, "%s", usage_msg);
343 res = 2; 343 res = 2;
344 goto out; 344 goto out;
345 } 345 }
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index d3d653a5f9b9..db2a4067013c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -106,16 +106,6 @@ inet_peer_maxttl - INTEGER
106 when the number of entries in the pool is very small). 106 when the number of entries in the pool is very small).
107 Measured in seconds. 107 Measured in seconds.
108 108
109inet_peer_gc_mintime - INTEGER
110 Minimum interval between garbage collection passes. This interval is
111 in effect under high memory pressure on the pool.
112 Measured in seconds.
113
114inet_peer_gc_maxtime - INTEGER
115 Minimum interval between garbage collection passes. This interval is
116 in effect under low (or absent) memory pressure on the pool.
117 Measured in seconds.
118
119TCP variables: 109TCP variables:
120 110
121somaxconn - INTEGER 111somaxconn - INTEGER
@@ -346,7 +336,7 @@ tcp_orphan_retries - INTEGER
346 when RTO retransmissions remain unacknowledged. 336 when RTO retransmissions remain unacknowledged.
347 See tcp_retries2 for more details. 337 See tcp_retries2 for more details.
348 338
349 The default value is 7. 339 The default value is 8.
350 If your machine is a loaded WEB server, 340 If your machine is a loaded WEB server,
351 you should think about lowering this value, such sockets 341 you should think about lowering this value, such sockets
352 may consume significant resources. Cf. tcp_max_orphans. 342 may consume significant resources. Cf. tcp_max_orphans.
@@ -394,7 +384,7 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max
394 min: Minimal size of receive buffer used by TCP sockets. 384 min: Minimal size of receive buffer used by TCP sockets.
395 It is guaranteed to each TCP socket, even under moderate memory 385 It is guaranteed to each TCP socket, even under moderate memory
396 pressure. 386 pressure.
397 Default: 8K 387 Default: 1 page
398 388
399 default: initial size of receive buffer used by TCP sockets. 389 default: initial size of receive buffer used by TCP sockets.
400 This value overrides net.core.rmem_default used by other protocols. 390 This value overrides net.core.rmem_default used by other protocols.
@@ -483,7 +473,7 @@ tcp_window_scaling - BOOLEAN
483tcp_wmem - vector of 3 INTEGERs: min, default, max 473tcp_wmem - vector of 3 INTEGERs: min, default, max
484 min: Amount of memory reserved for send buffers for TCP sockets. 474 min: Amount of memory reserved for send buffers for TCP sockets.
485 Each TCP socket has rights to use it due to fact of its birth. 475 Each TCP socket has rights to use it due to fact of its birth.
486 Default: 4K 476 Default: 1 page
487 477
488 default: initial size of send buffer used by TCP sockets. This 478 default: initial size of send buffer used by TCP sockets. This
489 value overrides net.core.wmem_default used by other protocols. 479 value overrides net.core.wmem_default used by other protocols.
@@ -553,13 +543,13 @@ udp_rmem_min - INTEGER
553 Minimal size of receive buffer used by UDP sockets in moderation. 543 Minimal size of receive buffer used by UDP sockets in moderation.
554 Each UDP socket is able to use the size for receiving data, even if 544 Each UDP socket is able to use the size for receiving data, even if
555 total pages of UDP sockets exceed udp_mem pressure. The unit is byte. 545 total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
556 Default: 4096 546 Default: 1 page
557 547
558udp_wmem_min - INTEGER 548udp_wmem_min - INTEGER
559 Minimal size of send buffer used by UDP sockets in moderation. 549 Minimal size of send buffer used by UDP sockets in moderation.
560 Each UDP socket is able to use the size for sending data, even if 550 Each UDP socket is able to use the size for sending data, even if
561 total pages of UDP sockets exceed udp_mem pressure. The unit is byte. 551 total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
562 Default: 4096 552 Default: 1 page
563 553
564CIPSOv4 Variables: 554CIPSOv4 Variables:
565 555
@@ -1465,10 +1455,17 @@ sctp_mem - vector of 3 INTEGERs: min, pressure, max
1465 Default is calculated at boot time from amount of available memory. 1455 Default is calculated at boot time from amount of available memory.
1466 1456
1467sctp_rmem - vector of 3 INTEGERs: min, default, max 1457sctp_rmem - vector of 3 INTEGERs: min, default, max
1468 See tcp_rmem for a description. 1458 Only the first value ("min") is used, "default" and "max" are
1459 ignored.
1460
1461 min: Minimal size of receive buffer used by SCTP socket.
1462 It is guaranteed to each SCTP socket (but not association) even
1463 under moderate memory pressure.
1464
1465 Default: 1 page
1469 1466
1470sctp_wmem - vector of 3 INTEGERs: min, default, max 1467sctp_wmem - vector of 3 INTEGERs: min, default, max
1471 See tcp_wmem for a description. 1468 Currently this tunable has no effect.
1472 1469
1473addr_scope_policy - INTEGER 1470addr_scope_policy - INTEGER
1474 Control IPv4 address scoping - draft-stewart-tsvwg-sctp-ipv4-00 1471 Control IPv4 address scoping - draft-stewart-tsvwg-sctp-ipv4-00
diff --git a/Documentation/networking/netdev-features.txt b/Documentation/networking/netdev-features.txt
new file mode 100644
index 000000000000..4b1c0dcef84c
--- /dev/null
+++ b/Documentation/networking/netdev-features.txt
@@ -0,0 +1,154 @@
1Netdev features mess and how to get out from it alive
2=====================================================
3
4Author:
5 Micha艂 Miros艂aw <mirq-linux@rere.qmqm.pl>
6
7
8
9 Part I: Feature sets
10======================
11
12Long gone are the days when a network card would just take and give packets
13verbatim. Today's devices add multiple features and bugs (read: offloads)
14that relieve an OS of various tasks like generating and checking checksums,
15splitting packets, classifying them. Those capabilities and their state
16are commonly referred to as netdev features in Linux kernel world.
17
18There are currently three sets of features relevant to the driver, and
19one used internally by network core:
20
21 1. netdev->hw_features set contains features whose state may possibly
22 be changed (enabled or disabled) for a particular device by user's
23 request. This set should be initialized in ndo_init callback and not
24 changed later.
25
26 2. netdev->features set contains features which are currently enabled
27 for a device. This should be changed only by network core or in
28 error paths of ndo_set_features callback.
29
30 3. netdev->vlan_features set contains features whose state is inherited
31 by child VLAN devices (limits netdev->features set). This is currently
32 used for all VLAN devices whether tags are stripped or inserted in
33 hardware or software.
34
35 4. netdev->wanted_features set contains feature set requested by user.
36 This set is filtered by ndo_fix_features callback whenever it or
37 some device-specific conditions change. This set is internal to
38 networking core and should not be referenced in drivers.
39
40
41
42 Part II: Controlling enabled features
43=======================================
44
45When current feature set (netdev->features) is to be changed, new set
46is calculated and filtered by calling ndo_fix_features callback
47and netdev_fix_features(). If the resulting set differs from current
48set, it is passed to ndo_set_features callback and (if the callback
49returns success) replaces value stored in netdev->features.
50NETDEV_FEAT_CHANGE notification is issued after that whenever current
51set might have changed.
52
53The following events trigger recalculation:
54 1. device's registration, after ndo_init returned success
55 2. user requested changes in features state
56 3. netdev_update_features() is called
57
58ndo_*_features callbacks are called with rtnl_lock held. Missing callbacks
59are treated as always returning success.
60
61A driver that wants to trigger recalculation must do so by calling
62netdev_update_features() while holding rtnl_lock. This should not be done
63from ndo_*_features callbacks. netdev->features should not be modified by
64driver except by means of ndo_fix_features callback.
65
66
67
68 Part III: Implementation hints
69================================
70
71 * ndo_fix_features:
72
73All dependencies between features should be resolved here. The resulting
74set can be reduced further by networking core imposed limitations (as coded
75in netdev_fix_features()). For this reason it is safer to disable a feature
76when its dependencies are not met instead of forcing the dependency on.
77
78This callback should not modify hardware nor driver state (should be
79stateless). It can be called multiple times between successive
80ndo_set_features calls.
81
82Callback must not alter features contained in NETIF_F_SOFT_FEATURES or
83NETIF_F_NEVER_CHANGE sets. The exception is NETIF_F_VLAN_CHALLENGED but
84care must be taken as the change won't affect already configured VLANs.
85
86 * ndo_set_features:
87
88Hardware should be reconfigured to match passed feature set. The set
89should not be altered unless some error condition happens that can't
90be reliably detected in ndo_fix_features. In this case, the callback
91should update netdev->features to match resulting hardware state.
92Errors returned are not (and cannot be) propagated anywhere except dmesg.
93(Note: successful return is zero, >0 means silent error.)
94
95
96
97 Part IV: Features
98===================
99
100For current list of features, see include/linux/netdev_features.h.
101This section describes semantics of some of them.
102
103 * Transmit checksumming
104
105For complete description, see comments near the top of include/linux/skbuff.h.
106
107Note: NETIF_F_HW_CSUM is a superset of NETIF_F_IP_CSUM + NETIF_F_IPV6_CSUM.
108It means that device can fill TCP/UDP-like checksum anywhere in the packets
109whatever headers there might be.
110
111 * Transmit TCP segmentation offload
112
113NETIF_F_TSO_ECN means that hardware can properly split packets with CWR bit
114set, be it TCPv4 (when NETIF_F_TSO is enabled) or TCPv6 (NETIF_F_TSO6).
115
116 * Transmit DMA from high memory
117
118On platforms where this is relevant, NETIF_F_HIGHDMA signals that
119ndo_start_xmit can handle skbs with frags in high memory.
120
121 * Transmit scatter-gather
122
123Those features say that ndo_start_xmit can handle fragmented skbs:
124NETIF_F_SG --- paged skbs (skb_shinfo()->frags), NETIF_F_FRAGLIST ---
125chained skbs (skb->next/prev list).
126
127 * Software features
128
129Features contained in NETIF_F_SOFT_FEATURES are features of networking
130stack. Driver should not change behaviour based on them.
131
132 * LLTX driver (deprecated for hardware drivers)
133
134NETIF_F_LLTX should be set in drivers that implement their own locking in
135transmit path or don't need locking at all (e.g. software tunnels).
136In ndo_start_xmit, it is recommended to use a try_lock and return
137NETDEV_TX_LOCKED when the spin lock fails. The locking should also properly
138protect against other callbacks (the rules you need to find out).
139
140Don't use it for new drivers.
141
142 * netns-local device
143
144NETIF_F_NETNS_LOCAL is set for devices that are not allowed to move between
145network namespaces (e.g. loopback).
146
147Don't use it in drivers.
148
149 * VLAN challenged
150
151NETIF_F_VLAN_CHALLENGED should be set for devices which can't cope with VLAN
152headers. Some drivers set this because the cards can't handle the bigger MTU.
153[FIXME: Those cases could be fixed in VLAN code by allowing only reduced-MTU
154VLANs. This may be not useful, though.]
diff --git a/Documentation/networking/nfc.txt b/Documentation/networking/nfc.txt
new file mode 100644
index 000000000000..b24c29bdae27
--- /dev/null
+++ b/Documentation/networking/nfc.txt
@@ -0,0 +1,128 @@
1Linux NFC subsystem
2===================
3
4The Near Field Communication (NFC) subsystem is required to standardize the
5NFC device drivers development and to create an unified userspace interface.
6
7This document covers the architecture overview, the device driver interface
8description and the userspace interface description.
9
10Architecture overview
11---------------------
12
13The NFC subsystem is responsible for:
14 - NFC adapters management;
15 - Polling for targets;
16 - Low-level data exchange;
17
18The subsystem is divided in some parts. The 'core' is responsible for
19providing the device driver interface. On the other side, it is also
20responsible for providing an interface to control operations and low-level
21data exchange.
22
23The control operations are available to userspace via generic netlink.
24
25The low-level data exchange interface is provided by the new socket family
26PF_NFC. The NFC_SOCKPROTO_RAW performs raw communication with NFC targets.
27
28
29 +--------------------------------------+
30 | USER SPACE |
31 +--------------------------------------+
32 ^ ^
33 | low-level | control
34 | data exchange | operations
35 | |
36 | v
37 | +-----------+
38 | AF_NFC | netlink |
39 | socket +-----------+
40 | raw ^
41 | |
42 v v
43 +---------+ +-----------+
44 | rawsock | <--------> | core |
45 +---------+ +-----------+
46 ^
47 |
48 v
49 +-----------+
50 | driver |
51 +-----------+
52
53Device Driver Interface
54-----------------------
55
56When registering on the NFC subsystem, the device driver must inform the core
57of the set of supported NFC protocols and the set of ops callbacks. The ops
58callbacks that must be implemented are the following:
59
60* start_poll - setup the device to poll for targets
61* stop_poll - stop on progress polling operation
62* activate_target - select and initialize one of the targets found
63* deactivate_target - deselect and deinitialize the selected target
64* data_exchange - send data and receive the response (transceive operation)
65
66Userspace interface
67--------------------
68
69The userspace interface is divided in control operations and low-level data
70exchange operation.
71
72CONTROL OPERATIONS:
73
74Generic netlink is used to implement the interface to the control operations.
75The operations are composed by commands and events, all listed below:
76
77* NFC_CMD_GET_DEVICE - get specific device info or dump the device list
78* NFC_CMD_START_POLL - setup a specific device to polling for targets
79* NFC_CMD_STOP_POLL - stop the polling operation in a specific device
80* NFC_CMD_GET_TARGET - dump the list of targets found by a specific device
81
82* NFC_EVENT_DEVICE_ADDED - reports an NFC device addition
83* NFC_EVENT_DEVICE_REMOVED - reports an NFC device removal
84* NFC_EVENT_TARGETS_FOUND - reports START_POLL results when 1 or more targets
85are found
86
87The user must call START_POLL to poll for NFC targets, passing the desired NFC
88protocols through NFC_ATTR_PROTOCOLS attribute. The device remains in polling
89state until it finds any target. However, the user can stop the polling
90operation by calling STOP_POLL command. In this case, it will be checked if
91the requester of STOP_POLL is the same of START_POLL.
92
93If the polling operation finds one or more targets, the event TARGETS_FOUND is
94sent (including the device id). The user must call GET_TARGET to get the list of
95all targets found by such device. Each reply message has target attributes with
96relevant information such as the supported NFC protocols.
97
98All polling operations requested through one netlink socket are stopped when
99it's closed.
100
101LOW-LEVEL DATA EXCHANGE:
102
103The userspace must use PF_NFC sockets to perform any data communication with
104targets. All NFC sockets use AF_NFC:
105
106struct sockaddr_nfc {
107 sa_family_t sa_family;
108 __u32 dev_idx;
109 __u32 target_idx;
110 __u32 nfc_protocol;
111};
112
113To establish a connection with one target, the user must create an
114NFC_SOCKPROTO_RAW socket and call the 'connect' syscall with the sockaddr_nfc
115struct correctly filled. All information comes from NFC_EVENT_TARGETS_FOUND
116netlink event. As a target can support more than one NFC protocol, the user
117must inform which protocol it wants to use.
118
119Internally, 'connect' will result in an activate_target call to the driver.
120When the socket is closed, the target is deactivated.
121
122The data format exchanged through the sockets is NFC protocol dependent. For
123instance, when communicating with MIFARE tags, the data exchanged are MIFARE
124commands and their responses.
125
126The first received package is the response to the first sent package and so
127on. In order to allow valid "empty" responses, every data received has a NULL
128header of 1 byte.
diff --git a/Documentation/networking/stmmac.txt b/Documentation/networking/stmmac.txt
index 80a7a3454902..57a24108b845 100644
--- a/Documentation/networking/stmmac.txt
+++ b/Documentation/networking/stmmac.txt
@@ -7,7 +7,7 @@ This is the driver for the MAC 10/100/1000 on-chip Ethernet controllers
7(Synopsys IP blocks); it has been fully tested on STLinux platforms. 7(Synopsys IP blocks); it has been fully tested on STLinux platforms.
8 8
9Currently this network device driver is for all STM embedded MAC/GMAC 9Currently this network device driver is for all STM embedded MAC/GMAC
10(7xxx SoCs). Other platforms start using it i.e. ARM SPEAr. 10(i.e. 7xxx/5xxx SoCs) and it's known working on other platforms i.e. ARM SPEAr.
11 11
12DWC Ether MAC 10/100/1000 Universal version 3.41a and DWC Ether MAC 10/100 12DWC Ether MAC 10/100/1000 Universal version 3.41a and DWC Ether MAC 10/100
13Universal version 4.0 have been used for developing the first code 13Universal version 4.0 have been used for developing the first code
@@ -71,7 +71,7 @@ Several performance tests on STM platforms showed this optimisation allows to sp
71the CPU while having the maximum throughput. 71the CPU while having the maximum throughput.
72 72
734.4) WOL 734.4) WOL
74Wake up on Lan feature through Magic Frame is only supported for the GMAC 74Wake up on Lan feature through Magic and Unicast frames are supported for the GMAC
75core. 75core.
76 76
774.5) DMA descriptors 774.5) DMA descriptors
@@ -91,11 +91,15 @@ LRO is not supported.
91The driver is compatible with PAL to work with PHY and GPHY devices. 91The driver is compatible with PAL to work with PHY and GPHY devices.
92 92
934.9) Platform information 934.9) Platform information
94Several information came from the platform; please refer to the 94Several driver's information can be passed through the platform
95driver's Header file in include/linux directory. 95These are included in the include/linux/stmmac.h header file
96and detailed below as well:
96 97
97struct plat_stmmacenet_data { 98 struct plat_stmmacenet_data {
98 int bus_id; 99 int bus_id;
100 int phy_addr;
101 int interface;
102 struct stmmac_mdio_bus_data *mdio_bus_data;
99 int pbl; 103 int pbl;
100 int clk_csr; 104 int clk_csr;
101 int has_gmac; 105 int has_gmac;
@@ -103,67 +107,135 @@ struct plat_stmmacenet_data {
103 int tx_coe; 107 int tx_coe;
104 int bugged_jumbo; 108 int bugged_jumbo;
105 int pmt; 109 int pmt;
106 void (*fix_mac_speed)(void *priv, unsigned int speed); 110 int force_sf_dma_mode;
107 void (*bus_setup)(unsigned long ioaddr); 111 void (*fix_mac_speed)(void *priv, unsigned int speed);
108#ifdef CONFIG_STM_DRIVERS 112 void (*bus_setup)(void __iomem *ioaddr);
109 struct stm_pad_config *pad_config; 113 int (*init)(struct platform_device *pdev);
110#endif 114 void (*exit)(struct platform_device *pdev);
111 void *bsp_priv; 115 void *bsp_priv;
112}; 116 };
113 117
114Where: 118Where:
115- pbl (Programmable Burst Length) is maximum number of 119 o bus_id: bus identifier.
116 beats to be transferred in one DMA transaction. 120 o phy_addr: the physical address can be passed from the platform.
117 GMAC also enables the 4xPBL by default. 121 If it is set to -1 the driver will automatically
118- fix_mac_speed and bus_setup are used to configure internal target 122 detect it at run-time by probing all the 32 addresses.
119 registers (on STM platforms); 123 o interface: PHY device's interface.
120- has_gmac: GMAC core is on board (get it at run-time in the next step); 124 o mdio_bus_data: specific platform fields for the MDIO bus.
121- bus_id: bus identifier. 125 o pbl: the Programmable Burst Length is maximum number of beats to
122- tx_coe: core is able to perform the tx csum in HW. 126 be transferred in one DMA transaction.
123- enh_desc: if sets the MAC will use the enhanced descriptor structure. 127 GMAC also enables the 4xPBL by default.
124- clk_csr: CSR Clock range selection. 128 o clk_csr: CSR Clock range selection.
125- bugged_jumbo: some HWs are not able to perform the csum in HW for 129 o has_gmac: uses the GMAC core.
126 over-sized frames due to limited buffer sizes. Setting this 130 o enh_desc: if sets the MAC will use the enhanced descriptor structure.
127 flag the csum will be done in SW on JUMBO frames. 131 o tx_coe: core is able to perform the tx csum in HW.
128 132 o bugged_jumbo: some HWs are not able to perform the csum in HW for
129struct plat_stmmacphy_data { 133 over-sized frames due to limited buffer sizes.
130 int bus_id; 134 Setting this flag the csum will be done in SW on
131 int phy_addr; 135 JUMBO frames.
132 unsigned int phy_mask; 136 o pmt: core has the embedded power module (optional).
133 int interface; 137 o force_sf_dma_mode: force DMA to use the Store and Forward mode
134 int (*phy_reset)(void *priv); 138 instead of the Threshold.
135 void *priv; 139 o fix_mac_speed: this callback is used for modifying some syscfg registers
136}; 140 (on ST SoCs) according to the link speed negotiated by the
141 physical layer .
142 o bus_setup: perform HW setup of the bus. For example, on some ST platforms
143 this field is used to configure the AMBA bridge to generate more
144 efficient STBus traffic.
145 o init/exit: callbacks used for calling a custom initialisation;
146 this is sometime necessary on some platforms (e.g. ST boxes)
147 where the HW needs to have set some PIO lines or system cfg
148 registers.
149 o custom_cfg: this is a custom configuration that can be passed while
150 initialising the resources.
151
152The we have:
153
154 struct stmmac_mdio_bus_data {
155 int bus_id;
156 int (*phy_reset)(void *priv);
157 unsigned int phy_mask;
158 int *irqs;
159 int probed_phy_irq;
160 };
137 161
138Where: 162Where:
139- bus_id: bus identifier; 163 o bus_id: bus identifier;
140- phy_addr: physical address used for the attached phy device; 164 o phy_reset: hook to reset the phy device attached to the bus.
141 set it to -1 to get it at run-time; 165 o phy_mask: phy mask passed when register the MDIO bus within the driver.
142- interface: physical MII interface mode; 166 o irqs: list of IRQs, one per PHY.
143- phy_reset: hook to reset HW function. 167 o probed_phy_irq: if irqs is NULL, use this for probed PHY.
144 168
145SOURCES: 169Below an example how the structures above are using on ST platforms.
146- Kconfig 170
147- Makefile 171 static struct plat_stmmacenet_data stxYYY_ethernet_platform_data = {
148- stmmac_main.c: main network device driver; 172 .pbl = 32,
149- stmmac_mdio.c: mdio functions; 173 .has_gmac = 0,
150- stmmac_ethtool.c: ethtool support; 174 .enh_desc = 0,
151- stmmac_timer.[ch]: timer code used for mitigating the driver dma interrupts 175 .fix_mac_speed = stxYYY_ethernet_fix_mac_speed,
152 Only tested on ST40 platforms based. 176 |
153- stmmac.h: private driver structure; 177 |-> to write an internal syscfg
154- common.h: common definitions and VFTs; 178 | on this platform when the
155- descs.h: descriptor structure definitions; 179 | link speed changes from 10 to
156- dwmac1000_core.c: GMAC core functions; 180 | 100 and viceversa
157- dwmac1000_dma.c: dma functions for the GMAC chip; 181 .init = &stmmac_claim_resource,
158- dwmac1000.h: specific header file for the GMAC; 182 |
159- dwmac100_core: MAC 100 core and dma code; 183 |-> On ST SoC this calls own "PAD"
160- dwmac100_dma.c: dma funtions for the MAC chip; 184 | manager framework to claim
161- dwmac1000.h: specific header file for the MAC; 185 | all the resources necessary
162- dwmac_lib.c: generic DMA functions shared among chips 186 | (GPIO ...). The .custom_cfg field
163- enh_desc.c: functions for handling enhanced descriptors 187 | is used to pass a custom config.
164- norm_desc.c: functions for handling normal descriptors 188};
165 189
166TODO: 190Below the usage of the stmmac_mdio_bus_data: on this SoC, in fact,
167- XGMAC controller is not supported. 191there are two MAC cores: one MAC is for MDIO Bus/PHY emulation
168- Review the timer optimisation code to use an embedded device that seems to be 192with fixed_link support.
193
194static struct stmmac_mdio_bus_data stmmac1_mdio_bus = {
195 .bus_id = 1,
196 |
197 |-> phy device on the bus_id 1
198 .phy_reset = phy_reset;
199 |
200 |-> function to provide the phy_reset on this board
201 .phy_mask = 0,
202};
203
204static struct fixed_phy_status stmmac0_fixed_phy_status = {
205 .link = 1,
206 .speed = 100,
207 .duplex = 1,
208};
209
210During the board's device_init we can configure the first
211MAC for fixed_link by calling:
212 fixed_phy_add(PHY_POLL, 1, &stmmac0_fixed_phy_status));)
213and the second one, with a real PHY device attached to the bus,
214by using the stmmac_mdio_bus_data structure (to provide the id, the
215reset procedure etc).
216
2174.10) List of source files:
218 o Kconfig
219 o Makefile
220 o stmmac_main.c: main network device driver;
221 o stmmac_mdio.c: mdio functions;
222 o stmmac_ethtool.c: ethtool support;
223 o stmmac_timer.[ch]: timer code used for mitigating the driver dma interrupts
224 Only tested on ST40 platforms based.
225 o stmmac.h: private driver structure;
226 o common.h: common definitions and VFTs;
227 o descs.h: descriptor structure definitions;
228 o dwmac1000_core.c: GMAC core functions;
229 o dwmac1000_dma.c: dma functions for the GMAC chip;
230 o dwmac1000.h: specific header file for the GMAC;
231 o dwmac100_core: MAC 100 core and dma code;
232 o dwmac100_dma.c: dma funtions for the MAC chip;
233 o dwmac1000.h: specific header file for the MAC;
234 o dwmac_lib.c: generic DMA functions shared among chips
235 o enh_desc.c: functions for handling enhanced descriptors
236 o norm_desc.c: functions for handling normal descriptors
237
2385) TODO:
239 o XGMAC is not supported.
240 o Review the timer optimisation code to use an embedded device that will be
169 available in new chip generations. 241 available in new chip generations.
diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
index 88880839ece4..3384d5996be2 100644
--- a/Documentation/power/devices.txt
+++ b/Documentation/power/devices.txt
@@ -506,8 +506,8 @@ routines. Nevertheless, different callback pointers are used in case there is a
506situation where it actually matters. 506situation where it actually matters.
507 507
508 508
509Device Power Domains 509Device Power Management Domains
510-------------------- 510-------------------------------
511Sometimes devices share reference clocks or other power resources. In those 511Sometimes devices share reference clocks or other power resources. In those
512cases it generally is not possible to put devices into low-power states 512cases it generally is not possible to put devices into low-power states
513individually. Instead, a set of devices sharing a power resource can be put 513individually. Instead, a set of devices sharing a power resource can be put
@@ -516,63 +516,24 @@ power resource. Of course, they also need to be put into the full-power state
516together, by turning the shared power resource on. A set of devices with this 516together, by turning the shared power resource on. A set of devices with this
517property is often referred to as a power domain. 517property is often referred to as a power domain.
518 518
519Support for power domains is provided through the pwr_domain field of struct 519Support for power domains is provided through the pm_domain field of struct
520device. This field is a pointer to an object of type struct dev_power_domain, 520device. This field is a pointer to an object of type struct dev_pm_domain,
521defined in include/linux/pm.h, providing a set of power management callbacks 521defined in include/linux/pm.h, providing a set of power management callbacks
522analogous to the subsystem-level and device driver callbacks that are executed 522analogous to the subsystem-level and device driver callbacks that are executed
523for the given device during all power transitions, in addition to the respective 523for the given device during all power transitions, instead of the respective
524subsystem-level callbacks. Specifically, the power domain "suspend" callbacks 524subsystem-level callbacks. Specifically, if a device's pm_domain pointer is
525(i.e. ->runtime_suspend(), ->suspend(), ->freeze(), ->poweroff(), etc.) are 525not NULL, the ->suspend() callback from the object pointed to by it will be
526executed after the analogous subsystem-level callbacks, while the power domain 526executed instead of its subsystem's (e.g. bus type's) ->suspend() callback and
527"resume" callbacks (i.e. ->runtime_resume(), ->resume(), ->thaw(), ->restore, 527anlogously for all of the remaining callbacks. In other words, power management
528etc.) are executed before the analogous subsystem-level callbacks. Error codes 528domain callbacks, if defined for the given device, always take precedence over
529returned by the "suspend" and "resume" power domain callbacks are ignored. 529the callbacks provided by the device's subsystem (e.g. bus type).
530 530
531Power domain ->runtime_idle() callback is executed before the subsystem-level 531The support for device power management domains is only relevant to platforms
532->runtime_idle() callback and the result returned by it is not ignored. Namely, 532needing to use the same device driver power management callbacks in many
533if it returns error code, the subsystem-level ->runtime_idle() callback will not 533different power domain configurations and wanting to avoid incorporating the
534be called and the helper function rpm_idle() executing it will return error 534support for power domains into subsystem-level callbacks, for example by
535code. This mechanism is intended to help platforms where saving device state 535modifying the platform bus type. Other platforms need not implement it or take
536is a time consuming operation and should only be carried out if all devices 536it into account in any way.
537in the power domain are idle, before turning off the shared power resource(s).
538Namely, the power domain ->runtime_idle() callback may return error code until
539the pm_runtime_idle() helper (or its asychronous version) has been called for
540all devices in the power domain (it is recommended that the returned error code
541be -EBUSY in those cases), preventing the subsystem-level ->runtime_idle()
542callback from being run prematurely.
543
544The support for device power domains is only relevant to platforms needing to
545use the same subsystem-level (e.g. platform bus type) and device driver power
546management callbacks in many different power domain configurations and wanting
547to avoid incorporating the support for power domains into the subsystem-level
548callbacks. The other platforms need not implement it or take it into account
549in any way.
550
551
552System Devices
553--------------
554System devices (sysdevs) follow a slightly different API, which can be found in
555
556 include/linux/sysdev.h
557 drivers/base/sys.c
558
559System devices will be suspended with interrupts disabled, and after all other
560devices have been suspended. On resume, they will be resumed before any other
561devices, and also with interrupts disabled. These things occur in special
562"sysdev_driver" phases, which affect only system devices.
563
564Thus, after the suspend_noirq (or freeze_noirq or poweroff_noirq) phase, when
565the non-boot CPUs are all offline and IRQs are disabled on the remaining online
566CPU, then a sysdev_driver.suspend phase is carried out, and the system enters a
567sleep state (or a system image is created). During resume (or after the image
568has been created or loaded) a sysdev_driver.resume phase is carried out, IRQs
569are enabled on the only online CPU, the non-boot CPUs are enabled, and the
570resume_noirq (or thaw_noirq or restore_noirq) phase begins.
571
572Code to actually enter and exit the system-wide low power state sometimes
573involves hardware details that are only known to the boot firmware, and
574may leave a CPU running software (from SRAM or flash memory) that monitors
575the system and manages its wakeup sequence.
576 537
577 538
578Device Low Power (suspend) States 539Device Low Power (suspend) States
@@ -643,7 +604,7 @@ state temporarily, for example so that its system wakeup capability can be
643disabled. This all depends on the hardware and the design of the subsystem and 604disabled. This all depends on the hardware and the design of the subsystem and
644device driver in question. 605device driver in question.
645 606
646During system-wide resume from a sleep state it's best to put devices into the 607During system-wide resume from a sleep state it's easiest to put devices into
647full-power state, as explained in Documentation/power/runtime_pm.txt. Refer to 608the full-power state, as explained in Documentation/power/runtime_pm.txt. Refer
648that document for more information regarding this particular issue as well as 609to that document for more information regarding this particular issue as well as
649for information on the device runtime power management framework in general. 610for information on the device runtime power management framework in general.
diff --git a/Documentation/power/opp.txt b/Documentation/power/opp.txt
index 5ae70a12c1e2..3035d00757ad 100644
--- a/Documentation/power/opp.txt
+++ b/Documentation/power/opp.txt
@@ -321,6 +321,8 @@ opp_init_cpufreq_table - cpufreq framework typically is initialized with
321 addition to CONFIG_PM as power management feature is required to 321 addition to CONFIG_PM as power management feature is required to
322 dynamically scale voltage and frequency in a system. 322 dynamically scale voltage and frequency in a system.
323 323
324opp_free_cpufreq_table - Free up the table allocated by opp_init_cpufreq_table
325
3247. Data Structures 3267. Data Structures
325================== 327==================
326Typically an SoC contains multiple voltage domains which are variable. Each 328Typically an SoC contains multiple voltage domains which are variable. Each
diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt
index 654097b130b4..14dd3c6ad97e 100644
--- a/Documentation/power/runtime_pm.txt
+++ b/Documentation/power/runtime_pm.txt
@@ -1,39 +1,39 @@
1Run-time Power Management Framework for I/O Devices 1Runtime Power Management Framework for I/O Devices
2 2
3(C) 2009-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. 3(C) 2009-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
4(C) 2010 Alan Stern <stern@rowland.harvard.edu> 4(C) 2010 Alan Stern <stern@rowland.harvard.edu>
5 5
61. Introduction 61. Introduction
7 7
8Support for run-time power management (run-time PM) of I/O devices is provided 8Support for runtime power management (runtime PM) of I/O devices is provided
9at the power management core (PM core) level by means of: 9at the power management core (PM core) level by means of:
10 10
11* The power management workqueue pm_wq in which bus types and device drivers can 11* The power management workqueue pm_wq in which bus types and device drivers can
12 put their PM-related work items. It is strongly recommended that pm_wq be 12 put their PM-related work items. It is strongly recommended that pm_wq be
13 used for queuing all work items related to run-time PM, because this allows 13 used for queuing all work items related to runtime PM, because this allows
14 them to be synchronized with system-wide power transitions (suspend to RAM, 14 them to be synchronized with system-wide power transitions (suspend to RAM,
15 hibernation and resume from system sleep states). pm_wq is declared in 15 hibernation and resume from system sleep states). pm_wq is declared in
16 include/linux/pm_runtime.h and defined in kernel/power/main.c. 16 include/linux/pm_runtime.h and defined in kernel/power/main.c.
17 17
18* A number of run-time PM fields in the 'power' member of 'struct device' (which 18* A number of runtime PM fields in the 'power' member of 'struct device' (which
19 is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can 19 is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can
20 be used for synchronizing run-time PM operations with one another. 20 be used for synchronizing runtime PM operations with one another.
21 21
22* Three device run-time PM callbacks in 'struct dev_pm_ops' (defined in 22* Three device runtime PM callbacks in 'struct dev_pm_ops' (defined in
23 include/linux/pm.h). 23 include/linux/pm.h).
24 24
25* A set of helper functions defined in drivers/base/power/runtime.c that can be 25* A set of helper functions defined in drivers/base/power/runtime.c that can be
26 used for carrying out run-time PM operations in such a way that the 26 used for carrying out runtime PM operations in such a way that the
27 synchronization between them is taken care of by the PM core. Bus types and 27 synchronization between them is taken care of by the PM core. Bus types and
28 device drivers are encouraged to use these functions. 28 device drivers are encouraged to use these functions.
29 29
30The run-time PM callbacks present in 'struct dev_pm_ops', the device run-time PM 30The runtime PM callbacks present in 'struct dev_pm_ops', the device runtime PM
31fields of 'struct dev_pm_info' and the core helper functions provided for 31fields of 'struct dev_pm_info' and the core helper functions provided for
32run-time PM are described below. 32runtime PM are described below.
33 33
342. Device Run-time PM Callbacks 342. Device Runtime PM Callbacks
35 35
36There are three device run-time PM callbacks defined in 'struct dev_pm_ops': 36There are three device runtime PM callbacks defined in 'struct dev_pm_ops':
37 37
38struct dev_pm_ops { 38struct dev_pm_ops {
39 ... 39 ...
@@ -72,11 +72,11 @@ knows what to do to handle the device).
72 not mean that the device has been put into a low power state. It is 72 not mean that the device has been put into a low power state. It is
73 supposed to mean, however, that the device will not process data and will 73 supposed to mean, however, that the device will not process data and will
74 not communicate with the CPU(s) and RAM until the subsystem-level resume 74 not communicate with the CPU(s) and RAM until the subsystem-level resume
75 callback is executed for it. The run-time PM status of a device after 75 callback is executed for it. The runtime PM status of a device after
76 successful execution of the subsystem-level suspend callback is 'suspended'. 76 successful execution of the subsystem-level suspend callback is 'suspended'.
77 77
78 * If the subsystem-level suspend callback returns -EBUSY or -EAGAIN, 78 * If the subsystem-level suspend callback returns -EBUSY or -EAGAIN,
79 the device's run-time PM status is 'active', which means that the device 79 the device's runtime PM status is 'active', which means that the device
80 _must_ be fully operational afterwards. 80 _must_ be fully operational afterwards.
81 81
82 * If the subsystem-level suspend callback returns an error code different 82 * If the subsystem-level suspend callback returns an error code different
@@ -104,7 +104,7 @@ the device).
104 104
105 * Once the subsystem-level resume callback has completed successfully, the PM 105 * Once the subsystem-level resume callback has completed successfully, the PM
106 core regards the device as fully operational, which means that the device 106 core regards the device as fully operational, which means that the device
107 _must_ be able to complete I/O operations as needed. The run-time PM status 107 _must_ be able to complete I/O operations as needed. The runtime PM status
108 of the device is then 'active'. 108 of the device is then 'active'.
109 109
110 * If the subsystem-level resume callback returns an error code, the PM core 110 * If the subsystem-level resume callback returns an error code, the PM core
@@ -130,7 +130,7 @@ device in that case. The value returned by this callback is ignored by the PM
130core. 130core.
131 131
132The helper functions provided by the PM core, described in Section 4, guarantee 132The helper functions provided by the PM core, described in Section 4, guarantee
133that the following constraints are met with respect to the bus type's run-time 133that the following constraints are met with respect to the bus type's runtime
134PM callbacks: 134PM callbacks:
135 135
136(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute 136(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute
@@ -142,7 +142,7 @@ PM callbacks:
142 142
143(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active' 143(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active'
144 devices (i.e. the PM core will only execute ->runtime_idle() or 144 devices (i.e. the PM core will only execute ->runtime_idle() or
145 ->runtime_suspend() for the devices the run-time PM status of which is 145 ->runtime_suspend() for the devices the runtime PM status of which is
146 'active'). 146 'active').
147 147
148(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device 148(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device
@@ -151,7 +151,7 @@ PM callbacks:
151 flag of which is set. 151 flag of which is set.
152 152
153(4) ->runtime_resume() can only be executed for 'suspended' devices (i.e. the 153(4) ->runtime_resume() can only be executed for 'suspended' devices (i.e. the
154 PM core will only execute ->runtime_resume() for the devices the run-time 154 PM core will only execute ->runtime_resume() for the devices the runtime
155 PM status of which is 'suspended'). 155 PM status of which is 'suspended').
156 156
157Additionally, the helper functions provided by the PM core obey the following 157Additionally, the helper functions provided by the PM core obey the following
@@ -171,9 +171,9 @@ rules:
171 scheduled requests to execute the other callbacks for the same device, 171 scheduled requests to execute the other callbacks for the same device,
172 except for scheduled autosuspends. 172 except for scheduled autosuspends.
173 173
1743. Run-time PM Device Fields 1743. Runtime PM Device Fields
175 175
176The following device run-time PM fields are present in 'struct dev_pm_info', as 176The following device runtime PM fields are present in 'struct dev_pm_info', as
177defined in include/linux/pm.h: 177defined in include/linux/pm.h:
178 178
179 struct timer_list suspend_timer; 179 struct timer_list suspend_timer;
@@ -205,7 +205,7 @@ defined in include/linux/pm.h:
205 205
206 unsigned int disable_depth; 206 unsigned int disable_depth;
207 - used for disabling the helper funcions (they work normally if this is 207 - used for disabling the helper funcions (they work normally if this is
208 equal to zero); the initial value of it is 1 (i.e. run-time PM is 208 equal to zero); the initial value of it is 1 (i.e. runtime PM is
209 initially disabled for all devices) 209 initially disabled for all devices)
210 210
211 unsigned int runtime_error; 211 unsigned int runtime_error;
@@ -229,10 +229,10 @@ defined in include/linux/pm.h:
229 suspend to complete; means "start a resume as soon as you've suspended" 229 suspend to complete; means "start a resume as soon as you've suspended"
230 230
231 unsigned int run_wake; 231 unsigned int run_wake;
232 - set if the device is capable of generating run-time wake-up events 232 - set if the device is capable of generating runtime wake-up events
233 233
234 enum rpm_status runtime_status; 234 enum rpm_status runtime_status;
235 - the run-time PM status of the device; this field's initial value is 235 - the runtime PM status of the device; this field's initial value is
236 RPM_SUSPENDED, which means that each device is initially regarded by the 236 RPM_SUSPENDED, which means that each device is initially regarded by the
237 PM core as 'suspended', regardless of its real hardware status 237 PM core as 'suspended', regardless of its real hardware status
238 238
@@ -243,7 +243,7 @@ defined in include/linux/pm.h:
243 and pm_runtime_forbid() helper functions 243 and pm_runtime_forbid() helper functions
244 244
245 unsigned int no_callbacks; 245 unsigned int no_callbacks;
246 - indicates that the device does not use the run-time PM callbacks (see 246 - indicates that the device does not use the runtime PM callbacks (see
247 Section 8); it may be modified only by the pm_runtime_no_callbacks() 247 Section 8); it may be modified only by the pm_runtime_no_callbacks()
248 helper function 248 helper function
249 249
@@ -270,16 +270,16 @@ defined in include/linux/pm.h:
270 270
271All of the above fields are members of the 'power' member of 'struct device'. 271All of the above fields are members of the 'power' member of 'struct device'.
272 272
2734. Run-time PM Device Helper Functions 2734. Runtime PM Device Helper Functions
274 274
275The following run-time PM helper functions are defined in 275The following runtime PM helper functions are defined in
276drivers/base/power/runtime.c and include/linux/pm_runtime.h: 276drivers/base/power/runtime.c and include/linux/pm_runtime.h:
277 277
278 void pm_runtime_init(struct device *dev); 278 void pm_runtime_init(struct device *dev);
279 - initialize the device run-time PM fields in 'struct dev_pm_info' 279 - initialize the device runtime PM fields in 'struct dev_pm_info'
280 280
281 void pm_runtime_remove(struct device *dev); 281 void pm_runtime_remove(struct device *dev);
282 - make sure that the run-time PM of the device will be disabled after 282 - make sure that the runtime PM of the device will be disabled after
283 removing the device from device hierarchy 283 removing the device from device hierarchy
284 284
285 int pm_runtime_idle(struct device *dev); 285 int pm_runtime_idle(struct device *dev);
@@ -289,9 +289,10 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
289 289
290 int pm_runtime_suspend(struct device *dev); 290 int pm_runtime_suspend(struct device *dev);
291 - execute the subsystem-level suspend callback for the device; returns 0 on 291 - execute the subsystem-level suspend callback for the device; returns 0 on
292 success, 1 if the device's run-time PM status was already 'suspended', or 292 success, 1 if the device's runtime PM status was already 'suspended', or
293 error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt 293 error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt
294 to suspend the device again in future 294 to suspend the device again in future and -EACCES means that
295 'power.disable_depth' is different from 0
295 296
296 int pm_runtime_autosuspend(struct device *dev); 297 int pm_runtime_autosuspend(struct device *dev);
297 - same as pm_runtime_suspend() except that the autosuspend delay is taken 298 - same as pm_runtime_suspend() except that the autosuspend delay is taken
@@ -301,10 +302,11 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
301 302
302 int pm_runtime_resume(struct device *dev); 303 int pm_runtime_resume(struct device *dev);
303 - execute the subsystem-level resume callback for the device; returns 0 on 304 - execute the subsystem-level resume callback for the device; returns 0 on
304 success, 1 if the device's run-time PM status was already 'active' or 305 success, 1 if the device's runtime PM status was already 'active' or
305 error code on failure, where -EAGAIN means it may be safe to attempt to 306 error code on failure, where -EAGAIN means it may be safe to attempt to
306 resume the device again in future, but 'power.runtime_error' should be 307 resume the device again in future, but 'power.runtime_error' should be
307 checked additionally 308 checked additionally, and -EACCES means that 'power.disable_depth' is
309 different from 0
308 310
309 int pm_request_idle(struct device *dev); 311 int pm_request_idle(struct device *dev);
310 - submit a request to execute the subsystem-level idle callback for the 312 - submit a request to execute the subsystem-level idle callback for the
@@ -321,7 +323,7 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
321 device in future, where 'delay' is the time to wait before queuing up a 323 device in future, where 'delay' is the time to wait before queuing up a
322 suspend work item in pm_wq, in milliseconds (if 'delay' is zero, the work 324 suspend work item in pm_wq, in milliseconds (if 'delay' is zero, the work
323 item is queued up immediately); returns 0 on success, 1 if the device's PM 325 item is queued up immediately); returns 0 on success, 1 if the device's PM
324 run-time status was already 'suspended', or error code if the request 326 runtime status was already 'suspended', or error code if the request
325 hasn't been scheduled (or queued up if 'delay' is 0); if the execution of 327 hasn't been scheduled (or queued up if 'delay' is 0); if the execution of
326 ->runtime_suspend() is already scheduled and not yet expired, the new 328 ->runtime_suspend() is already scheduled and not yet expired, the new
327 value of 'delay' will be used as the time to wait 329 value of 'delay' will be used as the time to wait
@@ -329,7 +331,7 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
329 int pm_request_resume(struct device *dev); 331 int pm_request_resume(struct device *dev);
330 - submit a request to execute the subsystem-level resume callback for the 332 - submit a request to execute the subsystem-level resume callback for the
331 device (the request is represented by a work item in pm_wq); returns 0 on 333 device (the request is represented by a work item in pm_wq); returns 0 on
332 success, 1 if the device's run-time PM status was already 'active', or 334 success, 1 if the device's runtime PM status was already 'active', or
333 error code if the request hasn't been queued up 335 error code if the request hasn't been queued up
334 336
335 void pm_runtime_get_noresume(struct device *dev); 337 void pm_runtime_get_noresume(struct device *dev);
@@ -367,22 +369,32 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
367 pm_runtime_autosuspend(dev) and return its result 369 pm_runtime_autosuspend(dev) and return its result
368 370
369 void pm_runtime_enable(struct device *dev); 371 void pm_runtime_enable(struct device *dev);
370 - enable the run-time PM helper functions to run the device bus type's 372 - decrement the device's 'power.disable_depth' field; if that field is equal
371 run-time PM callbacks described in Section 2 373 to zero, the runtime PM helper functions can execute subsystem-level
374 callbacks described in Section 2 for the device
372 375
373 int pm_runtime_disable(struct device *dev); 376 int pm_runtime_disable(struct device *dev);
374 - prevent the run-time PM helper functions from running subsystem-level 377 - increment the device's 'power.disable_depth' field (if the value of that
375 run-time PM callbacks for the device, make sure that all of the pending 378 field was previously zero, this prevents subsystem-level runtime PM
376 run-time PM operations on the device are either completed or canceled; 379 callbacks from being run for the device), make sure that all of the pending
380 runtime PM operations on the device are either completed or canceled;
377 returns 1 if there was a resume request pending and it was necessary to 381 returns 1 if there was a resume request pending and it was necessary to
378 execute the subsystem-level resume callback for the device to satisfy that 382 execute the subsystem-level resume callback for the device to satisfy that
379 request, otherwise 0 is returned 383 request, otherwise 0 is returned
380 384
385 int pm_runtime_barrier(struct device *dev);
386 - check if there's a resume request pending for the device and resume it
387 (synchronously) in that case, cancel any other pending runtime PM requests
388 regarding it and wait for all runtime PM operations on it in progress to
389 complete; returns 1 if there was a resume request pending and it was
390 necessary to execute the subsystem-level resume callback for the device to
391 satisfy that request, otherwise 0 is returned
392
381 void pm_suspend_ignore_children(struct device *dev, bool enable); 393 void pm_suspend_ignore_children(struct device *dev, bool enable);
382 - set/unset the power.ignore_children flag of the device 394 - set/unset the power.ignore_children flag of the device
383 395
384 int pm_runtime_set_active(struct device *dev); 396 int pm_runtime_set_active(struct device *dev);
385 - clear the device's 'power.runtime_error' flag, set the device's run-time 397 - clear the device's 'power.runtime_error' flag, set the device's runtime
386 PM status to 'active' and update its parent's counter of 'active' 398 PM status to 'active' and update its parent's counter of 'active'
387 children as appropriate (it is only valid to use this function if 399 children as appropriate (it is only valid to use this function if
388 'power.runtime_error' is set or 'power.disable_depth' is greater than 400 'power.runtime_error' is set or 'power.disable_depth' is greater than
@@ -390,7 +402,7 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
390 which is not active and the 'power.ignore_children' flag of which is unset 402 which is not active and the 'power.ignore_children' flag of which is unset
391 403
392 void pm_runtime_set_suspended(struct device *dev); 404 void pm_runtime_set_suspended(struct device *dev);
393 - clear the device's 'power.runtime_error' flag, set the device's run-time 405 - clear the device's 'power.runtime_error' flag, set the device's runtime
394 PM status to 'suspended' and update its parent's counter of 'active' 406 PM status to 'suspended' and update its parent's counter of 'active'
395 children as appropriate (it is only valid to use this function if 407 children as appropriate (it is only valid to use this function if
396 'power.runtime_error' is set or 'power.disable_depth' is greater than 408 'power.runtime_error' is set or 'power.disable_depth' is greater than
@@ -400,6 +412,9 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
400 - return true if the device's runtime PM status is 'suspended' and its 412 - return true if the device's runtime PM status is 'suspended' and its
401 'power.disable_depth' field is equal to zero, or false otherwise 413 'power.disable_depth' field is equal to zero, or false otherwise
402 414
415 bool pm_runtime_status_suspended(struct device *dev);
416 - return true if the device's runtime PM status is 'suspended'
417
403 void pm_runtime_allow(struct device *dev); 418 void pm_runtime_allow(struct device *dev);
404 - set the power.runtime_auto flag for the device and decrease its usage 419 - set the power.runtime_auto flag for the device and decrease its usage
405 counter (used by the /sys/devices/.../power/control interface to 420 counter (used by the /sys/devices/.../power/control interface to
@@ -411,7 +426,7 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
411 effectively prevent the device from being power managed at run time) 426 effectively prevent the device from being power managed at run time)
412 427
413 void pm_runtime_no_callbacks(struct device *dev); 428 void pm_runtime_no_callbacks(struct device *dev);
414 - set the power.no_callbacks flag for the device and remove the run-time 429 - set the power.no_callbacks flag for the device and remove the runtime
415 PM attributes from /sys/devices/.../power (or prevent them from being 430 PM attributes from /sys/devices/.../power (or prevent them from being
416 added when the device is registered) 431 added when the device is registered)
417 432
@@ -431,7 +446,7 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h:
431 446
432 void pm_runtime_set_autosuspend_delay(struct device *dev, int delay); 447 void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);
433 - set the power.autosuspend_delay value to 'delay' (expressed in 448 - set the power.autosuspend_delay value to 'delay' (expressed in
434 milliseconds); if 'delay' is negative then run-time suspends are 449 milliseconds); if 'delay' is negative then runtime suspends are
435 prevented 450 prevented
436 451
437 unsigned long pm_runtime_autosuspend_expiration(struct device *dev); 452 unsigned long pm_runtime_autosuspend_expiration(struct device *dev);
@@ -470,76 +485,92 @@ pm_runtime_resume()
470pm_runtime_get_sync() 485pm_runtime_get_sync()
471pm_runtime_put_sync_suspend() 486pm_runtime_put_sync_suspend()
472 487
4735. Run-time PM Initialization, Device Probing and Removal 4885. Runtime PM Initialization, Device Probing and Removal
474 489
475Initially, the run-time PM is disabled for all devices, which means that the 490Initially, the runtime PM is disabled for all devices, which means that the
476majority of the run-time PM helper funtions described in Section 4 will return 491majority of the runtime PM helper funtions described in Section 4 will return
477-EAGAIN until pm_runtime_enable() is called for the device. 492-EAGAIN until pm_runtime_enable() is called for the device.
478 493
479In addition to that, the initial run-time PM status of all devices is 494In addition to that, the initial runtime PM status of all devices is
480'suspended', but it need not reflect the actual physical state of the device. 495'suspended', but it need not reflect the actual physical state of the device.
481Thus, if the device is initially active (i.e. it is able to process I/O), its 496Thus, if the device is initially active (i.e. it is able to process I/O), its
482run-time PM status must be changed to 'active', with the help of 497runtime PM status must be changed to 'active', with the help of
483pm_runtime_set_active(), before pm_runtime_enable() is called for the device. 498pm_runtime_set_active(), before pm_runtime_enable() is called for the device.
484 499
485However, if the device has a parent and the parent's run-time PM is enabled, 500However, if the device has a parent and the parent's runtime PM is enabled,
486calling pm_runtime_set_active() for the device will affect the parent, unless 501calling pm_runtime_set_active() for the device will affect the parent, unless
487the parent's 'power.ignore_children' flag is set. Namely, in that case the 502the parent's 'power.ignore_children' flag is set. Namely, in that case the
488parent won't be able to suspend at run time, using the PM core's helper 503parent won't be able to suspend at run time, using the PM core's helper
489functions, as long as the child's status is 'active', even if the child's 504functions, as long as the child's status is 'active', even if the child's
490run-time PM is still disabled (i.e. pm_runtime_enable() hasn't been called for 505runtime PM is still disabled (i.e. pm_runtime_enable() hasn't been called for
491the child yet or pm_runtime_disable() has been called for it). For this reason, 506the child yet or pm_runtime_disable() has been called for it). For this reason,
492once pm_runtime_set_active() has been called for the device, pm_runtime_enable() 507once pm_runtime_set_active() has been called for the device, pm_runtime_enable()
493should be called for it too as soon as reasonably possible or its run-time PM 508should be called for it too as soon as reasonably possible or its runtime PM
494status should be changed back to 'suspended' with the help of 509status should be changed back to 'suspended' with the help of
495pm_runtime_set_suspended(). 510pm_runtime_set_suspended().
496 511
497If the default initial run-time PM status of the device (i.e. 'suspended') 512If the default initial runtime PM status of the device (i.e. 'suspended')
498reflects the actual state of the device, its bus type's or its driver's 513reflects the actual state of the device, its bus type's or its driver's
499->probe() callback will likely need to wake it up using one of the PM core's 514->probe() callback will likely need to wake it up using one of the PM core's
500helper functions described in Section 4. In that case, pm_runtime_resume() 515helper functions described in Section 4. In that case, pm_runtime_resume()
501should be used. Of course, for this purpose the device's run-time PM has to be 516should be used. Of course, for this purpose the device's runtime PM has to be
502enabled earlier by calling pm_runtime_enable(). 517enabled earlier by calling pm_runtime_enable().
503 518
504If the device bus type's or driver's ->probe() or ->remove() callback runs 519If the device bus type's or driver's ->probe() callback runs
505pm_runtime_suspend() or pm_runtime_idle() or their asynchronous counterparts, 520pm_runtime_suspend() or pm_runtime_idle() or their asynchronous counterparts,
506they will fail returning -EAGAIN, because the device's usage counter is 521they will fail returning -EAGAIN, because the device's usage counter is
507incremented by the core before executing ->probe() and ->remove(). Still, it 522incremented by the driver core before executing ->probe(). Still, it may be
508may be desirable to suspend the device as soon as ->probe() or ->remove() has 523desirable to suspend the device as soon as ->probe() has finished, so the driver
509finished, so the PM core uses pm_runtime_idle_sync() to invoke the 524core uses pm_runtime_put_sync() to invoke the subsystem-level idle callback for
510subsystem-level idle callback for the device at that time. 525the device at that time.
526
527Moreover, the driver core prevents runtime PM callbacks from racing with the bus
528notifier callback in __device_release_driver(), which is necessary, because the
529notifier is used by some subsystems to carry out operations affecting the
530runtime PM functionality. It does so by calling pm_runtime_get_sync() before
531driver_sysfs_remove() and the BUS_NOTIFY_UNBIND_DRIVER notifications. This
532resumes the device if it's in the suspended state and prevents it from
533being suspended again while those routines are being executed.
534
535To allow bus types and drivers to put devices into the suspended state by
536calling pm_runtime_suspend() from their ->remove() routines, the driver core
537executes pm_runtime_put_sync() after running the BUS_NOTIFY_UNBIND_DRIVER
538notifications in __device_release_driver(). This requires bus types and
539drivers to make their ->remove() callbacks avoid races with runtime PM directly,
540but also it allows of more flexibility in the handling of devices during the
541removal of their drivers.
511 542
512The user space can effectively disallow the driver of the device to power manage 543The user space can effectively disallow the driver of the device to power manage
513it at run time by changing the value of its /sys/devices/.../power/control 544it at run time by changing the value of its /sys/devices/.../power/control
514attribute to "on", which causes pm_runtime_forbid() to be called. In principle, 545attribute to "on", which causes pm_runtime_forbid() to be called. In principle,
515this mechanism may also be used by the driver to effectively turn off the 546this mechanism may also be used by the driver to effectively turn off the
516run-time power management of the device until the user space turns it on. 547runtime power management of the device until the user space turns it on.
517Namely, during the initialization the driver can make sure that the run-time PM 548Namely, during the initialization the driver can make sure that the runtime PM
518status of the device is 'active' and call pm_runtime_forbid(). It should be 549status of the device is 'active' and call pm_runtime_forbid(). It should be
519noted, however, that if the user space has already intentionally changed the 550noted, however, that if the user space has already intentionally changed the
520value of /sys/devices/.../power/control to "auto" to allow the driver to power 551value of /sys/devices/.../power/control to "auto" to allow the driver to power
521manage the device at run time, the driver may confuse it by using 552manage the device at run time, the driver may confuse it by using
522pm_runtime_forbid() this way. 553pm_runtime_forbid() this way.
523 554
5246. Run-time PM and System Sleep 5556. Runtime PM and System Sleep
525 556
526Run-time PM and system sleep (i.e., system suspend and hibernation, also known 557Runtime PM and system sleep (i.e., system suspend and hibernation, also known
527as suspend-to-RAM and suspend-to-disk) interact with each other in a couple of 558as suspend-to-RAM and suspend-to-disk) interact with each other in a couple of
528ways. If a device is active when a system sleep starts, everything is 559ways. If a device is active when a system sleep starts, everything is
529straightforward. But what should happen if the device is already suspended? 560straightforward. But what should happen if the device is already suspended?
530 561
531The device may have different wake-up settings for run-time PM and system sleep. 562The device may have different wake-up settings for runtime PM and system sleep.
532For example, remote wake-up may be enabled for run-time suspend but disallowed 563For example, remote wake-up may be enabled for runtime suspend but disallowed
533for system sleep (device_may_wakeup(dev) returns 'false'). When this happens, 564for system sleep (device_may_wakeup(dev) returns 'false'). When this happens,
534the subsystem-level system suspend callback is responsible for changing the 565the subsystem-level system suspend callback is responsible for changing the
535device's wake-up setting (it may leave that to the device driver's system 566device's wake-up setting (it may leave that to the device driver's system
536suspend routine). It may be necessary to resume the device and suspend it again 567suspend routine). It may be necessary to resume the device and suspend it again
537in order to do so. The same is true if the driver uses different power levels 568in order to do so. The same is true if the driver uses different power levels
538or other settings for run-time suspend and system sleep. 569or other settings for runtime suspend and system sleep.
539 570
540During system resume, devices generally should be brought back to full power, 571During system resume, the simplest approach is to bring all devices back to full
541even if they were suspended before the system sleep began. There are several 572power, even if they had been suspended before the system suspend began. There
542reasons for this, including: 573are several reasons for this, including:
543 574
544 * The device might need to switch power levels, wake-up settings, etc. 575 * The device might need to switch power levels, wake-up settings, etc.
545 576
@@ -554,22 +585,49 @@ reasons for this, including:
554 * The device might need to be reset. 585 * The device might need to be reset.
555 586
556 * Even though the device was suspended, if its usage counter was > 0 then most 587 * Even though the device was suspended, if its usage counter was > 0 then most
557 likely it would need a run-time resume in the near future anyway. 588 likely it would need a runtime resume in the near future anyway.
558
559 * Always going back to full power is simplest.
560 589
561If the device was suspended before the sleep began, then its run-time PM status 590If the device had been suspended before the system suspend began and it's
562will have to be updated to reflect the actual post-system sleep status. The way 591brought back to full power during resume, then its runtime PM status will have
563to do this is: 592to be updated to reflect the actual post-system sleep status. The way to do
593this is:
564 594
565 pm_runtime_disable(dev); 595 pm_runtime_disable(dev);
566 pm_runtime_set_active(dev); 596 pm_runtime_set_active(dev);
567 pm_runtime_enable(dev); 597 pm_runtime_enable(dev);
568 598
569The PM core always increments the run-time usage counter before calling the 599The PM core always increments the runtime usage counter before calling the
570->prepare() callback and decrements it after calling the ->complete() callback. 600->suspend() callback and decrements it after calling the ->resume() callback.
571Hence disabling run-time PM temporarily like this will not cause any run-time 601Hence disabling runtime PM temporarily like this will not cause any runtime
572suspend callbacks to be lost. 602suspend attempts to be permanently lost. If the usage count goes to zero
603following the return of the ->resume() callback, the ->runtime_idle() callback
604will be invoked as usual.
605
606On some systems, however, system sleep is not entered through a global firmware
607or hardware operation. Instead, all hardware components are put into low-power
608states directly by the kernel in a coordinated way. Then, the system sleep
609state effectively follows from the states the hardware components end up in
610and the system is woken up from that state by a hardware interrupt or a similar
611mechanism entirely under the kernel's control. As a result, the kernel never
612gives control away and the states of all devices during resume are precisely
613known to it. If that is the case and none of the situations listed above takes
614place (in particular, if the system is not waking up from hibernation), it may
615be more efficient to leave the devices that had been suspended before the system
616suspend began in the suspended state.
617
618The PM core does its best to reduce the probability of race conditions between
619the runtime PM and system suspend/resume (and hibernation) callbacks by carrying
620out the following operations:
621
622 * During system suspend it calls pm_runtime_get_noresume() and
623 pm_runtime_barrier() for every device right before executing the
624 subsystem-level .suspend() callback for it. In addition to that it calls
625 pm_runtime_disable() for every device right after executing the
626 subsystem-level .suspend() callback for it.
627
628 * During system resume it calls pm_runtime_enable() and pm_runtime_put_sync()
629 for every device right before and right after executing the subsystem-level
630 .resume() callback for it, respectively.
573 631
5747. Generic subsystem callbacks 6327. Generic subsystem callbacks
575 633
@@ -595,40 +653,68 @@ driver/base/power/generic_ops.c:
595 callback provided by its driver and return its result, or return 0 if not 653 callback provided by its driver and return its result, or return 0 if not
596 defined 654 defined
597 655
656 int pm_generic_suspend_noirq(struct device *dev);
657 - if pm_runtime_suspended(dev) returns "false", invoke the ->suspend_noirq()
658 callback provided by the device's driver and return its result, or return
659 0 if not defined
660
598 int pm_generic_resume(struct device *dev); 661 int pm_generic_resume(struct device *dev);
599 - invoke the ->resume() callback provided by the driver of this device and, 662 - invoke the ->resume() callback provided by the driver of this device and,
600 if successful, change the device's runtime PM status to 'active' 663 if successful, change the device's runtime PM status to 'active'
601 664
665 int pm_generic_resume_noirq(struct device *dev);
666 - invoke the ->resume_noirq() callback provided by the driver of this device
667
602 int pm_generic_freeze(struct device *dev); 668 int pm_generic_freeze(struct device *dev);
603 - if the device has not been suspended at run time, invoke the ->freeze() 669 - if the device has not been suspended at run time, invoke the ->freeze()
604 callback provided by its driver and return its result, or return 0 if not 670 callback provided by its driver and return its result, or return 0 if not
605 defined 671 defined
606 672
673 int pm_generic_freeze_noirq(struct device *dev);
674 - if pm_runtime_suspended(dev) returns "false", invoke the ->freeze_noirq()
675 callback provided by the device's driver and return its result, or return
676 0 if not defined
677
607 int pm_generic_thaw(struct device *dev); 678 int pm_generic_thaw(struct device *dev);
608 - if the device has not been suspended at run time, invoke the ->thaw() 679 - if the device has not been suspended at run time, invoke the ->thaw()
609 callback provided by its driver and return its result, or return 0 if not 680 callback provided by its driver and return its result, or return 0 if not
610 defined 681 defined
611 682
683 int pm_generic_thaw_noirq(struct device *dev);
684 - if pm_runtime_suspended(dev) returns "false", invoke the ->thaw_noirq()
685 callback provided by the device's driver and return its result, or return
686 0 if not defined
687
612 int pm_generic_poweroff(struct device *dev); 688 int pm_generic_poweroff(struct device *dev);
613 - if the device has not been suspended at run time, invoke the ->poweroff() 689 - if the device has not been suspended at run time, invoke the ->poweroff()
614 callback provided by its driver and return its result, or return 0 if not 690 callback provided by its driver and return its result, or return 0 if not
615 defined 691 defined
616 692
693 int pm_generic_poweroff_noirq(struct device *dev);
694 - if pm_runtime_suspended(dev) returns "false", run the ->poweroff_noirq()
695 callback provided by the device's driver and return its result, or return
696 0 if not defined
697
617 int pm_generic_restore(struct device *dev); 698 int pm_generic_restore(struct device *dev);
618 - invoke the ->restore() callback provided by the driver of this device and, 699 - invoke the ->restore() callback provided by the driver of this device and,
619 if successful, change the device's runtime PM status to 'active' 700 if successful, change the device's runtime PM status to 'active'
620 701
702 int pm_generic_restore_noirq(struct device *dev);
703 - invoke the ->restore_noirq() callback provided by the device's driver
704
621These functions can be assigned to the ->runtime_idle(), ->runtime_suspend(), 705These functions can be assigned to the ->runtime_idle(), ->runtime_suspend(),
622->runtime_resume(), ->suspend(), ->resume(), ->freeze(), ->thaw(), ->poweroff(), 706->runtime_resume(), ->suspend(), ->suspend_noirq(), ->resume(),
623or ->restore() callback pointers in the subsystem-level dev_pm_ops structures. 707->resume_noirq(), ->freeze(), ->freeze_noirq(), ->thaw(), ->thaw_noirq(),
708->poweroff(), ->poweroff_noirq(), ->restore(), ->restore_noirq() callback
709pointers in the subsystem-level dev_pm_ops structures.
624 710
625If a subsystem wishes to use all of them at the same time, it can simply assign 711If a subsystem wishes to use all of them at the same time, it can simply assign
626the GENERIC_SUBSYS_PM_OPS macro, defined in include/linux/pm.h, to its 712the GENERIC_SUBSYS_PM_OPS macro, defined in include/linux/pm.h, to its
627dev_pm_ops structure pointer. 713dev_pm_ops structure pointer.
628 714
629Device drivers that wish to use the same function as a system suspend, freeze, 715Device drivers that wish to use the same function as a system suspend, freeze,
630poweroff and run-time suspend callback, and similarly for system resume, thaw, 716poweroff and runtime suspend callback, and similarly for system resume, thaw,
631restore, and run-time resume, can achieve this with the help of the 717restore, and runtime resume, can achieve this with the help of the
632UNIVERSAL_DEV_PM_OPS macro defined in include/linux/pm.h (possibly setting its 718UNIVERSAL_DEV_PM_OPS macro defined in include/linux/pm.h (possibly setting its
633last argument to NULL). 719last argument to NULL).
634 720
@@ -638,7 +724,7 @@ Some "devices" are only logical sub-devices of their parent and cannot be
638power-managed on their own. (The prototype example is a USB interface. Entire 724power-managed on their own. (The prototype example is a USB interface. Entire
639USB devices can go into low-power mode or send wake-up requests, but neither is 725USB devices can go into low-power mode or send wake-up requests, but neither is
640possible for individual interfaces.) The drivers for these devices have no 726possible for individual interfaces.) The drivers for these devices have no
641need of run-time PM callbacks; if the callbacks did exist, ->runtime_suspend() 727need of runtime PM callbacks; if the callbacks did exist, ->runtime_suspend()
642and ->runtime_resume() would always return 0 without doing anything else and 728and ->runtime_resume() would always return 0 without doing anything else and
643->runtime_idle() would always call pm_runtime_suspend(). 729->runtime_idle() would always call pm_runtime_suspend().
644 730
@@ -646,7 +732,7 @@ Subsystems can tell the PM core about these devices by calling
646pm_runtime_no_callbacks(). This should be done after the device structure is 732pm_runtime_no_callbacks(). This should be done after the device structure is
647initialized and before it is registered (although after device registration is 733initialized and before it is registered (although after device registration is
648also okay). The routine will set the device's power.no_callbacks flag and 734also okay). The routine will set the device's power.no_callbacks flag and
649prevent the non-debugging run-time PM sysfs attributes from being created. 735prevent the non-debugging runtime PM sysfs attributes from being created.
650 736
651When power.no_callbacks is set, the PM core will not invoke the 737When power.no_callbacks is set, the PM core will not invoke the
652->runtime_idle(), ->runtime_suspend(), or ->runtime_resume() callbacks. 738->runtime_idle(), ->runtime_suspend(), or ->runtime_resume() callbacks.
@@ -654,7 +740,7 @@ Instead it will assume that suspends and resumes always succeed and that idle
654devices should be suspended. 740devices should be suspended.
655 741
656As a consequence, the PM core will never directly inform the device's subsystem 742As a consequence, the PM core will never directly inform the device's subsystem
657or driver about run-time power changes. Instead, the driver for the device's 743or driver about runtime power changes. Instead, the driver for the device's
658parent must take responsibility for telling the device's driver when the 744parent must take responsibility for telling the device's driver when the
659parent's power state changes. 745parent's power state changes.
660 746
@@ -665,13 +751,13 @@ A device should be put in a low-power state only when there's some reason to
665think it will remain in that state for a substantial time. A common heuristic 751think it will remain in that state for a substantial time. A common heuristic
666says that a device which hasn't been used for a while is liable to remain 752says that a device which hasn't been used for a while is liable to remain
667unused; following this advice, drivers should not allow devices to be suspended 753unused; following this advice, drivers should not allow devices to be suspended
668at run-time until they have been inactive for some minimum period. Even when 754at runtime until they have been inactive for some minimum period. Even when
669the heuristic ends up being non-optimal, it will still prevent devices from 755the heuristic ends up being non-optimal, it will still prevent devices from
670"bouncing" too rapidly between low-power and full-power states. 756"bouncing" too rapidly between low-power and full-power states.
671 757
672The term "autosuspend" is an historical remnant. It doesn't mean that the 758The term "autosuspend" is an historical remnant. It doesn't mean that the
673device is automatically suspended (the subsystem or driver still has to call 759device is automatically suspended (the subsystem or driver still has to call
674the appropriate PM routines); rather it means that run-time suspends will 760the appropriate PM routines); rather it means that runtime suspends will
675automatically be delayed until the desired period of inactivity has elapsed. 761automatically be delayed until the desired period of inactivity has elapsed.
676 762
677Inactivity is determined based on the power.last_busy field. Drivers should 763Inactivity is determined based on the power.last_busy field. Drivers should
diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt
index 1b5a5ddbc3ef..5df176ed59b8 100644
--- a/Documentation/printk-formats.txt
+++ b/Documentation/printk-formats.txt
@@ -9,7 +9,121 @@ If variable is of Type, use printk format specifier:
9 size_t %zu or %zx 9 size_t %zu or %zx
10 ssize_t %zd or %zx 10 ssize_t %zd or %zx
11 11
12Raw pointer value SHOULD be printed with %p. 12Raw pointer value SHOULD be printed with %p. The kernel supports
13the following extended format specifiers for pointer types:
14
15Symbols/Function Pointers:
16
17 %pF versatile_init+0x0/0x110
18 %pf versatile_init
19 %pS versatile_init+0x0/0x110
20 %ps versatile_init
21 %pB prev_fn_of_versatile_init+0x88/0x88
22
23 For printing symbols and function pointers. The 'S' and 's' specifiers
24 result in the symbol name with ('S') or without ('s') offsets. Where
25 this is used on a kernel without KALLSYMS - the symbol address is
26 printed instead.
27
28 The 'B' specifier results in the symbol name with offsets and should be
29 used when printing stack backtraces. The specifier takes into
30 consideration the effect of compiler optimisations which may occur
31 when tail-call's are used and marked with the noreturn GCC attribute.
32
33 On ia64, ppc64 and parisc64 architectures function pointers are
34 actually function descriptors which must first be resolved. The 'F' and
35 'f' specifiers perform this resolution and then provide the same
36 functionality as the 'S' and 's' specifiers.
37
38Kernel Pointers:
39
40 %pK 0x01234567 or 0x0123456789abcdef
41
42 For printing kernel pointers which should be hidden from unprivileged
43 users. The behaviour of %pK depends on the kptr_restrict sysctl - see
44 Documentation/sysctl/kernel.txt for more details.
45
46Struct Resources:
47
48 %pr [mem 0x60000000-0x6fffffff flags 0x2200] or
49 [mem 0x0000000060000000-0x000000006fffffff flags 0x2200]
50 %pR [mem 0x60000000-0x6fffffff pref] or
51 [mem 0x0000000060000000-0x000000006fffffff pref]
52
53 For printing struct resources. The 'R' and 'r' specifiers result in a
54 printed resource with ('R') or without ('r') a decoded flags member.
55
56MAC/FDDI addresses:
57
58 %pM 00:01:02:03:04:05
59 %pMF 00-01-02-03-04-05
60 %pm 000102030405
61
62 For printing 6-byte MAC/FDDI addresses in hex notation. The 'M' and 'm'
63 specifiers result in a printed address with ('M') or without ('m') byte
64 separators. The default byte separator is the colon (':').
65
66 Where FDDI addresses are concerned the 'F' specifier can be used after
67 the 'M' specifier to use dash ('-') separators instead of the default
68 separator.
69
70IPv4 addresses:
71
72 %pI4 1.2.3.4
73 %pi4 001.002.003.004
74 %p[Ii][hnbl]
75
76 For printing IPv4 dot-separated decimal addresses. The 'I4' and 'i4'
77 specifiers result in a printed address with ('i4') or without ('I4')
78 leading zeros.
79
80 The additional 'h', 'n', 'b', and 'l' specifiers are used to specify
81 host, network, big or little endian order addresses respectively. Where
82 no specifier is provided the default network/big endian order is used.
83
84IPv6 addresses:
85
86 %pI6 0001:0002:0003:0004:0005:0006:0007:0008
87 %pi6 00010002000300040005000600070008
88 %pI6c 1:2:3:4:5:6:7:8
89
90 For printing IPv6 network-order 16-bit hex addresses. The 'I6' and 'i6'
91 specifiers result in a printed address with ('I6') or without ('i6')
92 colon-separators. Leading zeros are always used.
93
94 The additional 'c' specifier can be used with the 'I' specifier to
95 print a compressed IPv6 address as described by
96 http://tools.ietf.org/html/rfc5952
97
98UUID/GUID addresses:
99
100 %pUb 00010203-0405-0607-0809-0a0b0c0d0e0f
101 %pUB 00010203-0405-0607-0809-0A0B0C0D0E0F
102 %pUl 03020100-0504-0706-0809-0a0b0c0e0e0f
103 %pUL 03020100-0504-0706-0809-0A0B0C0E0E0F
104
105 For printing 16-byte UUID/GUIDs addresses. The additional 'l', 'L',
106 'b' and 'B' specifiers are used to specify a little endian order in
107 lower ('l') or upper case ('L') hex characters - and big endian order
108 in lower ('b') or upper case ('B') hex characters.
109
110 Where no additional specifiers are used the default little endian
111 order with lower case hex characters will be printed.
112
113struct va_format:
114
115 %pV
116
117 For printing struct va_format structures. These contain a format string
118 and va_list as follows:
119
120 struct va_format {
121 const char *fmt;
122 va_list *va;
123 };
124
125 Do not use this feature without some mechanism to verify the
126 correctness of the format string and va_list arguments.
13 127
14u64 SHOULD be printed with %llu/%llx, (unsigned long long): 128u64 SHOULD be printed with %llu/%llx, (unsigned long long):
15 129
@@ -32,4 +146,5 @@ Reminder: sizeof() result is of type size_t.
32Thank you for your cooperation and attention. 146Thank you for your cooperation and attention.
33 147
34 148
35By Randy Dunlap <rdunlap@xenotime.net> 149By Randy Dunlap <rdunlap@xenotime.net> and
150Andrew Murray <amurray@mpc-data.co.uk>
diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
index 99961993257a..91ecff07cede 100644
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -223,9 +223,10 @@ When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each
223group created using the pseudo filesystem. See example steps below to create 223group created using the pseudo filesystem. See example steps below to create
224task groups and modify their CPU share using the "cgroups" pseudo filesystem. 224task groups and modify their CPU share using the "cgroups" pseudo filesystem.
225 225
226 # mkdir /dev/cpuctl 226 # mount -t tmpfs cgroup_root /sys/fs/cgroup
227 # mount -t cgroup -ocpu none /dev/cpuctl 227 # mkdir /sys/fs/cgroup/cpu
228 # cd /dev/cpuctl 228 # mount -t cgroup -ocpu none /sys/fs/cgroup/cpu
229 # cd /sys/fs/cgroup/cpu
229 230
230 # mkdir multimedia # create "multimedia" group of tasks 231 # mkdir multimedia # create "multimedia" group of tasks
231 # mkdir browser # create "browser" group of tasks 232 # mkdir browser # create "browser" group of tasks
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 605b0d40329d..71b54d549987 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -129,9 +129,8 @@ priority!
129Enabling CONFIG_RT_GROUP_SCHED lets you explicitly allocate real 129Enabling CONFIG_RT_GROUP_SCHED lets you explicitly allocate real
130CPU bandwidth to task groups. 130CPU bandwidth to task groups.
131 131
132This uses the /cgroup virtual file system and 132This uses the cgroup virtual file system and "<cgroup>/cpu.rt_runtime_us"
133"/cgroup/<cgroup>/cpu.rt_runtime_us" to control the CPU time reserved for each 133to control the CPU time reserved for each control group.
134control group.
135 134
136For more information on working with control groups, you should read 135For more information on working with control groups, you should read
137Documentation/cgroups/cgroups.txt as well. 136Documentation/cgroups/cgroups.txt as well.
@@ -150,7 +149,7 @@ For now, this can be simplified to just the following (but see Future plans):
150=============== 149===============
151 150
152There is work in progress to make the scheduling period for each group 151There is work in progress to make the scheduling period for each group
153("/cgroup/<cgroup>/cpu.rt_period_us") configurable as well. 152("<cgroup>/cpu.rt_period_us") configurable as well.
154 153
155The constraint on the period is that a subgroup must have a smaller or 154The constraint on the period is that a subgroup must have a smaller or
156equal period to its parent. But realistically its not very useful _yet_ 155equal period to its parent. But realistically its not very useful _yet_
diff --git a/Documentation/spi/ep93xx_spi b/Documentation/spi/ep93xx_spi
index 6325f5b48635..d8eb01c15db1 100644
--- a/Documentation/spi/ep93xx_spi
+++ b/Documentation/spi/ep93xx_spi
@@ -88,6 +88,16 @@ static void __init ts72xx_init_machine(void)
88 ARRAY_SIZE(ts72xx_spi_devices)); 88 ARRAY_SIZE(ts72xx_spi_devices));
89} 89}
90 90
91The driver can use DMA for the transfers also. In this case ts72xx_spi_info
92becomes:
93
94static struct ep93xx_spi_info ts72xx_spi_info = {
95 .num_chipselect = ARRAY_SIZE(ts72xx_spi_devices),
96 .use_dma = true;
97};
98
99Note that CONFIG_EP93XX_DMA should be enabled as well.
100
91Thanks to 101Thanks to
92========= 102=========
93Martin Guy, H. Hartley Sweeten and others who helped me during development of 103Martin Guy, H. Hartley Sweeten and others who helped me during development of
diff --git a/Documentation/spi/pxa2xx b/Documentation/spi/pxa2xx
index 493dada57372..00511e08db78 100644
--- a/Documentation/spi/pxa2xx
+++ b/Documentation/spi/pxa2xx
@@ -22,15 +22,11 @@ Typically a SPI master is defined in the arch/.../mach-*/board-*.c as a
22found in include/linux/spi/pxa2xx_spi.h: 22found in include/linux/spi/pxa2xx_spi.h:
23 23
24struct pxa2xx_spi_master { 24struct pxa2xx_spi_master {
25 enum pxa_ssp_type ssp_type;
26 u32 clock_enable; 25 u32 clock_enable;
27 u16 num_chipselect; 26 u16 num_chipselect;
28 u8 enable_dma; 27 u8 enable_dma;
29}; 28};
30 29
31The "pxa2xx_spi_master.ssp_type" field must have a value between 1 and 3 and
32informs the driver which features a particular SSP supports.
33
34The "pxa2xx_spi_master.clock_enable" field is used to enable/disable the 30The "pxa2xx_spi_master.clock_enable" field is used to enable/disable the
35corresponding SSP peripheral block in the "Clock Enable Register (CKEN"). See 31corresponding SSP peripheral block in the "Clock Enable Register (CKEN"). See
36the "PXA2xx Developer Manual" section "Clocks and Power Management". 32the "PXA2xx Developer Manual" section "Clocks and Power Management".
@@ -61,7 +57,6 @@ static struct resource pxa_spi_nssp_resources[] = {
61}; 57};
62 58
63static struct pxa2xx_spi_master pxa_nssp_master_info = { 59static struct pxa2xx_spi_master pxa_nssp_master_info = {
64 .ssp_type = PXA25x_NSSP, /* Type of SSP */
65 .clock_enable = CKEN_NSSP, /* NSSP Peripheral clock */ 60 .clock_enable = CKEN_NSSP, /* NSSP Peripheral clock */
66 .num_chipselect = 1, /* Matches the number of chips attached to NSSP */ 61 .num_chipselect = 1, /* Matches the number of chips attached to NSSP */
67 .enable_dma = 1, /* Enables NSSP DMA */ 62 .enable_dma = 1, /* Enables NSSP DMA */
diff --git a/Documentation/spinlocks.txt b/Documentation/spinlocks.txt
index 2e3c64b1a6a5..9dbe885ecd8d 100644
--- a/Documentation/spinlocks.txt
+++ b/Documentation/spinlocks.txt
@@ -13,18 +13,8 @@ static DEFINE_SPINLOCK(xxx_lock);
13The above is always safe. It will disable interrupts _locally_, but the 13The above is always safe. It will disable interrupts _locally_, but the
14spinlock itself will guarantee the global lock, so it will guarantee that 14spinlock itself will guarantee the global lock, so it will guarantee that
15there is only one thread-of-control within the region(s) protected by that 15there is only one thread-of-control within the region(s) protected by that
16lock. This works well even under UP. The above sequence under UP 16lock. This works well even under UP also, so the code does _not_ need to
17essentially is just the same as doing 17worry about UP vs SMP issues: the spinlocks work correctly under both.
18
19 unsigned long flags;
20
21 save_flags(flags); cli();
22 ... critical section ...
23 restore_flags(flags);
24
25so the code does _not_ need to worry about UP vs SMP issues: the spinlocks
26work correctly under both (and spinlocks are actually more efficient on
27architectures that allow doing the "save_flags + cli" in one operation).
28 18
29 NOTE! Implications of spin_locks for memory are further described in: 19 NOTE! Implications of spin_locks for memory are further described in:
30 20
@@ -36,27 +26,7 @@ The above is usually pretty simple (you usually need and want only one
36spinlock for most things - using more than one spinlock can make things a 26spinlock for most things - using more than one spinlock can make things a
37lot more complex and even slower and is usually worth it only for 27lot more complex and even slower and is usually worth it only for
38sequences that you _know_ need to be split up: avoid it at all cost if you 28sequences that you _know_ need to be split up: avoid it at all cost if you
39aren't sure). HOWEVER, it _does_ mean that if you have some code that does 29aren't sure).
40
41 cli();
42 .. critical section ..
43 sti();
44
45and another sequence that does
46
47 spin_lock_irqsave(flags);
48 .. critical section ..
49 spin_unlock_irqrestore(flags);
50
51then they are NOT mutually exclusive, and the critical regions can happen
52at the same time on two different CPU's. That's fine per se, but the
53critical regions had better be critical for different things (ie they
54can't stomp on each other).
55
56The above is a problem mainly if you end up mixing code - for example the
57routines in ll_rw_block() tend to use cli/sti to protect the atomicity of
58their actions, and if a driver uses spinlocks instead then you should
59think about issues like the above.
60 30
61This is really the only really hard part about spinlocks: once you start 31This is really the only really hard part about spinlocks: once you start
62using spinlocks they tend to expand to areas you might not have noticed 32using spinlocks they tend to expand to areas you might not have noticed
@@ -120,11 +90,10 @@ Lesson 3: spinlocks revisited.
120 90
121The single spin-lock primitives above are by no means the only ones. They 91The single spin-lock primitives above are by no means the only ones. They
122are the most safe ones, and the ones that work under all circumstances, 92are the most safe ones, and the ones that work under all circumstances,
123but partly _because_ they are safe they are also fairly slow. They are 93but partly _because_ they are safe they are also fairly slow. They are slower
124much faster than a generic global cli/sti pair, but slower than they'd 94than they'd need to be, because they do have to disable interrupts
125need to be, because they do have to disable interrupts (which is just a 95(which is just a single instruction on a x86, but it's an expensive one -
126single instruction on a x86, but it's an expensive one - and on other 96and on other architectures it can be worse).
127architectures it can be worse).
128 97
129If you have a case where you have to protect a data structure across 98If you have a case where you have to protect a data structure across
130several CPU's and you want to use spinlocks you can potentially use 99several CPU's and you want to use spinlocks you can potentially use
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 5e7cb39ad195..1c7fb0a94e28 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -17,23 +17,21 @@ before actually making adjustments.
17 17
18Currently, these files might (depending on your configuration) 18Currently, these files might (depending on your configuration)
19show up in /proc/sys/kernel: 19show up in /proc/sys/kernel:
20- acpi_video_flags 20
21- acct 21- acct
22- acpi_video_flags
23- auto_msgmni
22- bootloader_type [ X86 only ] 24- bootloader_type [ X86 only ]
23- bootloader_version [ X86 only ] 25- bootloader_version [ X86 only ]
24- callhome [ S390 only ] 26- callhome [ S390 only ]
25- auto_msgmni
26- core_pattern 27- core_pattern
27- core_pipe_limit 28- core_pipe_limit
28- core_uses_pid 29- core_uses_pid
29- ctrl-alt-del 30- ctrl-alt-del
30- dentry-state
31- dmesg_restrict 31- dmesg_restrict
32- domainname 32- domainname
33- hostname 33- hostname
34- hotplug 34- hotplug
35- java-appletviewer [ binfmt_java, obsolete ]
36- java-interpreter [ binfmt_java, obsolete ]
37- kptr_restrict 35- kptr_restrict
38- kstack_depth_to_print [ X86 only ] 36- kstack_depth_to_print [ X86 only ]
39- l2cr [ PPC only ] 37- l2cr [ PPC only ]
@@ -48,10 +46,14 @@ show up in /proc/sys/kernel:
48- overflowgid 46- overflowgid
49- overflowuid 47- overflowuid
50- panic 48- panic
49- panic_on_oops
50- panic_on_unrecovered_nmi
51- pid_max 51- pid_max
52- powersave-nap [ PPC only ] 52- powersave-nap [ PPC only ]
53- panic_on_unrecovered_nmi
54- printk 53- printk
54- printk_delay
55- printk_ratelimit
56- printk_ratelimit_burst
55- randomize_va_space 57- randomize_va_space
56- real-root-dev ==> Documentation/initrd.txt 58- real-root-dev ==> Documentation/initrd.txt
57- reboot-cmd [ SPARC only ] 59- reboot-cmd [ SPARC only ]
@@ -62,6 +64,7 @@ show up in /proc/sys/kernel:
62- shmall 64- shmall
63- shmmax [ sysv ipc ] 65- shmmax [ sysv ipc ]
64- shmmni 66- shmmni
67- softlockup_thresh
65- stop-a [ SPARC only ] 68- stop-a [ SPARC only ]
66- sysrq ==> Documentation/sysrq.txt 69- sysrq ==> Documentation/sysrq.txt
67- tainted 70- tainted
@@ -71,15 +74,6 @@ show up in /proc/sys/kernel:
71 74
72============================================================== 75==============================================================
73 76
74acpi_video_flags:
75
76flags
77
78See Doc*/kernel/power/video.txt, it allows mode of video boot to be
79set during run time.
80
81==============================================================
82
83acct: 77acct:
84 78
85highwater lowwater frequency 79highwater lowwater frequency
@@ -97,6 +91,25 @@ valid for 30 seconds.
97 91
98============================================================== 92==============================================================
99 93
94acpi_video_flags:
95
96flags
97
98See Doc*/kernel/power/video.txt, it allows mode of video boot to be
99set during run time.
100
101==============================================================
102
103auto_msgmni:
104
105Enables/Disables automatic recomputing of msgmni upon memory add/remove
106or upon ipc namespace creation/removal (see the msgmni description
107above). Echoing "1" into this file enables msgmni automatic recomputing.
108Echoing "0" turns it off. auto_msgmni default value is 1.
109
110
111==============================================================
112
100bootloader_type: 113bootloader_type:
101 114
102x86 bootloader identification 115x86 bootloader identification
@@ -172,22 +185,24 @@ core_pattern is used to specify a core dumpfile pattern name.
172 185
173core_pipe_limit: 186core_pipe_limit:
174 187
175This sysctl is only applicable when core_pattern is configured to pipe core 188This sysctl is only applicable when core_pattern is configured to pipe
176files to a user space helper (when the first character of core_pattern is a '|', 189core files to a user space helper (when the first character of
177see above). When collecting cores via a pipe to an application, it is 190core_pattern is a '|', see above). When collecting cores via a pipe
178occasionally useful for the collecting application to gather data about the 191to an application, it is occasionally useful for the collecting
179crashing process from its /proc/pid directory. In order to do this safely, the 192application to gather data about the crashing process from its
180kernel must wait for the collecting process to exit, so as not to remove the 193/proc/pid directory. In order to do this safely, the kernel must wait
181crashing processes proc files prematurely. This in turn creates the possibility 194for the collecting process to exit, so as not to remove the crashing
182that a misbehaving userspace collecting process can block the reaping of a 195processes proc files prematurely. This in turn creates the
183crashed process simply by never exiting. This sysctl defends against that. It 196possibility that a misbehaving userspace collecting process can block
184defines how many concurrent crashing processes may be piped to user space 197the reaping of a crashed process simply by never exiting. This sysctl
185applications in parallel. If this value is exceeded, then those crashing 198defends against that. It defines how many concurrent crashing
186processes above that value are noted via the kernel log and their cores are 199processes may be piped to user space applications in parallel. If
187skipped. 0 is a special value, indicating that unlimited processes may be 200this value is exceeded, then those crashing processes above that value
188captured in parallel, but that no waiting will take place (i.e. the collecting 201are noted via the kernel log and their cores are skipped. 0 is a
189process is not guaranteed access to /proc/<crashing pid>/). This value defaults 202special value, indicating that unlimited processes may be captured in
190to 0. 203parallel, but that no waiting will take place (i.e. the collecting
204process is not guaranteed access to /proc/<crashing pid>/). This
205value defaults to 0.
191 206
192============================================================== 207==============================================================
193 208
@@ -218,14 +233,14 @@ to decide what to do with it.
218 233
219dmesg_restrict: 234dmesg_restrict:
220 235
221This toggle indicates whether unprivileged users are prevented from using 236This toggle indicates whether unprivileged users are prevented
222dmesg(8) to view messages from the kernel's log buffer. When 237from using dmesg(8) to view messages from the kernel's log buffer.
223dmesg_restrict is set to (0) there are no restrictions. When 238When dmesg_restrict is set to (0) there are no restrictions. When
224dmesg_restrict is set set to (1), users must have CAP_SYSLOG to use 239dmesg_restrict is set set to (1), users must have CAP_SYSLOG to use
225dmesg(8). 240dmesg(8).
226 241
227The kernel config option CONFIG_SECURITY_DMESG_RESTRICT sets the default 242The kernel config option CONFIG_SECURITY_DMESG_RESTRICT sets the
228value of dmesg_restrict. 243default value of dmesg_restrict.
229 244
230============================================================== 245==============================================================
231 246
@@ -256,13 +271,6 @@ Default value is "/sbin/hotplug".
256 271
257============================================================== 272==============================================================
258 273
259l2cr: (PPC only)
260
261This flag controls the L2 cache of G3 processor boards. If
2620, the cache is disabled. Enabled if nonzero.
263
264==============================================================
265
266kptr_restrict: 274kptr_restrict:
267 275
268This toggle indicates whether restrictions are placed on 276This toggle indicates whether restrictions are placed on
@@ -283,6 +291,13 @@ kernel stack.
283 291
284============================================================== 292==============================================================
285 293
294l2cr: (PPC only)
295
296This flag controls the L2 cache of G3 processor boards. If
2970, the cache is disabled. Enabled if nonzero.
298
299==============================================================
300
286modules_disabled: 301modules_disabled:
287 302
288A toggle value indicating if modules are allowed to be loaded 303A toggle value indicating if modules are allowed to be loaded
@@ -293,6 +308,21 @@ to false.
293 308
294============================================================== 309==============================================================
295 310
311nmi_watchdog:
312
313Enables/Disables the NMI watchdog on x86 systems. When the value is
314non-zero the NMI watchdog is enabled and will continuously test all
315online cpus to determine whether or not they are still functioning
316properly. Currently, passing "nmi_watchdog=" parameter at boot time is
317required for this function to work.
318
319If LAPIC NMI watchdog method is in use (nmi_watchdog=2 kernel
320parameter), the NMI watchdog shares registers with oprofile. By
321disabling the NMI watchdog, oprofile may have more registers to
322utilize.
323
324==============================================================
325
296osrelease, ostype & version: 326osrelease, ostype & version:
297 327
298# cat osrelease 328# cat osrelease
@@ -312,10 +342,10 @@ The only way to tune these values is to rebuild the kernel :-)
312 342
313overflowgid & overflowuid: 343overflowgid & overflowuid:
314 344
315if your architecture did not always support 32-bit UIDs (i.e. arm, i386, 345if your architecture did not always support 32-bit UIDs (i.e. arm,
316m68k, sh, and sparc32), a fixed UID and GID will be returned to 346i386, m68k, sh, and sparc32), a fixed UID and GID will be returned to
317applications that use the old 16-bit UID/GID system calls, if the actual 347applications that use the old 16-bit UID/GID system calls, if the
318UID or GID would exceed 65535. 348actual UID or GID would exceed 65535.
319 349
320These sysctls allow you to change the value of the fixed UID and GID. 350These sysctls allow you to change the value of the fixed UID and GID.
321The default is 65534. 351The default is 65534.
@@ -324,9 +354,22 @@ The default is 65534.
324 354
325panic: 355panic:
326 356
327The value in this file represents the number of seconds the 357The value in this file represents the number of seconds the kernel
328kernel waits before rebooting on a panic. When you use the 358waits before rebooting on a panic. When you use the software watchdog,
329software watchdog, the recommended setting is 60. 359the recommended setting is 60.
360
361==============================================================
362
363panic_on_unrecovered_nmi:
364
365The default Linux behaviour on an NMI of either memory or unknown is
366to continue operation. For many environments such as scientific
367computing it is preferable that the box is taken out and the error
368dealt with than an uncorrected parity/ECC error get propagated.
369
370A small number of systems do generate NMI's for bizarre random reasons
371such as power management so the default is off. That sysctl works like
372the existing panic controls already in that directory.
330 373
331============================================================== 374==============================================================
332 375
@@ -376,6 +419,14 @@ the different loglevels.
376 419
377============================================================== 420==============================================================
378 421
422printk_delay:
423
424Delay each printk message in printk_delay milliseconds
425
426Value from 0 - 10000 is allowed.
427
428==============================================================
429
379printk_ratelimit: 430printk_ratelimit:
380 431
381Some warning messages are rate limited. printk_ratelimit specifies 432Some warning messages are rate limited. printk_ratelimit specifies
@@ -395,15 +446,7 @@ send before ratelimiting kicks in.
395 446
396============================================================== 447==============================================================
397 448
398printk_delay: 449randomize_va_space:
399
400Delay each printk message in printk_delay milliseconds
401
402Value from 0 - 10000 is allowed.
403
404==============================================================
405
406randomize-va-space:
407 450
408This option can be used to select the type of process address 451This option can be used to select the type of process address
409space randomization that is used in the system, for architectures 452space randomization that is used in the system, for architectures
@@ -466,11 +509,11 @@ are doing anyway :)
466 509
467============================================================== 510==============================================================
468 511
469shmmax: 512shmmax:
470 513
471This value can be used to query and set the run time limit 514This value can be used to query and set the run time limit
472on the maximum shared memory segment size that can be created. 515on the maximum shared memory segment size that can be created.
473Shared memory segments up to 1Gb are now supported in the 516Shared memory segments up to 1Gb are now supported in the
474kernel. This value defaults to SHMMAX. 517kernel. This value defaults to SHMMAX.
475 518
476============================================================== 519==============================================================
@@ -484,7 +527,7 @@ tunable to zero will disable the softlockup detection altogether.
484 527
485============================================================== 528==============================================================
486 529
487tainted: 530tainted:
488 531
489Non-zero if the kernel has been tainted. Numeric values, which 532Non-zero if the kernel has been tainted. Numeric values, which
490can be ORed together: 533can be ORed together:
@@ -509,49 +552,11 @@ can be ORed together:
509 552
510============================================================== 553==============================================================
511 554
512auto_msgmni:
513
514Enables/Disables automatic recomputing of msgmni upon memory add/remove or
515upon ipc namespace creation/removal (see the msgmni description above).
516Echoing "1" into this file enables msgmni automatic recomputing.
517Echoing "0" turns it off.
518auto_msgmni default value is 1.
519
520==============================================================
521
522nmi_watchdog:
523
524Enables/Disables the NMI watchdog on x86 systems. When the value is non-zero
525the NMI watchdog is enabled and will continuously test all online cpus to
526determine whether or not they are still functioning properly. Currently,
527passing "nmi_watchdog=" parameter at boot time is required for this function
528to work.
529
530If LAPIC NMI watchdog method is in use (nmi_watchdog=2 kernel parameter), the
531NMI watchdog shares registers with oprofile. By disabling the NMI watchdog,
532oprofile may have more registers to utilize.
533
534==============================================================
535
536unknown_nmi_panic: 555unknown_nmi_panic:
537 556
538The value in this file affects behavior of handling NMI. When the value is 557The value in this file affects behavior of handling NMI. When the
539non-zero, unknown NMI is trapped and then panic occurs. At that time, kernel 558value is non-zero, unknown NMI is trapped and then panic occurs. At
540debugging information is displayed on console. 559that time, kernel debugging information is displayed on console.
541
542NMI switch that most IA32 servers have fires unknown NMI up, for example.
543If a system hangs up, try pressing the NMI switch.
544
545==============================================================
546
547panic_on_unrecovered_nmi:
548
549The default Linux behaviour on an NMI of either memory or unknown is to continue
550operation. For many environments such as scientific computing it is preferable
551that the box is taken out and the error dealt with than an uncorrected
552parity/ECC error get propogated.
553
554A small number of systems do generate NMI's for bizarre random reasons such as
555power management so the default is off. That sysctl works like the existing
556panic controls already in that directory.
557 560
561NMI switch that most IA32 servers have fires unknown NMI up, for
562example. If a system hangs up, try pressing the NMI switch.
diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index c83bd6b4e6e8..d0d0bb9e3e25 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -22,14 +22,15 @@ current_tracer. Instead of that, add probe points via
22 22
23Synopsis of kprobe_events 23Synopsis of kprobe_events
24------------------------- 24-------------------------
25 p[:[GRP/]EVENT] SYMBOL[+offs]|MEMADDR [FETCHARGS] : Set a probe 25 p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe
26 r[:[GRP/]EVENT] SYMBOL[+0] [FETCHARGS] : Set a return probe 26 r[:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe
27 -:[GRP/]EVENT : Clear a probe 27 -:[GRP/]EVENT : Clear a probe
28 28
29 GRP : Group name. If omitted, use "kprobes" for it. 29 GRP : Group name. If omitted, use "kprobes" for it.
30 EVENT : Event name. If omitted, the event name is generated 30 EVENT : Event name. If omitted, the event name is generated
31 based on SYMBOL+offs or MEMADDR. 31 based on SYM+offs or MEMADDR.
32 SYMBOL[+offs] : Symbol+offset where the probe is inserted. 32 MOD : Module name which has given SYM.
33 SYM[+offs] : Symbol+offset where the probe is inserted.
33 MEMADDR : Address where the probe is inserted. 34 MEMADDR : Address where the probe is inserted.
34 35
35 FETCHARGS : Arguments. Each probe can have up to 128 args. 36 FETCHARGS : Arguments. Each probe can have up to 128 args.
diff --git a/Documentation/usb/error-codes.txt b/Documentation/usb/error-codes.txt
index d83703ea74b2..b3f606b81a03 100644
--- a/Documentation/usb/error-codes.txt
+++ b/Documentation/usb/error-codes.txt
@@ -76,6 +76,13 @@ A transfer's actual_length may be positive even when an error has been
76reported. That's because transfers often involve several packets, so that 76reported. That's because transfers often involve several packets, so that
77one or more packets could finish before an error stops further endpoint I/O. 77one or more packets could finish before an error stops further endpoint I/O.
78 78
79For isochronous URBs, the urb status value is non-zero only if the URB is
80unlinked, the device is removed, the host controller is disabled, or the total
81transferred length is less than the requested length and the URB_SHORT_NOT_OK
82flag is set. Completion handlers for isochronous URBs should only see
83urb->status set to zero, -ENOENT, -ECONNRESET, -ESHUTDOWN, or -EREMOTEIO.
84Individual frame descriptor status fields may report more status codes.
85
79 86
800 Transfer completed successfully 870 Transfer completed successfully
81 88
@@ -132,7 +139,7 @@ one or more packets could finish before an error stops further endpoint I/O.
132 device removal events immediately. 139 device removal events immediately.
133 140
134-EXDEV ISO transfer only partially completed 141-EXDEV ISO transfer only partially completed
135 look at individual frame status for details 142 (only set in iso_frame_desc[n].status, not urb->status)
136 143
137-EINVAL ISO madness, if this happens: Log off and go home 144-EINVAL ISO madness, if this happens: Log off and go home
138 145
diff --git a/Documentation/vDSO/parse_vdso.c b/Documentation/vDSO/parse_vdso.c
new file mode 100644
index 000000000000..85870208edcf
--- /dev/null
+++ b/Documentation/vDSO/parse_vdso.c
@@ -0,0 +1,256 @@
1/*
2 * parse_vdso.c: Linux reference vDSO parser
3 * Written by Andrew Lutomirski, 2011.
4 *
5 * This code is meant to be linked in to various programs that run on Linux.
6 * As such, it is available with as few restrictions as possible. This file
7 * is licensed under the Creative Commons Zero License, version 1.0,
8 * available at http://creativecommons.org/publicdomain/zero/1.0/legalcode
9 *
10 * The vDSO is a regular ELF DSO that the kernel maps into user space when
11 * it starts a program. It works equally well in statically and dynamically
12 * linked binaries.
13 *
14 * This code is tested on x86_64. In principle it should work on any 64-bit
15 * architecture that has a vDSO.
16 */
17
18#include <stdbool.h>
19#include <stdint.h>
20#include <string.h>
21#include <elf.h>
22
23/*
24 * To use this vDSO parser, first call one of the vdso_init_* functions.
25 * If you've already parsed auxv, then pass the value of AT_SYSINFO_EHDR
26 * to vdso_init_from_sysinfo_ehdr. Otherwise pass auxv to vdso_init_from_auxv.
27 * Then call vdso_sym for each symbol you want. For example, to look up
28 * gettimeofday on x86_64, use:
29 *
30 * <some pointer> = vdso_sym("LINUX_2.6", "gettimeofday");
31 * or
32 * <some pointer> = vdso_sym("LINUX_2.6", "__vdso_gettimeofday");
33 *
34 * vdso_sym will return 0 if the symbol doesn't exist or if the init function
35 * failed or was not called. vdso_sym is a little slow, so its return value
36 * should be cached.
37 *
38 * vdso_sym is threadsafe; the init functions are not.
39 *
40 * These are the prototypes:
41 */
42extern void vdso_init_from_auxv(void *auxv);
43extern void vdso_init_from_sysinfo_ehdr(uintptr_t base);
44extern void *vdso_sym(const char *version, const char *name);
45
46
47/* And here's the code. */
48
49#ifndef __x86_64__
50# error Not yet ported to non-x86_64 architectures
51#endif
52
53static struct vdso_info
54{
55 bool valid;
56
57 /* Load information */
58 uintptr_t load_addr;
59 uintptr_t load_offset; /* load_addr - recorded vaddr */
60
61 /* Symbol table */
62 Elf64_Sym *symtab;
63 const char *symstrings;
64 Elf64_Word *bucket, *chain;
65 Elf64_Word nbucket, nchain;
66
67 /* Version table */
68 Elf64_Versym *versym;
69 Elf64_Verdef *verdef;
70} vdso_info;
71
72/* Straight from the ELF specification. */
73static unsigned long elf_hash(const unsigned char *name)
74{
75 unsigned long h = 0, g;
76 while (*name)
77 {
78 h = (h << 4) + *name++;
79 if (g = h & 0xf0000000)
80 h ^= g >> 24;
81 h &= ~g;
82 }
83 return h;
84}
85
86void vdso_init_from_sysinfo_ehdr(uintptr_t base)
87{
88 size_t i;
89 bool found_vaddr = false;
90
91 vdso_info.valid = false;
92
93 vdso_info.load_addr = base;
94
95 Elf64_Ehdr *hdr = (Elf64_Ehdr*)base;
96 Elf64_Phdr *pt = (Elf64_Phdr*)(vdso_info.load_addr + hdr->e_phoff);
97 Elf64_Dyn *dyn = 0;
98
99 /*
100 * We need two things from the segment table: the load offset
101 * and the dynamic table.
102 */
103 for (i = 0; i < hdr->e_phnum; i++)
104 {
105 if (pt[i].p_type == PT_LOAD && !found_vaddr) {
106 found_vaddr = true;
107 vdso_info.load_offset = base
108 + (uintptr_t)pt[i].p_offset
109 - (uintptr_t)pt[i].p_vaddr;
110 } else if (pt[i].p_type == PT_DYNAMIC) {
111 dyn = (Elf64_Dyn*)(base + pt[i].p_offset);
112 }
113 }
114
115 if (!found_vaddr || !dyn)
116 return; /* Failed */
117
118 /*
119 * Fish out the useful bits of the dynamic table.
120 */
121 Elf64_Word *hash = 0;
122 vdso_info.symstrings = 0;
123 vdso_info.symtab = 0;
124 vdso_info.versym = 0;
125 vdso_info.verdef = 0;
126 for (i = 0; dyn[i].d_tag != DT_NULL; i++) {
127 switch (dyn[i].d_tag) {
128 case DT_STRTAB:
129 vdso_info.symstrings = (const char *)
130 ((uintptr_t)dyn[i].d_un.d_ptr
131 + vdso_info.load_offset);
132 break;
133 case DT_SYMTAB:
134 vdso_info.symtab = (Elf64_Sym *)
135 ((uintptr_t)dyn[i].d_un.d_ptr
136 + vdso_info.load_offset);
137 break;
138 case DT_HASH:
139 hash = (Elf64_Word *)
140 ((uintptr_t)dyn[i].d_un.d_ptr
141 + vdso_info.load_offset);
142 break;
143 case DT_VERSYM:
144 vdso_info.versym = (Elf64_Versym *)
145 ((uintptr_t)dyn[i].d_un.d_ptr
146 + vdso_info.load_offset);
147 break;
148 case DT_VERDEF:
149 vdso_info.verdef = (Elf64_Verdef *)
150 ((uintptr_t)dyn[i].d_un.d_ptr
151 + vdso_info.load_offset);
152 break;
153 }
154 }
155 if (!vdso_info.symstrings || !vdso_info.symtab || !hash)
156 return; /* Failed */
157
158 if (!vdso_info.verdef)
159 vdso_info.versym = 0;
160
161 /* Parse the hash table header. */
162 vdso_info.nbucket = hash[0];
163 vdso_info.nchain = hash[1];
164 vdso_info.bucket = &hash[2];
165 vdso_info.chain = &hash[vdso_info.nbucket + 2];
166
167 /* That's all we need. */
168 vdso_info.valid = true;
169}
170
171static bool vdso_match_version(Elf64_Versym ver,
172 const char *name, Elf64_Word hash)
173{
174 /*
175 * This is a helper function to check if the version indexed by
176 * ver matches name (which hashes to hash).
177 *
178 * The version definition table is a mess, and I don't know how
179 * to do this in better than linear time without allocating memory
180 * to build an index. I also don't know why the table has
181 * variable size entries in the first place.
182 *
183 * For added fun, I can't find a comprehensible specification of how
184 * to parse all the weird flags in the table.
185 *
186 * So I just parse the whole table every time.
187 */
188
189 /* First step: find the version definition */
190 ver &= 0x7fff; /* Apparently bit 15 means "hidden" */
191 Elf64_Verdef *def = vdso_info.verdef;
192 while(true) {
193 if ((def->vd_flags & VER_FLG_BASE) == 0
194 && (def->vd_ndx & 0x7fff) == ver)
195 break;
196
197 if (def->vd_next == 0)
198 return false; /* No definition. */
199
200 def = (Elf64_Verdef *)((char *)def + def->vd_next);
201 }
202
203 /* Now figure out whether it matches. */
204 Elf64_Verdaux *aux = (Elf64_Verdaux*)((char *)def + def->vd_aux);
205 return def->vd_hash == hash
206 && !strcmp(name, vdso_info.symstrings + aux->vda_name);
207}
208
209void *vdso_sym(const char *version, const char *name)
210{
211 unsigned long ver_hash;
212 if (!vdso_info.valid)
213 return 0;
214
215 ver_hash = elf_hash(version);
216 Elf64_Word chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket];
217
218 for (; chain != STN_UNDEF; chain = vdso_info.chain[chain]) {
219 Elf64_Sym *sym = &vdso_info.symtab[chain];
220
221 /* Check for a defined global or weak function w/ right name. */
222 if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC)
223 continue;
224 if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL &&
225 ELF64_ST_BIND(sym->st_info) != STB_WEAK)
226 continue;
227 if (sym->st_shndx == SHN_UNDEF)
228 continue;
229 if (strcmp(name, vdso_info.symstrings + sym->st_name))
230 continue;
231
232 /* Check symbol version. */
233 if (vdso_info.versym
234 && !vdso_match_version(vdso_info.versym[chain],
235 version, ver_hash))
236 continue;
237
238 return (void *)(vdso_info.load_offset + sym->st_value);
239 }
240
241 return 0;
242}
243
244void vdso_init_from_auxv(void *auxv)
245{
246 Elf64_auxv_t *elf_auxv = auxv;
247 for (int i = 0; elf_auxv[i].a_type != AT_NULL; i++)
248 {
249 if (elf_auxv[i].a_type == AT_SYSINFO_EHDR) {
250 vdso_init_from_sysinfo_ehdr(elf_auxv[i].a_un.a_val);
251 return;
252 }
253 }
254
255 vdso_info.valid = false;
256}
diff --git a/Documentation/vDSO/vdso_test.c b/Documentation/vDSO/vdso_test.c
new file mode 100644
index 000000000000..fff633432dff
--- /dev/null
+++ b/Documentation/vDSO/vdso_test.c
@@ -0,0 +1,111 @@
1/*
2 * vdso_test.c: Sample code to test parse_vdso.c on x86_64
3 * Copyright (c) 2011 Andy Lutomirski
4 * Subject to the GNU General Public License, version 2
5 *
6 * You can amuse yourself by compiling with:
7 * gcc -std=gnu99 -nostdlib
8 * -Os -fno-asynchronous-unwind-tables -flto
9 * vdso_test.c parse_vdso.c -o vdso_test
10 * to generate a small binary with no dependencies at all.
11 */
12
13#include <sys/syscall.h>
14#include <sys/time.h>
15#include <unistd.h>
16#include <stdint.h>
17
18extern void *vdso_sym(const char *version, const char *name);
19extern void vdso_init_from_sysinfo_ehdr(uintptr_t base);
20extern void vdso_init_from_auxv(void *auxv);
21
22/* We need a libc functions... */
23int strcmp(const char *a, const char *b)
24{
25 /* This implementation is buggy: it never returns -1. */
26 while (*a || *b) {
27 if (*a != *b)
28 return 1;
29 if (*a == 0 || *b == 0)
30 return 1;
31 a++;
32 b++;
33 }
34
35 return 0;
36}
37
38/* ...and two syscalls. This is x86_64-specific. */
39static inline long linux_write(int fd, const void *data, size_t len)
40{
41
42 long ret;
43 asm volatile ("syscall" : "=a" (ret) : "a" (__NR_write),
44 "D" (fd), "S" (data), "d" (len) :
45 "cc", "memory", "rcx",
46 "r8", "r9", "r10", "r11" );
47 return ret;
48}
49
50static inline void linux_exit(int code)
51{
52 asm volatile ("syscall" : : "a" (__NR_exit), "D" (code));
53}
54
55void to_base10(char *lastdig, uint64_t n)
56{
57 while (n) {
58 *lastdig = (n % 10) + '0';
59 n /= 10;
60 lastdig--;
61 }
62}
63
64__attribute__((externally_visible)) void c_main(void **stack)
65{
66 /* Parse the stack */
67 long argc = (long)*stack;
68 stack += argc + 2;
69
70 /* Now we're pointing at the environment. Skip it. */
71 while(*stack)
72 stack++;
73 stack++;
74
75 /* Now we're pointing at auxv. Initialize the vDSO parser. */
76 vdso_init_from_auxv((void *)stack);
77
78 /* Find gettimeofday. */
79 typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
80 gtod_t gtod = (gtod_t)vdso_sym("LINUX_2.6", "__vdso_gettimeofday");
81
82 if (!gtod)
83 linux_exit(1);
84
85 struct timeval tv;
86 long ret = gtod(&tv, 0);
87
88 if (ret == 0) {
89 char buf[] = "The time is .000000\n";
90 to_base10(buf + 31, tv.tv_sec);
91 to_base10(buf + 38, tv.tv_usec);
92 linux_write(1, buf, sizeof(buf) - 1);
93 } else {
94 linux_exit(ret);
95 }
96
97 linux_exit(0);
98}
99
100/*
101 * This is the real entry point. It passes the initial stack into
102 * the C entry point.
103 */
104asm (
105 ".text\n"
106 ".global _start\n"
107 ".type _start,@function\n"
108 "_start:\n\t"
109 "mov %rsp,%rdi\n\t"
110 "jmp c_main"
111 );
diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c
index cd9d6af61d07..043bd7df3139 100644
--- a/Documentation/virtual/lguest/lguest.c
+++ b/Documentation/virtual/lguest/lguest.c
@@ -51,7 +51,7 @@
51#include <asm/bootparam.h> 51#include <asm/bootparam.h>
52#include "../../../include/linux/lguest_launcher.h" 52#include "../../../include/linux/lguest_launcher.h"
53/*L:110 53/*L:110
54 * We can ignore the 42 include files we need for this program, but I do want 54 * We can ignore the 43 include files we need for this program, but I do want
55 * to draw attention to the use of kernel-style types. 55 * to draw attention to the use of kernel-style types.
56 * 56 *
57 * As Linus said, "C is a Spartan language, and so should your naming be." I 57 * As Linus said, "C is a Spartan language, and so should your naming be." I
@@ -65,7 +65,6 @@ typedef uint16_t u16;
65typedef uint8_t u8; 65typedef uint8_t u8;
66/*:*/ 66/*:*/
67 67
68#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
69#define BRIDGE_PFX "bridge:" 68#define BRIDGE_PFX "bridge:"
70#ifndef SIOCBRADDIF 69#ifndef SIOCBRADDIF
71#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ 70#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
@@ -861,8 +860,10 @@ static void console_output(struct virtqueue *vq)
861 /* writev can return a partial write, so we loop here. */ 860 /* writev can return a partial write, so we loop here. */
862 while (!iov_empty(iov, out)) { 861 while (!iov_empty(iov, out)) {
863 int len = writev(STDOUT_FILENO, iov, out); 862 int len = writev(STDOUT_FILENO, iov, out);
864 if (len <= 0) 863 if (len <= 0) {
865 err(1, "Write to stdout gave %i", len); 864 warn("Write to stdout gave %i (%d)", len, errno);
865 break;
866 }
866 iov_consume(iov, out, len); 867 iov_consume(iov, out, len);
867 } 868 }
868 869
@@ -898,7 +899,7 @@ static void net_output(struct virtqueue *vq)
898 * same format: what a coincidence! 899 * same format: what a coincidence!
899 */ 900 */
900 if (writev(net_info->tunfd, iov, out) < 0) 901 if (writev(net_info->tunfd, iov, out) < 0)
901 errx(1, "Write to tun failed?"); 902 warnx("Write to tun failed (%d)?", errno);
902 903
903 /* 904 /*
904 * Done with that one; wait_for_vq_desc() will send the interrupt if 905 * Done with that one; wait_for_vq_desc() will send the interrupt if
@@ -955,7 +956,7 @@ static void net_input(struct virtqueue *vq)
955 */ 956 */
956 len = readv(net_info->tunfd, iov, in); 957 len = readv(net_info->tunfd, iov, in);
957 if (len <= 0) 958 if (len <= 0)
958 err(1, "Failed to read from tun."); 959 warn("Failed to read from tun (%d).", errno);
959 960
960 /* 961 /*
961 * Mark that packet buffer as used, but don't interrupt here. We want 962 * Mark that packet buffer as used, but don't interrupt here. We want
@@ -1093,9 +1094,10 @@ static void update_device_status(struct device *dev)
1093 warnx("Device %s configuration FAILED", dev->name); 1094 warnx("Device %s configuration FAILED", dev->name);
1094 if (dev->running) 1095 if (dev->running)
1095 reset_device(dev); 1096 reset_device(dev);
1096 } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { 1097 } else {
1097 if (!dev->running) 1098 if (dev->running)
1098 start_device(dev); 1099 err(1, "Device %s features finalized twice", dev->name);
1100 start_device(dev);
1099 } 1101 }
1100} 1102}
1101 1103
@@ -1120,25 +1122,11 @@ static void handle_output(unsigned long addr)
1120 return; 1122 return;
1121 } 1123 }
1122 1124
1123 /* 1125 /* Devices should not be used before features are finalized. */
1124 * Devices *can* be used before status is set to DRIVER_OK.
1125 * The original plan was that they would never do this: they
1126 * would always finish setting up their status bits before
1127 * actually touching the virtqueues. In practice, we allowed
1128 * them to, and they do (eg. the disk probes for partition
1129 * tables as part of initialization).
1130 *
1131 * If we see this, we start the device: once it's running, we
1132 * expect the device to catch all the notifications.
1133 */
1134 for (vq = i->vq; vq; vq = vq->next) { 1126 for (vq = i->vq; vq; vq = vq->next) {
1135 if (addr != vq->config.pfn*getpagesize()) 1127 if (addr != vq->config.pfn*getpagesize())
1136 continue; 1128 continue;
1137 if (i->running) 1129 errx(1, "Notification on %s before setup!", i->name);
1138 errx(1, "Notification on running %s", i->name);
1139 /* This just calls create_thread() for each virtqueue */
1140 start_device(i);
1141 return;
1142 } 1130 }
1143 } 1131 }
1144 1132
@@ -1370,7 +1358,7 @@ static void setup_console(void)
1370 * --sharenet=<name> option which opens or creates a named pipe. This can be 1358 * --sharenet=<name> option which opens or creates a named pipe. This can be
1371 * used to send packets to another guest in a 1:1 manner. 1359 * used to send packets to another guest in a 1:1 manner.
1372 * 1360 *
1373 * More sopisticated is to use one of the tools developed for project like UML 1361 * More sophisticated is to use one of the tools developed for project like UML
1374 * to do networking. 1362 * to do networking.
1375 * 1363 *
1376 * Faster is to do virtio bonding in kernel. Doing this 1:1 would be 1364 * Faster is to do virtio bonding in kernel. Doing this 1:1 would be
@@ -1380,7 +1368,7 @@ static void setup_console(void)
1380 * multiple inter-guest channels behind one interface, although it would 1368 * multiple inter-guest channels behind one interface, although it would
1381 * require some manner of hotplugging new virtio channels. 1369 * require some manner of hotplugging new virtio channels.
1382 * 1370 *
1383 * Finally, we could implement a virtio network switch in the kernel. 1371 * Finally, we could use a virtio network switch in the kernel, ie. vhost.
1384:*/ 1372:*/
1385 1373
1386static u32 str2ip(const char *ipaddr) 1374static u32 str2ip(const char *ipaddr)
@@ -2017,10 +2005,7 @@ int main(int argc, char *argv[])
2017 /* Tell the entry path not to try to reload segment registers. */ 2005 /* Tell the entry path not to try to reload segment registers. */
2018 boot->hdr.loadflags |= KEEP_SEGMENTS; 2006 boot->hdr.loadflags |= KEEP_SEGMENTS;
2019 2007
2020 /* 2008 /* We tell the kernel to initialize the Guest. */
2021 * We tell the kernel to initialize the Guest: this returns the open
2022 * /dev/lguest file descriptor.
2023 */
2024 tell_kernel(start); 2009 tell_kernel(start);
2025 2010
2026 /* Ensure that we terminate if a device-servicing child dies. */ 2011 /* Ensure that we terminate if a device-servicing child dies. */
diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt
index 12f9ba20ccb7..550068466605 100644
--- a/Documentation/vm/hwpoison.txt
+++ b/Documentation/vm/hwpoison.txt
@@ -129,12 +129,12 @@ Limit injection to pages owned by memgroup. Specified by inode number
129of the memcg. 129of the memcg.
130 130
131Example: 131Example:
132 mkdir /cgroup/hwpoison 132 mkdir /sys/fs/cgroup/mem/hwpoison
133 133
134 usemem -m 100 -s 1000 & 134 usemem -m 100 -s 1000 &
135 echo `jobs -p` > /cgroup/hwpoison/tasks 135 echo `jobs -p` > /sys/fs/cgroup/mem/hwpoison/tasks
136 136
137 memcg_ino=$(ls -id /cgroup/hwpoison | cut -f1 -d' ') 137 memcg_ino=$(ls -id /sys/fs/cgroup/mem/hwpoison | cut -f1 -d' ')
138 echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg 138 echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg
139 139
140 page-types -p `pidof init` --hwpoison # shall do nothing 140 page-types -p `pidof init` --hwpoison # shall do nothing
diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index 9b7221a86df2..7c3a8801b7ce 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -674,7 +674,7 @@ Protocol: 2.10+
674 674
675Field name: init_size 675Field name: init_size
676Type: read 676Type: read
677Offset/size: 0x25c/4 677Offset/size: 0x260/4
678 678
679 This field indicates the amount of linear contiguous memory starting 679 This field indicates the amount of linear contiguous memory starting
680 at the kernel runtime start address that the kernel needs before it 680 at the kernel runtime start address that the kernel needs before it
diff --git a/Documentation/x86/entry_64.txt b/Documentation/x86/entry_64.txt
new file mode 100644
index 000000000000..7869f14d055c
--- /dev/null
+++ b/Documentation/x86/entry_64.txt
@@ -0,0 +1,98 @@
1This file documents some of the kernel entries in
2arch/x86/kernel/entry_64.S. A lot of this explanation is adapted from
3an email from Ingo Molnar:
4
5http://lkml.kernel.org/r/<20110529191055.GC9835%40elte.hu>
6
7The x86 architecture has quite a few different ways to jump into
8kernel code. Most of these entry points are registered in
9arch/x86/kernel/traps.c and implemented in arch/x86/kernel/entry_64.S
10and arch/x86/ia32/ia32entry.S.
11
12The IDT vector assignments are listed in arch/x86/include/irq_vectors.h.
13
14Some of these entries are:
15
16 - system_call: syscall instruction from 64-bit code.
17
18 - ia32_syscall: int 0x80 from 32-bit or 64-bit code; compat syscall
19 either way.
20
21 - ia32_syscall, ia32_sysenter: syscall and sysenter from 32-bit
22 code
23
24 - interrupt: An array of entries. Every IDT vector that doesn't
25 explicitly point somewhere else gets set to the corresponding
26 value in interrupts. These point to a whole array of
27 magically-generated functions that make their way to do_IRQ with
28 the interrupt number as a parameter.
29
30 - emulate_vsyscall: int 0xcc, a special non-ABI entry used by
31 vsyscall emulation.
32
33 - APIC interrupts: Various special-purpose interrupts for things
34 like TLB shootdown.
35
36 - Architecturally-defined exceptions like divide_error.
37
38There are a few complexities here. The different x86-64 entries
39have different calling conventions. The syscall and sysenter
40instructions have their own peculiar calling conventions. Some of
41the IDT entries push an error code onto the stack; others don't.
42IDT entries using the IST alternative stack mechanism need their own
43magic to get the stack frames right. (You can find some
44documentation in the AMD APM, Volume 2, Chapter 8 and the Intel SDM,
45Volume 3, Chapter 6.)
46
47Dealing with the swapgs instruction is especially tricky. Swapgs
48toggles whether gs is the kernel gs or the user gs. The swapgs
49instruction is rather fragile: it must nest perfectly and only in
50single depth, it should only be used if entering from user mode to
51kernel mode and then when returning to user-space, and precisely
52so. If we mess that up even slightly, we crash.
53
54So when we have a secondary entry, already in kernel mode, we *must
55not* use SWAPGS blindly - nor must we forget doing a SWAPGS when it's
56not switched/swapped yet.
57
58Now, there's a secondary complication: there's a cheap way to test
59which mode the CPU is in and an expensive way.
60
61The cheap way is to pick this info off the entry frame on the kernel
62stack, from the CS of the ptregs area of the kernel stack:
63
64 xorl %ebx,%ebx
65 testl $3,CS+8(%rsp)
66 je error_kernelspace
67 SWAPGS
68
69The expensive (paranoid) way is to read back the MSR_GS_BASE value
70(which is what SWAPGS modifies):
71
72 movl $1,%ebx
73 movl $MSR_GS_BASE,%ecx
74 rdmsr
75 testl %edx,%edx
76 js 1f /* negative -> in kernel */
77 SWAPGS
78 xorl %ebx,%ebx
791: ret
80
81and the whole paranoid non-paranoid macro complexity is about whether
82to suffer that RDMSR cost.
83
84If we are at an interrupt or user-trap/gate-alike boundary then we can
85use the faster check: the stack will be a reliable indicator of
86whether SWAPGS was already done: if we see that we are a secondary
87entry interrupting kernel mode execution, then we know that the GS
88base has already been switched. If it says that we interrupted
89user-space execution then we must do the SWAPGS.
90
91But if we are in an NMI/MCE/DEBUG/whatever super-atomic entry context,
92which might have triggered right after a normal entry wrote CS to the
93stack but before we executed SWAPGS, then the only safe way to check
94for GS is the slower method: the RDMSR.
95
96So we try only to mark those entry methods 'paranoid' that absolutely
97need the more expensive check for the GS base - and we generate all
98'normal' entry points with the regular (faster) entry macros.
diff --git a/Documentation/zh_CN/SubmitChecklist b/Documentation/zh_CN/SubmitChecklist
index 951415bbab0c..4c741d6bc048 100644
--- a/Documentation/zh_CN/SubmitChecklist
+++ b/Documentation/zh_CN/SubmitChecklist
@@ -67,7 +67,7 @@ Linux内核提交清单
67 67
6812:已经通过CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, 6812:已经通过CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT,
69 CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, 69 CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES,
70 CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_SPINLOCK_SLEEP测试,并且同时都 70 CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_ATOMIC_SLEEP测试,并且同时都
71 使能。 71 使能。
72 72
7313:已经都构建并且使用或者不使用 CONFIG_SMP 和 CONFIG_PREEMPT测试执行时间。 7313:已经都构建并且使用或者不使用 CONFIG_SMP 和 CONFIG_PREEMPT测试执行时间。