aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2009-04-21 23:02:09 -0400
committerPaul Mackerras <paulus@samba.org>2009-04-21 23:02:09 -0400
commit5bd3ef84d73c2ea7b4babbad060909753c4828d4 (patch)
treefdf2bafb48ae1ed03175f6c77a7548a181e69ee9 /Documentation
parent0658c16056660886ea2f35c4f038be70a94b1532 (diff)
parent6d25b688ecc488753af3c9e6f6a9a575b863cf37 (diff)
Merge branch 'merge' of git://git.secretlab.ca/git/linux-2.6 into merge
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/ABI/testing/debugfs-pktcdvd6
-rw-r--r--Documentation/DMA-mapping.txt18
-rw-r--r--Documentation/DocBook/Makefile11
-rw-r--r--Documentation/DocBook/kernel-api.tmpl2
-rw-r--r--Documentation/DocBook/writing-an-alsa-driver.tmpl8
-rw-r--r--Documentation/blockdev/00-INDEX2
-rw-r--r--Documentation/blockdev/mflash.txt84
-rw-r--r--Documentation/cgroups/cpuacct.txt18
-rw-r--r--Documentation/cgroups/memory.txt55
-rw-r--r--Documentation/cgroups/resource_counter.txt27
-rw-r--r--Documentation/devices.txt6
-rw-r--r--Documentation/fb/uvesafb.txt7
-rw-r--r--Documentation/feature-removal-schedule.txt12
-rw-r--r--Documentation/filesystems/00-INDEX2
-rw-r--r--Documentation/filesystems/knfsd-stats.txt159
-rw-r--r--Documentation/filesystems/nfs41-server.txt161
-rw-r--r--Documentation/filesystems/nilfs2.txt200
-rw-r--r--Documentation/hwmon/g760a36
-rw-r--r--Documentation/infiniband/ipoib.txt45
-rw-r--r--Documentation/input/rotary-encoder.txt101
-rw-r--r--Documentation/isdn/README.gigaset52
-rw-r--r--Documentation/kbuild/makefiles.txt83
-rw-r--r--Documentation/kernel-parameters.txt481
-rw-r--r--Documentation/kprobes.txt38
-rw-r--r--Documentation/powerpc/booting-without-of.txt89
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/upm-nand.txt39
-rw-r--r--Documentation/powerpc/dts-bindings/gpio/led.txt46
-rw-r--r--Documentation/powerpc/dts-bindings/mtd-physmap.txt80
-rw-r--r--Documentation/scsi/aacraid.txt15
-rw-r--r--Documentation/sound/alsa/soc/jack.txt71
-rw-r--r--Documentation/sparse.txt8
-rw-r--r--Documentation/sysctl/net.txt2
-rw-r--r--Documentation/sysctl/vm.txt28
-rw-r--r--Documentation/tomoyo.txt55
-rw-r--r--Documentation/trace/ftrace.txt (renamed from Documentation/ftrace.txt)0
-rw-r--r--Documentation/trace/kmemtrace.txt (renamed from Documentation/vm/kmemtrace.txt)0
-rw-r--r--Documentation/trace/mmiotrace.txt (renamed from Documentation/tracers/mmiotrace.txt)0
-rw-r--r--Documentation/trace/tracepoints.txt (renamed from Documentation/tracepoints.txt)0
-rw-r--r--Documentation/video4linux/pxa_camera.txt125
-rw-r--r--Documentation/video4linux/v4l2-framework.txt21
-rw-r--r--Documentation/vm/00-INDEX2
-rw-r--r--Documentation/vm/active_mm.txt83
-rw-r--r--Documentation/vm/unevictable-lru.txt1041
43 files changed, 2390 insertions, 929 deletions
diff --git a/Documentation/ABI/testing/debugfs-pktcdvd b/Documentation/ABI/testing/debugfs-pktcdvd
index bf9c16b64c34..cf11736acb76 100644
--- a/Documentation/ABI/testing/debugfs-pktcdvd
+++ b/Documentation/ABI/testing/debugfs-pktcdvd
@@ -1,4 +1,4 @@
1What: /debug/pktcdvd/pktcdvd[0-7] 1What: /sys/kernel/debug/pktcdvd/pktcdvd[0-7]
2Date: Oct. 2006 2Date: Oct. 2006
3KernelVersion: 2.6.20 3KernelVersion: 2.6.20
4Contact: Thomas Maier <balagi@justmail.de> 4Contact: Thomas Maier <balagi@justmail.de>
@@ -10,10 +10,10 @@ debugfs interface
10The pktcdvd module (packet writing driver) creates 10The pktcdvd module (packet writing driver) creates
11these files in debugfs: 11these files in debugfs:
12 12
13/debug/pktcdvd/pktcdvd[0-7]/ 13/sys/kernel/debug/pktcdvd/pktcdvd[0-7]/
14 info (0444) Lots of driver statistics and infos. 14 info (0444) Lots of driver statistics and infos.
15 15
16Example: 16Example:
17------- 17-------
18 18
19cat /debug/pktcdvd/pktcdvd0/info 19cat /sys/kernel/debug/pktcdvd/pktcdvd0/info
diff --git a/Documentation/DMA-mapping.txt b/Documentation/DMA-mapping.txt
index b2a4d6d244d9..01f24e94bdb6 100644
--- a/Documentation/DMA-mapping.txt
+++ b/Documentation/DMA-mapping.txt
@@ -136,7 +136,7 @@ exactly why.
136The standard 32-bit addressing PCI device would do something like 136The standard 32-bit addressing PCI device would do something like
137this: 137this:
138 138
139 if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) { 139 if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
140 printk(KERN_WARNING 140 printk(KERN_WARNING
141 "mydev: No suitable DMA available.\n"); 141 "mydev: No suitable DMA available.\n");
142 goto ignore_this_device; 142 goto ignore_this_device;
@@ -155,9 +155,9 @@ all 64-bits when accessing streaming DMA:
155 155
156 int using_dac; 156 int using_dac;
157 157
158 if (!pci_set_dma_mask(pdev, DMA_64BIT_MASK)) { 158 if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
159 using_dac = 1; 159 using_dac = 1;
160 } else if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK)) { 160 } else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
161 using_dac = 0; 161 using_dac = 0;
162 } else { 162 } else {
163 printk(KERN_WARNING 163 printk(KERN_WARNING
@@ -170,14 +170,14 @@ the case would look like this:
170 170
171 int using_dac, consistent_using_dac; 171 int using_dac, consistent_using_dac;
172 172
173 if (!pci_set_dma_mask(pdev, DMA_64BIT_MASK)) { 173 if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
174 using_dac = 1; 174 using_dac = 1;
175 consistent_using_dac = 1; 175 consistent_using_dac = 1;
176 pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK); 176 pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
177 } else if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK)) { 177 } else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) {
178 using_dac = 0; 178 using_dac = 0;
179 consistent_using_dac = 0; 179 consistent_using_dac = 0;
180 pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK); 180 pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
181 } else { 181 } else {
182 printk(KERN_WARNING 182 printk(KERN_WARNING
183 "mydev: No suitable DMA available.\n"); 183 "mydev: No suitable DMA available.\n");
@@ -192,7 +192,7 @@ check the return value from pci_set_consistent_dma_mask().
192Finally, if your device can only drive the low 24-bits of 192Finally, if your device can only drive the low 24-bits of
193address during PCI bus mastering you might do something like: 193address during PCI bus mastering you might do something like:
194 194
195 if (pci_set_dma_mask(pdev, DMA_24BIT_MASK)) { 195 if (pci_set_dma_mask(pdev, DMA_BIT_MASK(24))) {
196 printk(KERN_WARNING 196 printk(KERN_WARNING
197 "mydev: 24-bit DMA addressing not available.\n"); 197 "mydev: 24-bit DMA addressing not available.\n");
198 goto ignore_this_device; 198 goto ignore_this_device;
@@ -213,7 +213,7 @@ most specific mask.
213 213
214Here is pseudo-code showing how this might be done: 214Here is pseudo-code showing how this might be done:
215 215
216 #define PLAYBACK_ADDRESS_BITS DMA_32BIT_MASK 216 #define PLAYBACK_ADDRESS_BITS DMA_BIT_MASK(32)
217 #define RECORD_ADDRESS_BITS 0x00ffffff 217 #define RECORD_ADDRESS_BITS 0x00ffffff
218 218
219 struct my_sound_card *card; 219 struct my_sound_card *card;
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
index a3a83d38f96f..8918a32c6b3a 100644
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -31,7 +31,7 @@ PS_METHOD = $(prefer-db2x)
31 31
32### 32###
33# The targets that may be used. 33# The targets that may be used.
34PHONY += xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs 34PHONY += xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs cleandocs
35 35
36BOOKS := $(addprefix $(obj)/,$(DOCBOOKS)) 36BOOKS := $(addprefix $(obj)/,$(DOCBOOKS))
37xmldocs: $(BOOKS) 37xmldocs: $(BOOKS)
@@ -213,11 +213,12 @@ silent_gen_xml = :
213dochelp: 213dochelp:
214 @echo ' Linux kernel internal documentation in different formats:' 214 @echo ' Linux kernel internal documentation in different formats:'
215 @echo ' htmldocs - HTML' 215 @echo ' htmldocs - HTML'
216 @echo ' installmandocs - install man pages generated by mandocs'
217 @echo ' mandocs - man pages'
218 @echo ' pdfdocs - PDF' 216 @echo ' pdfdocs - PDF'
219 @echo ' psdocs - Postscript' 217 @echo ' psdocs - Postscript'
220 @echo ' xmldocs - XML DocBook' 218 @echo ' xmldocs - XML DocBook'
219 @echo ' mandocs - man pages'
220 @echo ' installmandocs - install man pages generated by mandocs'
221 @echo ' cleandocs - clean all generated DocBook files'
221 222
222### 223###
223# Temporary files left by various tools 224# Temporary files left by various tools
@@ -235,6 +236,10 @@ clean-files := $(DOCBOOKS) \
235 236
236clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man 237clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man
237 238
239cleandocs:
240 $(Q)rm -f $(call objectify, $(clean-files))
241 $(Q)rm -rf $(call objectify, $(clean-dirs))
242
238# Declare the contents of the .PHONY variable as phony. We keep that 243# Declare the contents of the .PHONY variable as phony. We keep that
239# information in a variable se we can use it in if_changed and friends. 244# information in a variable se we can use it in if_changed and friends.
240 245
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index 58c194572c76..d6ac5d61820e 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -259,7 +259,7 @@ X!Earch/x86/kernel/mca_32.c
259!Eblock/blk-tag.c 259!Eblock/blk-tag.c
260!Iblock/blk-tag.c 260!Iblock/blk-tag.c
261!Eblock/blk-integrity.c 261!Eblock/blk-integrity.c
262!Iblock/blktrace.c 262!Ikernel/trace/blktrace.c
263!Iblock/genhd.c 263!Iblock/genhd.c
264!Eblock/genhd.c 264!Eblock/genhd.c
265 </chapter> 265 </chapter>
diff --git a/Documentation/DocBook/writing-an-alsa-driver.tmpl b/Documentation/DocBook/writing-an-alsa-driver.tmpl
index 46b08fef3744..7a2e0e98986a 100644
--- a/Documentation/DocBook/writing-an-alsa-driver.tmpl
+++ b/Documentation/DocBook/writing-an-alsa-driver.tmpl
@@ -1137,8 +1137,8 @@
1137 if (err < 0) 1137 if (err < 0)
1138 return err; 1138 return err;
1139 /* check PCI availability (28bit DMA) */ 1139 /* check PCI availability (28bit DMA) */
1140 if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 || 1140 if (pci_set_dma_mask(pci, DMA_BIT_MASK(28)) < 0 ||
1141 pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) { 1141 pci_set_consistent_dma_mask(pci, DMA_BIT_MASK(28)) < 0) {
1142 printk(KERN_ERR "error to set 28bit mask DMA\n"); 1142 printk(KERN_ERR "error to set 28bit mask DMA\n");
1143 pci_disable_device(pci); 1143 pci_disable_device(pci);
1144 return -ENXIO; 1144 return -ENXIO;
@@ -1252,8 +1252,8 @@
1252 err = pci_enable_device(pci); 1252 err = pci_enable_device(pci);
1253 if (err < 0) 1253 if (err < 0)
1254 return err; 1254 return err;
1255 if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 || 1255 if (pci_set_dma_mask(pci, DMA_BIT_MASK(28)) < 0 ||
1256 pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) { 1256 pci_set_consistent_dma_mask(pci, DMA_BIT_MASK(28)) < 0) {
1257 printk(KERN_ERR "error to set 28bit mask DMA\n"); 1257 printk(KERN_ERR "error to set 28bit mask DMA\n");
1258 pci_disable_device(pci); 1258 pci_disable_device(pci);
1259 return -ENXIO; 1259 return -ENXIO;
diff --git a/Documentation/blockdev/00-INDEX b/Documentation/blockdev/00-INDEX
index 86f054c47013..c08df56dd91b 100644
--- a/Documentation/blockdev/00-INDEX
+++ b/Documentation/blockdev/00-INDEX
@@ -8,6 +8,8 @@ cpqarray.txt
8 - info on using Compaq's SMART2 Intelligent Disk Array Controllers. 8 - info on using Compaq's SMART2 Intelligent Disk Array Controllers.
9floppy.txt 9floppy.txt
10 - notes and driver options for the floppy disk driver. 10 - notes and driver options for the floppy disk driver.
11mflash.txt
12 - info on mGine m(g)flash driver for linux.
11nbd.txt 13nbd.txt
12 - info on a TCP implementation of a network block device. 14 - info on a TCP implementation of a network block device.
13paride.txt 15paride.txt
diff --git a/Documentation/blockdev/mflash.txt b/Documentation/blockdev/mflash.txt
new file mode 100644
index 000000000000..1f610ecf698a
--- /dev/null
+++ b/Documentation/blockdev/mflash.txt
@@ -0,0 +1,84 @@
1This document describes m[g]flash support in linux.
2
3Contents
4 1. Overview
5 2. Reserved area configuration
6 3. Example of mflash platform driver registration
7
81. Overview
9
10Mflash and gflash are embedded flash drive. The only difference is mflash is
11MCP(Multi Chip Package) device. These two device operate exactly same way.
12So the rest mflash repersents mflash and gflash altogether.
13
14Internally, mflash has nand flash and other hardware logics and supports
152 different operation (ATA, IO) modes. ATA mode doesn't need any new
16driver and currently works well under standard IDE subsystem. Actually it's
17one chip SSD. IO mode is ATA-like custom mode for the host that doesn't have
18IDE interface.
19
20Followings are brief descriptions about IO mode.
21A. IO mode based on ATA protocol and uses some custom command. (read confirm,
22write confirm)
23B. IO mode uses SRAM bus interface.
24C. IO mode supports 4kB boot area, so host can boot from mflash.
25
262. Reserved area configuration
27If host boot from mflash, usually needs raw area for boot loader image. All of
28the mflash's block device operation will be taken this value as start offset.
29Note that boot loader's size of reserved area and kernel configuration value
30must be same.
31
323. Example of mflash platform driver registration
33Working mflash is very straight forward. Adding platform device stuff to board
34configuration file is all. Here is some pseudo example.
35
36static struct mg_drv_data mflash_drv_data = {
37 /* If you want to polling driver set to 1 */
38 .use_polling = 0,
39 /* device attribution */
40 .dev_attr = MG_BOOT_DEV
41};
42
43static struct resource mg_mflash_rsc[] = {
44 /* Base address of mflash */
45 [0] = {
46 .start = 0x08000000,
47 .end = 0x08000000 + SZ_64K - 1,
48 .flags = IORESOURCE_MEM
49 },
50 /* mflash interrupt pin */
51 [1] = {
52 .start = IRQ_GPIO(84),
53 .end = IRQ_GPIO(84),
54 .flags = IORESOURCE_IRQ
55 },
56 /* mflash reset pin */
57 [2] = {
58 .start = 43,
59 .end = 43,
60 .name = MG_RST_PIN,
61 .flags = IORESOURCE_IO
62 },
63 /* mflash reset-out pin
64 * If you use mflash as storage device (i.e. other than MG_BOOT_DEV),
65 * should assign this */
66 [3] = {
67 .start = 51,
68 .end = 51,
69 .name = MG_RSTOUT_PIN,
70 .flags = IORESOURCE_IO
71 }
72};
73
74static struct platform_device mflash_dev = {
75 .name = MG_DEV_NAME,
76 .id = -1,
77 .dev = {
78 .platform_data = &mflash_drv_data,
79 },
80 .num_resources = ARRAY_SIZE(mg_mflash_rsc),
81 .resource = mg_mflash_rsc
82};
83
84platform_device_register(&mflash_dev);
diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroups/cpuacct.txt
index bb775fbe43d7..8b930946c52a 100644
--- a/Documentation/cgroups/cpuacct.txt
+++ b/Documentation/cgroups/cpuacct.txt
@@ -30,3 +30,21 @@ The above steps create a new group g1 and move the current shell
30process (bash) into it. CPU time consumed by this bash and its children 30process (bash) into it. CPU time consumed by this bash and its children
31can be obtained from g1/cpuacct.usage and the same is accumulated in 31can be obtained from g1/cpuacct.usage and the same is accumulated in
32/cgroups/cpuacct.usage also. 32/cgroups/cpuacct.usage also.
33
34cpuacct.stat file lists a few statistics which further divide the
35CPU time obtained by the cgroup into user and system times. Currently
36the following statistics are supported:
37
38user: Time spent by tasks of the cgroup in user mode.
39system: Time spent by tasks of the cgroup in kernel mode.
40
41user and system are in USER_HZ unit.
42
43cpuacct controller uses percpu_counter interface to collect user and
44system times. This has two side effects:
45
46- It is theoretically possible to see wrong values for user and system times.
47 This is because percpu_counter_read() on 32bit systems isn't safe
48 against concurrent writes.
49- It is possible to see slightly outdated values for user and system times
50 due to the batch processing nature of percpu_counter.
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index a98a7fe7aabb..1a608877b14e 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -6,15 +6,14 @@ used here with the memory controller that is used in hardware.
6 6
7Salient features 7Salient features
8 8
9a. Enable control of both RSS (mapped) and Page Cache (unmapped) pages 9a. Enable control of Anonymous, Page Cache (mapped and unmapped) and
10 Swap Cache memory pages.
10b. The infrastructure allows easy addition of other types of memory to control 11b. The infrastructure allows easy addition of other types of memory to control
11c. Provides *zero overhead* for non memory controller users 12c. Provides *zero overhead* for non memory controller users
12d. Provides a double LRU: global memory pressure causes reclaim from the 13d. Provides a double LRU: global memory pressure causes reclaim from the
13 global LRU; a cgroup on hitting a limit, reclaims from the per 14 global LRU; a cgroup on hitting a limit, reclaims from the per
14 cgroup LRU 15 cgroup LRU
15 16
16NOTE: Swap Cache (unmapped) is not accounted now.
17
18Benefits and Purpose of the memory controller 17Benefits and Purpose of the memory controller
19 18
20The memory controller isolates the memory behaviour of a group of tasks 19The memory controller isolates the memory behaviour of a group of tasks
@@ -290,34 +289,44 @@ will be charged as a new owner of it.
290 moved to the parent. If you want to avoid that, force_empty will be useful. 289 moved to the parent. If you want to avoid that, force_empty will be useful.
291 290
2925.2 stat file 2915.2 stat file
293 memory.stat file includes following statistics (now) 292
294 cache - # of pages from page-cache and shmem. 293memory.stat file includes following statistics
295 rss - # of pages from anonymous memory. 294
296 pgpgin - # of event of charging 295cache - # of bytes of page cache memory.
297 pgpgout - # of event of uncharging 296rss - # of bytes of anonymous and swap cache memory.
298 active_anon - # of pages on active lru of anon, shmem. 297pgpgin - # of pages paged in (equivalent to # of charging events).
299 inactive_anon - # of pages on active lru of anon, shmem 298pgpgout - # of pages paged out (equivalent to # of uncharging events).
300 active_file - # of pages on active lru of file-cache 299active_anon - # of bytes of anonymous and swap cache memory on active
301 inactive_file - # of pages on inactive lru of file cache 300 lru list.
302 unevictable - # of pages cannot be reclaimed.(mlocked etc) 301inactive_anon - # of bytes of anonymous memory and swap cache memory on
303 302 inactive lru list.
304 Below is depend on CONFIG_DEBUG_VM. 303active_file - # of bytes of file-backed memory on active lru list.
305 inactive_ratio - VM internal parameter. (see mm/page_alloc.c) 304inactive_file - # of bytes of file-backed memory on inactive lru list.
306 recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) 305unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc).
307 recent_rotated_file - VM internal parameter. (see mm/vmscan.c) 306
308 recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) 307The following additional stats are dependent on CONFIG_DEBUG_VM.
309 recent_scanned_file - VM internal parameter. (see mm/vmscan.c) 308
310 309inactive_ratio - VM internal parameter. (see mm/page_alloc.c)
311 Memo: 310recent_rotated_anon - VM internal parameter. (see mm/vmscan.c)
311recent_rotated_file - VM internal parameter. (see mm/vmscan.c)
312recent_scanned_anon - VM internal parameter. (see mm/vmscan.c)
313recent_scanned_file - VM internal parameter. (see mm/vmscan.c)
314
315Memo:
312 recent_rotated means recent frequency of lru rotation. 316 recent_rotated means recent frequency of lru rotation.
313 recent_scanned means recent # of scans to lru. 317 recent_scanned means recent # of scans to lru.
314 showing for better debug please see the code for meanings. 318 showing for better debug please see the code for meanings.
315 319
320Note:
321 Only anonymous and swap cache memory is listed as part of 'rss' stat.
322 This should not be confused with the true 'resident set size' or the
323 amount of physical memory used by the cgroup. Per-cgroup rss
324 accounting is not done yet.
316 325
3175.3 swappiness 3265.3 swappiness
318 Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. 327 Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
319 328
320 Following cgroup's swapiness can't be changed. 329 Following cgroups' swapiness can't be changed.
321 - root cgroup (uses /proc/sys/vm/swappiness). 330 - root cgroup (uses /proc/sys/vm/swappiness).
322 - a cgroup which uses hierarchy and it has child cgroup. 331 - a cgroup which uses hierarchy and it has child cgroup.
323 - a cgroup which uses hierarchy and not the root of hierarchy. 332 - a cgroup which uses hierarchy and not the root of hierarchy.
diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt
index f196ac1d7d25..95b24d766eab 100644
--- a/Documentation/cgroups/resource_counter.txt
+++ b/Documentation/cgroups/resource_counter.txt
@@ -47,13 +47,18 @@ to work with it.
47 47
482. Basic accounting routines 482. Basic accounting routines
49 49
50 a. void res_counter_init(struct res_counter *rc) 50 a. void res_counter_init(struct res_counter *rc,
51 struct res_counter *rc_parent)
51 52
52 Initializes the resource counter. As usual, should be the first 53 Initializes the resource counter. As usual, should be the first
53 routine called for a new counter. 54 routine called for a new counter.
54 55
55 b. int res_counter_charge[_locked] 56 The struct res_counter *parent can be used to define a hierarchical
56 (struct res_counter *rc, unsigned long val) 57 child -> parent relationship directly in the res_counter structure,
58 NULL can be used to define no relationship.
59
60 c. int res_counter_charge(struct res_counter *rc, unsigned long val,
61 struct res_counter **limit_fail_at)
57 62
58 When a resource is about to be allocated it has to be accounted 63 When a resource is about to be allocated it has to be accounted
59 with the appropriate resource counter (controller should determine 64 with the appropriate resource counter (controller should determine
@@ -67,15 +72,25 @@ to work with it.
67 * if the charging is performed first, then it should be uncharged 72 * if the charging is performed first, then it should be uncharged
68 on error path (if the one is called). 73 on error path (if the one is called).
69 74
70 c. void res_counter_uncharge[_locked] 75 If the charging fails and a hierarchical dependency exists, the
76 limit_fail_at parameter is set to the particular res_counter element
77 where the charging failed.
78
79 d. int res_counter_charge_locked
80 (struct res_counter *rc, unsigned long val)
81
82 The same as res_counter_charge(), but it must not acquire/release the
83 res_counter->lock internally (it must be called with res_counter->lock
84 held).
85
86 e. void res_counter_uncharge[_locked]
71 (struct res_counter *rc, unsigned long val) 87 (struct res_counter *rc, unsigned long val)
72 88
73 When a resource is released (freed) it should be de-accounted 89 When a resource is released (freed) it should be de-accounted
74 from the resource counter it was accounted to. This is called 90 from the resource counter it was accounted to. This is called
75 "uncharging". 91 "uncharging".
76 92
77 The _locked routines imply that the res_counter->lock is taken. 93 The _locked routines imply that the res_counter->lock is taken.
78
79 94
80 2.1 Other accounting routines 95 2.1 Other accounting routines
81 96
diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 327de1624759..53d64d382343 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -3,7 +3,7 @@
3 3
4 Maintained by Alan Cox <device@lanana.org> 4 Maintained by Alan Cox <device@lanana.org>
5 5
6 Last revised: 29 November 2006 6 Last revised: 6th April 2009
7 7
8This list is the Linux Device List, the official registry of allocated 8This list is the Linux Device List, the official registry of allocated
9device numbers and /dev directory nodes for the Linux operating 9device numbers and /dev directory nodes for the Linux operating
@@ -2797,6 +2797,10 @@ Your cooperation is appreciated.
2797 206 = /dev/ttySC1 SC26xx serial port 1 2797 206 = /dev/ttySC1 SC26xx serial port 1
2798 207 = /dev/ttySC2 SC26xx serial port 2 2798 207 = /dev/ttySC2 SC26xx serial port 2
2799 208 = /dev/ttySC3 SC26xx serial port 3 2799 208 = /dev/ttySC3 SC26xx serial port 3
2800 209 = /dev/ttyMAX0 MAX3100 serial port 0
2801 210 = /dev/ttyMAX1 MAX3100 serial port 1
2802 211 = /dev/ttyMAX2 MAX3100 serial port 2
2803 212 = /dev/ttyMAX3 MAX3100 serial port 3
2800 2804
2801205 char Low-density serial ports (alternate device) 2805205 char Low-density serial ports (alternate device)
2802 0 = /dev/culu0 Callout device for ttyLU0 2806 0 = /dev/culu0 Callout device for ttyLU0
diff --git a/Documentation/fb/uvesafb.txt b/Documentation/fb/uvesafb.txt
index 7ac3c4078ff9..eefdd91d298a 100644
--- a/Documentation/fb/uvesafb.txt
+++ b/Documentation/fb/uvesafb.txt
@@ -59,7 +59,8 @@ Accepted options:
59ypan Enable display panning using the VESA protected mode 59ypan Enable display panning using the VESA protected mode
60 interface. The visible screen is just a window of the 60 interface. The visible screen is just a window of the
61 video memory, console scrolling is done by changing the 61 video memory, console scrolling is done by changing the
62 start of the window. Available on x86 only. 62 start of the window. This option is available on x86
63 only and is the default option on that architecture.
63 64
64ywrap Same as ypan, but assumes your gfx board can wrap-around 65ywrap Same as ypan, but assumes your gfx board can wrap-around
65 the video memory (i.e. starts reading from top if it 66 the video memory (i.e. starts reading from top if it
@@ -67,7 +68,7 @@ ywrap Same as ypan, but assumes your gfx board can wrap-around
67 Available on x86 only. 68 Available on x86 only.
68 69
69redraw Scroll by redrawing the affected part of the screen, this 70redraw Scroll by redrawing the affected part of the screen, this
70 is the safe (and slow) default. 71 is the default on non-x86.
71 72
72(If you're using uvesafb as a module, the above three options are 73(If you're using uvesafb as a module, the above three options are
73 used a parameter of the scroll option, e.g. scroll=ypan.) 74 used a parameter of the scroll option, e.g. scroll=ypan.)
@@ -182,7 +183,7 @@ from the Video BIOS if you set pixclock to 0 in fb_var_screeninfo.
182 183
183-- 184--
184 Michal Januszewski <spock@gentoo.org> 185 Michal Januszewski <spock@gentoo.org>
185 Last updated: 2007-06-16 186 Last updated: 2009-03-30
186 187
187 Documentation of the uvesafb options is loosely based on vesafb.txt. 188 Documentation of the uvesafb options is loosely based on vesafb.txt.
188 189
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 39246fc11257..de491a3e2313 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -354,7 +354,8 @@ Who: Krzysztof Piotr Oledzki <ole@ans.pl>
354 354
355--------------------------- 355---------------------------
356 356
357What: i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client() 357What: i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client(),
358 i2c_adapter->client_register(), i2c_adapter->client_unregister
358When: 2.6.30 359When: 2.6.30
359Check: i2c_attach_client i2c_detach_client 360Check: i2c_attach_client i2c_detach_client
360Why: Deprecated by the new (standard) device driver binding model. Use 361Why: Deprecated by the new (standard) device driver binding model. Use
@@ -427,3 +428,12 @@ Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to
427 After a reasonable transition period, we will remove the legacy 428 After a reasonable transition period, we will remove the legacy
428 fakephp interface. 429 fakephp interface.
429Who: Alex Chiang <achiang@hp.com> 430Who: Alex Chiang <achiang@hp.com>
431
432---------------------------
433
434What: i2c-voodoo3 driver
435When: October 2009
436Why: Superseded by tdfxfb. I2C/DDC support used to live in a separate
437 driver but this caused driver conflicts.
438Who: Jean Delvare <khali@linux-fr.org>
439 Krzysztof Helt <krzysztof.h1@wp.pl>
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 52cd611277a3..8dd6db76171d 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -68,6 +68,8 @@ ncpfs.txt
68 - info on Novell Netware(tm) filesystem using NCP protocol. 68 - info on Novell Netware(tm) filesystem using NCP protocol.
69nfsroot.txt 69nfsroot.txt
70 - short guide on setting up a diskless box with NFS root filesystem. 70 - short guide on setting up a diskless box with NFS root filesystem.
71nilfs2.txt
72 - info and mount options for the NILFS2 filesystem.
71ntfs.txt 73ntfs.txt
72 - info and mount options for the NTFS filesystem (Windows NT). 74 - info and mount options for the NTFS filesystem (Windows NT).
73ocfs2.txt 75ocfs2.txt
diff --git a/Documentation/filesystems/knfsd-stats.txt b/Documentation/filesystems/knfsd-stats.txt
new file mode 100644
index 000000000000..64ced5149d37
--- /dev/null
+++ b/Documentation/filesystems/knfsd-stats.txt
@@ -0,0 +1,159 @@
1
2Kernel NFS Server Statistics
3============================
4
5This document describes the format and semantics of the statistics
6which the kernel NFS server makes available to userspace. These
7statistics are available in several text form pseudo files, each of
8which is described separately below.
9
10In most cases you don't need to know these formats, as the nfsstat(8)
11program from the nfs-utils distribution provides a helpful command-line
12interface for extracting and printing them.
13
14All the files described here are formatted as a sequence of text lines,
15separated by newline '\n' characters. Lines beginning with a hash
16'#' character are comments intended for humans and should be ignored
17by parsing routines. All other lines contain a sequence of fields
18separated by whitespace.
19
20/proc/fs/nfsd/pool_stats
21------------------------
22
23This file is available in kernels from 2.6.30 onwards, if the
24/proc/fs/nfsd filesystem is mounted (it almost always should be).
25
26The first line is a comment which describes the fields present in
27all the other lines. The other lines present the following data as
28a sequence of unsigned decimal numeric fields. One line is shown
29for each NFS thread pool.
30
31All counters are 64 bits wide and wrap naturally. There is no way
32to zero these counters, instead applications should do their own
33rate conversion.
34
35pool
36 The id number of the NFS thread pool to which this line applies.
37 This number does not change.
38
39 Thread pool ids are a contiguous set of small integers starting
40 at zero. The maximum value depends on the thread pool mode, but
41 currently cannot be larger than the number of CPUs in the system.
42 Note that in the default case there will be a single thread pool
43 which contains all the nfsd threads and all the CPUs in the system,
44 and thus this file will have a single line with a pool id of "0".
45
46packets-arrived
47 Counts how many NFS packets have arrived. More precisely, this
48 is the number of times that the network stack has notified the
49 sunrpc server layer that new data may be available on a transport
50 (e.g. an NFS or UDP socket or an NFS/RDMA endpoint).
51
52 Depending on the NFS workload patterns and various network stack
53 effects (such as Large Receive Offload) which can combine packets
54 on the wire, this may be either more or less than the number
55 of NFS calls received (which statistic is available elsewhere).
56 However this is a more accurate and less workload-dependent measure
57 of how much CPU load is being placed on the sunrpc server layer
58 due to NFS network traffic.
59
60sockets-enqueued
61 Counts how many times an NFS transport is enqueued to wait for
62 an nfsd thread to service it, i.e. no nfsd thread was considered
63 available.
64
65 The circumstance this statistic tracks indicates that there was NFS
66 network-facing work to be done but it couldn't be done immediately,
67 thus introducing a small delay in servicing NFS calls. The ideal
68 rate of change for this counter is zero; significantly non-zero
69 values may indicate a performance limitation.
70
71 This can happen either because there are too few nfsd threads in the
72 thread pool for the NFS workload (the workload is thread-limited),
73 or because the NFS workload needs more CPU time than is available in
74 the thread pool (the workload is CPU-limited). In the former case,
75 configuring more nfsd threads will probably improve the performance
76 of the NFS workload. In the latter case, the sunrpc server layer is
77 already choosing not to wake idle nfsd threads because there are too
78 many nfsd threads which want to run but cannot, so configuring more
79 nfsd threads will make no difference whatsoever. The overloads-avoided
80 statistic (see below) can be used to distinguish these cases.
81
82threads-woken
83 Counts how many times an idle nfsd thread is woken to try to
84 receive some data from an NFS transport.
85
86 This statistic tracks the circumstance where incoming
87 network-facing NFS work is being handled quickly, which is a good
88 thing. The ideal rate of change for this counter will be close
89 to but less than the rate of change of the packets-arrived counter.
90
91overloads-avoided
92 Counts how many times the sunrpc server layer chose not to wake an
93 nfsd thread, despite the presence of idle nfsd threads, because
94 too many nfsd threads had been recently woken but could not get
95 enough CPU time to actually run.
96
97 This statistic counts a circumstance where the sunrpc layer
98 heuristically avoids overloading the CPU scheduler with too many
99 runnable nfsd threads. The ideal rate of change for this counter
100 is zero. Significant non-zero values indicate that the workload
101 is CPU limited. Usually this is associated with heavy CPU usage
102 on all the CPUs in the nfsd thread pool.
103
104 If a sustained large overloads-avoided rate is detected on a pool,
105 the top(1) utility should be used to check for the following
106 pattern of CPU usage on all the CPUs associated with the given
107 nfsd thread pool.
108
109 - %us ~= 0 (as you're *NOT* running applications on your NFS server)
110
111 - %wa ~= 0
112
113 - %id ~= 0
114
115 - %sy + %hi + %si ~= 100
116
117 If this pattern is seen, configuring more nfsd threads will *not*
118 improve the performance of the workload. If this patten is not
119 seen, then something more subtle is wrong.
120
121threads-timedout
122 Counts how many times an nfsd thread triggered an idle timeout,
123 i.e. was not woken to handle any incoming network packets for
124 some time.
125
126 This statistic counts a circumstance where there are more nfsd
127 threads configured than can be used by the NFS workload. This is
128 a clue that the number of nfsd threads can be reduced without
129 affecting performance. Unfortunately, it's only a clue and not
130 a strong indication, for a couple of reasons:
131
132 - Currently the rate at which the counter is incremented is quite
133 slow; the idle timeout is 60 minutes. Unless the NFS workload
134 remains constant for hours at a time, this counter is unlikely
135 to be providing information that is still useful.
136
137 - It is usually a wise policy to provide some slack,
138 i.e. configure a few more nfsds than are currently needed,
139 to allow for future spikes in load.
140
141
142Note that incoming packets on NFS transports will be dealt with in
143one of three ways. An nfsd thread can be woken (threads-woken counts
144this case), or the transport can be enqueued for later attention
145(sockets-enqueued counts this case), or the packet can be temporarily
146deferred because the transport is currently being used by an nfsd
147thread. This last case is not very interesting and is not explicitly
148counted, but can be inferred from the other counters thus:
149
150packets-deferred = packets-arrived - ( sockets-enqueued + threads-woken )
151
152
153More
154----
155Descriptions of the other statistics file should go here.
156
157
158Greg Banks <gnb@sgi.com>
15926 Mar 2009
diff --git a/Documentation/filesystems/nfs41-server.txt b/Documentation/filesystems/nfs41-server.txt
new file mode 100644
index 000000000000..05d81cbcb2e1
--- /dev/null
+++ b/Documentation/filesystems/nfs41-server.txt
@@ -0,0 +1,161 @@
1NFSv4.1 Server Implementation
2
3Server support for minorversion 1 can be controlled using the
4/proc/fs/nfsd/versions control file. The string output returned
5by reading this file will contain either "+4.1" or "-4.1"
6correspondingly.
7
8Currently, server support for minorversion 1 is disabled by default.
9It can be enabled at run time by writing the string "+4.1" to
10the /proc/fs/nfsd/versions control file. Note that to write this
11control file, the nfsd service must be taken down. Use your user-mode
12nfs-utils to set this up; see rpc.nfsd(8)
13
14The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based
15on the latest NFSv4.1 Internet Draft:
16http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29
17
18From the many new features in NFSv4.1 the current implementation
19focuses on the mandatory-to-implement NFSv4.1 Sessions, providing
20"exactly once" semantics and better control and throttling of the
21resources allocated for each client.
22
23Other NFSv4.1 features, Parallel NFS operations in particular,
24are still under development out of tree.
25See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design
26for more information.
27
28The table below, taken from the NFSv4.1 document, lists
29the operations that are mandatory to implement (REQ), optional
30(OPT), and NFSv4.0 operations that are required not to implement (MNI)
31in minor version 1. The first column indicates the operations that
32are not supported yet by the linux server implementation.
33
34The OPTIONAL features identified and their abbreviations are as follows:
35 pNFS Parallel NFS
36 FDELG File Delegations
37 DDELG Directory Delegations
38
39The following abbreviations indicate the linux server implementation status.
40 I Implemented NFSv4.1 operations.
41 NS Not Supported.
42 NS* unimplemented optional feature.
43 P pNFS features implemented out of tree.
44 PNS pNFS features that are not supported yet (out of tree).
45
46Operations
47
48 +----------------------+------------+--------------+----------------+
49 | Operation | REQ, REC, | Feature | Definition |
50 | | OPT, or | (REQ, REC, | |
51 | | MNI | or OPT) | |
52 +----------------------+------------+--------------+----------------+
53 | ACCESS | REQ | | Section 18.1 |
54NS | BACKCHANNEL_CTL | REQ | | Section 18.33 |
55NS | BIND_CONN_TO_SESSION | REQ | | Section 18.34 |
56 | CLOSE | REQ | | Section 18.2 |
57 | COMMIT | REQ | | Section 18.3 |
58 | CREATE | REQ | | Section 18.4 |
59I | CREATE_SESSION | REQ | | Section 18.36 |
60NS*| DELEGPURGE | OPT | FDELG (REQ) | Section 18.5 |
61 | DELEGRETURN | OPT | FDELG, | Section 18.6 |
62 | | | DDELG, pNFS | |
63 | | | (REQ) | |
64NS | DESTROY_CLIENTID | REQ | | Section 18.50 |
65I | DESTROY_SESSION | REQ | | Section 18.37 |
66I | EXCHANGE_ID | REQ | | Section 18.35 |
67NS | FREE_STATEID | REQ | | Section 18.38 |
68 | GETATTR | REQ | | Section 18.7 |
69P | GETDEVICEINFO | OPT | pNFS (REQ) | Section 18.40 |
70P | GETDEVICELIST | OPT | pNFS (OPT) | Section 18.41 |
71 | GETFH | REQ | | Section 18.8 |
72NS*| GET_DIR_DELEGATION | OPT | DDELG (REQ) | Section 18.39 |
73P | LAYOUTCOMMIT | OPT | pNFS (REQ) | Section 18.42 |
74P | LAYOUTGET | OPT | pNFS (REQ) | Section 18.43 |
75P | LAYOUTRETURN | OPT | pNFS (REQ) | Section 18.44 |
76 | LINK | OPT | | Section 18.9 |
77 | LOCK | REQ | | Section 18.10 |
78 | LOCKT | REQ | | Section 18.11 |
79 | LOCKU | REQ | | Section 18.12 |
80 | LOOKUP | REQ | | Section 18.13 |
81 | LOOKUPP | REQ | | Section 18.14 |
82 | NVERIFY | REQ | | Section 18.15 |
83 | OPEN | REQ | | Section 18.16 |
84NS*| OPENATTR | OPT | | Section 18.17 |
85 | OPEN_CONFIRM | MNI | | N/A |
86 | OPEN_DOWNGRADE | REQ | | Section 18.18 |
87 | PUTFH | REQ | | Section 18.19 |
88 | PUTPUBFH | REQ | | Section 18.20 |
89 | PUTROOTFH | REQ | | Section 18.21 |
90 | READ | REQ | | Section 18.22 |
91 | READDIR | REQ | | Section 18.23 |
92 | READLINK | OPT | | Section 18.24 |
93NS | RECLAIM_COMPLETE | REQ | | Section 18.51 |
94 | RELEASE_LOCKOWNER | MNI | | N/A |
95 | REMOVE | REQ | | Section 18.25 |
96 | RENAME | REQ | | Section 18.26 |
97 | RENEW | MNI | | N/A |
98 | RESTOREFH | REQ | | Section 18.27 |
99 | SAVEFH | REQ | | Section 18.28 |
100 | SECINFO | REQ | | Section 18.29 |
101NS | SECINFO_NO_NAME | REC | pNFS files | Section 18.45, |
102 | | | layout (REQ) | Section 13.12 |
103I | SEQUENCE | REQ | | Section 18.46 |
104 | SETATTR | REQ | | Section 18.30 |
105 | SETCLIENTID | MNI | | N/A |
106 | SETCLIENTID_CONFIRM | MNI | | N/A |
107NS | SET_SSV | REQ | | Section 18.47 |
108NS | TEST_STATEID | REQ | | Section 18.48 |
109 | VERIFY | REQ | | Section 18.31 |
110NS*| WANT_DELEGATION | OPT | FDELG (OPT) | Section 18.49 |
111 | WRITE | REQ | | Section 18.32 |
112
113Callback Operations
114
115 +-------------------------+-----------+-------------+---------------+
116 | Operation | REQ, REC, | Feature | Definition |
117 | | OPT, or | (REQ, REC, | |
118 | | MNI | or OPT) | |
119 +-------------------------+-----------+-------------+---------------+
120 | CB_GETATTR | OPT | FDELG (REQ) | Section 20.1 |
121P | CB_LAYOUTRECALL | OPT | pNFS (REQ) | Section 20.3 |
122NS*| CB_NOTIFY | OPT | DDELG (REQ) | Section 20.4 |
123P | CB_NOTIFY_DEVICEID | OPT | pNFS (OPT) | Section 20.12 |
124NS*| CB_NOTIFY_LOCK | OPT | | Section 20.11 |
125NS*| CB_PUSH_DELEG | OPT | FDELG (OPT) | Section 20.5 |
126 | CB_RECALL | OPT | FDELG, | Section 20.2 |
127 | | | DDELG, pNFS | |
128 | | | (REQ) | |
129NS*| CB_RECALL_ANY | OPT | FDELG, | Section 20.6 |
130 | | | DDELG, pNFS | |
131 | | | (REQ) | |
132NS | CB_RECALL_SLOT | REQ | | Section 20.8 |
133NS*| CB_RECALLABLE_OBJ_AVAIL | OPT | DDELG, pNFS | Section 20.7 |
134 | | | (REQ) | |
135I | CB_SEQUENCE | OPT | FDELG, | Section 20.9 |
136 | | | DDELG, pNFS | |
137 | | | (REQ) | |
138NS*| CB_WANTS_CANCELLED | OPT | FDELG, | Section 20.10 |
139 | | | DDELG, pNFS | |
140 | | | (REQ) | |
141 +-------------------------+-----------+-------------+---------------+
142
143Implementation notes:
144
145EXCHANGE_ID:
146* only SP4_NONE state protection supported
147* implementation ids are ignored
148
149CREATE_SESSION:
150* backchannel attributes are ignored
151* backchannel security parameters are ignored
152
153SEQUENCE:
154* no support for dynamic slot table renegotiation (optional)
155
156nfsv4.1 COMPOUND rules:
157The following cases aren't supported yet:
158* Enforcing of NFS4ERR_NOT_ONLY_OP for: BIND_CONN_TO_SESSION, CREATE_SESSION,
159 DESTROY_CLIENTID, DESTROY_SESSION, EXCHANGE_ID.
160* DESTROY_SESSION MUST be the final operation in the COMPOUND request.
161
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
new file mode 100644
index 000000000000..55c4300abfcb
--- /dev/null
+++ b/Documentation/filesystems/nilfs2.txt
@@ -0,0 +1,200 @@
1NILFS2
2------
3
4NILFS2 is a log-structured file system (LFS) supporting continuous
5snapshotting. In addition to versioning capability of the entire file
6system, users can even restore files mistakenly overwritten or
7destroyed just a few seconds ago. Since NILFS2 can keep consistency
8like conventional LFS, it achieves quick recovery after system
9crashes.
10
11NILFS2 creates a number of checkpoints every few seconds or per
12synchronous write basis (unless there is no change). Users can select
13significant versions among continuously created checkpoints, and can
14change them into snapshots which will be preserved until they are
15changed back to checkpoints.
16
17There is no limit on the number of snapshots until the volume gets
18full. Each snapshot is mountable as a read-only file system
19concurrently with its writable mount, and this feature is convenient
20for online backup.
21
22The userland tools are included in nilfs-utils package, which is
23available from the following download page. At least "mkfs.nilfs2",
24"mount.nilfs2", "umount.nilfs2", and "nilfs_cleanerd" (so called
25cleaner or garbage collector) are required. Details on the tools are
26described in the man pages included in the package.
27
28Project web page: http://www.nilfs.org/en/
29Download page: http://www.nilfs.org/en/download.html
30Git tree web page: http://www.nilfs.org/git/
31NILFS mailing lists: http://www.nilfs.org/mailman/listinfo/users
32
33Caveats
34=======
35
36Features which NILFS2 does not support yet:
37
38 - atime
39 - extended attributes
40 - POSIX ACLs
41 - quotas
42 - writable snapshots
43 - remote backup (CDP)
44 - data integrity
45 - defragmentation
46
47Mount options
48=============
49
50NILFS2 supports the following mount options:
51(*) == default
52
53barrier=on(*) This enables/disables barriers. barrier=off disables
54 it, barrier=on enables it.
55errors=continue(*) Keep going on a filesystem error.
56errors=remount-ro Remount the filesystem read-only on an error.
57errors=panic Panic and halt the machine if an error occurs.
58cp=n Specify the checkpoint-number of the snapshot to be
59 mounted. Checkpoints and snapshots are listed by lscp
60 user command. Only the checkpoints marked as snapshot
61 are mountable with this option. Snapshot is read-only,
62 so a read-only mount option must be specified together.
63order=relaxed(*) Apply relaxed order semantics that allows modified data
64 blocks to be written to disk without making a
65 checkpoint if no metadata update is going. This mode
66 is equivalent to the ordered data mode of the ext3
67 filesystem except for the updates on data blocks still
68 conserve atomicity. This will improve synchronous
69 write performance for overwriting.
70order=strict Apply strict in-order semantics that preserves sequence
71 of all file operations including overwriting of data
72 blocks. That means, it is guaranteed that no
73 overtaking of events occurs in the recovered file
74 system after a crash.
75
76NILFS2 usage
77============
78
79To use nilfs2 as a local file system, simply:
80
81 # mkfs -t nilfs2 /dev/block_device
82 # mount -t nilfs2 /dev/block_device /dir
83
84This will also invoke the cleaner through the mount helper program
85(mount.nilfs2).
86
87Checkpoints and snapshots are managed by the following commands.
88Their manpages are included in the nilfs-utils package above.
89
90 lscp list checkpoints or snapshots.
91 mkcp make a checkpoint or a snapshot.
92 chcp change an existing checkpoint to a snapshot or vice versa.
93 rmcp invalidate specified checkpoint(s).
94
95To mount a snapshot,
96
97 # mount -t nilfs2 -r -o cp=<cno> /dev/block_device /snap_dir
98
99where <cno> is the checkpoint number of the snapshot.
100
101To unmount the NILFS2 mount point or snapshot, simply:
102
103 # umount /dir
104
105Then, the cleaner daemon is automatically shut down by the umount
106helper program (umount.nilfs2).
107
108Disk format
109===========
110
111A nilfs2 volume is equally divided into a number of segments except
112for the super block (SB) and segment #0. A segment is the container
113of logs. Each log is composed of summary information blocks, payload
114blocks, and an optional super root block (SR):
115
116 ______________________________________________________
117 | |SB| | Segment | Segment | Segment | ... | Segment | |
118 |_|__|_|____0____|____1____|____2____|_____|____N____|_|
119 0 +1K +4K +8M +16M +24M +(8MB x N)
120 . . (Typical offsets for 4KB-block)
121 . .
122 .______________________.
123 | log | log |... | log |
124 |__1__|__2__|____|__m__|
125 . .
126 . .
127 . .
128 .______________________________.
129 | Summary | Payload blocks |SR|
130 |_blocks__|_________________|__|
131
132The payload blocks are organized per file, and each file consists of
133data blocks and B-tree node blocks:
134
135 |<--- File-A --->|<--- File-B --->|
136 _______________________________________________________________
137 | Data blocks | B-tree blocks | Data blocks | B-tree blocks | ...
138 _|_____________|_______________|_____________|_______________|_
139
140
141Since only the modified blocks are written in the log, it may have
142files without data blocks or B-tree node blocks.
143
144The organization of the blocks is recorded in the summary information
145blocks, which contains a header structure (nilfs_segment_summary), per
146file structures (nilfs_finfo), and per block structures (nilfs_binfo):
147
148 _________________________________________________________________________
149 | Summary | finfo | binfo | ... | binfo | finfo | binfo | ... | binfo |...
150 |_blocks__|___A___|_(A,1)_|_____|(A,Na)_|___B___|_(B,1)_|_____|(B,Nb)_|___
151
152
153The logs include regular files, directory files, symbolic link files
154and several meta data files. The mata data files are the files used
155to maintain file system meta data. The current version of NILFS2 uses
156the following meta data files:
157
158 1) Inode file (ifile) -- Stores on-disk inodes
159 2) Checkpoint file (cpfile) -- Stores checkpoints
160 3) Segment usage file (sufile) -- Stores allocation state of segments
161 4) Data address translation file -- Maps virtual block numbers to usual
162 (DAT) block numbers. This file serves to
163 make on-disk blocks relocatable.
164
165The following figure shows a typical organization of the logs:
166
167 _________________________________________________________________________
168 | Summary | regular file | file | ... | ifile | cpfile | sufile | DAT |SR|
169 |_blocks__|_or_directory_|_______|_____|_______|________|________|_____|__|
170
171
172To stride over segment boundaries, this sequence of files may be split
173into multiple logs. The sequence of logs that should be treated as
174logically one log, is delimited with flags marked in the segment
175summary. The recovery code of nilfs2 looks this boundary information
176to ensure atomicity of updates.
177
178The super root block is inserted for every checkpoints. It includes
179three special inodes, inodes for the DAT, cpfile, and sufile. Inodes
180of regular files, directories, symlinks and other special files, are
181included in the ifile. The inode of ifile itself is included in the
182corresponding checkpoint entry in the cpfile. Thus, the hierarchy
183among NILFS2 files can be depicted as follows:
184
185 Super block (SB)
186 |
187 v
188 Super root block (the latest cno=xx)
189 |-- DAT
190 |-- sufile
191 `-- cpfile
192 |-- ifile (cno=c1)
193 |-- ifile (cno=c2) ---- file (ino=i1)
194 : : |-- file (ino=i2)
195 `-- ifile (cno=xx) |-- file (ino=i3)
196 : :
197 `-- file (ino=yy)
198 ( regular file, directory, or symlink )
199
200For detail on the format of each file, please see include/linux/nilfs2_fs.h.
diff --git a/Documentation/hwmon/g760a b/Documentation/hwmon/g760a
new file mode 100644
index 000000000000..e032eeb75629
--- /dev/null
+++ b/Documentation/hwmon/g760a
@@ -0,0 +1,36 @@
1Kernel driver g760a
2===================
3
4Supported chips:
5 * Global Mixed-mode Technology Inc. G760A
6 Prefix: 'g760a'
7 Datasheet: Publicly available at the GMT website
8 http://www.gmt.com.tw/datasheet/g760a.pdf
9
10Author: Herbert Valerio Riedel <hvr@gnu.org>
11
12Description
13-----------
14
15The GMT G760A Fan Speed PWM Controller is connected directly to a fan
16and performs closed-loop control of the fan speed.
17
18The fan speed is programmed by setting the period via 'pwm1' of two
19consecutive speed pulses. The period is defined in terms of clock
20cycle counts of an assumed 32kHz clock source.
21
22Setting a period of 0 stops the fan; setting the period to 255 sets
23fan to maximum speed.
24
25The measured fan rotation speed returned via 'fan1_input' is derived
26from the measured speed pulse period by assuming again a 32kHz clock
27source and a 2 pulse-per-revolution fan.
28
29The 'alarms' file provides access to the two alarm bits provided by
30the G760A chip's status register: Bit 0 is set when the actual fan
31speed differs more than 20% with respect to the programmed fan speed;
32bit 1 is set when fan speed is below 1920 RPM.
33
34The g760a driver will not update its values more frequently than every
35other second; reading them more often will do no harm, but will return
36'old' values.
diff --git a/Documentation/infiniband/ipoib.txt b/Documentation/infiniband/ipoib.txt
index 864ff3283780..6d40f00b358c 100644
--- a/Documentation/infiniband/ipoib.txt
+++ b/Documentation/infiniband/ipoib.txt
@@ -24,6 +24,49 @@ Partitions and P_Keys
24 The P_Key for any interface is given by the "pkey" file, and the 24 The P_Key for any interface is given by the "pkey" file, and the
25 main interface for a subinterface is in "parent." 25 main interface for a subinterface is in "parent."
26 26
27Datagram vs Connected modes
28
29 The IPoIB driver supports two modes of operation: datagram and
30 connected. The mode is set and read through an interface's
31 /sys/class/net/<intf name>/mode file.
32
33 In datagram mode, the IB UD (Unreliable Datagram) transport is used
34 and so the interface MTU has is equal to the IB L2 MTU minus the
35 IPoIB encapsulation header (4 bytes). For example, in a typical IB
36 fabric with a 2K MTU, the IPoIB MTU will be 2048 - 4 = 2044 bytes.
37
38 In connected mode, the IB RC (Reliable Connected) transport is used.
39 Connected mode is to takes advantage of the connected nature of the
40 IB transport and allows an MTU up to the maximal IP packet size of
41 64K, which reduces the number of IP packets needed for handling
42 large UDP datagrams, TCP segments, etc and increases the performance
43 for large messages.
44
45 In connected mode, the interface's UD QP is still used for multicast
46 and communication with peers that don't support connected mode. In
47 this case, RX emulation of ICMP PMTU packets is used to cause the
48 networking stack to use the smaller UD MTU for these neighbours.
49
50Stateless offloads
51
52 If the IB HW supports IPoIB stateless offloads, IPoIB advertises
53 TCP/IP checksum and/or Large Send (LSO) offloading capability to the
54 network stack.
55
56 Large Receive (LRO) offloading is also implemented and may be turned
57 on/off using ethtool calls. Currently LRO is supported only for
58 checksum offload capable devices.
59
60 Stateless offloads are supported only in datagram mode.
61
62Interrupt moderation
63
64 If the underlying IB device supports CQ event moderation, one can
65 use ethtool to set interrupt mitigation parameters and thus reduce
66 the overhead incurred by handling interrupts. The main code path of
67 IPoIB doesn't use events for TX completion signaling so only RX
68 moderation is supported.
69
27Debugging Information 70Debugging Information
28 71
29 By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set 72 By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set
@@ -55,3 +98,5 @@ References
55 http://ietf.org/rfc/rfc4391.txt 98 http://ietf.org/rfc/rfc4391.txt
56 IP over InfiniBand (IPoIB) Architecture (RFC 4392) 99 IP over InfiniBand (IPoIB) Architecture (RFC 4392)
57 http://ietf.org/rfc/rfc4392.txt 100 http://ietf.org/rfc/rfc4392.txt
101 IP over InfiniBand: Connected Mode (RFC 4755)
102 http://ietf.org/rfc/rfc4755.txt
diff --git a/Documentation/input/rotary-encoder.txt b/Documentation/input/rotary-encoder.txt
new file mode 100644
index 000000000000..435102a26d96
--- /dev/null
+++ b/Documentation/input/rotary-encoder.txt
@@ -0,0 +1,101 @@
1rotary-encoder - a generic driver for GPIO connected devices
2Daniel Mack <daniel@caiaq.de>, Feb 2009
3
40. Function
5-----------
6
7Rotary encoders are devices which are connected to the CPU or other
8peripherals with two wires. The outputs are phase-shifted by 90 degrees
9and by triggering on falling and rising edges, the turn direction can
10be determined.
11
12The phase diagram of these two outputs look like this:
13
14 _____ _____ _____
15 | | | | | |
16 Channel A ____| |_____| |_____| |____
17
18 : : : : : : : : : : : :
19 __ _____ _____ _____
20 | | | | | | |
21 Channel B |_____| |_____| |_____| |__
22
23 : : : : : : : : : : : :
24 Event a b c d a b c d a b c d
25
26 |<-------->|
27 one step
28
29
30For more information, please see
31 http://en.wikipedia.org/wiki/Rotary_encoder
32
33
341. Events / state machine
35-------------------------
36
37a) Rising edge on channel A, channel B in low state
38 This state is used to recognize a clockwise turn
39
40b) Rising edge on channel B, channel A in high state
41 When entering this state, the encoder is put into 'armed' state,
42 meaning that there it has seen half the way of a one-step transition.
43
44c) Falling edge on channel A, channel B in high state
45 This state is used to recognize a counter-clockwise turn
46
47d) Falling edge on channel B, channel A in low state
48 Parking position. If the encoder enters this state, a full transition
49 should have happend, unless it flipped back on half the way. The
50 'armed' state tells us about that.
51
522. Platform requirements
53------------------------
54
55As there is no hardware dependent call in this driver, the platform it is
56used with must support gpiolib. Another requirement is that IRQs must be
57able to fire on both edges.
58
59
603. Board integration
61--------------------
62
63To use this driver in your system, register a platform_device with the
64name 'rotary-encoder' and associate the IRQs and some specific platform
65data with it.
66
67struct rotary_encoder_platform_data is declared in
68include/linux/rotary-encoder.h and needs to be filled with the number of
69steps the encoder has and can carry information about externally inverted
70signals (because of used invertig buffer or other reasons).
71
72Because GPIO to IRQ mapping is platform specific, this information must
73be given in seperately to the driver. See the example below.
74
75---------<snip>---------
76
77/* board support file example */
78
79#include <linux/input.h>
80#include <linux/rotary_encoder.h>
81
82#define GPIO_ROTARY_A 1
83#define GPIO_ROTARY_B 2
84
85static struct rotary_encoder_platform_data my_rotary_encoder_info = {
86 .steps = 24,
87 .axis = ABS_X,
88 .gpio_a = GPIO_ROTARY_A,
89 .gpio_b = GPIO_ROTARY_B,
90 .inverted_a = 0,
91 .inverted_b = 0,
92};
93
94static struct platform_device rotary_encoder_device = {
95 .name = "rotary-encoder",
96 .id = 0,
97 .dev = {
98 .platform_data = &my_rotary_encoder_info,
99 }
100};
101
diff --git a/Documentation/isdn/README.gigaset b/Documentation/isdn/README.gigaset
index 55b2852904a4..02c0e9341dd8 100644
--- a/Documentation/isdn/README.gigaset
+++ b/Documentation/isdn/README.gigaset
@@ -61,24 +61,28 @@ GigaSet 307x Device Driver
61 --------------------- 61 ---------------------
622.1. Modules 622.1. Modules
63 ------- 63 -------
64 To get the device working, you have to load the proper kernel module. You 64 For the devices to work, the proper kernel modules have to be loaded.
65 can do this using 65 This normally happens automatically when the system detects the USB
66 modprobe modulename 66 device (base, M105) or when the line discipline is attached (M101). It
67 where modulename is ser_gigaset (M101), usb_gigaset (M105), or 67 can also be triggered manually using the modprobe(8) command, for example
68 bas_gigaset (direct USB connection to the base). 68 for troubleshooting or to pass module parameters.
69 69
70 The module ser_gigaset provides a serial line discipline N_GIGASET_M101 70 The module ser_gigaset provides a serial line discipline N_GIGASET_M101
71 which drives the device through the regular serial line driver. To use it, 71 which drives the device through the regular serial line driver. It must
72 run the Gigaset M101 daemon "gigasetm101d" (also available from 72 be attached to the serial line to which the M101 is connected with the
73 http://sourceforge.net/projects/gigaset307x/) with the device file of the 73 ldattach(8) command (requires util-linux-ng release 2.14 or later), for
74 RS232 port to the M101 as an argument, for example: 74 example:
75 gigasetm101d /dev/ttyS1 75 ldattach GIGASET_M101 /dev/ttyS1
76 This will open the device file, set its line discipline to N_GIGASET_M101, 76 This will open the device file, attach the line discipline to it, and
77 and then sleep in the background, keeping the device open so that the 77 then sleep in the background, keeping the device open so that the line
78 line discipline remains active. To deactivate it, kill the daemon, for 78 discipline remains active. To deactivate it, kill the daemon, for example
79 example with 79 with
80 killall gigasetm101d 80 killall ldattach
81 before disconnecting the device. 81 before disconnecting the device. To have this happen automatically at
82 system startup/shutdown on an LSB compatible system, create and activate
83 an appropriate LSB startup script /etc/init.d/gigaset. (The init name
84 'gigaset' is officially assigned to this project by LANANA.)
85 Alternatively, just add the 'ldattach' command line to /etc/rc.local.
82 86
832.2. Device nodes for user space programs 872.2. Device nodes for user space programs
84 ------------------------------------ 88 ------------------------------------
@@ -194,10 +198,11 @@ GigaSet 307x Device Driver
194 operation (for wireless access to the base), but are needed for access 198 operation (for wireless access to the base), but are needed for access
195 to the M105's own configuration mode (registration to the base, baudrate 199 to the M105's own configuration mode (registration to the base, baudrate
196 and line format settings, device status queries) via the gigacontr 200 and line format settings, device status queries) via the gigacontr
197 utility. Their use is disabled in the driver by default for safety 201 utility. Their use is controlled by the kernel configuration option
198 reasons but can be enabled by setting the kernel configuration option 202 "Support for undocumented USB requests" (CONFIG_GIGASET_UNDOCREQ). If you
199 "Support for undocumented USB requests" (GIGASET_UNDOCREQ) to "Y" and 203 encounter error code -ENOTTY when trying to use some features of the
200 recompiling. 204 M105, try setting that option to "y" via 'make {x,menu}config' and
205 recompiling the driver.
201 206
202 207
2033. Troubleshooting 2083. Troubleshooting
@@ -228,6 +233,13 @@ GigaSet 307x Device Driver
228 Solution: 233 Solution:
229 Select Unimodem mode for all DECT data adapters. (see section 2.4.) 234 Select Unimodem mode for all DECT data adapters. (see section 2.4.)
230 235
236 Problem:
237 You want to configure your USB DECT data adapter (M105) but gigacontr
238 reports an error: "/dev/ttyGU0: Inappropriate ioctl for device".
239 Solution:
240 Recompile the usb_gigaset driver with the kernel configuration option
241 CONFIG_GIGASET_UNDOCREQ set to 'y'. (see section 2.6.)
242
2313.2. Telling the driver to provide more information 2433.2. Telling the driver to provide more information
232 ---------------------------------------------- 244 ----------------------------------------------
233 Building the driver with the "Gigaset debugging" kernel configuration 245 Building the driver with the "Gigaset debugging" kernel configuration
diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt
index 51104f9194a5..d4b05672f9f7 100644
--- a/Documentation/kbuild/makefiles.txt
+++ b/Documentation/kbuild/makefiles.txt
@@ -40,10 +40,16 @@ This document describes the Linux kernel Makefiles.
40 --- 6.7 Custom kbuild commands 40 --- 6.7 Custom kbuild commands
41 --- 6.8 Preprocessing linker scripts 41 --- 6.8 Preprocessing linker scripts
42 42
43 === 7 Kbuild Variables 43 === 7 Kbuild syntax for exported headers
44 === 8 Makefile language 44 --- 7.1 header-y
45 === 9 Credits 45 --- 7.2 objhdr-y
46 === 10 TODO 46 --- 7.3 destination-y
47 --- 7.4 unifdef-y (deprecated)
48
49 === 8 Kbuild Variables
50 === 9 Makefile language
51 === 10 Credits
52 === 11 TODO
47 53
48=== 1 Overview 54=== 1 Overview
49 55
@@ -1143,8 +1149,69 @@ When kbuild executes, the following steps are followed (roughly):
1143 The kbuild infrastructure for *lds file are used in several 1149 The kbuild infrastructure for *lds file are used in several
1144 architecture-specific files. 1150 architecture-specific files.
1145 1151
1152=== 7 Kbuild syntax for exported headers
1153
1154The kernel include a set of headers that is exported to userspace.
1155Many headers can be exported as-is but other headers requires a
1156minimal pre-processing before they are ready for user-space.
1157The pre-processing does:
1158- drop kernel specific annotations
1159- drop include of compiler.h
1160- drop all sections that is kernel internat (guarded by ifdef __KERNEL__)
1161
1162Each relevant directory contain a file name "Kbuild" which specify the
1163headers to be exported.
1164See subsequent chapter for the syntax of the Kbuild file.
1165
1166 --- 7.1 header-y
1167
1168 header-y specify header files to be exported.
1169
1170 Example:
1171 #include/linux/Kbuild
1172 header-y += usb/
1173 header-y += aio_abi.h
1174
1175 The convention is to list one file per line and
1176 preferably in alphabetic order.
1177
1178 header-y also specify which subdirectories to visit.
1179 A subdirectory is identified by a trailing '/' which
1180 can be seen in the example above for the usb subdirectory.
1181
1182 Subdirectories are visited before their parent directories.
1183
1184 --- 7.2 objhdr-y
1185
1186 objhdr-y specifies generated files to be exported.
1187 Generated files are special as they need to be looked
1188 up in another directory when doing 'make O=...' builds.
1189
1190 Example:
1191 #include/linux/Kbuild
1192 objhdr-y += version.h
1193
1194 --- 7.3 destination-y
1195
1196 When an architecture have a set of exported headers that needs to be
1197 exported to a different directory destination-y is used.
1198 destination-y specify the destination directory for all exported
1199 headers in the file where it is present.
1200
1201 Example:
1202 #arch/xtensa/platforms/s6105/include/platform/Kbuild
1203 destination-y := include/linux
1204
1205 In the example above all exported headers in the Kbuild file
1206 will be located in the directory "include/linux" when exported.
1207
1208
1209 --- 7.4 unifdef-y (deprecated)
1210
1211 unifdef-y is deprecated. A direct replacement is header-y.
1212
1146 1213
1147=== 7 Kbuild Variables 1214=== 8 Kbuild Variables
1148 1215
1149The top Makefile exports the following variables: 1216The top Makefile exports the following variables:
1150 1217
@@ -1206,7 +1273,7 @@ The top Makefile exports the following variables:
1206 INSTALL_MOD_STRIP will used as the option(s) to the strip command. 1273 INSTALL_MOD_STRIP will used as the option(s) to the strip command.
1207 1274
1208 1275
1209=== 8 Makefile language 1276=== 9 Makefile language
1210 1277
1211The kernel Makefiles are designed to be run with GNU Make. The Makefiles 1278The kernel Makefiles are designed to be run with GNU Make. The Makefiles
1212use only the documented features of GNU Make, but they do use many 1279use only the documented features of GNU Make, but they do use many
@@ -1225,14 +1292,14 @@ time the left-hand side is used.
1225There are some cases where "=" is appropriate. Usually, though, ":=" 1292There are some cases where "=" is appropriate. Usually, though, ":="
1226is the right choice. 1293is the right choice.
1227 1294
1228=== 9 Credits 1295=== 10 Credits
1229 1296
1230Original version made by Michael Elizabeth Chastain, <mailto:mec@shout.net> 1297Original version made by Michael Elizabeth Chastain, <mailto:mec@shout.net>
1231Updates by Kai Germaschewski <kai@tp1.ruhr-uni-bochum.de> 1298Updates by Kai Germaschewski <kai@tp1.ruhr-uni-bochum.de>
1232Updates by Sam Ravnborg <sam@ravnborg.org> 1299Updates by Sam Ravnborg <sam@ravnborg.org>
1233Language QA by Jan Engelhardt <jengelh@gmx.de> 1300Language QA by Jan Engelhardt <jengelh@gmx.de>
1234 1301
1235=== 10 TODO 1302=== 11 TODO
1236 1303
1237- Describe how kbuild supports shipped files with _shipped. 1304- Describe how kbuild supports shipped files with _shipped.
1238- Generating offset header files. 1305- Generating offset header files.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 2895ce29dea5..6172e4360f60 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -153,60 +153,6 @@ and is between 256 and 4096 characters. It is defined in the file
153 1,0: use 1st APIC table 153 1,0: use 1st APIC table
154 default: 0 154 default: 0
155 155
156 acpi_sleep= [HW,ACPI] Sleep options
157 Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig,
158 old_ordering, s4_nonvs }
159 See Documentation/power/video.txt for information on
160 s3_bios and s3_mode.
161 s3_beep is for debugging; it makes the PC's speaker beep
162 as soon as the kernel's real-mode entry point is called.
163 s4_nohwsig prevents ACPI hardware signature from being
164 used during resume from hibernation.
165 old_ordering causes the ACPI 1.0 ordering of the _PTS
166 control method, with respect to putting devices into
167 low power states, to be enforced (the ACPI 2.0 ordering
168 of _PTS is used by default).
169 s4_nonvs prevents the kernel from saving/restoring the
170 ACPI NVS memory during hibernation.
171
172 acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode
173 Format: { level | edge | high | low }
174
175 acpi_irq_balance [HW,ACPI]
176 ACPI will balance active IRQs
177 default in APIC mode
178
179 acpi_irq_nobalance [HW,ACPI]
180 ACPI will not move active IRQs (default)
181 default in PIC mode
182
183 acpi_irq_pci= [HW,ACPI] If irq_balance, clear listed IRQs for
184 use by PCI
185 Format: <irq>,<irq>...
186
187 acpi_irq_isa= [HW,ACPI] If irq_balance, mark listed IRQs used by ISA
188 Format: <irq>,<irq>...
189
190 acpi_no_auto_ssdt [HW,ACPI] Disable automatic loading of SSDT
191
192 acpi_os_name= [HW,ACPI] Tell ACPI BIOS the name of the OS
193 Format: To spoof as Windows 98: ="Microsoft Windows"
194
195 acpi_osi= [HW,ACPI] Modify list of supported OS interface strings
196 acpi_osi="string1" # add string1 -- only one string
197 acpi_osi="!string2" # remove built-in string2
198 acpi_osi= # disable all strings
199
200 acpi_serialize [HW,ACPI] force serialization of AML methods
201
202 acpi_skip_timer_override [HW,ACPI]
203 Recognize and ignore IRQ0/pin2 Interrupt Override.
204 For broken nForce2 BIOS resulting in XT-PIC timer.
205 acpi_use_timer_override [HW,ACPI]
206 Use timer override. For some broken Nvidia NF5 boards
207 that require a timer override, but don't have
208 HPET
209
210 acpi_backlight= [HW,ACPI] 156 acpi_backlight= [HW,ACPI]
211 acpi_backlight=vendor 157 acpi_backlight=vendor
212 acpi_backlight=video 158 acpi_backlight=video
@@ -214,11 +160,6 @@ and is between 256 and 4096 characters. It is defined in the file
214 (e.g. thinkpad_acpi, sony_acpi, etc.) instead 160 (e.g. thinkpad_acpi, sony_acpi, etc.) instead
215 of the ACPI video.ko driver. 161 of the ACPI video.ko driver.
216 162
217 acpi_display_output= [HW,ACPI]
218 acpi_display_output=vendor
219 acpi_display_output=video
220 See above.
221
222 acpi.debug_layer= [HW,ACPI,ACPI_DEBUG] 163 acpi.debug_layer= [HW,ACPI,ACPI_DEBUG]
223 acpi.debug_level= [HW,ACPI,ACPI_DEBUG] 164 acpi.debug_level= [HW,ACPI,ACPI_DEBUG]
224 Format: <int> 165 Format: <int>
@@ -247,6 +188,41 @@ and is between 256 and 4096 characters. It is defined in the file
247 unusable. The "log_buf_len" parameter may be useful 188 unusable. The "log_buf_len" parameter may be useful
248 if you need to capture more output. 189 if you need to capture more output.
249 190
191 acpi_display_output= [HW,ACPI]
192 acpi_display_output=vendor
193 acpi_display_output=video
194 See above.
195
196 acpi_irq_balance [HW,ACPI]
197 ACPI will balance active IRQs
198 default in APIC mode
199
200 acpi_irq_nobalance [HW,ACPI]
201 ACPI will not move active IRQs (default)
202 default in PIC mode
203
204 acpi_irq_isa= [HW,ACPI] If irq_balance, mark listed IRQs used by ISA
205 Format: <irq>,<irq>...
206
207 acpi_irq_pci= [HW,ACPI] If irq_balance, clear listed IRQs for
208 use by PCI
209 Format: <irq>,<irq>...
210
211 acpi_no_auto_ssdt [HW,ACPI] Disable automatic loading of SSDT
212
213 acpi_os_name= [HW,ACPI] Tell ACPI BIOS the name of the OS
214 Format: To spoof as Windows 98: ="Microsoft Windows"
215
216 acpi_osi= [HW,ACPI] Modify list of supported OS interface strings
217 acpi_osi="string1" # add string1 -- only one string
218 acpi_osi="!string2" # remove built-in string2
219 acpi_osi= # disable all strings
220
221 acpi_pm_good [X86-32,X86-64]
222 Override the pmtimer bug detection: force the kernel
223 to assume that this machine's pmtimer latches its value
224 and always returns good values.
225
250 acpi.power_nocheck= [HW,ACPI] 226 acpi.power_nocheck= [HW,ACPI]
251 Format: 1/0 enable/disable the check of power state. 227 Format: 1/0 enable/disable the check of power state.
252 On some bogus BIOS the _PSC object/_STA object of 228 On some bogus BIOS the _PSC object/_STA object of
@@ -255,11 +231,6 @@ and is between 256 and 4096 characters. It is defined in the file
255 power state again in power transition. 231 power state again in power transition.
256 1 : disable the power state check 232 1 : disable the power state check
257 233
258 acpi_pm_good [X86-32,X86-64]
259 Override the pmtimer bug detection: force the kernel
260 to assume that this machine's pmtimer latches its value
261 and always returns good values.
262
263 acpi_enforce_resources= [ACPI] 234 acpi_enforce_resources= [ACPI]
264 { strict | lax | no } 235 { strict | lax | no }
265 Check for resource conflicts between native drivers 236 Check for resource conflicts between native drivers
@@ -276,22 +247,6 @@ and is between 256 and 4096 characters. It is defined in the file
276 no: ACPI OperationRegions are not marked as reserved, 247 no: ACPI OperationRegions are not marked as reserved,
277 no further checks are performed. 248 no further checks are performed.
278 249
279 agp= [AGP]
280 { off | try_unsupported }
281 off: disable AGP support
282 try_unsupported: try to drive unsupported chipsets
283 (may crash computer or cause data corruption)
284
285 enable_timer_pin_1 [i386,x86-64]
286 Enable PIN 1 of APIC timer
287 Can be useful to work around chipset bugs
288 (in particular on some ATI chipsets).
289 The kernel tries to set a reasonable default.
290
291 disable_timer_pin_1 [i386,x86-64]
292 Disable PIN 1 of APIC timer
293 Can be useful to work around chipset bugs.
294
295 ad1848= [HW,OSS] 250 ad1848= [HW,OSS]
296 Format: <io>,<irq>,<dma>,<dma2>,<type> 251 Format: <io>,<irq>,<dma>,<dma2>,<type>
297 252
@@ -305,6 +260,12 @@ and is between 256 and 4096 characters. It is defined in the file
305 Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq> 260 Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq>
306 See also header of sound/oss/aedsp16.c. 261 See also header of sound/oss/aedsp16.c.
307 262
263 agp= [AGP]
264 { off | try_unsupported }
265 off: disable AGP support
266 try_unsupported: try to drive unsupported chipsets
267 (may crash computer or cause data corruption)
268
308 aha152x= [HW,SCSI] 269 aha152x= [HW,SCSI]
309 See Documentation/scsi/aha152x.txt. 270 See Documentation/scsi/aha152x.txt.
310 271
@@ -432,12 +393,6 @@ and is between 256 and 4096 characters. It is defined in the file
432 possible to determine what the correct size should be. 393 possible to determine what the correct size should be.
433 This option provides an override for these situations. 394 This option provides an override for these situations.
434 395
435 security= [SECURITY] Choose a security module to enable at boot.
436 If this boot parameter is not specified, only the first
437 security module asking for security registration will be
438 loaded. An invalid security module name will be treated
439 as if no module has been chosen.
440
441 capability.disable= 396 capability.disable=
442 [SECURITY] Disable capabilities. This would normally 397 [SECURITY] Disable capabilities. This would normally
443 be used only if an alternative security model is to be 398 be used only if an alternative security model is to be
@@ -509,24 +464,6 @@ and is between 256 and 4096 characters. It is defined in the file
509 Range: 0 - 8192 464 Range: 0 - 8192
510 Default: 64 465 Default: 64
511 466
512 dma_debug=off If the kernel is compiled with DMA_API_DEBUG support
513 this option disables the debugging code at boot.
514
515 dma_debug_entries=<number>
516 This option allows to tune the number of preallocated
517 entries for DMA-API debugging code. One entry is
518 required per DMA-API allocation. Use this if the
519 DMA-API debugging code disables itself because the
520 architectural default is too low.
521
522 hpet= [X86-32,HPET] option to control HPET usage
523 Format: { enable (default) | disable | force |
524 verbose }
525 disable: disable HPET and use PIT instead
526 force: allow force enabled of undocumented chips (ICH4,
527 VIA, nVidia)
528 verbose: show contents of HPET registers during setup
529
530 com20020= [HW,NET] ARCnet - COM20020 chipset 467 com20020= [HW,NET] ARCnet - COM20020 chipset
531 Format: 468 Format:
532 <io>[,<irq>[,<nodeID>[,<backplane>[,<ckp>[,<timeout>]]]]] 469 <io>[,<irq>[,<nodeID>[,<backplane>[,<ckp>[,<timeout>]]]]]
@@ -570,23 +507,6 @@ and is between 256 and 4096 characters. It is defined in the file
570 console=brl,ttyS0 507 console=brl,ttyS0
571 For now, only VisioBraille is supported. 508 For now, only VisioBraille is supported.
572 509
573 earlycon= [KNL] Output early console device and options.
574 uart[8250],io,<addr>[,options]
575 uart[8250],mmio,<addr>[,options]
576 Start an early, polled-mode console on the 8250/16550
577 UART at the specified I/O port or MMIO address.
578 The options are the same as for ttyS, above.
579
580 no_console_suspend
581 [HW] Never suspend the console
582 Disable suspending of consoles during suspend and
583 hibernate operations. Once disabled, debugging
584 messages can reach various consoles while the rest
585 of the system is being put to sleep (ie, while
586 debugging driver suspend/resume hooks). This may
587 not work reliably with all consoles, but is known
588 to work with serial and VGA consoles.
589
590 coredump_filter= 510 coredump_filter=
591 [KNL] Change the default value for 511 [KNL] Change the default value for
592 /proc/<pid>/coredump_filter. 512 /proc/<pid>/coredump_filter.
@@ -643,30 +563,13 @@ and is between 256 and 4096 characters. It is defined in the file
643 Format: <area>[,<node>] 563 Format: <area>[,<node>]
644 See also Documentation/networking/decnet.txt. 564 See also Documentation/networking/decnet.txt.
645 565
646 vt.default_blu= [VT] 566 default_hugepagesz=
647 Format: <blue0>,<blue1>,<blue2>,...,<blue15> 567 [same as hugepagesz=] The size of the default
648 Change the default blue palette of the console. 568 HugeTLB page size. This is the size represented by
649 This is a 16-member array composed of values 569 the legacy /proc/ hugepages APIs, used for SHM, and
650 ranging from 0-255. 570 default size when mounting hugetlbfs filesystems.
651 571 Defaults to the default architecture's huge page size
652 vt.default_grn= [VT] 572 if not specified.
653 Format: <green0>,<green1>,<green2>,...,<green15>
654 Change the default green palette of the console.
655 This is a 16-member array composed of values
656 ranging from 0-255.
657
658 vt.default_red= [VT]
659 Format: <red0>,<red1>,<red2>,...,<red15>
660 Change the default red palette of the console.
661 This is a 16-member array composed of values
662 ranging from 0-255.
663
664 vt.default_utf8=
665 [VT]
666 Format=<0|1>
667 Set system-wide default UTF-8 mode for all tty's.
668 Default is 1, i.e. UTF-8 mode is enabled for all
669 newly opened terminals.
670 573
671 dhash_entries= [KNL] 574 dhash_entries= [KNL]
672 Set number of hash buckets for dentry cache. 575 Set number of hash buckets for dentry cache.
@@ -679,27 +582,9 @@ and is between 256 and 4096 characters. It is defined in the file
679 Documentation/serial/digiepca.txt. 582 Documentation/serial/digiepca.txt.
680 583
681 disable_mtrr_cleanup [X86] 584 disable_mtrr_cleanup [X86]
682 enable_mtrr_cleanup [X86]
683 The kernel tries to adjust MTRR layout from continuous 585 The kernel tries to adjust MTRR layout from continuous
684 to discrete, to make X server driver able to add WB 586 to discrete, to make X server driver able to add WB
685 entry later. This parameter enables/disables that. 587 entry later. This parameter disables that.
686
687 mtrr_chunk_size=nn[KMG] [X86]
688 used for mtrr cleanup. It is largest continous chunk
689 that could hold holes aka. UC entries.
690
691 mtrr_gran_size=nn[KMG] [X86]
692 Used for mtrr cleanup. It is granularity of mtrr block.
693 Default is 1.
694 Large value could prevent small alignment from
695 using up MTRRs.
696
697 mtrr_spare_reg_nr=n [X86]
698 Format: <integer>
699 Range: 0,7 : spare reg number
700 Default : 1
701 Used for mtrr cleanup. It is spare mtrr entries number.
702 Set to 2 or more if your graphical card needs more.
703 588
704 disable_mtrr_trim [X86, Intel and AMD only] 589 disable_mtrr_trim [X86, Intel and AMD only]
705 By default the kernel will trim any uncacheable 590 By default the kernel will trim any uncacheable
@@ -707,12 +592,38 @@ and is between 256 and 4096 characters. It is defined in the file
707 MTRR settings. This parameter disables that behavior, 592 MTRR settings. This parameter disables that behavior,
708 possibly causing your machine to run very slowly. 593 possibly causing your machine to run very slowly.
709 594
595 disable_timer_pin_1 [i386,x86-64]
596 Disable PIN 1 of APIC timer
597 Can be useful to work around chipset bugs.
598
710 dmasound= [HW,OSS] Sound subsystem buffers 599 dmasound= [HW,OSS] Sound subsystem buffers
711 600
601 dma_debug=off If the kernel is compiled with DMA_API_DEBUG support,
602 this option disables the debugging code at boot.
603
604 dma_debug_entries=<number>
605 This option allows to tune the number of preallocated
606 entries for DMA-API debugging code. One entry is
607 required per DMA-API allocation. Use this if the
608 DMA-API debugging code disables itself because the
609 architectural default is too low.
610
712 dscc4.setup= [NET] 611 dscc4.setup= [NET]
713 612
714 dtc3181e= [HW,SCSI] 613 dtc3181e= [HW,SCSI]
715 614
615 dynamic_printk Enables pr_debug()/dev_dbg() calls if
616 CONFIG_DYNAMIC_PRINTK_DEBUG has been enabled.
617 These can also be switched on/off via
618 <debugfs>/dynamic_printk/modules
619
620 earlycon= [KNL] Output early console device and options.
621 uart[8250],io,<addr>[,options]
622 uart[8250],mmio,<addr>[,options]
623 Start an early, polled-mode console on the 8250/16550
624 UART at the specified I/O port or MMIO address.
625 The options are the same as for ttyS, above.
626
716 earlyprintk= [X86-32,X86-64,SH,BLACKFIN] 627 earlyprintk= [X86-32,X86-64,SH,BLACKFIN]
717 earlyprintk=vga 628 earlyprintk=vga
718 earlyprintk=serial[,ttySn[,baudrate]] 629 earlyprintk=serial[,ttySn[,baudrate]]
@@ -754,6 +665,17 @@ and is between 256 and 4096 characters. It is defined in the file
754 pass this option to capture kernel. 665 pass this option to capture kernel.
755 See Documentation/kdump/kdump.txt for details. 666 See Documentation/kdump/kdump.txt for details.
756 667
668 enable_mtrr_cleanup [X86]
669 The kernel tries to adjust MTRR layout from continuous
670 to discrete, to make X server driver able to add WB
671 entry later. This parameter enables that.
672
673 enable_timer_pin_1 [i386,x86-64]
674 Enable PIN 1 of APIC timer
675 Can be useful to work around chipset bugs
676 (in particular on some ATI chipsets).
677 The kernel tries to set a reasonable default.
678
757 enforcing [SELINUX] Set initial enforcing status. 679 enforcing [SELINUX] Set initial enforcing status.
758 Format: {"0" | "1"} 680 Format: {"0" | "1"}
759 See security/selinux/Kconfig help text. 681 See security/selinux/Kconfig help text.
@@ -841,6 +763,16 @@ and is between 256 and 4096 characters. It is defined in the file
841 hisax= [HW,ISDN] 763 hisax= [HW,ISDN]
842 See Documentation/isdn/README.HiSax. 764 See Documentation/isdn/README.HiSax.
843 765
766 hlt [BUGS=ARM,SH]
767
768 hpet= [X86-32,HPET] option to control HPET usage
769 Format: { enable (default) | disable | force |
770 verbose }
771 disable: disable HPET and use PIT instead
772 force: allow force enabled of undocumented chips (ICH4,
773 VIA, nVidia)
774 verbose: show contents of HPET registers during setup
775
844 hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot. 776 hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot.
845 hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages. 777 hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages.
846 On x86-64 and powerpc, this option can be specified 778 On x86-64 and powerpc, this option can be specified
@@ -850,15 +782,6 @@ and is between 256 and 4096 characters. It is defined in the file
850 (when the CPU supports the "pdpe1gb" cpuinfo flag) 782 (when the CPU supports the "pdpe1gb" cpuinfo flag)
851 Note that 1GB pages can only be allocated at boot time 783 Note that 1GB pages can only be allocated at boot time
852 using hugepages= and not freed afterwards. 784 using hugepages= and not freed afterwards.
853 default_hugepagesz=
854 [same as hugepagesz=] The size of the default
855 HugeTLB page size. This is the size represented by
856 the legacy /proc/ hugepages APIs, used for SHM, and
857 default size when mounting hugetlbfs filesystems.
858 Defaults to the default architecture's huge page size
859 if not specified.
860
861 hlt [BUGS=ARM,SH]
862 785
863 hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC) 786 hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC)
864 terminal devices. Valid values: 0..8 787 terminal devices. Valid values: 0..8
@@ -919,6 +842,9 @@ and is between 256 and 4096 characters. It is defined in the file
919 idebus= [HW] (E)IDE subsystem - VLB/PCI bus speed 842 idebus= [HW] (E)IDE subsystem - VLB/PCI bus speed
920 See Documentation/ide/ide.txt. 843 See Documentation/ide/ide.txt.
921 844
845 ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
846 Claim all unknown PCI IDE storage controllers.
847
922 idle= [X86] 848 idle= [X86]
923 Format: idle=poll, idle=mwait, idle=halt, idle=nomwait 849 Format: idle=poll, idle=mwait, idle=halt, idle=nomwait
924 Poll forces a polling idle loop that can slightly 850 Poll forces a polling idle loop that can slightly
@@ -934,9 +860,6 @@ and is between 256 and 4096 characters. It is defined in the file
934 In such case C2/C3 won't be used again. 860 In such case C2/C3 won't be used again.
935 idle=nomwait: Disable mwait for CPU C-states 861 idle=nomwait: Disable mwait for CPU C-states
936 862
937 ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
938 Claim all unknown PCI IDE storage controllers.
939
940 ignore_loglevel [KNL] 863 ignore_loglevel [KNL]
941 Ignore loglevel setting - this will print /all/ 864 Ignore loglevel setting - this will print /all/
942 kernel messages to the console. Useful for debugging. 865 kernel messages to the console. Useful for debugging.
@@ -970,25 +893,6 @@ and is between 256 and 4096 characters. It is defined in the file
970 inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver 893 inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver
971 Format: <irq> 894 Format: <irq>
972 895
973 inttest= [IA64]
974
975 iomem= Disable strict checking of access to MMIO memory
976 strict regions from userspace.
977 relaxed
978
979 iommu= [x86]
980 off
981 force
982 noforce
983 biomerge
984 panic
985 nopanic
986 merge
987 nomerge
988 forcesac
989 soft
990
991
992 intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option 896 intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option
993 on 897 on
994 Enable intel iommu driver. 898 Enable intel iommu driver.
@@ -1012,6 +916,28 @@ and is between 256 and 4096 characters. It is defined in the file
1012 result in a hardware IOTLB flush operation as opposed 916 result in a hardware IOTLB flush operation as opposed
1013 to batching them for performance. 917 to batching them for performance.
1014 918
919 inttest= [IA64]
920
921 iomem= Disable strict checking of access to MMIO memory
922 strict regions from userspace.
923 relaxed
924
925 iommu= [x86]
926 off
927 force
928 noforce
929 biomerge
930 panic
931 nopanic
932 merge
933 nomerge
934 forcesac
935 soft
936
937 io7= [HW] IO7 for Marvel based alpha systems
938 See comment before marvel_specify_io7 in
939 arch/alpha/kernel/core_marvel.c.
940
1015 io_delay= [X86-32,X86-64] I/O delay method 941 io_delay= [X86-32,X86-64] I/O delay method
1016 0x80 942 0x80
1017 Standard port 0x80 based delay 943 Standard port 0x80 based delay
@@ -1022,10 +948,6 @@ and is between 256 and 4096 characters. It is defined in the file
1022 none 948 none
1023 No delay 949 No delay
1024 950
1025 io7= [HW] IO7 for Marvel based alpha systems
1026 See comment before marvel_specify_io7 in
1027 arch/alpha/kernel/core_marvel.c.
1028
1029 ip= [IP_PNP] 951 ip= [IP_PNP]
1030 See Documentation/filesystems/nfsroot.txt. 952 See Documentation/filesystems/nfsroot.txt.
1031 953
@@ -1036,12 +958,6 @@ and is between 256 and 4096 characters. It is defined in the file
1036 ips= [HW,SCSI] Adaptec / IBM ServeRAID controller 958 ips= [HW,SCSI] Adaptec / IBM ServeRAID controller
1037 See header of drivers/scsi/ips.c. 959 See header of drivers/scsi/ips.c.
1038 960
1039 ports= [IP_VS_FTP] IPVS ftp helper module
1040 Default is 21.
1041 Up to 8 (IP_VS_APP_MAX_PORTS) ports
1042 may be specified.
1043 Format: <port>,<port>....
1044
1045 irqfixup [HW] 961 irqfixup [HW]
1046 When an interrupt is not handled search all handlers 962 When an interrupt is not handled search all handlers
1047 for it. Intended to get systems with badly broken 963 for it. Intended to get systems with badly broken
@@ -1082,6 +998,8 @@ and is between 256 and 4096 characters. It is defined in the file
1082 js= [HW,JOY] Analog joystick 998 js= [HW,JOY] Analog joystick
1083 See Documentation/input/joystick.txt. 999 See Documentation/input/joystick.txt.
1084 1000
1001 keepinitrd [HW,ARM]
1002
1085 kernelcore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter 1003 kernelcore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter
1086 specifies the amount of memory usable by the kernel 1004 specifies the amount of memory usable by the kernel
1087 for non-movable allocations. The requested amount is 1005 for non-movable allocations. The requested amount is
@@ -1107,21 +1025,6 @@ and is between 256 and 4096 characters. It is defined in the file
1107 higher than default (KMEMTRACE_N_SUBBUFS in code) if 1025 higher than default (KMEMTRACE_N_SUBBUFS in code) if
1108 you experience buffer overruns. 1026 you experience buffer overruns.
1109 1027
1110 movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter
1111 is similar to kernelcore except it specifies the
1112 amount of memory used for migratable allocations.
1113 If both kernelcore and movablecore is specified,
1114 then kernelcore will be at *least* the specified
1115 value but may be more. If movablecore on its own
1116 is specified, the administrator must be careful
1117 that the amount of memory usable for all allocations
1118 is not too small.
1119
1120 keepinitrd [HW,ARM]
1121
1122 kstack=N [X86-32,X86-64] Print N words from the kernel stack
1123 in oops dumps.
1124
1125 kgdboc= [HW] kgdb over consoles. 1028 kgdboc= [HW] kgdb over consoles.
1126 Requires a tty driver that supports console polling. 1029 Requires a tty driver that supports console polling.
1127 (only serial suported for now) 1030 (only serial suported for now)
@@ -1131,6 +1034,9 @@ and is between 256 and 4096 characters. It is defined in the file
1131 Configure the RouterBoard 532 series on-chip 1034 Configure the RouterBoard 532 series on-chip
1132 Ethernet adapter MAC address. 1035 Ethernet adapter MAC address.
1133 1036
1037 kstack=N [X86-32,X86-64] Print N words from the kernel stack
1038 in oops dumps.
1039
1134 l2cr= [PPC] 1040 l2cr= [PPC]
1135 1041
1136 l3cr= [PPC] 1042 l3cr= [PPC]
@@ -1276,9 +1182,8 @@ and is between 256 and 4096 characters. It is defined in the file
1276 (machvec) in a generic kernel. 1182 (machvec) in a generic kernel.
1277 Example: machvec=hpzx1_swiotlb 1183 Example: machvec=hpzx1_swiotlb
1278 1184
1279 max_loop= [LOOP] Maximum number of loopback devices that can 1185 max_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory greater
1280 be mounted 1186 than or equal to this physical address is ignored.
1281 Format: <1-256>
1282 1187
1283 maxcpus= [SMP] Maximum number of processors that an SMP kernel 1188 maxcpus= [SMP] Maximum number of processors that an SMP kernel
1284 should make use of. maxcpus=n : n >= 0 limits the 1189 should make use of. maxcpus=n : n >= 0 limits the
@@ -1286,8 +1191,9 @@ and is between 256 and 4096 characters. It is defined in the file
1286 it is equivalent to "nosmp", which also disables 1191 it is equivalent to "nosmp", which also disables
1287 the IO APIC. 1192 the IO APIC.
1288 1193
1289 max_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory greater than 1194 max_loop= [LOOP] Maximum number of loopback devices that can
1290 or equal to this physical address is ignored. 1195 be mounted
1196 Format: <1-256>
1291 1197
1292 max_luns= [SCSI] Maximum number of LUNs to probe. 1198 max_luns= [SCSI] Maximum number of LUNs to probe.
1293 Should be between 1 and 2^32-1. 1199 Should be between 1 and 2^32-1.
@@ -1414,6 +1320,16 @@ and is between 256 and 4096 characters. It is defined in the file
1414 mousedev.yres= [MOUSE] Vertical screen resolution, used for devices 1320 mousedev.yres= [MOUSE] Vertical screen resolution, used for devices
1415 reporting absolute coordinates, such as tablets 1321 reporting absolute coordinates, such as tablets
1416 1322
1323 movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter
1324 is similar to kernelcore except it specifies the
1325 amount of memory used for migratable allocations.
1326 If both kernelcore and movablecore is specified,
1327 then kernelcore will be at *least* the specified
1328 value but may be more. If movablecore on its own
1329 is specified, the administrator must be careful
1330 that the amount of memory usable for all allocations
1331 is not too small.
1332
1417 mpu401= [HW,OSS] 1333 mpu401= [HW,OSS]
1418 Format: <io>,<irq> 1334 Format: <io>,<irq>
1419 1335
@@ -1435,6 +1351,23 @@ and is between 256 and 4096 characters. It is defined in the file
1435 [HW] Make the MicroTouch USB driver use raw coordinates 1351 [HW] Make the MicroTouch USB driver use raw coordinates
1436 ('y', default) or cooked coordinates ('n') 1352 ('y', default) or cooked coordinates ('n')
1437 1353
1354 mtrr_chunk_size=nn[KMG] [X86]
1355 used for mtrr cleanup. It is largest continous chunk
1356 that could hold holes aka. UC entries.
1357
1358 mtrr_gran_size=nn[KMG] [X86]
1359 Used for mtrr cleanup. It is granularity of mtrr block.
1360 Default is 1.
1361 Large value could prevent small alignment from
1362 using up MTRRs.
1363
1364 mtrr_spare_reg_nr=n [X86]
1365 Format: <integer>
1366 Range: 0,7 : spare reg number
1367 Default : 1
1368 Used for mtrr cleanup. It is spare mtrr entries number.
1369 Set to 2 or more if your graphical card needs more.
1370
1438 n2= [NET] SDL Inc. RISCom/N2 synchronous serial card 1371 n2= [NET] SDL Inc. RISCom/N2 synchronous serial card
1439 1372
1440 NCR_D700= [HW,SCSI] 1373 NCR_D700= [HW,SCSI]
@@ -1495,11 +1428,13 @@ and is between 256 and 4096 characters. It is defined in the file
1495 0 - turn nmi_watchdog off 1428 0 - turn nmi_watchdog off
1496 1 - use the IO-APIC timer for the NMI watchdog 1429 1 - use the IO-APIC timer for the NMI watchdog
1497 2 - use the local APIC for the NMI watchdog using 1430 2 - use the local APIC for the NMI watchdog using
1498 a performance counter. Note: This will use one performance 1431 a performance counter. Note: This will use one
1499 counter and the local APIC's performance vector. 1432 performance counter and the local APIC's performance
1500 When panic is specified panic when an NMI watchdog timeout occurs. 1433 vector.
1501 This is useful when you use a panic=... timeout and need the box 1434 When panic is specified, panic when an NMI watchdog
1502 quickly up again. 1435 timeout occurs.
1436 This is useful when you use a panic=... timeout and
1437 need the box quickly up again.
1503 Instead of 1 and 2 it is possible to use the following 1438 Instead of 1 and 2 it is possible to use the following
1504 symbolic names: lapic and ioapic 1439 symbolic names: lapic and ioapic
1505 Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic 1440 Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic
@@ -1508,6 +1443,16 @@ and is between 256 and 4096 characters. It is defined in the file
1508 emulation library even if a 387 maths coprocessor 1443 emulation library even if a 387 maths coprocessor
1509 is present. 1444 is present.
1510 1445
1446 no_console_suspend
1447 [HW] Never suspend the console
1448 Disable suspending of consoles during suspend and
1449 hibernate operations. Once disabled, debugging
1450 messages can reach various consoles while the rest
1451 of the system is being put to sleep (ie, while
1452 debugging driver suspend/resume hooks). This may
1453 not work reliably with all consoles, but is known
1454 to work with serial and VGA consoles.
1455
1511 noaliencache [MM, NUMA, SLAB] Disables the allocation of alien 1456 noaliencache [MM, NUMA, SLAB] Disables the allocation of alien
1512 caches in the slab allocator. Saves per-node memory, 1457 caches in the slab allocator. Saves per-node memory,
1513 but will impact performance. 1458 but will impact performance.
@@ -1522,6 +1467,8 @@ and is between 256 and 4096 characters. It is defined in the file
1522 1467
1523 nocache [ARM] 1468 nocache [ARM]
1524 1469
1470 noclflush [BUGS=X86] Don't use the CLFLUSH instruction
1471
1525 nodelayacct [KNL] Disable per-task delay accounting 1472 nodelayacct [KNL] Disable per-task delay accounting
1526 1473
1527 nodisconnect [HW,SCSI,M68K] Disables SCSI disconnects. 1474 nodisconnect [HW,SCSI,M68K] Disables SCSI disconnects.
@@ -1550,8 +1497,6 @@ and is between 256 and 4096 characters. It is defined in the file
1550 register save and restore. The kernel will only save 1497 register save and restore. The kernel will only save
1551 legacy floating-point registers on task switch. 1498 legacy floating-point registers on task switch.
1552 1499
1553 noclflush [BUGS=X86] Don't use the CLFLUSH instruction
1554
1555 nohlt [BUGS=ARM,SH] Tells the kernel that the sleep(SH) or 1500 nohlt [BUGS=ARM,SH] Tells the kernel that the sleep(SH) or
1556 wfi(ARM) instruction doesn't work correctly and not to 1501 wfi(ARM) instruction doesn't work correctly and not to
1557 use it. This is also useful when using JTAG debugger. 1502 use it. This is also useful when using JTAG debugger.
@@ -1596,12 +1541,6 @@ and is between 256 and 4096 characters. It is defined in the file
1596 1541
1597 nolapic_timer [X86-32,APIC] Do not use the local APIC timer. 1542 nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
1598 1543
1599 nox2apic [X86-64,APIC] Do not enable x2APIC mode.
1600
1601 x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
1602 default x2apic cluster mode on platforms
1603 supporting x2apic.
1604
1605 noltlbs [PPC] Do not use large page/tlb entries for kernel 1544 noltlbs [PPC] Do not use large page/tlb entries for kernel
1606 lowmem mapping on PPC40x. 1545 lowmem mapping on PPC40x.
1607 1546
@@ -1612,6 +1551,9 @@ and is between 256 and 4096 characters. It is defined in the file
1612 nomfgpt [X86-32] Disable Multi-Function General Purpose 1551 nomfgpt [X86-32] Disable Multi-Function General Purpose
1613 Timer usage (for AMD Geode machines). 1552 Timer usage (for AMD Geode machines).
1614 1553
1554 norandmaps Don't use address space randomization. Equivalent to
1555 echo 0 > /proc/sys/kernel/randomize_va_space
1556
1615 noreplace-paravirt [X86-32,PV_OPS] Don't patch paravirt_ops 1557 noreplace-paravirt [X86-32,PV_OPS] Don't patch paravirt_ops
1616 1558
1617 noreplace-smp [X86-32,SMP] Don't replace SMP instructions 1559 noreplace-smp [X86-32,SMP] Don't replace SMP instructions
@@ -1650,13 +1592,13 @@ and is between 256 and 4096 characters. It is defined in the file
1650 purges which is reported from either PAL_VM_SUMMARY or 1592 purges which is reported from either PAL_VM_SUMMARY or
1651 SAL PALO. 1593 SAL PALO.
1652 1594
1595 nr_uarts= [SERIAL] maximum number of UARTs to be registered.
1596
1653 numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. 1597 numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
1654 one of ['zone', 'node', 'default'] can be specified 1598 one of ['zone', 'node', 'default'] can be specified
1655 This can be set from sysctl after boot. 1599 This can be set from sysctl after boot.
1656 See Documentation/sysctl/vm.txt for details. 1600 See Documentation/sysctl/vm.txt for details.
1657 1601
1658 nr_uarts= [SERIAL] maximum number of UARTs to be registered.
1659
1660 ohci1394_dma=early [HW] enable debugging via the ohci1394 driver. 1602 ohci1394_dma=early [HW] enable debugging via the ohci1394 driver.
1661 See Documentation/debugging-via-ohci1394.txt for more 1603 See Documentation/debugging-via-ohci1394.txt for more
1662 info. 1604 info.
@@ -1905,6 +1847,14 @@ and is between 256 and 4096 characters. It is defined in the file
1905 printk.time= Show timing data prefixed to each printk message line 1847 printk.time= Show timing data prefixed to each printk message line
1906 Format: <bool> (1/Y/y=enable, 0/N/n=disable) 1848 Format: <bool> (1/Y/y=enable, 0/N/n=disable)
1907 1849
1850 processor.max_cstate= [HW,ACPI]
1851 Limit processor to maximum C-state
1852 max_cstate=9 overrides any DMI blacklist limit.
1853
1854 processor.nocst [HW,ACPI]
1855 Ignore the _CST method to determine C-states,
1856 instead using the legacy FADT method
1857
1908 profile= [KNL] Enable kernel profiling via /proc/profile 1858 profile= [KNL] Enable kernel profiling via /proc/profile
1909 Format: [schedule,]<number> 1859 Format: [schedule,]<number>
1910 Param: "schedule" - profile schedule points. 1860 Param: "schedule" - profile schedule points.
@@ -1914,14 +1864,6 @@ and is between 256 and 4096 characters. It is defined in the file
1914 Requires CONFIG_SCHEDSTATS 1864 Requires CONFIG_SCHEDSTATS
1915 Param: "kvm" - profile VM exits. 1865 Param: "kvm" - profile VM exits.
1916 1866
1917 processor.max_cstate= [HW,ACPI]
1918 Limit processor to maximum C-state
1919 max_cstate=9 overrides any DMI blacklist limit.
1920
1921 processor.nocst [HW,ACPI]
1922 Ignore the _CST method to determine C-states,
1923 instead using the legacy FADT method
1924
1925 prompt_ramdisk= [RAM] List of RAM disks to prompt for floppy disk 1867 prompt_ramdisk= [RAM] List of RAM disks to prompt for floppy disk
1926 before loading. 1868 before loading.
1927 See Documentation/blockdev/ramdisk.txt. 1869 See Documentation/blockdev/ramdisk.txt.
@@ -2075,7 +2017,13 @@ and is between 256 and 4096 characters. It is defined in the file
2075 allowing boot to proceed. none ignores them, expecting 2017 allowing boot to proceed. none ignores them, expecting
2076 user space to do the scan. 2018 user space to do the scan.
2077 2019
2078 selinux [SELINUX] Disable or enable SELinux at boot time. 2020 security= [SECURITY] Choose a security module to enable at boot.
2021 If this boot parameter is not specified, only the first
2022 security module asking for security registration will be
2023 loaded. An invalid security module name will be treated
2024 as if no module has been chosen.
2025
2026 selinux= [SELINUX] Disable or enable SELinux at boot time.
2079 Format: { "0" | "1" } 2027 Format: { "0" | "1" }
2080 See security/selinux/Kconfig help text. 2028 See security/selinux/Kconfig help text.
2081 0 -- disable. 2029 0 -- disable.
@@ -2499,9 +2447,6 @@ and is between 256 and 4096 characters. It is defined in the file
2499 medium is write-protected). 2447 medium is write-protected).
2500 Example: quirks=0419:aaf5:rl,0421:0433:rc 2448 Example: quirks=0419:aaf5:rl,0421:0433:rc
2501 2449
2502 add_efi_memmap [EFI; x86-32,X86-64] Include EFI memory map in
2503 kernel's map of available physical RAM.
2504
2505 vdso= [X86-32,SH,x86-64] 2450 vdso= [X86-32,SH,x86-64]
2506 vdso=2: enable compat VDSO (default with COMPAT_VDSO) 2451 vdso=2: enable compat VDSO (default with COMPAT_VDSO)
2507 vdso=1: enable VDSO (default) 2452 vdso=1: enable VDSO (default)
@@ -2540,6 +2485,31 @@ and is between 256 and 4096 characters. It is defined in the file
2540 vmpoff= [KNL,S390] Perform z/VM CP command after power off. 2485 vmpoff= [KNL,S390] Perform z/VM CP command after power off.
2541 Format: <command> 2486 Format: <command>
2542 2487
2488 vt.default_blu= [VT]
2489 Format: <blue0>,<blue1>,<blue2>,...,<blue15>
2490 Change the default blue palette of the console.
2491 This is a 16-member array composed of values
2492 ranging from 0-255.
2493
2494 vt.default_grn= [VT]
2495 Format: <green0>,<green1>,<green2>,...,<green15>
2496 Change the default green palette of the console.
2497 This is a 16-member array composed of values
2498 ranging from 0-255.
2499
2500 vt.default_red= [VT]
2501 Format: <red0>,<red1>,<red2>,...,<red15>
2502 Change the default red palette of the console.
2503 This is a 16-member array composed of values
2504 ranging from 0-255.
2505
2506 vt.default_utf8=
2507 [VT]
2508 Format=<0|1>
2509 Set system-wide default UTF-8 mode for all tty's.
2510 Default is 1, i.e. UTF-8 mode is enabled for all
2511 newly opened terminals.
2512
2543 waveartist= [HW,OSS] 2513 waveartist= [HW,OSS]
2544 Format: <io>,<irq>,<dma>,<dma2> 2514 Format: <io>,<irq>,<dma>,<dma2>
2545 2515
@@ -2552,6 +2522,10 @@ and is between 256 and 4096 characters. It is defined in the file
2552 wdt= [WDT] Watchdog 2522 wdt= [WDT] Watchdog
2553 See Documentation/watchdog/wdt.txt. 2523 See Documentation/watchdog/wdt.txt.
2554 2524
2525 x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
2526 default x2apic cluster mode on platforms
2527 supporting x2apic.
2528
2555 xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks. 2529 xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks.
2556 xd_geo= See header of drivers/block/xd.c. 2530 xd_geo= See header of drivers/block/xd.c.
2557 2531
@@ -2559,9 +2533,6 @@ and is between 256 and 4096 characters. It is defined in the file
2559 Format: 2533 Format:
2560 <irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]] 2534 <irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
2561 2535
2562 norandmaps Don't use address space randomization. Equivalent to
2563 echo 0 > /proc/sys/kernel/randomize_va_space
2564
2565______________________________________________________________________ 2536______________________________________________________________________
2566 2537
2567TODO: 2538TODO:
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 48b3de90eb1e..1e7a769a10f9 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -212,7 +212,9 @@ hit, Kprobes calls kp->pre_handler. After the probed instruction
212is single-stepped, Kprobe calls kp->post_handler. If a fault 212is single-stepped, Kprobe calls kp->post_handler. If a fault
213occurs during execution of kp->pre_handler or kp->post_handler, 213occurs during execution of kp->pre_handler or kp->post_handler,
214or during single-stepping of the probed instruction, Kprobes calls 214or during single-stepping of the probed instruction, Kprobes calls
215kp->fault_handler. Any or all handlers can be NULL. 215kp->fault_handler. Any or all handlers can be NULL. If kp->flags
216is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled,
217so, it's handlers aren't hit until calling enable_kprobe(kp).
216 218
217NOTE: 219NOTE:
2181. With the introduction of the "symbol_name" field to struct kprobe, 2201. With the introduction of the "symbol_name" field to struct kprobe,
@@ -363,6 +365,26 @@ probes) in the specified array, they clear the addr field of those
363incorrect probes. However, other probes in the array are 365incorrect probes. However, other probes in the array are
364unregistered correctly. 366unregistered correctly.
365 367
3684.7 disable_*probe
369
370#include <linux/kprobes.h>
371int disable_kprobe(struct kprobe *kp);
372int disable_kretprobe(struct kretprobe *rp);
373int disable_jprobe(struct jprobe *jp);
374
375Temporarily disables the specified *probe. You can enable it again by using
376enable_*probe(). You must specify the probe which has been registered.
377
3784.8 enable_*probe
379
380#include <linux/kprobes.h>
381int enable_kprobe(struct kprobe *kp);
382int enable_kretprobe(struct kretprobe *rp);
383int enable_jprobe(struct jprobe *jp);
384
385Enables *probe which has been disabled by disable_*probe(). You must specify
386the probe which has been registered.
387
3665. Kprobes Features and Limitations 3885. Kprobes Features and Limitations
367 389
368Kprobes allows multiple probes at the same address. Currently, 390Kprobes allows multiple probes at the same address. Currently,
@@ -500,10 +522,14 @@ the probe. If the probed function belongs to a module, the module name
500is also specified. Following columns show probe status. If the probe is on 522is also specified. Following columns show probe status. If the probe is on
501a virtual address that is no longer valid (module init sections, module 523a virtual address that is no longer valid (module init sections, module
502virtual addresses that correspond to modules that've been unloaded), 524virtual addresses that correspond to modules that've been unloaded),
503such probes are marked with [GONE]. 525such probes are marked with [GONE]. If the probe is temporarily disabled,
526such probes are marked with [DISABLED].
504 527
505/debug/kprobes/enabled: Turn kprobes ON/OFF 528/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly.
506 529
507Provides a knob to globally turn registered kprobes ON or OFF. By default, 530Provides a knob to globally and forcibly turn registered kprobes ON or OFF.
508all kprobes are enabled. By echoing "0" to this file, all registered probes 531By default, all kprobes are enabled. By echoing "0" to this file, all
509will be disarmed, till such time a "1" is echoed to this file. 532registered probes will be disarmed, till such time a "1" is echoed to this
533file. Note that this knob just disarms and arms all kprobes and doesn't
534change each probe's disabling state. This means that disabled kprobes (marked
535[DISABLED]) will be not enabled if you turn ON all kprobes by this knob.
diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt
index 0ab0230cbcb0..d16b7a1c3793 100644
--- a/Documentation/powerpc/booting-without-of.txt
+++ b/Documentation/powerpc/booting-without-of.txt
@@ -43,12 +43,11 @@ Table of Contents
43 2) Representing devices without a current OF specification 43 2) Representing devices without a current OF specification
44 a) PHY nodes 44 a) PHY nodes
45 b) Interrupt controllers 45 b) Interrupt controllers
46 c) CFI or JEDEC memory-mapped NOR flash 46 c) 4xx/Axon EMAC ethernet nodes
47 d) 4xx/Axon EMAC ethernet nodes 47 d) Xilinx IP cores
48 e) Xilinx IP cores 48 e) USB EHCI controllers
49 f) USB EHCI controllers 49 f) MDIO on GPIOs
50 g) MDIO on GPIOs 50 g) SPI busses
51 h) SPI busses
52 51
53 VII - Marvell Discovery mv64[345]6x System Controller chips 52 VII - Marvell Discovery mv64[345]6x System Controller chips
54 1) The /system-controller node 53 1) The /system-controller node
@@ -999,7 +998,7 @@ compatibility.
999 translation of SOC addresses for memory mapped SOC registers. 998 translation of SOC addresses for memory mapped SOC registers.
1000 - bus-frequency: Contains the bus frequency for the SOC node. 999 - bus-frequency: Contains the bus frequency for the SOC node.
1001 Typically, the value of this field is filled in by the boot 1000 Typically, the value of this field is filled in by the boot
1002 loader. 1001 loader.
1003 1002
1004 1003
1005 Recommended properties: 1004 Recommended properties:
@@ -1287,71 +1286,7 @@ platforms are moved over to use the flattened-device-tree model.
1287 device_type = "open-pic"; 1286 device_type = "open-pic";
1288 }; 1287 };
1289 1288
1290 c) CFI or JEDEC memory-mapped NOR flash 1289 c) 4xx/Axon EMAC ethernet nodes
1291
1292 Flash chips (Memory Technology Devices) are often used for solid state
1293 file systems on embedded devices.
1294
1295 - compatible : should contain the specific model of flash chip(s)
1296 used, if known, followed by either "cfi-flash" or "jedec-flash"
1297 - reg : Address range of the flash chip
1298 - bank-width : Width (in bytes) of the flash bank. Equal to the
1299 device width times the number of interleaved chips.
1300 - device-width : (optional) Width of a single flash chip. If
1301 omitted, assumed to be equal to 'bank-width'.
1302 - #address-cells, #size-cells : Must be present if the flash has
1303 sub-nodes representing partitions (see below). In this case
1304 both #address-cells and #size-cells must be equal to 1.
1305
1306 For JEDEC compatible devices, the following additional properties
1307 are defined:
1308
1309 - vendor-id : Contains the flash chip's vendor id (1 byte).
1310 - device-id : Contains the flash chip's device id (1 byte).
1311
1312 In addition to the information on the flash bank itself, the
1313 device tree may optionally contain additional information
1314 describing partitions of the flash address space. This can be
1315 used on platforms which have strong conventions about which
1316 portions of the flash are used for what purposes, but which don't
1317 use an on-flash partition table such as RedBoot.
1318
1319 Each partition is represented as a sub-node of the flash device.
1320 Each node's name represents the name of the corresponding
1321 partition of the flash device.
1322
1323 Flash partitions
1324 - reg : The partition's offset and size within the flash bank.
1325 - label : (optional) The label / name for this flash partition.
1326 If omitted, the label is taken from the node name (excluding
1327 the unit address).
1328 - read-only : (optional) This parameter, if present, is a hint to
1329 Linux that this flash partition should only be mounted
1330 read-only. This is usually used for flash partitions
1331 containing early-boot firmware images or data which should not
1332 be clobbered.
1333
1334 Example:
1335
1336 flash@ff000000 {
1337 compatible = "amd,am29lv128ml", "cfi-flash";
1338 reg = <ff000000 01000000>;
1339 bank-width = <4>;
1340 device-width = <1>;
1341 #address-cells = <1>;
1342 #size-cells = <1>;
1343 fs@0 {
1344 label = "fs";
1345 reg = <0 f80000>;
1346 };
1347 firmware@f80000 {
1348 label ="firmware";
1349 reg = <f80000 80000>;
1350 read-only;
1351 };
1352 };
1353
1354 d) 4xx/Axon EMAC ethernet nodes
1355 1290
1356 The EMAC ethernet controller in IBM and AMCC 4xx chips, and also 1291 The EMAC ethernet controller in IBM and AMCC 4xx chips, and also
1357 the Axon bridge. To operate this needs to interact with a ths 1292 the Axon bridge. To operate this needs to interact with a ths
@@ -1499,7 +1434,7 @@ platforms are moved over to use the flattened-device-tree model.
1499 available. 1434 available.
1500 For Axon: 0x0000012a 1435 For Axon: 0x0000012a
1501 1436
1502 e) Xilinx IP cores 1437 d) Xilinx IP cores
1503 1438
1504 The Xilinx EDK toolchain ships with a set of IP cores (devices) for use 1439 The Xilinx EDK toolchain ships with a set of IP cores (devices) for use
1505 in Xilinx Spartan and Virtex FPGAs. The devices cover the whole range 1440 in Xilinx Spartan and Virtex FPGAs. The devices cover the whole range
@@ -1761,7 +1696,7 @@ platforms are moved over to use the flattened-device-tree model.
1761 listed above, nodes for these devices should include a phy-handle 1696 listed above, nodes for these devices should include a phy-handle
1762 property, and may include other common network device properties 1697 property, and may include other common network device properties
1763 like local-mac-address. 1698 like local-mac-address.
1764 1699
1765 iv) Xilinx Uartlite 1700 iv) Xilinx Uartlite
1766 1701
1767 Xilinx uartlite devices are simple fixed speed serial ports. 1702 Xilinx uartlite devices are simple fixed speed serial ports.
@@ -1793,7 +1728,7 @@ platforms are moved over to use the flattened-device-tree model.
1793 - reg-offset : A value of 3 is required 1728 - reg-offset : A value of 3 is required
1794 - reg-shift : A value of 2 is required 1729 - reg-shift : A value of 2 is required
1795 1730
1796 f) USB EHCI controllers 1731 e) USB EHCI controllers
1797 1732
1798 Required properties: 1733 Required properties:
1799 - compatible : should be "usb-ehci". 1734 - compatible : should be "usb-ehci".
@@ -1819,7 +1754,7 @@ platforms are moved over to use the flattened-device-tree model.
1819 big-endian; 1754 big-endian;
1820 }; 1755 };
1821 1756
1822 g) MDIO on GPIOs 1757 f) MDIO on GPIOs
1823 1758
1824 Currently defined compatibles: 1759 Currently defined compatibles:
1825 - virtual,gpio-mdio 1760 - virtual,gpio-mdio
@@ -1839,7 +1774,7 @@ platforms are moved over to use the flattened-device-tree model.
1839 &qe_pio_c 6>; 1774 &qe_pio_c 6>;
1840 }; 1775 };
1841 1776
1842 h) SPI (Serial Peripheral Interface) busses 1777 g) SPI (Serial Peripheral Interface) busses
1843 1778
1844 SPI busses can be described with a node for the SPI master device 1779 SPI busses can be described with a node for the SPI master device
1845 and a set of child nodes for each SPI slave on the bus. For this 1780 and a set of child nodes for each SPI slave on the bus. For this
diff --git a/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt b/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt
index 84a04d5eb8e6..a48b2cadc7f0 100644
--- a/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt
@@ -5,9 +5,21 @@ Required properties:
5- reg : should specify localbus chip select and size used for the chip. 5- reg : should specify localbus chip select and size used for the chip.
6- fsl,upm-addr-offset : UPM pattern offset for the address latch. 6- fsl,upm-addr-offset : UPM pattern offset for the address latch.
7- fsl,upm-cmd-offset : UPM pattern offset for the command latch. 7- fsl,upm-cmd-offset : UPM pattern offset for the command latch.
8- gpios : may specify optional GPIO connected to the Ready-Not-Busy pin.
9 8
10Example: 9Optional properties:
10- fsl,upm-wait-flags : add chip-dependent short delays after running the
11 UPM pattern (0x1), after writing a data byte (0x2) or after
12 writing out a buffer (0x4).
13- fsl,upm-addr-line-cs-offsets : address offsets for multi-chip support.
14 The corresponding address lines are used to select the chip.
15- gpios : may specify optional GPIOs connected to the Ready-Not-Busy pins
16 (R/B#). For multi-chip devices, "n" GPIO definitions are required
17 according to the number of chips.
18- chip-delay : chip dependent delay for transfering data from array to
19 read registers (tR). Required if property "gpios" is not used
20 (R/B# pins not connected).
21
22Examples:
11 23
12upm@1,0 { 24upm@1,0 {
13 compatible = "fsl,upm-nand"; 25 compatible = "fsl,upm-nand";
@@ -26,3 +38,26 @@ upm@1,0 {
26 }; 38 };
27 }; 39 };
28}; 40};
41
42upm@3,0 {
43 #address-cells = <0>;
44 #size-cells = <0>;
45 compatible = "tqc,tqm8548-upm-nand", "fsl,upm-nand";
46 reg = <3 0x0 0x800>;
47 fsl,upm-addr-offset = <0x10>;
48 fsl,upm-cmd-offset = <0x08>;
49 /* Multi-chip NAND device */
50 fsl,upm-addr-line-cs-offsets = <0x0 0x200>;
51 fsl,upm-wait-flags = <0x5>;
52 chip-delay = <25>; // in micro-seconds
53
54 nand@0 {
55 #address-cells = <1>;
56 #size-cells = <1>;
57
58 partition@0 {
59 label = "fs";
60 reg = <0x00000000 0x10000000>;
61 };
62 };
63};
diff --git a/Documentation/powerpc/dts-bindings/gpio/led.txt b/Documentation/powerpc/dts-bindings/gpio/led.txt
index ff51f4c0fa9d..4fe14deedc0a 100644
--- a/Documentation/powerpc/dts-bindings/gpio/led.txt
+++ b/Documentation/powerpc/dts-bindings/gpio/led.txt
@@ -1,15 +1,43 @@
1LED connected to GPIO 1LEDs connected to GPIO lines
2 2
3Required properties: 3Required properties:
4- compatible : should be "gpio-led". 4- compatible : should be "gpio-leds".
5- label : (optional) the label for this LED. If omitted, the label is 5
6Each LED is represented as a sub-node of the gpio-leds device. Each
7node's name represents the name of the corresponding LED.
8
9LED sub-node properties:
10- gpios : Should specify the LED's GPIO, see "Specifying GPIO information
11 for devices" in Documentation/powerpc/booting-without-of.txt. Active
12 low LEDs should be indicated using flags in the GPIO specifier.
13- label : (optional) The label for this LED. If omitted, the label is
6 taken from the node name (excluding the unit address). 14 taken from the node name (excluding the unit address).
7- gpios : should specify LED GPIO. 15- linux,default-trigger : (optional) This parameter, if present, is a
16 string defining the trigger assigned to the LED. Current triggers are:
17 "backlight" - LED will act as a back-light, controlled by the framebuffer
18 system
19 "default-on" - LED will turn on
20 "heartbeat" - LED "double" flashes at a load average based rate
21 "ide-disk" - LED indicates disk activity
22 "timer" - LED flashes at a fixed, configurable rate
8 23
9Example: 24Examples:
10 25
11led@0 { 26leds {
12 compatible = "gpio-led"; 27 compatible = "gpio-leds";
13 label = "hdd"; 28 hdd {
14 gpios = <&mcu_pio 0 1>; 29 label = "IDE Activity";
30 gpios = <&mcu_pio 0 1>; /* Active low */
31 linux,default-trigger = "ide-disk";
32 };
15}; 33};
34
35run-control {
36 compatible = "gpio-leds";
37 red {
38 gpios = <&mpc8572 6 0>;
39 };
40 green {
41 gpios = <&mpc8572 7 0>;
42 };
43}
diff --git a/Documentation/powerpc/dts-bindings/mtd-physmap.txt b/Documentation/powerpc/dts-bindings/mtd-physmap.txt
new file mode 100644
index 000000000000..667c9bde8699
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/mtd-physmap.txt
@@ -0,0 +1,80 @@
1CFI or JEDEC memory-mapped NOR flash
2
3Flash chips (Memory Technology Devices) are often used for solid state
4file systems on embedded devices.
5
6 - compatible : should contain the specific model of flash chip(s)
7 used, if known, followed by either "cfi-flash" or "jedec-flash"
8 - reg : Address range(s) of the flash chip(s)
9 It's possible to (optionally) define multiple "reg" tuples so that
10 non-identical NOR chips can be described in one flash node.
11 - bank-width : Width (in bytes) of the flash bank. Equal to the
12 device width times the number of interleaved chips.
13 - device-width : (optional) Width of a single flash chip. If
14 omitted, assumed to be equal to 'bank-width'.
15 - #address-cells, #size-cells : Must be present if the flash has
16 sub-nodes representing partitions (see below). In this case
17 both #address-cells and #size-cells must be equal to 1.
18
19For JEDEC compatible devices, the following additional properties
20are defined:
21
22 - vendor-id : Contains the flash chip's vendor id (1 byte).
23 - device-id : Contains the flash chip's device id (1 byte).
24
25In addition to the information on the flash bank itself, the
26device tree may optionally contain additional information
27describing partitions of the flash address space. This can be
28used on platforms which have strong conventions about which
29portions of the flash are used for what purposes, but which don't
30use an on-flash partition table such as RedBoot.
31
32Each partition is represented as a sub-node of the flash device.
33Each node's name represents the name of the corresponding
34partition of the flash device.
35
36Flash partitions
37 - reg : The partition's offset and size within the flash bank.
38 - label : (optional) The label / name for this flash partition.
39 If omitted, the label is taken from the node name (excluding
40 the unit address).
41 - read-only : (optional) This parameter, if present, is a hint to
42 Linux that this flash partition should only be mounted
43 read-only. This is usually used for flash partitions
44 containing early-boot firmware images or data which should not
45 be clobbered.
46
47Example:
48
49 flash@ff000000 {
50 compatible = "amd,am29lv128ml", "cfi-flash";
51 reg = <ff000000 01000000>;
52 bank-width = <4>;
53 device-width = <1>;
54 #address-cells = <1>;
55 #size-cells = <1>;
56 fs@0 {
57 label = "fs";
58 reg = <0 f80000>;
59 };
60 firmware@f80000 {
61 label ="firmware";
62 reg = <f80000 80000>;
63 read-only;
64 };
65 };
66
67Here an example with multiple "reg" tuples:
68
69 flash@f0000000,0 {
70 #address-cells = <1>;
71 #size-cells = <1>;
72 compatible = "intel,PC48F4400P0VB", "cfi-flash";
73 reg = <0 0x00000000 0x02000000
74 0 0x02000000 0x02000000>;
75 bank-width = <2>;
76 partition@0 {
77 label = "test-part1";
78 reg = <0 0x04000000>;
79 };
80 };
diff --git a/Documentation/scsi/aacraid.txt b/Documentation/scsi/aacraid.txt
index ddace3afc83b..30f643f611b2 100644
--- a/Documentation/scsi/aacraid.txt
+++ b/Documentation/scsi/aacraid.txt
@@ -60,17 +60,9 @@ Supported Cards/Chipsets
60 9005:0285:9005:02d5 Adaptec ASR-2405 (Voodoo40 Lite) 60 9005:0285:9005:02d5 Adaptec ASR-2405 (Voodoo40 Lite)
61 9005:0285:9005:02d6 Adaptec ASR-2445 (Voodoo44 Lite) 61 9005:0285:9005:02d6 Adaptec ASR-2445 (Voodoo44 Lite)
62 9005:0285:9005:02d7 Adaptec ASR-2805 (Voodoo80 Lite) 62 9005:0285:9005:02d7 Adaptec ASR-2805 (Voodoo80 Lite)
63 9005:0285:9005:02d8 Adaptec 5405G (Voodoo40 PM) 63 9005:0285:9005:02d8 Adaptec 5405Z (Voodoo40 BLBU)
64 9005:0285:9005:02d9 Adaptec 5445G (Voodoo44 PM) 64 9005:0285:9005:02d9 Adaptec 5445Z (Voodoo44 BLBU)
65 9005:0285:9005:02da Adaptec 5805G (Voodoo80 PM) 65 9005:0285:9005:02da Adaptec 5805Z (Voodoo80 BLBU)
66 9005:0285:9005:02db Adaptec 5085G (Voodoo08 PM)
67 9005:0285:9005:02dc Adaptec 51245G (Voodoo124 PM)
68 9005:0285:9005:02dd Adaptec 51645G (Voodoo164 PM)
69 9005:0285:9005:02de Adaptec 52445G (Voodoo244 PM)
70 9005:0285:9005:02df Adaptec ASR-2045G (Voodoo04 Lite PM)
71 9005:0285:9005:02e0 Adaptec ASR-2405G (Voodoo40 Lite PM)
72 9005:0285:9005:02e1 Adaptec ASR-2445G (Voodoo44 Lite PM)
73 9005:0285:9005:02e2 Adaptec ASR-2805G (Voodoo80 Lite PM)
74 1011:0046:9005:0364 Adaptec 5400S (Mustang) 66 1011:0046:9005:0364 Adaptec 5400S (Mustang)
75 1011:0046:9005:0365 Adaptec 5400S (Mustang) 67 1011:0046:9005:0365 Adaptec 5400S (Mustang)
76 9005:0287:9005:0800 Adaptec Themisto (Jupiter) 68 9005:0287:9005:0800 Adaptec Themisto (Jupiter)
@@ -140,6 +132,7 @@ Deanna Bonds (non-DASD support, PAE fibs and 64 bit,
140 where fibs that go to the hardware are consistently called hw_fibs and 132 where fibs that go to the hardware are consistently called hw_fibs and
141 not just fibs like the name of the driver tracking structure) 133 not just fibs like the name of the driver tracking structure)
142Mark Salyzyn <Mark_Salyzyn@adaptec.com> Fixed panic issues and added some new product ids for upcoming hbas. Performance tuning, card failover and bug mitigations. 134Mark Salyzyn <Mark_Salyzyn@adaptec.com> Fixed panic issues and added some new product ids for upcoming hbas. Performance tuning, card failover and bug mitigations.
135Achim Leubner <Achim_Leubner@adaptec.com>
143 136
144Original Driver 137Original Driver
145------------------------- 138-------------------------
diff --git a/Documentation/sound/alsa/soc/jack.txt b/Documentation/sound/alsa/soc/jack.txt
new file mode 100644
index 000000000000..fcf82a417293
--- /dev/null
+++ b/Documentation/sound/alsa/soc/jack.txt
@@ -0,0 +1,71 @@
1ASoC jack detection
2===================
3
4ALSA has a standard API for representing physical jacks to user space,
5the kernel side of which can be seen in include/sound/jack.h. ASoC
6provides a version of this API adding two additional features:
7
8 - It allows more than one jack detection method to work together on one
9 user visible jack. In embedded systems it is common for multiple
10 to be present on a single jack but handled by separate bits of
11 hardware.
12
13 - Integration with DAPM, allowing DAPM endpoints to be updated
14 automatically based on the detected jack status (eg, turning off the
15 headphone outputs if no headphones are present).
16
17This is done by splitting the jacks up into three things working
18together: the jack itself represented by a struct snd_soc_jack, sets of
19snd_soc_jack_pins representing DAPM endpoints to update and blocks of
20code providing jack reporting mechanisms.
21
22For example, a system may have a stereo headset jack with two reporting
23mechanisms, one for the headphone and one for the microphone. Some
24systems won't be able to use their speaker output while a headphone is
25connected and so will want to make sure to update both speaker and
26headphone when the headphone jack status changes.
27
28The jack - struct snd_soc_jack
29==============================
30
31This represents a physical jack on the system and is what is visible to
32user space. The jack itself is completely passive, it is set up by the
33machine driver and updated by jack detection methods.
34
35Jacks are created by the machine driver calling snd_soc_jack_new().
36
37snd_soc_jack_pin
38================
39
40These represent a DAPM pin to update depending on some of the status
41bits supported by the jack. Each snd_soc_jack has zero or more of these
42which are updated automatically. They are created by the machine driver
43and associated with the jack using snd_soc_jack_add_pins(). The status
44of the endpoint may configured to be the opposite of the jack status if
45required (eg, enabling a built in microphone if a microphone is not
46connected via a jack).
47
48Jack detection methods
49======================
50
51Actual jack detection is done by code which is able to monitor some
52input to the system and update a jack by calling snd_soc_jack_report(),
53specifying a subset of bits to update. The jack detection code should
54be set up by the machine driver, taking configuration for the jack to
55update and the set of things to report when the jack is connected.
56
57Often this is done based on the status of a GPIO - a handler for this is
58provided by the snd_soc_jack_add_gpio() function. Other methods are
59also available, for example integrated into CODECs. One example of
60CODEC integrated jack detection can be see in the WM8350 driver.
61
62Each jack may have multiple reporting mechanisms, though it will need at
63least one to be useful.
64
65Machine drivers
66===============
67
68These are all hooked together by the machine driver depending on the
69system hardware. The machine driver will set up the snd_soc_jack and
70the list of pins to update then set up one or more jack detection
71mechanisms to update that jack based on their current status.
diff --git a/Documentation/sparse.txt b/Documentation/sparse.txt
index 42f43fa59f24..34c76a55bc04 100644
--- a/Documentation/sparse.txt
+++ b/Documentation/sparse.txt
@@ -42,6 +42,14 @@ sure that bitwise types don't get mixed up (little-endian vs big-endian
42vs cpu-endian vs whatever), and there the constant "0" really _is_ 42vs cpu-endian vs whatever), and there the constant "0" really _is_
43special. 43special.
44 44
45__bitwise__ - to be used for relatively compact stuff (gfp_t, etc.) that
46is mostly warning-free and is supposed to stay that way. Warnings will
47be generated without __CHECK_ENDIAN__.
48
49__bitwise - noisy stuff; in particular, __le*/__be* are that. We really
50don't want to drown in noise unless we'd explicitly asked for it.
51
52
45Getting sparse 53Getting sparse
46~~~~~~~~~~~~~~ 54~~~~~~~~~~~~~~
47 55
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index a34d55b65441..df38ef046f8d 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -95,7 +95,7 @@ of struct cmsghdr structures with appended data.
95 95
96There is only one file in this directory. 96There is only one file in this directory.
97unix_dgram_qlen limits the max number of datagrams queued in Unix domain 97unix_dgram_qlen limits the max number of datagrams queued in Unix domain
98socket's buffer. It will not take effect unless PF_UNIX flag is spicified. 98socket's buffer. It will not take effect unless PF_UNIX flag is specified.
99 99
100 100
1013. /proc/sys/net/ipv4 - IPV4 settings 1013. /proc/sys/net/ipv4 - IPV4 settings
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 3197fc83bc51..97c4b3284329 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -39,6 +39,8 @@ Currently, these files are in /proc/sys/vm:
39- nr_hugepages 39- nr_hugepages
40- nr_overcommit_hugepages 40- nr_overcommit_hugepages
41- nr_pdflush_threads 41- nr_pdflush_threads
42- nr_pdflush_threads_min
43- nr_pdflush_threads_max
42- nr_trim_pages (only if CONFIG_MMU=n) 44- nr_trim_pages (only if CONFIG_MMU=n)
43- numa_zonelist_order 45- numa_zonelist_order
44- oom_dump_tasks 46- oom_dump_tasks
@@ -463,6 +465,32 @@ The default value is 0.
463 465
464============================================================== 466==============================================================
465 467
468nr_pdflush_threads_min
469
470This value controls the minimum number of pdflush threads.
471
472At boot time, the kernel will create and maintain 'nr_pdflush_threads_min'
473threads for the kernel's lifetime.
474
475The default value is 2. The minimum value you can specify is 1, and
476the maximum value is the current setting of 'nr_pdflush_threads_max'.
477
478See 'nr_pdflush_threads_max' below for more information.
479
480==============================================================
481
482nr_pdflush_threads_max
483
484This value controls the maximum number of pdflush threads that can be
485created. The pdflush algorithm will create a new pdflush thread (up to
486this maximum) if no pdflush threads have been available for >= 1 second.
487
488The default value is 8. The minimum value you can specify is the
489current value of 'nr_pdflush_threads_min' and the
490maximum is 1000.
491
492==============================================================
493
466overcommit_memory: 494overcommit_memory:
467 495
468This value contains a flag that enables memory overcommitment. 496This value contains a flag that enables memory overcommitment.
diff --git a/Documentation/tomoyo.txt b/Documentation/tomoyo.txt
new file mode 100644
index 000000000000..b3a232cae7f8
--- /dev/null
+++ b/Documentation/tomoyo.txt
@@ -0,0 +1,55 @@
1--- What is TOMOYO? ---
2
3TOMOYO is a name-based MAC extension (LSM module) for the Linux kernel.
4
5LiveCD-based tutorials are available at
6http://tomoyo.sourceforge.jp/en/1.6.x/1st-step/ubuntu8.04-live/
7http://tomoyo.sourceforge.jp/en/1.6.x/1st-step/centos5-live/ .
8Though these tutorials use non-LSM version of TOMOYO, they are useful for you
9to know what TOMOYO is.
10
11--- How to enable TOMOYO? ---
12
13Build the kernel with CONFIG_SECURITY_TOMOYO=y and pass "security=tomoyo" on
14kernel's command line.
15
16Please see http://tomoyo.sourceforge.jp/en/2.2.x/ for details.
17
18--- Where is documentation? ---
19
20User <-> Kernel interface documentation is available at
21http://tomoyo.sourceforge.jp/en/2.2.x/policy-reference.html .
22
23Materials we prepared for seminars and symposiums are available at
24http://sourceforge.jp/projects/tomoyo/docs/?category_id=532&language_id=1 .
25Below lists are chosen from three aspects.
26
27What is TOMOYO?
28 TOMOYO Linux Overview
29 http://sourceforge.jp/projects/tomoyo/docs/lca2009-takeda.pdf
30 TOMOYO Linux: pragmatic and manageable security for Linux
31 http://sourceforge.jp/projects/tomoyo/docs/freedomhectaipei-tomoyo.pdf
32 TOMOYO Linux: A Practical Method to Understand and Protect Your Own Linux Box
33 http://sourceforge.jp/projects/tomoyo/docs/PacSec2007-en-no-demo.pdf
34
35What can TOMOYO do?
36 Deep inside TOMOYO Linux
37 http://sourceforge.jp/projects/tomoyo/docs/lca2009-kumaneko.pdf
38 The role of "pathname based access control" in security.
39 http://sourceforge.jp/projects/tomoyo/docs/lfj2008-bof.pdf
40
41History of TOMOYO?
42 Realities of Mainlining
43 http://sourceforge.jp/projects/tomoyo/docs/lfj2008.pdf
44
45--- What is future plan? ---
46
47We believe that inode based security and name based security are complementary
48and both should be used together. But unfortunately, so far, we cannot enable
49multiple LSM modules at the same time. We feel sorry that you have to give up
50SELinux/SMACK/AppArmor etc. when you want to use TOMOYO.
51
52We hope that LSM becomes stackable in future. Meanwhile, you can use non-LSM
53version of TOMOYO, available at http://tomoyo.sourceforge.jp/en/1.6.x/ .
54LSM version of TOMOYO is a subset of non-LSM version of TOMOYO. We are planning
55to port non-LSM version's functionalities to LSM versions.
diff --git a/Documentation/ftrace.txt b/Documentation/trace/ftrace.txt
index fd9a3e693813..fd9a3e693813 100644
--- a/Documentation/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
diff --git a/Documentation/vm/kmemtrace.txt b/Documentation/trace/kmemtrace.txt
index a956d9b7f943..a956d9b7f943 100644
--- a/Documentation/vm/kmemtrace.txt
+++ b/Documentation/trace/kmemtrace.txt
diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/trace/mmiotrace.txt
index 5731c67abc55..5731c67abc55 100644
--- a/Documentation/tracers/mmiotrace.txt
+++ b/Documentation/trace/mmiotrace.txt
diff --git a/Documentation/tracepoints.txt b/Documentation/trace/tracepoints.txt
index c0e1ceed75a4..c0e1ceed75a4 100644
--- a/Documentation/tracepoints.txt
+++ b/Documentation/trace/tracepoints.txt
diff --git a/Documentation/video4linux/pxa_camera.txt b/Documentation/video4linux/pxa_camera.txt
new file mode 100644
index 000000000000..b1137f9a53eb
--- /dev/null
+++ b/Documentation/video4linux/pxa_camera.txt
@@ -0,0 +1,125 @@
1 PXA-Camera Host Driver
2 ======================
3
4Constraints
5-----------
6 a) Image size for YUV422P format
7 All YUV422P images are enforced to have width x height % 16 = 0.
8 This is due to DMA constraints, which transfers only planes of 8 byte
9 multiples.
10
11
12Global video workflow
13---------------------
14 a) QCI stopped
15 Initialy, the QCI interface is stopped.
16 When a buffer is queued (pxa_videobuf_ops->buf_queue), the QCI starts.
17
18 b) QCI started
19 More buffers can be queued while the QCI is started without halting the
20 capture. The new buffers are "appended" at the tail of the DMA chain, and
21 smoothly captured one frame after the other.
22
23 Once a buffer is filled in the QCI interface, it is marked as "DONE" and
24 removed from the active buffers list. It can be then requeud or dequeued by
25 userland application.
26
27 Once the last buffer is filled in, the QCI interface stops.
28
29
30DMA usage
31---------
32 a) DMA flow
33 - first buffer queued for capture
34 Once a first buffer is queued for capture, the QCI is started, but data
35 transfer is not started. On "End Of Frame" interrupt, the irq handler
36 starts the DMA chain.
37 - capture of one videobuffer
38 The DMA chain starts transfering data into videobuffer RAM pages.
39 When all pages are transfered, the DMA irq is raised on "ENDINTR" status
40 - finishing one videobuffer
41 The DMA irq handler marks the videobuffer as "done", and removes it from
42 the active running queue
43 Meanwhile, the next videobuffer (if there is one), is transfered by DMA
44 - finishing the last videobuffer
45 On the DMA irq of the last videobuffer, the QCI is stopped.
46
47 b) DMA prepared buffer will have this structure
48
49 +------------+-----+---------------+-----------------+
50 | desc-sg[0] | ... | desc-sg[last] | finisher/linker |
51 +------------+-----+---------------+-----------------+
52
53 This structure is pointed by dma->sg_cpu.
54 The descriptors are used as follows :
55 - desc-sg[i]: i-th descriptor, transfering the i-th sg
56 element to the video buffer scatter gather
57 - finisher: has ddadr=DADDR_STOP, dcmd=ENDIRQEN
58 - linker: has ddadr= desc-sg[0] of next video buffer, dcmd=0
59
60 For the next schema, let's assume d0=desc-sg[0] .. dN=desc-sg[N],
61 "f" stands for finisher and "l" for linker.
62 A typical running chain is :
63
64 Videobuffer 1 Videobuffer 2
65 +---------+----+---+ +----+----+----+---+
66 | d0 | .. | dN | l | | d0 | .. | dN | f |
67 +---------+----+-|-+ ^----+----+----+---+
68 | |
69 +----+
70
71 After the chaining is finished, the chain looks like :
72
73 Videobuffer 1 Videobuffer 2 Videobuffer 3
74 +---------+----+---+ +----+----+----+---+ +----+----+----+---+
75 | d0 | .. | dN | l | | d0 | .. | dN | l | | d0 | .. | dN | f |
76 +---------+----+-|-+ ^----+----+----+-|-+ ^----+----+----+---+
77 | | | |
78 +----+ +----+
79 new_link
80
81 c) DMA hot chaining timeslice issue
82
83 As DMA chaining is done while DMA _is_ running, the linking may be done
84 while the DMA jumps from one Videobuffer to another. On the schema, that
85 would be a problem if the following sequence is encountered :
86
87 - DMA chain is Videobuffer1 + Videobuffer2
88 - pxa_videobuf_queue() is called to queue Videobuffer3
89 - DMA controller finishes Videobuffer2, and DMA stops
90 =>
91 Videobuffer 1 Videobuffer 2
92 +---------+----+---+ +----+----+----+---+
93 | d0 | .. | dN | l | | d0 | .. | dN | f |
94 +---------+----+-|-+ ^----+----+----+-^-+
95 | | |
96 +----+ +-- DMA DDADR loads DDADR_STOP
97
98 - pxa_dma_add_tail_buf() is called, the Videobuffer2 "finisher" is
99 replaced by a "linker" to Videobuffer3 (creation of new_link)
100 - pxa_videobuf_queue() finishes
101 - the DMA irq handler is called, which terminates Videobuffer2
102 - Videobuffer3 capture is not scheduled on DMA chain (as it stopped !!!)
103
104 Videobuffer 1 Videobuffer 2 Videobuffer 3
105 +---------+----+---+ +----+----+----+---+ +----+----+----+---+
106 | d0 | .. | dN | l | | d0 | .. | dN | l | | d0 | .. | dN | f |
107 +---------+----+-|-+ ^----+----+----+-|-+ ^----+----+----+---+
108 | | | |
109 +----+ +----+
110 new_link
111 DMA DDADR still is DDADR_STOP
112
113 - pxa_camera_check_link_miss() is called
114 This checks if the DMA is finished and a buffer is still on the
115 pcdev->capture list. If that's the case, the capture will be restarted,
116 and Videobuffer3 is scheduled on DMA chain.
117 - the DMA irq handler finishes
118
119 Note: if DMA stops just after pxa_camera_check_link_miss() reads DDADR()
120 value, we have the guarantee that the DMA irq handler will be called back
121 when the DMA will finish the buffer, and pxa_camera_check_link_miss() will
122 be called again, to reschedule Videobuffer3.
123
124--
125Author: Robert Jarzmik <robert.jarzmik@free.fr>
diff --git a/Documentation/video4linux/v4l2-framework.txt b/Documentation/video4linux/v4l2-framework.txt
index a31177390e55..854808b67fae 100644
--- a/Documentation/video4linux/v4l2-framework.txt
+++ b/Documentation/video4linux/v4l2-framework.txt
@@ -90,7 +90,7 @@ up before calling v4l2_device_register then it will be untouched. If dev is
90NULL, then you *must* setup v4l2_dev->name before calling v4l2_device_register. 90NULL, then you *must* setup v4l2_dev->name before calling v4l2_device_register.
91 91
92The first 'dev' argument is normally the struct device pointer of a pci_dev, 92The first 'dev' argument is normally the struct device pointer of a pci_dev,
93usb_device or platform_device. It is rare for dev to be NULL, but it happens 93usb_interface or platform_device. It is rare for dev to be NULL, but it happens
94with ISA devices or when one device creates multiple PCI devices, thus making 94with ISA devices or when one device creates multiple PCI devices, thus making
95it impossible to associate v4l2_dev with a particular parent. 95it impossible to associate v4l2_dev with a particular parent.
96 96
@@ -351,17 +351,6 @@ And this to go from an i2c_client to a v4l2_subdev struct:
351 351
352 struct v4l2_subdev *sd = i2c_get_clientdata(client); 352 struct v4l2_subdev *sd = i2c_get_clientdata(client);
353 353
354Finally you need to make a command function to make driver->command()
355call the right subdev_ops functions:
356
357static int subdev_command(struct i2c_client *client, unsigned cmd, void *arg)
358{
359 return v4l2_subdev_command(i2c_get_clientdata(client), cmd, arg);
360}
361
362If driver->command is never used then you can leave this out. Eventually the
363driver->command usage should be removed from v4l.
364
365Make sure to call v4l2_device_unregister_subdev(sd) when the remove() callback 354Make sure to call v4l2_device_unregister_subdev(sd) when the remove() callback
366is called. This will unregister the sub-device from the bridge driver. It is 355is called. This will unregister the sub-device from the bridge driver. It is
367safe to call this even if the sub-device was never registered. 356safe to call this even if the sub-device was never registered.
@@ -375,14 +364,12 @@ from the remove() callback ensures that this is always done correctly.
375 364
376The bridge driver also has some helper functions it can use: 365The bridge driver also has some helper functions it can use:
377 366
378struct v4l2_subdev *sd = v4l2_i2c_new_subdev(adapter, "module_foo", "chipid", 0x36); 367struct v4l2_subdev *sd = v4l2_i2c_new_subdev(v4l2_dev, adapter,
368 "module_foo", "chipid", 0x36);
379 369
380This loads the given module (can be NULL if no module needs to be loaded) and 370This loads the given module (can be NULL if no module needs to be loaded) and
381calls i2c_new_device() with the given i2c_adapter and chip/address arguments. 371calls i2c_new_device() with the given i2c_adapter and chip/address arguments.
382If all goes well, then it registers the subdev with the v4l2_device. It gets 372If all goes well, then it registers the subdev with the v4l2_device.
383the v4l2_device by calling i2c_get_adapdata(adapter), so you should make sure
384to call i2c_set_adapdata(adapter, v4l2_device) when you setup the i2c_adapter
385in your driver.
386 373
387You can also use v4l2_i2c_new_probed_subdev() which is very similar to 374You can also use v4l2_i2c_new_probed_subdev() which is very similar to
388v4l2_i2c_new_subdev(), except that it has an array of possible I2C addresses 375v4l2_i2c_new_subdev(), except that it has an array of possible I2C addresses
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
index 2131b00b63f6..2f77ced35df7 100644
--- a/Documentation/vm/00-INDEX
+++ b/Documentation/vm/00-INDEX
@@ -1,5 +1,7 @@
100-INDEX 100-INDEX
2 - this file. 2 - this file.
3active_mm.txt
4 - An explanation from Linus about tsk->active_mm vs tsk->mm.
3balance 5balance
4 - various information on memory balancing. 6 - various information on memory balancing.
5hugetlbpage.txt 7hugetlbpage.txt
diff --git a/Documentation/vm/active_mm.txt b/Documentation/vm/active_mm.txt
new file mode 100644
index 000000000000..4ee1f643d897
--- /dev/null
+++ b/Documentation/vm/active_mm.txt
@@ -0,0 +1,83 @@
1List: linux-kernel
2Subject: Re: active_mm
3From: Linus Torvalds <torvalds () transmeta ! com>
4Date: 1999-07-30 21:36:24
5
6Cc'd to linux-kernel, because I don't write explanations all that often,
7and when I do I feel better about more people reading them.
8
9On Fri, 30 Jul 1999, David Mosberger wrote:
10>
11> Is there a brief description someplace on how "mm" vs. "active_mm" in
12> the task_struct are supposed to be used? (My apologies if this was
13> discussed on the mailing lists---I just returned from vacation and
14> wasn't able to follow linux-kernel for a while).
15
16Basically, the new setup is:
17
18 - we have "real address spaces" and "anonymous address spaces". The
19 difference is that an anonymous address space doesn't care about the
20 user-level page tables at all, so when we do a context switch into an
21 anonymous address space we just leave the previous address space
22 active.
23
24 The obvious use for a "anonymous address space" is any thread that
25 doesn't need any user mappings - all kernel threads basically fall into
26 this category, but even "real" threads can temporarily say that for
27 some amount of time they are not going to be interested in user space,
28 and that the scheduler might as well try to avoid wasting time on
29 switching the VM state around. Currently only the old-style bdflush
30 sync does that.
31
32 - "tsk->mm" points to the "real address space". For an anonymous process,
33 tsk->mm will be NULL, for the logical reason that an anonymous process
34 really doesn't _have_ a real address space at all.
35
36 - however, we obviously need to keep track of which address space we
37 "stole" for such an anonymous user. For that, we have "tsk->active_mm",
38 which shows what the currently active address space is.
39
40 The rule is that for a process with a real address space (ie tsk->mm is
41 non-NULL) the active_mm obviously always has to be the same as the real
42 one.
43
44 For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the
45 "borrowed" mm while the anonymous process is running. When the
46 anonymous process gets scheduled away, the borrowed address space is
47 returned and cleared.
48
49To support all that, the "struct mm_struct" now has two counters: a
50"mm_users" counter that is how many "real address space users" there are,
51and a "mm_count" counter that is the number of "lazy" users (ie anonymous
52users) plus one if there are any real users.
53
54Usually there is at least one real user, but it could be that the real
55user exited on another CPU while a lazy user was still active, so you do
56actually get cases where you have a address space that is _only_ used by
57lazy users. That is often a short-lived state, because once that thread
58gets scheduled away in favour of a real thread, the "zombie" mm gets
59released because "mm_users" becomes zero.
60
61Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any
62more. "init_mm" should be considered just a "lazy context when no other
63context is available", and in fact it is mainly used just at bootup when
64no real VM has yet been created. So code that used to check
65
66 if (current->mm == &init_mm)
67
68should generally just do
69
70 if (!current->mm)
71
72instead (which makes more sense anyway - the test is basically one of "do
73we have a user context", and is generally done by the page fault handler
74and things like that).
75
76Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago,
77because it slightly changes the interfaces to accomodate the alpha (who
78would have thought it, but the alpha actually ends up having one of the
79ugliest context switch codes - unlike the other architectures where the MM
80and register state is separate, the alpha PALcode joins the two, and you
81need to switch both together).
82
83(From http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt
index 0706a7282a8c..2d70d0d95108 100644
--- a/Documentation/vm/unevictable-lru.txt
+++ b/Documentation/vm/unevictable-lru.txt
@@ -1,588 +1,691 @@
1 1 ==============================
2This document describes the Linux memory management "Unevictable LRU" 2 UNEVICTABLE LRU INFRASTRUCTURE
3infrastructure and the use of this infrastructure to manage several types 3 ==============================
4of "unevictable" pages. The document attempts to provide the overall 4
5rationale behind this mechanism and the rationale for some of the design 5========
6decisions that drove the implementation. The latter design rationale is 6CONTENTS
7discussed in the context of an implementation description. Admittedly, one 7========
8can obtain the implementation details--the "what does it do?"--by reading the 8
9code. One hopes that the descriptions below add value by provide the answer 9 (*) The Unevictable LRU
10to "why does it do that?". 10
11 11 - The unevictable page list.
12Unevictable LRU Infrastructure: 12 - Memory control group interaction.
13 13 - Marking address spaces unevictable.
14The Unevictable LRU adds an additional LRU list to track unevictable pages 14 - Detecting Unevictable Pages.
15and to hide these pages from vmscan. This mechanism is based on a patch by 15 - vmscan's handling of unevictable pages.
16Larry Woodman of Red Hat to address several scalability problems with page 16
17 (*) mlock()'d pages.
18
19 - History.
20 - Basic management.
21 - mlock()/mlockall() system call handling.
22 - Filtering special vmas.
23 - munlock()/munlockall() system call handling.
24 - Migrating mlocked pages.
25 - mmap(MAP_LOCKED) system call handling.
26 - munmap()/exit()/exec() system call handling.
27 - try_to_unmap().
28 - try_to_munlock() reverse map scan.
29 - Page reclaim in shrink_*_list().
30
31
32============
33INTRODUCTION
34============
35
36This document describes the Linux memory manager's "Unevictable LRU"
37infrastructure and the use of this to manage several types of "unevictable"
38pages.
39
40The document attempts to provide the overall rationale behind this mechanism
41and the rationale for some of the design decisions that drove the
42implementation. The latter design rationale is discussed in the context of an
43implementation description. Admittedly, one can obtain the implementation
44details - the "what does it do?" - by reading the code. One hopes that the
45descriptions below add value by provide the answer to "why does it do that?".
46
47
48===================
49THE UNEVICTABLE LRU
50===================
51
52The Unevictable LRU facility adds an additional LRU list to track unevictable
53pages and to hide these pages from vmscan. This mechanism is based on a patch
54by Larry Woodman of Red Hat to address several scalability problems with page
17reclaim in Linux. The problems have been observed at customer sites on large 55reclaim in Linux. The problems have been observed at customer sites on large
18memory x86_64 systems. For example, a non-numal x86_64 platform with 128GB 56memory x86_64 systems.
19of main memory will have over 32 million 4k pages in a single zone. When a 57
20large fraction of these pages are not evictable for any reason [see below], 58To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of
21vmscan will spend a lot of time scanning the LRU lists looking for the small 59main memory will have over 32 million 4k pages in a single zone. When a large
22fraction of pages that are evictable. This can result in a situation where 60fraction of these pages are not evictable for any reason [see below], vmscan
23all cpus are spending 100% of their time in vmscan for hours or days on end, 61will spend a lot of time scanning the LRU lists looking for the small fraction
24with the system completely unresponsive. 62of pages that are evictable. This can result in a situation where all CPUs are
25 63spending 100% of their time in vmscan for hours or days on end, with the system
26The Unevictable LRU infrastructure addresses the following classes of 64completely unresponsive.
27unevictable pages: 65
28 66The unevictable list addresses the following classes of unevictable pages:
29+ page owned by ramfs 67
30+ page mapped into SHM_LOCKed shared memory regions 68 (*) Those owned by ramfs.
31+ page mapped into VM_LOCKED [mlock()ed] vmas 69
32 70 (*) Those mapped into SHM_LOCK'd shared memory regions.
33The infrastructure might be able to handle other conditions that make pages 71
72 (*) Those mapped into VM_LOCKED [mlock()ed] VMAs.
73
74The infrastructure may also be able to handle other conditions that make pages
34unevictable, either by definition or by circumstance, in the future. 75unevictable, either by definition or by circumstance, in the future.
35 76
36 77
37The Unevictable LRU List 78THE UNEVICTABLE PAGE LIST
79-------------------------
38 80
39The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list 81The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list
40called the "unevictable" list and an associated page flag, PG_unevictable, to 82called the "unevictable" list and an associated page flag, PG_unevictable, to
41indicate that the page is being managed on the unevictable list. The 83indicate that the page is being managed on the unevictable list.
42PG_unevictable flag is analogous to, and mutually exclusive with, the PG_active 84
43flag in that it indicates on which LRU list a page resides when PG_lru is set. 85The PG_unevictable flag is analogous to, and mutually exclusive with, the
44The unevictable LRU list is source configurable based on the UNEVICTABLE_LRU 86PG_active flag in that it indicates on which LRU list a page resides when
45Kconfig option. 87PG_lru is set. The unevictable list is compile-time configurable based on the
88UNEVICTABLE_LRU Kconfig option.
46 89
47The Unevictable LRU infrastructure maintains unevictable pages on an additional 90The Unevictable LRU infrastructure maintains unevictable pages on an additional
48LRU list for a few reasons: 91LRU list for a few reasons:
49 92
501) We get to "treat unevictable pages just like we treat other pages in the 93 (1) We get to "treat unevictable pages just like we treat other pages in the
51 system, which means we get to use the same code to manipulate them, the 94 system - which means we get to use the same code to manipulate them, the
52 same code to isolate them (for migrate, etc.), the same code to keep track 95 same code to isolate them (for migrate, etc.), the same code to keep track
53 of the statistics, etc..." [Rik van Riel] 96 of the statistics, etc..." [Rik van Riel]
97
98 (2) We want to be able to migrate unevictable pages between nodes for memory
99 defragmentation, workload management and memory hotplug. The linux kernel
100 can only migrate pages that it can successfully isolate from the LRU
101 lists. If we were to maintain pages elsewhere than on an LRU-like list,
102 where they can be found by isolate_lru_page(), we would prevent their
103 migration, unless we reworked migration code to find the unevictable pages
104 itself.
54 105
552) We want to be able to migrate unevictable pages between nodes--for memory
56 defragmentation, workload management and memory hotplug. The linux kernel
57 can only migrate pages that it can successfully isolate from the lru lists.
58 If we were to maintain pages elsewise than on an lru-like list, where they
59 can be found by isolate_lru_page(), we would prevent their migration, unless
60 we reworked migration code to find the unevictable pages.
61 106
107The unevictable list does not differentiate between file-backed and anonymous,
108swap-backed pages. This differentiation is only important while the pages are,
109in fact, evictable.
62 110
63The unevictable LRU list does not differentiate between file backed and swap 111The unevictable list benefits from the "arrayification" of the per-zone LRU
64backed [anon] pages. This differentiation is only important while the pages 112lists and statistics originally proposed and posted by Christoph Lameter.
65are, in fact, evictable.
66 113
67The unevictable LRU list benefits from the "arrayification" of the per-zone 114The unevictable list does not use the LRU pagevec mechanism. Rather,
68LRU lists and statistics originally proposed and posted by Christoph Lameter. 115unevictable pages are placed directly on the page's zone's unevictable list
116under the zone lru_lock. This allows us to prevent the stranding of pages on
117the unevictable list when one task has the page isolated from the LRU and other
118tasks are changing the "evictability" state of the page.
69 119
70The unevictable list does not use the lru pagevec mechanism. Rather,
71unevictable pages are placed directly on the page's zone's unevictable
72list under the zone lru_lock. The reason for this is to prevent stranding
73of pages on the unevictable list when one task has the page isolated from the
74lru and other tasks are changing the "evictability" state of the page.
75 120
121MEMORY CONTROL GROUP INTERACTION
122--------------------------------
76 123
77Unevictable LRU and Memory Controller Interaction 124The unevictable LRU facility interacts with the memory control group [aka
125memory controller; see Documentation/cgroups/memory.txt] by extending the
126lru_list enum.
127
128The memory controller data structure automatically gets a per-zone unevictable
129list as a result of the "arrayification" of the per-zone LRU lists (one per
130lru_list enum element). The memory controller tracks the movement of pages to
131and from the unevictable list.
78 132
79The memory controller data structure automatically gets a per zone unevictable
80lru list as a result of the "arrayification" of the per-zone LRU lists. The
81memory controller tracks the movement of pages to and from the unevictable list.
82When a memory control group comes under memory pressure, the controller will 133When a memory control group comes under memory pressure, the controller will
83not attempt to reclaim pages on the unevictable list. This has a couple of 134not attempt to reclaim pages on the unevictable list. This has a couple of
84effects. Because the pages are "hidden" from reclaim on the unevictable list, 135effects:
85the reclaim process can be more efficient, dealing only with pages that have 136
86a chance of being reclaimed. On the other hand, if too many of the pages 137 (1) Because the pages are "hidden" from reclaim on the unevictable list, the
87charged to the control group are unevictable, the evictable portion of the 138 reclaim process can be more efficient, dealing only with pages that have a
88working set of the tasks in the control group may not fit into the available 139 chance of being reclaimed.
89memory. This can cause the control group to thrash or to oom-kill tasks. 140
90 141 (2) On the other hand, if too many of the pages charged to the control group
91 142 are unevictable, the evictable portion of the working set of the tasks in
92Unevictable LRU: Detecting Unevictable Pages 143 the control group may not fit into the available memory. This can cause
93 144 the control group to thrash or to OOM-kill tasks.
94The function page_evictable(page, vma) in vmscan.c determines whether a 145
95page is evictable or not. For ramfs pages and pages in SHM_LOCKed regions, 146
96page_evictable() tests a new address space flag, AS_UNEVICTABLE, in the page's 147MARKING ADDRESS SPACES UNEVICTABLE
97address space using a wrapper function. Wrapper functions are used to set, 148----------------------------------
98clear and test the flag to reduce the requirement for #ifdef's throughout the 149
99source code. AS_UNEVICTABLE is set on ramfs inode/mapping when it is created. 150For facilities such as ramfs none of the pages attached to the address space
100This flag remains for the life of the inode. 151may be evicted. To prevent eviction of any such pages, the AS_UNEVICTABLE
101 152address space flag is provided, and this can be manipulated by a filesystem
102For shared memory regions, AS_UNEVICTABLE is set when an application 153using a number of wrapper functions:
103successfully SHM_LOCKs the region and is removed when the region is 154
104SHM_UNLOCKed. Note that shmctl(SHM_LOCK, ...) does not populate the page 155 (*) void mapping_set_unevictable(struct address_space *mapping);
105tables for the region as does, for example, mlock(). So, we make no special 156
106effort to push any pages in the SHM_LOCKed region to the unevictable list. 157 Mark the address space as being completely unevictable.
107Vmscan will do this when/if it encounters the pages during reclaim. On 158
108SHM_UNLOCK, shmctl() scans the pages in the region and "rescues" them from the 159 (*) void mapping_clear_unevictable(struct address_space *mapping);
109unevictable list if no other condition keeps them unevictable. If a SHM_LOCKed 160
110region is destroyed, the pages are also "rescued" from the unevictable list in 161 Mark the address space as being evictable.
111the process of freeing them. 162
112 163 (*) int mapping_unevictable(struct address_space *mapping);
113page_evictable() detects mlock()ed pages by testing an additional page flag, 164
114PG_mlocked via the PageMlocked() wrapper. If the page is NOT mlocked, and a 165 Query the address space, and return true if it is completely
115non-NULL vma is supplied, page_evictable() will check whether the vma is 166 unevictable.
167
168These are currently used in two places in the kernel:
169
170 (1) By ramfs to mark the address spaces of its inodes when they are created,
171 and this mark remains for the life of the inode.
172
173 (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called.
174
175 Note that SHM_LOCK is not required to page in the locked pages if they're
176 swapped out; the application must touch the pages manually if it wants to
177 ensure they're in memory.
178
179
180DETECTING UNEVICTABLE PAGES
181---------------------------
182
183The function page_evictable() in vmscan.c determines whether a page is
184evictable or not using the query function outlined above [see section "Marking
185address spaces unevictable"] to check the AS_UNEVICTABLE flag.
186
187For address spaces that are so marked after being populated (as SHM regions
188might be), the lock action (eg: SHM_LOCK) can be lazy, and need not populate
189the page tables for the region as does, for example, mlock(), nor need it make
190any special effort to push any pages in the SHM_LOCK'd area to the unevictable
191list. Instead, vmscan will do this if and when it encounters the pages during
192a reclamation scan.
193
194On an unlock action (such as SHM_UNLOCK), the unlocker (eg: shmctl()) must scan
195the pages in the region and "rescue" them from the unevictable list if no other
196condition is keeping them unevictable. If an unevictable region is destroyed,
197the pages are also "rescued" from the unevictable list in the process of
198freeing them.
199
200page_evictable() also checks for mlocked pages by testing an additional page
201flag, PG_mlocked (as wrapped by PageMlocked()). If the page is NOT mlocked,
202and a non-NULL VMA is supplied, page_evictable() will check whether the VMA is
116VM_LOCKED via is_mlocked_vma(). is_mlocked_vma() will SetPageMlocked() and 203VM_LOCKED via is_mlocked_vma(). is_mlocked_vma() will SetPageMlocked() and
117update the appropriate statistics if the vma is VM_LOCKED. This method allows 204update the appropriate statistics if the vma is VM_LOCKED. This method allows
118efficient "culling" of pages in the fault path that are being faulted in to 205efficient "culling" of pages in the fault path that are being faulted in to
119VM_LOCKED vmas. 206VM_LOCKED VMAs.
120 207
121 208
122Unevictable Pages and Vmscan [shrink_*_list()] 209VMSCAN'S HANDLING OF UNEVICTABLE PAGES
210--------------------------------------
123 211
124If unevictable pages are culled in the fault path, or moved to the unevictable 212If unevictable pages are culled in the fault path, or moved to the unevictable
125list at mlock() or mmap() time, vmscan will never encounter the pages until 213list at mlock() or mmap() time, vmscan will not encounter the pages until they
126they have become evictable again, for example, via munlock() and have been 214have become evictable again (via munlock() for example) and have been "rescued"
127"rescued" from the unevictable list. However, there may be situations where we 215from the unevictable list. However, there may be situations where we decide,
128decide, for the sake of expediency, to leave a unevictable page on one of the 216for the sake of expediency, to leave a unevictable page on one of the regular
129regular active/inactive LRU lists for vmscan to deal with. Vmscan checks for 217active/inactive LRU lists for vmscan to deal with. vmscan checks for such
130such pages in all of the shrink_{active|inactive|page}_list() functions and 218pages in all of the shrink_{active|inactive|page}_list() functions and will
131will "cull" such pages that it encounters--that is, it diverts those pages to 219"cull" such pages that it encounters: that is, it diverts those pages to the
132the unevictable list for the zone being scanned. 220unevictable list for the zone being scanned.
133 221
134There may be situations where a page is mapped into a VM_LOCKED vma, but the 222There may be situations where a page is mapped into a VM_LOCKED VMA, but the
135page is not marked as PageMlocked. Such pages will make it all the way to 223page is not marked as PG_mlocked. Such pages will make it all the way to
136shrink_page_list() where they will be detected when vmscan walks the reverse 224shrink_page_list() where they will be detected when vmscan walks the reverse
137map in try_to_unmap(). If try_to_unmap() returns SWAP_MLOCK, shrink_page_list() 225map in try_to_unmap(). If try_to_unmap() returns SWAP_MLOCK,
138will cull the page at that point. 226shrink_page_list() will cull the page at that point.
139 227
140To "cull" an unevictable page, vmscan simply puts the page back on the lru 228To "cull" an unevictable page, vmscan simply puts the page back on the LRU list
141list using putback_lru_page()--the inverse operation to isolate_lru_page()-- 229using putback_lru_page() - the inverse operation to isolate_lru_page() - after
142after dropping the page lock. Because the condition which makes the page 230dropping the page lock. Because the condition which makes the page unevictable
143unevictable may change once the page is unlocked, putback_lru_page() will 231may change once the page is unlocked, putback_lru_page() will recheck the
144recheck the unevictable state of a page that it places on the unevictable lru 232unevictable state of a page that it places on the unevictable list. If the
145list. If the page has become unevictable, putback_lru_page() removes it from 233page has become unevictable, putback_lru_page() removes it from the list and
146the list and retries, including the page_unevictable() test. Because such a 234retries, including the page_unevictable() test. Because such a race is a rare
147race is a rare event and movement of pages onto the unevictable list should be 235event and movement of pages onto the unevictable list should be rare, these
148rare, these extra evictabilty checks should not occur in the majority of calls 236extra evictabilty checks should not occur in the majority of calls to
149to putback_lru_page(). 237putback_lru_page().
150 238
151 239
152Mlocked Page: Prior Work 240=============
241MLOCKED PAGES
242=============
153 243
154The "Unevictable Mlocked Pages" infrastructure is based on work originally 244The unevictable page list is also useful for mlock(), in addition to ramfs and
245SYSV SHM. Note that mlock() is only available in CONFIG_MMU=y situations; in
246NOMMU situations, all mappings are effectively mlocked.
247
248
249HISTORY
250-------
251
252The "Unevictable mlocked Pages" infrastructure is based on work originally
155posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU". 253posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU".
156Nick posted his patch as an alternative to a patch posted by Christoph 254Nick posted his patch as an alternative to a patch posted by Christoph Lameter
157Lameter to achieve the same objective--hiding mlocked pages from vmscan. 255to achieve the same objective: hiding mlocked pages from vmscan.
158In Nick's patch, he used one of the struct page lru list link fields as a count 256
159of VM_LOCKED vmas that map the page. This use of the link field for a count 257In Nick's patch, he used one of the struct page LRU list link fields as a count
160prevented the management of the pages on an LRU list. Thus, mlocked pages were 258of VM_LOCKED VMAs that map the page. This use of the link field for a count
161not migratable as isolate_lru_page() could not find them and the lru list link 259prevented the management of the pages on an LRU list, and thus mlocked pages
162field was not available to the migration subsystem. Nick resolved this by 260were not migratable as isolate_lru_page() could not find them, and the LRU list
163putting mlocked pages back on the lru list before attempting to isolate them, 261link field was not available to the migration subsystem.
164thus abandoning the count of VM_LOCKED vmas. When Nick's patch was integrated 262
165with the Unevictable LRU work, the count was replaced by walking the reverse 263Nick resolved this by putting mlocked pages back on the lru list before
166map to determine whether any VM_LOCKED vmas mapped the page. More on this 264attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs. When
167below. 265Nick's patch was integrated with the Unevictable LRU work, the count was
168 266replaced by walking the reverse map to determine whether any VM_LOCKED VMAs
169 267mapped the page. More on this below.
170Mlocked Pages: Basic Management 268
171 269
172Mlocked pages--pages mapped into a VM_LOCKED vma--represent one class of 270BASIC MANAGEMENT
173unevictable pages. When such a page has been "noticed" by the memory 271----------------
174management subsystem, the page is marked with the PG_mlocked [PageMlocked()] 272
175flag. A PageMlocked() page will be placed on the unevictable LRU list when 273mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable
176it is added to the LRU. Pages can be "noticed" by memory management in 274pages. When such a page has been "noticed" by the memory management subsystem,
177several places: 275the page is marked with the PG_mlocked flag. This can be manipulated using the
178 276PageMlocked() functions.
1791) in the mlock()/mlockall() system call handlers. 277
1802) in the mmap() system call handler when mmap()ing a region with the 278A PG_mlocked page will be placed on the unevictable list when it is added to
181 MAP_LOCKED flag, or mmap()ing a region in a task that has called 279the LRU. Such pages can be "noticed" by memory management in several places:
182 mlockall() with the MCL_FUTURE flag. Both of these conditions result 280
183 in the VM_LOCKED flag being set for the vma. 281 (1) in the mlock()/mlockall() system call handlers;
1843) in the fault path, if mlocked pages are "culled" in the fault path, 282
185 and when a VM_LOCKED stack segment is expanded. 283 (2) in the mmap() system call handler when mmapping a region with the
1864) as mentioned above, in vmscan:shrink_page_list() when attempting to 284 MAP_LOCKED flag;
187 reclaim a page in a VM_LOCKED vma via try_to_unmap(). 285
188 286 (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE
189Mlocked pages become unlocked and rescued from the unevictable list when: 287 flag
190 288
1911) mapped in a range unlocked via the munlock()/munlockall() system calls. 289 (4) in the fault path, if mlocked pages are "culled" in the fault path,
1922) munmapped() out of the last VM_LOCKED vma that maps the page, including 290 and when a VM_LOCKED stack segment is expanded; or
193 unmapping at task exit. 291
1943) when the page is truncated from the last VM_LOCKED vma of an mmap()ed file. 292 (5) as mentioned above, in vmscan:shrink_page_list() when attempting to
1954) before a page is COWed in a VM_LOCKED vma. 293 reclaim a page in a VM_LOCKED VMA via try_to_unmap()
196 294
197 295all of which result in the VM_LOCKED flag being set for the VMA if it doesn't
198Mlocked Pages: mlock()/mlockall() System Call Handling 296already have it set.
297
298mlocked pages become unlocked and rescued from the unevictable list when:
299
300 (1) mapped in a range unlocked via the munlock()/munlockall() system calls;
301
302 (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including
303 unmapping at task exit;
304
305 (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file;
306 or
307
308 (4) before a page is COW'd in a VM_LOCKED VMA.
309
310
311mlock()/mlockall() SYSTEM CALL HANDLING
312---------------------------------------
199 313
200Both [do_]mlock() and [do_]mlockall() system call handlers call mlock_fixup() 314Both [do_]mlock() and [do_]mlockall() system call handlers call mlock_fixup()
201for each vma in the range specified by the call. In the case of mlockall(), 315for each VMA in the range specified by the call. In the case of mlockall(),
202this is the entire active address space of the task. Note that mlock_fixup() 316this is the entire active address space of the task. Note that mlock_fixup()
203is used for both mlock()ing and munlock()ing a range of memory. A call to 317is used for both mlocking and munlocking a range of memory. A call to mlock()
204mlock() an already VM_LOCKED vma, or to munlock() a vma that is not VM_LOCKED 318an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED is
205is treated as a no-op--mlock_fixup() simply returns. 319treated as a no-op, and mlock_fixup() simply returns.
206 320
207If the vma passes some filtering described in "Mlocked Pages: Filtering Vmas" 321If the VMA passes some filtering as described in "Filtering Special Vmas"
208below, mlock_fixup() will attempt to merge the vma with its neighbors or split 322below, mlock_fixup() will attempt to merge the VMA with its neighbors or split
209off a subset of the vma if the range does not cover the entire vma. Once the 323off a subset of the VMA if the range does not cover the entire VMA. Once the
210vma has been merged or split or neither, mlock_fixup() will call 324VMA has been merged or split or neither, mlock_fixup() will call
211__mlock_vma_pages_range() to fault in the pages via get_user_pages() and 325__mlock_vma_pages_range() to fault in the pages via get_user_pages() and to
212to mark the pages as mlocked via mlock_vma_page(). 326mark the pages as mlocked via mlock_vma_page().
213 327
214Note that the vma being mlocked might be mapped with PROT_NONE. In this case, 328Note that the VMA being mlocked might be mapped with PROT_NONE. In this case,
215get_user_pages() will be unable to fault in the pages. That's OK. If pages 329get_user_pages() will be unable to fault in the pages. That's okay. If pages
216do end up getting faulted into this VM_LOCKED vma, we'll handle them in the 330do end up getting faulted into this VM_LOCKED VMA, we'll handle them in the
217fault path or in vmscan. 331fault path or in vmscan.
218 332
219Also note that a page returned by get_user_pages() could be truncated or 333Also note that a page returned by get_user_pages() could be truncated or
220migrated out from under us, while we're trying to mlock it. To detect 334migrated out from under us, while we're trying to mlock it. To detect this,
221this, __mlock_vma_pages_range() tests the page_mapping after acquiring 335__mlock_vma_pages_range() checks page_mapping() after acquiring the page lock.
222the page lock. If the page is still associated with its mapping, we'll 336If the page is still associated with its mapping, we'll go ahead and call
223go ahead and call mlock_vma_page(). If the mapping is gone, we just 337mlock_vma_page(). If the mapping is gone, we just unlock the page and move on.
224unlock the page and move on. Worse case, this results in page mapped 338In the worst case, this will result in a page mapped in a VM_LOCKED VMA
225in a VM_LOCKED vma remaining on a normal LRU list without being 339remaining on a normal LRU list without being PageMlocked(). Again, vmscan will
226PageMlocked(). Again, vmscan will detect and cull such pages. 340detect and cull such pages.
227 341
228mlock_vma_page(), called with the page locked [N.B., not "mlocked"], will 342mlock_vma_page() will call TestSetPageMlocked() for each page returned by
229TestSetPageMlocked() for each page returned by get_user_pages(). We use 343get_user_pages(). We use TestSetPageMlocked() because the page might already
230TestSetPageMlocked() because the page might already be mlocked by another 344be mlocked by another task/VMA and we don't want to do extra work. We
231task/vma and we don't want to do extra work. We especially do not want to 345especially do not want to count an mlocked page more than once in the
232count an mlocked page more than once in the statistics. If the page was 346statistics. If the page was already mlocked, mlock_vma_page() need do nothing
233already mlocked, mlock_vma_page() is done. 347more.
234 348
235If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the 349If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the
236page from the LRU, as it is likely on the appropriate active or inactive list 350page from the LRU, as it is likely on the appropriate active or inactive list
237at that time. If the isolate_lru_page() succeeds, mlock_vma_page() will 351at that time. If the isolate_lru_page() succeeds, mlock_vma_page() will put
238putback the page--putback_lru_page()--which will notice that the page is now 352back the page - by calling putback_lru_page() - which will notice that the page
239mlocked and divert the page to the zone's unevictable LRU list. If 353is now mlocked and divert the page to the zone's unevictable list. If
240mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle 354mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle
241it later if/when it attempts to reclaim the page. 355it later if and when it attempts to reclaim the page.
242 356
243 357
244Mlocked Pages: Filtering Special Vmas 358FILTERING SPECIAL VMAS
359----------------------
245 360
246mlock_fixup() filters several classes of "special" vmas: 361mlock_fixup() filters several classes of "special" VMAs:
247 362
2481) vmas with VM_IO|VM_PFNMAP set are skipped entirely. The pages behind 3631) VMAs with VM_IO or VM_PFNMAP set are skipped entirely. The pages behind
249 these mappings are inherently pinned, so we don't need to mark them as 364 these mappings are inherently pinned, so we don't need to mark them as
250 mlocked. In any case, most of the pages have no struct page in which to 365 mlocked. In any case, most of the pages have no struct page in which to so
251 so mark the page. Because of this, get_user_pages() will fail for these 366 mark the page. Because of this, get_user_pages() will fail for these VMAs,
252 vmas, so there is no sense in attempting to visit them. 367 so there is no sense in attempting to visit them.
253 368
2542) vmas mapping hugetlbfs page are already effectively pinned into memory. 3692) VMAs mapping hugetlbfs page are already effectively pinned into memory. We
255 We don't need nor want to mlock() these pages. However, to preserve the 370 neither need nor want to mlock() these pages. However, to preserve the
256 prior behavior of mlock()--before the unevictable/mlock changes-- 371 prior behavior of mlock() - before the unevictable/mlock changes -
257 mlock_fixup() will call make_pages_present() in the hugetlbfs vma range 372 mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to
258 to allocate the huge pages and populate the ptes. 373 allocate the huge pages and populate the ptes.
259 374
2603) vmas with VM_DONTEXPAND|VM_RESERVED are generally user space mappings of 3753) VMAs with VM_DONTEXPAND or VM_RESERVED are generally userspace mappings of
261 kernel pages, such as the vdso page, relay channel pages, etc. These pages 376 kernel pages, such as the VDSO page, relay channel pages, etc. These pages
262 are inherently unevictable and are not managed on the LRU lists. 377 are inherently unevictable and are not managed on the LRU lists.
263 mlock_fixup() treats these vmas the same as hugetlbfs vmas. It calls 378 mlock_fixup() treats these VMAs the same as hugetlbfs VMAs. It calls
264 make_pages_present() to populate the ptes. 379 make_pages_present() to populate the ptes.
265 380
266Note that for all of these special vmas, mlock_fixup() does not set the 381Note that for all of these special VMAs, mlock_fixup() does not set the
267VM_LOCKED flag. Therefore, we won't have to deal with them later during 382VM_LOCKED flag. Therefore, we won't have to deal with them later during
268munlock() or munmap()--for example, at task exit. Neither does mlock_fixup() 383munlock(), munmap() or task exit. Neither does mlock_fixup() account these
269account these vmas against the task's "locked_vm". 384VMAs against the task's "locked_vm".
270 385
271Mlocked Pages: Downgrading the Mmap Semaphore. 386
272 387munlock()/munlockall() SYSTEM CALL HANDLING
273mlock_fixup() must be called with the mmap semaphore held for write, because 388-------------------------------------------
274it may have to merge or split vmas. However, mlocking a large region of 389
275memory can take a long time--especially if vmscan must reclaim pages to 390The munlock() and munlockall() system calls are handled by the same functions -
276satisfy the regions requirements. Faulting in a large region with the mmap 391do_mlock[all]() - as the mlock() and mlockall() system calls with the unlock vs
277semaphore held for write can hold off other faults on the address space, in 392lock operation indicated by an argument. So, these system calls are also
278the case of a multi-threaded task. It can also hold off scans of the task's 393handled by mlock_fixup(). Again, if called for an already munlocked VMA,
279address space via /proc. While testing under heavy load, it was observed that 394mlock_fixup() simply returns. Because of the VMA filtering discussed above,
280the ps(1) command could be held off for many minutes while a large segment was 395VM_LOCKED will not be set in any "special" VMAs. So, these VMAs will be
281mlock()ed down.
282
283To address this issue, and to make the system more responsive during mlock()ing
284of large segments, mlock_fixup() downgrades the mmap semaphore to read mode
285during the call to __mlock_vma_pages_range(). This works fine. However, the
286callers of mlock_fixup() expect the semaphore to be returned in write mode.
287So, mlock_fixup() "upgrades" the semphore to write mode. Linux does not
288support an atomic upgrade_sem() call, so mlock_fixup() must drop the semaphore
289and reacquire it in write mode. In a multi-threaded task, it is possible for
290the task memory map to change while the semaphore is dropped. Therefore,
291mlock_fixup() looks up the vma at the range start address after reacquiring
292the semaphore in write mode and verifies that it still covers the original
293range. If not, mlock_fixup() returns an error [-EAGAIN]. All callers of
294mlock_fixup() have been changed to deal with this new error condition.
295
296Note: when munlocking a region, all of the pages should already be resident--
297unless we have racing threads mlocking() and munlocking() regions. So,
298unlocking should not have to wait for page allocations nor faults of any kind.
299Therefore mlock_fixup() does not downgrade the semaphore for munlock().
300
301
302Mlocked Pages: munlock()/munlockall() System Call Handling
303
304The munlock() and munlockall() system calls are handled by the same functions--
305do_mlock[all]()--as the mlock() and mlockall() system calls with the unlock
306vs lock operation indicated by an argument. So, these system calls are also
307handled by mlock_fixup(). Again, if called for an already munlock()ed vma,
308mlock_fixup() simply returns. Because of the vma filtering discussed above,
309VM_LOCKED will not be set in any "special" vmas. So, these vmas will be
310ignored for munlock. 396ignored for munlock.
311 397
312If the vma is VM_LOCKED, mlock_fixup() again attempts to merge or split off 398If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the
313the specified range. The range is then munlocked via the function 399specified range. The range is then munlocked via the function
314__mlock_vma_pages_range()--the same function used to mlock a vma range-- 400__mlock_vma_pages_range() - the same function used to mlock a VMA range -
315passing a flag to indicate that munlock() is being performed. 401passing a flag to indicate that munlock() is being performed.
316 402
317Because the vma access protections could have been changed to PROT_NONE after 403Because the VMA access protections could have been changed to PROT_NONE after
318faulting in and mlocking pages, get_user_pages() was unreliable for visiting 404faulting in and mlocking pages, get_user_pages() was unreliable for visiting
319these pages for munlocking. Because we don't want to leave pages mlocked(), 405these pages for munlocking. Because we don't want to leave pages mlocked,
320get_user_pages() was enhanced to accept a flag to ignore the permissions when 406get_user_pages() was enhanced to accept a flag to ignore the permissions when
321fetching the pages--all of which should be resident as a result of previous 407fetching the pages - all of which should be resident as a result of previous
322mlock()ing. 408mlocking.
323 409
324For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling 410For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling
325munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked 411munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked
326flag using TestClearPageMlocked(). As with mlock_vma_page(), munlock_vma_page() 412flag using TestClearPageMlocked(). As with mlock_vma_page(),
327use the Test*PageMlocked() function to handle the case where the page might 413munlock_vma_page() use the Test*PageMlocked() function to handle the case where
328have already been unlocked by another task. If the page was mlocked, 414the page might have already been unlocked by another task. If the page was
329munlock_vma_page() updates that zone statistics for the number of mlocked 415mlocked, munlock_vma_page() updates that zone statistics for the number of
330pages. Note, however, that at this point we haven't checked whether the page 416mlocked pages. Note, however, that at this point we haven't checked whether
331is mapped by other VM_LOCKED vmas. 417the page is mapped by other VM_LOCKED VMAs.
332 418
333We can't call try_to_munlock(), the function that walks the reverse map to check 419We can't call try_to_munlock(), the function that walks the reverse map to
334for other VM_LOCKED vmas, without first isolating the page from the LRU. 420check for other VM_LOCKED VMAs, without first isolating the page from the LRU.
335try_to_munlock() is a variant of try_to_unmap() and thus requires that the page 421try_to_munlock() is a variant of try_to_unmap() and thus requires that the page
336not be on an lru list. [More on these below.] However, the call to 422not be on an LRU list [more on these below]. However, the call to
337isolate_lru_page() could fail, in which case we couldn't try_to_munlock(). 423isolate_lru_page() could fail, in which case we couldn't try_to_munlock(). So,
338So, we go ahead and clear PG_mlocked up front, as this might be the only chance 424we go ahead and clear PG_mlocked up front, as this might be the only chance we
339we have. If we can successfully isolate the page, we go ahead and 425have. If we can successfully isolate the page, we go ahead and
340try_to_munlock(), which will restore the PG_mlocked flag and update the zone 426try_to_munlock(), which will restore the PG_mlocked flag and update the zone
341page statistics if it finds another vma holding the page mlocked. If we fail 427page statistics if it finds another VMA holding the page mlocked. If we fail
342to isolate the page, we'll have left a potentially mlocked page on the LRU. 428to isolate the page, we'll have left a potentially mlocked page on the LRU.
343This is fine, because we'll catch it later when/if vmscan tries to reclaim the 429This is fine, because we'll catch it later if and if vmscan tries to reclaim
344page. This should be relatively rare. 430the page. This should be relatively rare.
345 431
346Mlocked Pages: Migrating Them... 432
347 433MIGRATING MLOCKED PAGES
348A page that is being migrated has been isolated from the lru lists and is 434-----------------------
349held locked across unmapping of the page, updating the page's mapping 435
350[address_space] entry and copying the contents and state, until the 436A page that is being migrated has been isolated from the LRU lists and is held
351page table entry has been replaced with an entry that refers to the new 437locked across unmapping of the page, updating the page's address space entry
352page. Linux supports migration of mlocked pages and other unevictable 438and copying the contents and state, until the page table entry has been
353pages. This involves simply moving the PageMlocked and PageUnevictable states 439replaced with an entry that refers to the new page. Linux supports migration
354from the old page to the new page. 440of mlocked pages and other unevictable pages. This involves simply moving the
355 441PG_mlocked and PG_unevictable states from the old page to the new page.
356Note that page migration can race with mlocking or munlocking of the same 442
357page. This has been discussed from the mlock/munlock perspective in the 443Note that page migration can race with mlocking or munlocking of the same page.
358respective sections above. Both processes [migration, m[un]locking], hold 444This has been discussed from the mlock/munlock perspective in the respective
359the page locked. This provides the first level of synchronization. Page 445sections above. Both processes (migration and m[un]locking) hold the page
360migration zeros out the page_mapping of the old page before unlocking it, 446locked. This provides the first level of synchronization. Page migration
361so m[un]lock can skip these pages by testing the page mapping under page 447zeros out the page_mapping of the old page before unlocking it, so m[un]lock
362lock. 448can skip these pages by testing the page mapping under page lock.
363 449
364When completing page migration, we place the new and old pages back onto the 450To complete page migration, we place the new and old pages back onto the LRU
365lru after dropping the page lock. The "unneeded" page--old page on success, 451after dropping the page lock. The "unneeded" page - old page on success, new
366new page on failure--will be freed when the reference count held by the 452page on failure - will be freed when the reference count held by the migration
367migration process is released. To ensure that we don't strand pages on the 453process is released. To ensure that we don't strand pages on the unevictable
368unevictable list because of a race between munlock and migration, page 454list because of a race between munlock and migration, page migration uses the
369migration uses the putback_lru_page() function to add migrated pages back to 455putback_lru_page() function to add migrated pages back to the LRU.
370the lru. 456
371 457
372 458mmap(MAP_LOCKED) SYSTEM CALL HANDLING
373Mlocked Pages: mmap(MAP_LOCKED) System Call Handling 459-------------------------------------
374 460
375In addition the the mlock()/mlockall() system calls, an application can request 461In addition the the mlock()/mlockall() system calls, an application can request
376that a region of memory be mlocked using the MAP_LOCKED flag with the mmap() 462that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap()
377call. Furthermore, any mmap() call or brk() call that expands the heap by a 463call. Furthermore, any mmap() call or brk() call that expands the heap by a
378task that has previously called mlockall() with the MCL_FUTURE flag will result 464task that has previously called mlockall() with the MCL_FUTURE flag will result
379in the newly mapped memory being mlocked. Before the unevictable/mlock changes, 465in the newly mapped memory being mlocked. Before the unevictable/mlock
380the kernel simply called make_pages_present() to allocate pages and populate 466changes, the kernel simply called make_pages_present() to allocate pages and
381the page table. 467populate the page table.
382 468
383To mlock a range of memory under the unevictable/mlock infrastructure, the 469To mlock a range of memory under the unevictable/mlock infrastructure, the
384mmap() handler and task address space expansion functions call 470mmap() handler and task address space expansion functions call
385mlock_vma_pages_range() specifying the vma and the address range to mlock. 471mlock_vma_pages_range() specifying the vma and the address range to mlock.
386mlock_vma_pages_range() filters vmas like mlock_fixup(), as described above in 472mlock_vma_pages_range() filters VMAs like mlock_fixup(), as described above in
387"Mlocked Pages: Filtering Vmas". It will clear the VM_LOCKED flag, which will 473"Filtering Special VMAs". It will clear the VM_LOCKED flag, which will have
388have already been set by the caller, in filtered vmas. Thus these vma's need 474already been set by the caller, in filtered VMAs. Thus these VMA's need not be
389not be visited for munlock when the region is unmapped. 475visited for munlock when the region is unmapped.
390 476
391For "normal" vmas, mlock_vma_pages_range() calls __mlock_vma_pages_range() to 477For "normal" VMAs, mlock_vma_pages_range() calls __mlock_vma_pages_range() to
392fault/allocate the pages and mlock them. Again, like mlock_fixup(), 478fault/allocate the pages and mlock them. Again, like mlock_fixup(),
393mlock_vma_pages_range() downgrades the mmap semaphore to read mode before 479mlock_vma_pages_range() downgrades the mmap semaphore to read mode before
394attempting to fault/allocate and mlock the pages; and "upgrades" the semaphore 480attempting to fault/allocate and mlock the pages and "upgrades" the semaphore
395back to write mode before returning. 481back to write mode before returning.
396 482
397The callers of mlock_vma_pages_range() will have already added the memory 483The callers of mlock_vma_pages_range() will have already added the memory range
398range to be mlocked to the task's "locked_vm". To account for filtered vmas, 484to be mlocked to the task's "locked_vm". To account for filtered VMAs,
399mlock_vma_pages_range() returns the number of pages NOT mlocked. All of the 485mlock_vma_pages_range() returns the number of pages NOT mlocked. All of the
400callers then subtract a non-negative return value from the task's locked_vm. 486callers then subtract a non-negative return value from the task's locked_vm. A
401A negative return value represent an error--for example, from get_user_pages() 487negative return value represent an error - for example, from get_user_pages()
402attempting to fault in a vma with PROT_NONE access. In this case, we leave 488attempting to fault in a VMA with PROT_NONE access. In this case, we leave the
403the memory range accounted as locked_vm, as the protections could be changed 489memory range accounted as locked_vm, as the protections could be changed later
404later and pages allocated into that region. 490and pages allocated into that region.
405 491
406 492
407Mlocked Pages: munmap()/exit()/exec() System Call Handling 493munmap()/exit()/exec() SYSTEM CALL HANDLING
494-------------------------------------------
408 495
409When unmapping an mlocked region of memory, whether by an explicit call to 496When unmapping an mlocked region of memory, whether by an explicit call to
410munmap() or via an internal unmap from exit() or exec() processing, we must 497munmap() or via an internal unmap from exit() or exec() processing, we must
411munlock the pages if we're removing the last VM_LOCKED vma that maps the pages. 498munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages.
412Before the unevictable/mlock changes, mlocking did not mark the pages in any 499Before the unevictable/mlock changes, mlocking did not mark the pages in any
413way, so unmapping them required no processing. 500way, so unmapping them required no processing.
414 501
415To munlock a range of memory under the unevictable/mlock infrastructure, the 502To munlock a range of memory under the unevictable/mlock infrastructure, the
416munmap() hander and task address space tear down function call 503munmap() handler and task address space call tear down function
417munlock_vma_pages_all(). The name reflects the observation that one always 504munlock_vma_pages_all(). The name reflects the observation that one always
418specifies the entire vma range when munlock()ing during unmap of a region. 505specifies the entire VMA range when munlock()ing during unmap of a region.
419Because of the vma filtering when mlocking() regions, only "normal" vmas that 506Because of the VMA filtering when mlocking() regions, only "normal" VMAs that
420actually contain mlocked pages will be passed to munlock_vma_pages_all(). 507actually contain mlocked pages will be passed to munlock_vma_pages_all().
421 508
422munlock_vma_pages_all() clears the VM_LOCKED vma flag and, like mlock_fixup() 509munlock_vma_pages_all() clears the VM_LOCKED VMA flag and, like mlock_fixup()
423for the munlock case, calls __munlock_vma_pages_range() to walk the page table 510for the munlock case, calls __munlock_vma_pages_range() to walk the page table
424for the vma's memory range and munlock_vma_page() each resident page mapped by 511for the VMA's memory range and munlock_vma_page() each resident page mapped by
425the vma. This effectively munlocks the page, only if this is the last 512the VMA. This effectively munlocks the page, only if this is the last
426VM_LOCKED vma that maps the page. 513VM_LOCKED VMA that maps the page.
427
428 514
429Mlocked Page: try_to_unmap()
430 515
431[Note: the code changes represented by this section are really quite small 516try_to_unmap()
432compared to the text to describe what happening and why, and to discuss the 517--------------
433implications.]
434 518
435Pages can, of course, be mapped into multiple vmas. Some of these vmas may 519Pages can, of course, be mapped into multiple VMAs. Some of these VMAs may
436have VM_LOCKED flag set. It is possible for a page mapped into one or more 520have VM_LOCKED flag set. It is possible for a page mapped into one or more
437VM_LOCKED vmas not to have the PG_mlocked flag set and therefore reside on one 521VM_LOCKED VMAs not to have the PG_mlocked flag set and therefore reside on one
438of the active or inactive LRU lists. This could happen if, for example, a 522of the active or inactive LRU lists. This could happen if, for example, a task
439task in the process of munlock()ing the page could not isolate the page from 523in the process of munlocking the page could not isolate the page from the LRU.
440the LRU. As a result, vmscan/shrink_page_list() might encounter such a page 524As a result, vmscan/shrink_page_list() might encounter such a page as described
441as described in "Unevictable Pages and Vmscan [shrink_*_list()]". To 525in section "vmscan's handling of unevictable pages". To handle this situation,
442handle this situation, try_to_unmap() has been enhanced to check for VM_LOCKED 526try_to_unmap() checks for VM_LOCKED VMAs while it is walking a page's reverse
443vmas while it is walking a page's reverse map. 527map.
444 528
445try_to_unmap() is always called, by either vmscan for reclaim or for page 529try_to_unmap() is always called, by either vmscan for reclaim or for page
446migration, with the argument page locked and isolated from the LRU. BUG_ON() 530migration, with the argument page locked and isolated from the LRU. Separate
447assertions enforce this requirement. Separate functions handle anonymous and 531functions handle anonymous and mapped file pages, as these types of pages have
448mapped file pages, as these types of pages have different reverse map 532different reverse map mechanisms.
449mechanisms. 533
450 534 (*) try_to_unmap_anon()
451 try_to_unmap_anon() 535
452 536 To unmap anonymous pages, each VMA in the list anchored in the anon_vma
453To unmap anonymous pages, each vma in the list anchored in the anon_vma must be 537 must be visited - at least until a VM_LOCKED VMA is encountered. If the
454visited--at least until a VM_LOCKED vma is encountered. If the page is being 538 page is being unmapped for migration, VM_LOCKED VMAs do not stop the
455unmapped for migration, VM_LOCKED vmas do not stop the process because mlocked 539 process because mlocked pages are migratable. However, for reclaim, if
456pages are migratable. However, for reclaim, if the page is mapped into a 540 the page is mapped into a VM_LOCKED VMA, the scan stops.
457VM_LOCKED vma, the scan stops. try_to_unmap() attempts to acquire the mmap 541
458semphore of the mm_struct to which the vma belongs in read mode. If this is 542 try_to_unmap_anon() attempts to acquire in read mode the mmap semphore of
459successful, try_to_unmap() will mlock the page via mlock_vma_page()--we 543 the mm_struct to which the VMA belongs. If this is successful, it will
460wouldn't have gotten to try_to_unmap() if the page were already mlocked--and 544 mlock the page via mlock_vma_page() - we wouldn't have gotten to
461will return SWAP_MLOCK, indicating that the page is unevictable. If the 545 try_to_unmap_anon() if the page were already mlocked - and will return
462mmap semaphore cannot be acquired, we are not sure whether the page is really 546 SWAP_MLOCK, indicating that the page is unevictable.
463unevictable or not. In this case, try_to_unmap() will return SWAP_AGAIN. 547
464 548 If the mmap semaphore cannot be acquired, we are not sure whether the page
465 try_to_unmap_file() -- linear mappings 549 is really unevictable or not. In this case, try_to_unmap_anon() will
466 550 return SWAP_AGAIN.
467Unmapping of a mapped file page works the same, except that the scan visits 551
468all vmas that maps the page's index/page offset in the page's mapping's 552 (*) try_to_unmap_file() - linear mappings
469reverse map priority search tree. It must also visit each vma in the page's 553
470mapping's non-linear list, if the list is non-empty. As for anonymous pages, 554 Unmapping of a mapped file page works the same as for anonymous mappings,
471on encountering a VM_LOCKED vma for a mapped file page, try_to_unmap() will 555 except that the scan visits all VMAs that map the page's index/page offset
472attempt to acquire the associated mm_struct's mmap semaphore to mlock the page, 556 in the page's mapping's reverse map priority search tree. It also visits
473returning SWAP_MLOCK if this is successful, and SWAP_AGAIN, if not. 557 each VMA in the page's mapping's non-linear list, if the list is
474 558 non-empty.
475 try_to_unmap_file() -- non-linear mappings 559
476 560 As for anonymous pages, on encountering a VM_LOCKED VMA for a mapped file
477If a page's mapping contains a non-empty non-linear mapping vma list, then 561 page, try_to_unmap_file() will attempt to acquire the associated
478try_to_un{map|lock}() must also visit each vma in that list to determine 562 mm_struct's mmap semaphore to mlock the page, returning SWAP_MLOCK if this
479whether the page is mapped in a VM_LOCKED vma. Again, the scan must visit 563 is successful, and SWAP_AGAIN, if not.
480all vmas in the non-linear list to ensure that the pages is not/should not be 564
481mlocked. If a VM_LOCKED vma is found in the list, the scan could terminate. 565 (*) try_to_unmap_file() - non-linear mappings
482However, there is no easy way to determine whether the page is actually mapped 566
483in a given vma--either for unmapping or testing whether the VM_LOCKED vma 567 If a page's mapping contains a non-empty non-linear mapping VMA list, then
484actually pins the page. 568 try_to_un{map|lock}() must also visit each VMA in that list to determine
485 569 whether the page is mapped in a VM_LOCKED VMA. Again, the scan must visit
486So, try_to_unmap_file() handles non-linear mappings by scanning a certain 570 all VMAs in the non-linear list to ensure that the pages is not/should not
487number of pages--a "cluster"--in each non-linear vma associated with the page's 571 be mlocked.
488mapping, for each file mapped page that vmscan tries to unmap. If this happens 572
489to unmap the page we're trying to unmap, try_to_unmap() will notice this on 573 If a VM_LOCKED VMA is found in the list, the scan could terminate.
490return--(page_mapcount(page) == 0)--and return SWAP_SUCCESS. Otherwise, it 574 However, there is no easy way to determine whether the page is actually
491will return SWAP_AGAIN, causing vmscan to recirculate this page. We take 575 mapped in a given VMA - either for unmapping or testing whether the
492advantage of the cluster scan in try_to_unmap_cluster() as follows: 576 VM_LOCKED VMA actually pins the page.
493 577
494For each non-linear vma, try_to_unmap_cluster() attempts to acquire the mmap 578 try_to_unmap_file() handles non-linear mappings by scanning a certain
495semaphore of the associated mm_struct for read without blocking. If this 579 number of pages - a "cluster" - in each non-linear VMA associated with the
496attempt is successful and the vma is VM_LOCKED, try_to_unmap_cluster() will 580 page's mapping, for each file mapped page that vmscan tries to unmap. If
497retain the mmap semaphore for the scan; otherwise it drops it here. Then, 581 this happens to unmap the page we're trying to unmap, try_to_unmap() will
498for each page in the cluster, if we're holding the mmap semaphore for a locked 582 notice this on return (page_mapcount(page) will be 0) and return
499vma, try_to_unmap_cluster() calls mlock_vma_page() to mlock the page. This 583 SWAP_SUCCESS. Otherwise, it will return SWAP_AGAIN, causing vmscan to
500call is a no-op if the page is already locked, but will mlock any pages in 584 recirculate this page. We take advantage of the cluster scan in
501the non-linear mapping that happen to be unlocked. If one of the pages so 585 try_to_unmap_cluster() as follows:
502mlocked is the page passed in to try_to_unmap(), try_to_unmap_cluster() will 586
503return SWAP_MLOCK, rather than the default SWAP_AGAIN. This will allow vmscan 587 For each non-linear VMA, try_to_unmap_cluster() attempts to acquire the
504to cull the page, rather than recirculating it on the inactive list. Again, 588 mmap semaphore of the associated mm_struct for read without blocking.
505if try_to_unmap_cluster() cannot acquire the vma's mmap sem, it returns 589
506SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED vma, but 590 If this attempt is successful and the VMA is VM_LOCKED,
507couldn't be mlocked. 591 try_to_unmap_cluster() will retain the mmap semaphore for the scan;
508 592 otherwise it drops it here.
509 593
510Mlocked pages: try_to_munlock() Reverse Map Scan 594 Then, for each page in the cluster, if we're holding the mmap semaphore
511 595 for a locked VMA, try_to_unmap_cluster() calls mlock_vma_page() to
512TODO/FIXME: a better name might be page_mlocked()--analogous to the 596 mlock the page. This call is a no-op if the page is already locked,
513page_referenced() reverse map walker. 597 but will mlock any pages in the non-linear mapping that happen to be
514 598 unlocked.
515When munlock_vma_page()--see "Mlocked Pages: munlock()/munlockall() 599
516System Call Handling" above--tries to munlock a page, it needs to 600 If one of the pages so mlocked is the page passed in to try_to_unmap(),
517determine whether or not the page is mapped by any VM_LOCKED vma, without 601 try_to_unmap_cluster() will return SWAP_MLOCK, rather than the default
518actually attempting to unmap all ptes from the page. For this purpose, the 602 SWAP_AGAIN. This will allow vmscan to cull the page, rather than
519unevictable/mlock infrastructure introduced a variant of try_to_unmap() called 603 recirculating it on the inactive list.
520try_to_munlock(). 604
605 Again, if try_to_unmap_cluster() cannot acquire the VMA's mmap sem, it
606 returns SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED
607 VMA, but couldn't be mlocked.
608
609
610try_to_munlock() REVERSE MAP SCAN
611---------------------------------
612
613 [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the
614 page_referenced() reverse map walker.
615
616When munlock_vma_page() [see section "munlock()/munlockall() System Call
617Handling" above] tries to munlock a page, it needs to determine whether or not
618the page is mapped by any VM_LOCKED VMA without actually attempting to unmap
619all PTEs from the page. For this purpose, the unevictable/mlock infrastructure
620introduced a variant of try_to_unmap() called try_to_munlock().
521 621
522try_to_munlock() calls the same functions as try_to_unmap() for anonymous and 622try_to_munlock() calls the same functions as try_to_unmap() for anonymous and
523mapped file pages with an additional argument specifing unlock versus unmap 623mapped file pages with an additional argument specifing unlock versus unmap
524processing. Again, these functions walk the respective reverse maps looking 624processing. Again, these functions walk the respective reverse maps looking
525for VM_LOCKED vmas. When such a vma is found for anonymous pages and file 625for VM_LOCKED VMAs. When such a VMA is found for anonymous pages and file
526pages mapped in linear VMAs, as in the try_to_unmap() case, the functions 626pages mapped in linear VMAs, as in the try_to_unmap() case, the functions
527attempt to acquire the associated mmap semphore, mlock the page via 627attempt to acquire the associated mmap semphore, mlock the page via
528mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the 628mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the
529pre-clearing of the page's PG_mlocked done by munlock_vma_page. 629pre-clearing of the page's PG_mlocked done by munlock_vma_page.
530 630
531If try_to_unmap() is unable to acquire a VM_LOCKED vma's associated mmap 631If try_to_unmap() is unable to acquire a VM_LOCKED VMA's associated mmap
532semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() 632semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() to
533to recycle the page on the inactive list and hope that it has better luck 633recycle the page on the inactive list and hope that it has better luck with the
534with the page next time. 634page next time.
535 635
536For file pages mapped into non-linear vmas, the try_to_munlock() logic works 636For file pages mapped into non-linear VMAs, the try_to_munlock() logic works
537slightly differently. On encountering a VM_LOCKED non-linear vma that might 637slightly differently. On encountering a VM_LOCKED non-linear VMA that might
538map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking 638map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking the
539the page. munlock_vma_page() will just leave the page unlocked and let 639page. munlock_vma_page() will just leave the page unlocked and let vmscan deal
540vmscan deal with it--the usual fallback position. 640with it - the usual fallback position.
541 641
542Note that try_to_munlock()'s reverse map walk must visit every vma in a pages' 642Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's
543reverse map to determine that a page is NOT mapped into any VM_LOCKED vma. 643reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA.
544However, the scan can terminate when it encounters a VM_LOCKED vma and can 644However, the scan can terminate when it encounters a VM_LOCKED VMA and can
545successfully acquire the vma's mmap semphore for read and mlock the page. 645successfully acquire the VMA's mmap semphore for read and mlock the page.
546Although try_to_munlock() can be called many [very many!] times when 646Although try_to_munlock() might be called a great many times when munlocking a
547munlock()ing a large region or tearing down a large address space that has been 647large region or tearing down a large address space that has been mlocked via
548mlocked via mlockall(), overall this is a fairly rare event. 648mlockall(), overall this is a fairly rare event.
549 649
550Mlocked Page: Page Reclaim in shrink_*_list() 650
551 651PAGE RECLAIM IN shrink_*_list()
552shrink_active_list() culls any obviously unevictable pages--i.e., 652-------------------------------
553!page_evictable(page, NULL)--diverting these to the unevictable lru 653
554list. However, shrink_active_list() only sees unevictable pages that 654shrink_active_list() culls any obviously unevictable pages - i.e.
555made it onto the active/inactive lru lists. Note that these pages do not 655!page_evictable(page, NULL) - diverting these to the unevictable list.
556have PageUnevictable set--otherwise, they would be on the unevictable list and 656However, shrink_active_list() only sees unevictable pages that made it onto the
557shrink_active_list would never see them. 657active/inactive lru lists. Note that these pages do not have PageUnevictable
658set - otherwise they would be on the unevictable list and shrink_active_list
659would never see them.
558 660
559Some examples of these unevictable pages on the LRU lists are: 661Some examples of these unevictable pages on the LRU lists are:
560 662
5611) ramfs pages that have been placed on the lru lists when first allocated. 663 (1) ramfs pages that have been placed on the LRU lists when first allocated.
664
665 (2) SHM_LOCK'd shared memory pages. shmctl(SHM_LOCK) does not attempt to
666 allocate or fault in the pages in the shared memory region. This happens
667 when an application accesses the page the first time after SHM_LOCK'ing
668 the segment.
562 669
5632) SHM_LOCKed shared memory pages. shmctl(SHM_LOCK) does not attempt to 670 (3) mlocked pages that could not be isolated from the LRU and moved to the
564 allocate or fault in the pages in the shared memory region. This happens 671 unevictable list in mlock_vma_page().
565 when an application accesses the page the first time after SHM_LOCKing
566 the segment.
567 672
5683) Mlocked pages that could not be isolated from the lru and moved to the 673 (4) Pages mapped into multiple VM_LOCKED VMAs, but try_to_munlock() couldn't
569 unevictable list in mlock_vma_page(). 674 acquire the VMA's mmap semaphore to test the flags and set PageMlocked.
675 munlock_vma_page() was forced to let the page back on to the normal LRU
676 list for vmscan to handle.
570 677
5713) Pages mapped into multiple VM_LOCKED vmas, but try_to_munlock() couldn't 678shrink_inactive_list() also diverts any unevictable pages that it finds on the
572 acquire the vma's mmap semaphore to test the flags and set PageMlocked. 679inactive lists to the appropriate zone's unevictable list.
573 munlock_vma_page() was forced to let the page back on to the normal
574 LRU list for vmscan to handle.
575 680
576shrink_inactive_list() also culls any unevictable pages that it finds on 681shrink_inactive_list() should only see SHM_LOCK'd pages that became SHM_LOCK'd
577the inactive lists, again diverting them to the appropriate zone's unevictable 682after shrink_active_list() had moved them to the inactive list, or pages mapped
578lru list. shrink_inactive_list() should only see SHM_LOCKed pages that became 683into VM_LOCKED VMAs that munlock_vma_page() couldn't isolate from the LRU to
579SHM_LOCKed after shrink_active_list() had moved them to the inactive list, or 684recheck via try_to_munlock(). shrink_inactive_list() won't notice the latter,
580pages mapped into VM_LOCKED vmas that munlock_vma_page() couldn't isolate from 685but will pass on to shrink_page_list().
581the lru to recheck via try_to_munlock(). shrink_inactive_list() won't notice
582the latter, but will pass on to shrink_page_list().
583 686
584shrink_page_list() again culls obviously unevictable pages that it could 687shrink_page_list() again culls obviously unevictable pages that it could
585encounter for similar reason to shrink_inactive_list(). Pages mapped into 688encounter for similar reason to shrink_inactive_list(). Pages mapped into
586VM_LOCKED vmas but without PG_mlocked set will make it all the way to 689VM_LOCKED VMAs but without PG_mlocked set will make it all the way to
587try_to_unmap(). shrink_page_list() will divert them to the unevictable list 690try_to_unmap(). shrink_page_list() will divert them to the unevictable list
588when try_to_unmap() returns SWAP_MLOCK, as discussed above. 691when try_to_unmap() returns SWAP_MLOCK, as discussed above.