aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/ABI/testing/debugfs-pktcdvd6
-rw-r--r--Documentation/ABI/testing/sysfs-firmware-acpi8
-rw-r--r--Documentation/DocBook/Makefile11
-rw-r--r--Documentation/block/biodoc.txt19
-rw-r--r--Documentation/cgroups/cpuacct.txt18
-rw-r--r--Documentation/cgroups/memory.txt55
-rw-r--r--Documentation/cgroups/resource_counter.txt27
-rw-r--r--Documentation/driver-model/platform.txt59
-rw-r--r--Documentation/feature-removal-schedule.txt9
-rw-r--r--Documentation/filesystems/pohmelfs/design_notes.txt5
-rw-r--r--Documentation/filesystems/pohmelfs/info.txt21
-rw-r--r--Documentation/filesystems/vfs.txt3
-rw-r--r--Documentation/infiniband/ipoib.txt45
-rw-r--r--Documentation/input/rotary-encoder.txt101
-rw-r--r--Documentation/kbuild/makefiles.txt93
-rw-r--r--Documentation/kernel-parameters.txt78
-rw-r--r--Documentation/laptops/thinkpad-acpi.txt4
-rw-r--r--Documentation/lguest/.gitignore1
-rw-r--r--Documentation/lguest/lguest.txt11
-rw-r--r--Documentation/networking/bonding.txt2
-rw-r--r--Documentation/powerpc/dts-bindings/fsl/i2c.txt46
-rw-r--r--Documentation/sound/alsa/HD-Audio.txt4
-rw-r--r--Documentation/sparse.txt8
-rw-r--r--Documentation/spi/spi-summary6
-rw-r--r--Documentation/sysctl/net.txt2
-rw-r--r--Documentation/tomoyo.txt55
-rw-r--r--Documentation/trace/ftrace.txt (renamed from Documentation/ftrace.txt)0
-rw-r--r--Documentation/trace/kmemtrace.txt (renamed from Documentation/vm/kmemtrace.txt)0
-rw-r--r--Documentation/trace/mmiotrace.txt (renamed from Documentation/tracers/mmiotrace.txt)0
-rw-r--r--Documentation/trace/tracepoints.txt (renamed from Documentation/tracepoints.txt)0
-rw-r--r--Documentation/vm/00-INDEX2
-rw-r--r--Documentation/vm/active_mm.txt83
-rw-r--r--Documentation/vm/unevictable-lru.txt1041
33 files changed, 1242 insertions, 581 deletions
diff --git a/Documentation/ABI/testing/debugfs-pktcdvd b/Documentation/ABI/testing/debugfs-pktcdvd
index bf9c16b64c34..cf11736acb76 100644
--- a/Documentation/ABI/testing/debugfs-pktcdvd
+++ b/Documentation/ABI/testing/debugfs-pktcdvd
@@ -1,4 +1,4 @@
1What: /debug/pktcdvd/pktcdvd[0-7] 1What: /sys/kernel/debug/pktcdvd/pktcdvd[0-7]
2Date: Oct. 2006 2Date: Oct. 2006
3KernelVersion: 2.6.20 3KernelVersion: 2.6.20
4Contact: Thomas Maier <balagi@justmail.de> 4Contact: Thomas Maier <balagi@justmail.de>
@@ -10,10 +10,10 @@ debugfs interface
10The pktcdvd module (packet writing driver) creates 10The pktcdvd module (packet writing driver) creates
11these files in debugfs: 11these files in debugfs:
12 12
13/debug/pktcdvd/pktcdvd[0-7]/ 13/sys/kernel/debug/pktcdvd/pktcdvd[0-7]/
14 info (0444) Lots of driver statistics and infos. 14 info (0444) Lots of driver statistics and infos.
15 15
16Example: 16Example:
17------- 17-------
18 18
19cat /debug/pktcdvd/pktcdvd0/info 19cat /sys/kernel/debug/pktcdvd/pktcdvd0/info
diff --git a/Documentation/ABI/testing/sysfs-firmware-acpi b/Documentation/ABI/testing/sysfs-firmware-acpi
index e8ffc70ffe12..4f9ba3c2fca7 100644
--- a/Documentation/ABI/testing/sysfs-firmware-acpi
+++ b/Documentation/ABI/testing/sysfs-firmware-acpi
@@ -69,9 +69,13 @@ Description:
69 gpe1F: 0 invalid 69 gpe1F: 0 invalid
70 gpe_all: 1192 70 gpe_all: 1192
71 sci: 1194 71 sci: 1194
72 sci_not: 0
72 73
73 sci - The total number of times the ACPI SCI 74 sci - The number of times the ACPI SCI
74 has claimed an interrupt. 75 has been called and claimed an interrupt.
76
77 sci_not - The number of times the ACPI SCI
78 has been called and NOT claimed an interrupt.
75 79
76 gpe_all - count of SCI caused by GPEs. 80 gpe_all - count of SCI caused by GPEs.
77 81
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
index a3a83d38f96f..8918a32c6b3a 100644
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -31,7 +31,7 @@ PS_METHOD = $(prefer-db2x)
31 31
32### 32###
33# The targets that may be used. 33# The targets that may be used.
34PHONY += xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs 34PHONY += xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs cleandocs
35 35
36BOOKS := $(addprefix $(obj)/,$(DOCBOOKS)) 36BOOKS := $(addprefix $(obj)/,$(DOCBOOKS))
37xmldocs: $(BOOKS) 37xmldocs: $(BOOKS)
@@ -213,11 +213,12 @@ silent_gen_xml = :
213dochelp: 213dochelp:
214 @echo ' Linux kernel internal documentation in different formats:' 214 @echo ' Linux kernel internal documentation in different formats:'
215 @echo ' htmldocs - HTML' 215 @echo ' htmldocs - HTML'
216 @echo ' installmandocs - install man pages generated by mandocs'
217 @echo ' mandocs - man pages'
218 @echo ' pdfdocs - PDF' 216 @echo ' pdfdocs - PDF'
219 @echo ' psdocs - Postscript' 217 @echo ' psdocs - Postscript'
220 @echo ' xmldocs - XML DocBook' 218 @echo ' xmldocs - XML DocBook'
219 @echo ' mandocs - man pages'
220 @echo ' installmandocs - install man pages generated by mandocs'
221 @echo ' cleandocs - clean all generated DocBook files'
221 222
222### 223###
223# Temporary files left by various tools 224# Temporary files left by various tools
@@ -235,6 +236,10 @@ clean-files := $(DOCBOOKS) \
235 236
236clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man 237clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man
237 238
239cleandocs:
240 $(Q)rm -f $(call objectify, $(clean-files))
241 $(Q)rm -rf $(call objectify, $(clean-dirs))
242
238# Declare the contents of the .PHONY variable as phony. We keep that 243# Declare the contents of the .PHONY variable as phony. We keep that
239# information in a variable se we can use it in if_changed and friends. 244# information in a variable se we can use it in if_changed and friends.
240 245
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index ecad6ee75705..6fab97ea7e6b 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -1040,23 +1040,21 @@ Front merges are handled by the binary trees in AS and deadline schedulers.
1040iii. Plugging the queue to batch requests in anticipation of opportunities for 1040iii. Plugging the queue to batch requests in anticipation of opportunities for
1041 merge/sort optimizations 1041 merge/sort optimizations
1042 1042
1043This is just the same as in 2.4 so far, though per-device unplugging
1044support is anticipated for 2.5. Also with a priority-based i/o scheduler,
1045such decisions could be based on request priorities.
1046
1047Plugging is an approach that the current i/o scheduling algorithm resorts to so 1043Plugging is an approach that the current i/o scheduling algorithm resorts to so
1048that it collects up enough requests in the queue to be able to take 1044that it collects up enough requests in the queue to be able to take
1049advantage of the sorting/merging logic in the elevator. If the 1045advantage of the sorting/merging logic in the elevator. If the
1050queue is empty when a request comes in, then it plugs the request queue 1046queue is empty when a request comes in, then it plugs the request queue
1051(sort of like plugging the bottom of a vessel to get fluid to build up) 1047(sort of like plugging the bath tub of a vessel to get fluid to build up)
1052till it fills up with a few more requests, before starting to service 1048till it fills up with a few more requests, before starting to service
1053the requests. This provides an opportunity to merge/sort the requests before 1049the requests. This provides an opportunity to merge/sort the requests before
1054passing them down to the device. There are various conditions when the queue is 1050passing them down to the device. There are various conditions when the queue is
1055unplugged (to open up the flow again), either through a scheduled task or 1051unplugged (to open up the flow again), either through a scheduled task or
1056could be on demand. For example wait_on_buffer sets the unplugging going 1052could be on demand. For example wait_on_buffer sets the unplugging going
1057(by running tq_disk) so the read gets satisfied soon. So in the read case, 1053through sync_buffer() running blk_run_address_space(mapping). Or the caller
1058the queue gets explicitly unplugged as part of waiting for completion, 1054can do it explicity through blk_unplug(bdev). So in the read case,
1059in fact all queues get unplugged as a side-effect. 1055the queue gets explicitly unplugged as part of waiting for completion on that
1056buffer. For page driven IO, the address space ->sync_page() takes care of
1057doing the blk_run_address_space().
1060 1058
1061Aside: 1059Aside:
1062 This is kind of controversial territory, as it's not clear if plugging is 1060 This is kind of controversial territory, as it's not clear if plugging is
@@ -1067,11 +1065,6 @@ Aside:
1067 multi-page bios being queued in one shot, we may not need to wait to merge 1065 multi-page bios being queued in one shot, we may not need to wait to merge
1068 a big request from the broken up pieces coming by. 1066 a big request from the broken up pieces coming by.
1069 1067
1070 Per-queue granularity unplugging (still a Todo) may help reduce some of the
1071 concerns with just a single tq_disk flush approach. Something like
1072 blk_kick_queue() to unplug a specific queue (right away ?)
1073 or optionally, all queues, is in the plan.
1074
10754.4 I/O contexts 10684.4 I/O contexts
1076I/O contexts provide a dynamically allocated per process data area. They may 1069I/O contexts provide a dynamically allocated per process data area. They may
1077be used in I/O schedulers, and in the block layer (could be used for IO statis, 1070be used in I/O schedulers, and in the block layer (could be used for IO statis,
diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroups/cpuacct.txt
index bb775fbe43d7..8b930946c52a 100644
--- a/Documentation/cgroups/cpuacct.txt
+++ b/Documentation/cgroups/cpuacct.txt
@@ -30,3 +30,21 @@ The above steps create a new group g1 and move the current shell
30process (bash) into it. CPU time consumed by this bash and its children 30process (bash) into it. CPU time consumed by this bash and its children
31can be obtained from g1/cpuacct.usage and the same is accumulated in 31can be obtained from g1/cpuacct.usage and the same is accumulated in
32/cgroups/cpuacct.usage also. 32/cgroups/cpuacct.usage also.
33
34cpuacct.stat file lists a few statistics which further divide the
35CPU time obtained by the cgroup into user and system times. Currently
36the following statistics are supported:
37
38user: Time spent by tasks of the cgroup in user mode.
39system: Time spent by tasks of the cgroup in kernel mode.
40
41user and system are in USER_HZ unit.
42
43cpuacct controller uses percpu_counter interface to collect user and
44system times. This has two side effects:
45
46- It is theoretically possible to see wrong values for user and system times.
47 This is because percpu_counter_read() on 32bit systems isn't safe
48 against concurrent writes.
49- It is possible to see slightly outdated values for user and system times
50 due to the batch processing nature of percpu_counter.
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index a98a7fe7aabb..1a608877b14e 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -6,15 +6,14 @@ used here with the memory controller that is used in hardware.
6 6
7Salient features 7Salient features
8 8
9a. Enable control of both RSS (mapped) and Page Cache (unmapped) pages 9a. Enable control of Anonymous, Page Cache (mapped and unmapped) and
10 Swap Cache memory pages.
10b. The infrastructure allows easy addition of other types of memory to control 11b. The infrastructure allows easy addition of other types of memory to control
11c. Provides *zero overhead* for non memory controller users 12c. Provides *zero overhead* for non memory controller users
12d. Provides a double LRU: global memory pressure causes reclaim from the 13d. Provides a double LRU: global memory pressure causes reclaim from the
13 global LRU; a cgroup on hitting a limit, reclaims from the per 14 global LRU; a cgroup on hitting a limit, reclaims from the per
14 cgroup LRU 15 cgroup LRU
15 16
16NOTE: Swap Cache (unmapped) is not accounted now.
17
18Benefits and Purpose of the memory controller 17Benefits and Purpose of the memory controller
19 18
20The memory controller isolates the memory behaviour of a group of tasks 19The memory controller isolates the memory behaviour of a group of tasks
@@ -290,34 +289,44 @@ will be charged as a new owner of it.
290 moved to the parent. If you want to avoid that, force_empty will be useful. 289 moved to the parent. If you want to avoid that, force_empty will be useful.
291 290
2925.2 stat file 2915.2 stat file
293 memory.stat file includes following statistics (now) 292
294 cache - # of pages from page-cache and shmem. 293memory.stat file includes following statistics
295 rss - # of pages from anonymous memory. 294
296 pgpgin - # of event of charging 295cache - # of bytes of page cache memory.
297 pgpgout - # of event of uncharging 296rss - # of bytes of anonymous and swap cache memory.
298 active_anon - # of pages on active lru of anon, shmem. 297pgpgin - # of pages paged in (equivalent to # of charging events).
299 inactive_anon - # of pages on active lru of anon, shmem 298pgpgout - # of pages paged out (equivalent to # of uncharging events).
300 active_file - # of pages on active lru of file-cache 299active_anon - # of bytes of anonymous and swap cache memory on active
301 inactive_file - # of pages on inactive lru of file cache 300 lru list.
302 unevictable - # of pages cannot be reclaimed.(mlocked etc) 301inactive_anon - # of bytes of anonymous memory and swap cache memory on
303 302 inactive lru list.
304 Below is depend on CONFIG_DEBUG_VM. 303active_file - # of bytes of file-backed memory on active lru list.
305 inactive_ratio - VM internal parameter. (see mm/page_alloc.c) 304inactive_file - # of bytes of file-backed memory on inactive lru list.
306 recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) 305unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc).
307 recent_rotated_file - VM internal parameter. (see mm/vmscan.c) 306
308 recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) 307The following additional stats are dependent on CONFIG_DEBUG_VM.
309 recent_scanned_file - VM internal parameter. (see mm/vmscan.c) 308
310 309inactive_ratio - VM internal parameter. (see mm/page_alloc.c)
311 Memo: 310recent_rotated_anon - VM internal parameter. (see mm/vmscan.c)
311recent_rotated_file - VM internal parameter. (see mm/vmscan.c)
312recent_scanned_anon - VM internal parameter. (see mm/vmscan.c)
313recent_scanned_file - VM internal parameter. (see mm/vmscan.c)
314
315Memo:
312 recent_rotated means recent frequency of lru rotation. 316 recent_rotated means recent frequency of lru rotation.
313 recent_scanned means recent # of scans to lru. 317 recent_scanned means recent # of scans to lru.
314 showing for better debug please see the code for meanings. 318 showing for better debug please see the code for meanings.
315 319
320Note:
321 Only anonymous and swap cache memory is listed as part of 'rss' stat.
322 This should not be confused with the true 'resident set size' or the
323 amount of physical memory used by the cgroup. Per-cgroup rss
324 accounting is not done yet.
316 325
3175.3 swappiness 3265.3 swappiness
318 Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. 327 Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
319 328
320 Following cgroup's swapiness can't be changed. 329 Following cgroups' swapiness can't be changed.
321 - root cgroup (uses /proc/sys/vm/swappiness). 330 - root cgroup (uses /proc/sys/vm/swappiness).
322 - a cgroup which uses hierarchy and it has child cgroup. 331 - a cgroup which uses hierarchy and it has child cgroup.
323 - a cgroup which uses hierarchy and not the root of hierarchy. 332 - a cgroup which uses hierarchy and not the root of hierarchy.
diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt
index f196ac1d7d25..95b24d766eab 100644
--- a/Documentation/cgroups/resource_counter.txt
+++ b/Documentation/cgroups/resource_counter.txt
@@ -47,13 +47,18 @@ to work with it.
47 47
482. Basic accounting routines 482. Basic accounting routines
49 49
50 a. void res_counter_init(struct res_counter *rc) 50 a. void res_counter_init(struct res_counter *rc,
51 struct res_counter *rc_parent)
51 52
52 Initializes the resource counter. As usual, should be the first 53 Initializes the resource counter. As usual, should be the first
53 routine called for a new counter. 54 routine called for a new counter.
54 55
55 b. int res_counter_charge[_locked] 56 The struct res_counter *parent can be used to define a hierarchical
56 (struct res_counter *rc, unsigned long val) 57 child -> parent relationship directly in the res_counter structure,
58 NULL can be used to define no relationship.
59
60 c. int res_counter_charge(struct res_counter *rc, unsigned long val,
61 struct res_counter **limit_fail_at)
57 62
58 When a resource is about to be allocated it has to be accounted 63 When a resource is about to be allocated it has to be accounted
59 with the appropriate resource counter (controller should determine 64 with the appropriate resource counter (controller should determine
@@ -67,15 +72,25 @@ to work with it.
67 * if the charging is performed first, then it should be uncharged 72 * if the charging is performed first, then it should be uncharged
68 on error path (if the one is called). 73 on error path (if the one is called).
69 74
70 c. void res_counter_uncharge[_locked] 75 If the charging fails and a hierarchical dependency exists, the
76 limit_fail_at parameter is set to the particular res_counter element
77 where the charging failed.
78
79 d. int res_counter_charge_locked
80 (struct res_counter *rc, unsigned long val)
81
82 The same as res_counter_charge(), but it must not acquire/release the
83 res_counter->lock internally (it must be called with res_counter->lock
84 held).
85
86 e. void res_counter_uncharge[_locked]
71 (struct res_counter *rc, unsigned long val) 87 (struct res_counter *rc, unsigned long val)
72 88
73 When a resource is released (freed) it should be de-accounted 89 When a resource is released (freed) it should be de-accounted
74 from the resource counter it was accounted to. This is called 90 from the resource counter it was accounted to. This is called
75 "uncharging". 91 "uncharging".
76 92
77 The _locked routines imply that the res_counter->lock is taken. 93 The _locked routines imply that the res_counter->lock is taken.
78
79 94
80 2.1 Other accounting routines 95 2.1 Other accounting routines
81 96
diff --git a/Documentation/driver-model/platform.txt b/Documentation/driver-model/platform.txt
index 83009fdcbbc8..2e2c2ea90ceb 100644
--- a/Documentation/driver-model/platform.txt
+++ b/Documentation/driver-model/platform.txt
@@ -169,3 +169,62 @@ three different ways to find such a match:
169 be probed later if another device registers. (Which is OK, since 169 be probed later if another device registers. (Which is OK, since
170 this interface is only for use with non-hotpluggable devices.) 170 this interface is only for use with non-hotpluggable devices.)
171 171
172
173Early Platform Devices and Drivers
174~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
175The early platform interfaces provide platform data to platform device
176drivers early on during the system boot. The code is built on top of the
177early_param() command line parsing and can be executed very early on.
178
179Example: "earlyprintk" class early serial console in 6 steps
180
1811. Registering early platform device data
182~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
183The architecture code registers platform device data using the function
184early_platform_add_devices(). In the case of early serial console this
185should be hardware configuration for the serial port. Devices registered
186at this point will later on be matched against early platform drivers.
187
1882. Parsing kernel command line
189~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
190The architecture code calls parse_early_param() to parse the kernel
191command line. This will execute all matching early_param() callbacks.
192User specified early platform devices will be registered at this point.
193For the early serial console case the user can specify port on the
194kernel command line as "earlyprintk=serial.0" where "earlyprintk" is
195the class string, "serial" is the name of the platfrom driver and
1960 is the platform device id. If the id is -1 then the dot and the
197id can be omitted.
198
1993. Installing early platform drivers belonging to a certain class
200~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
201The architecture code may optionally force registration of all early
202platform drivers belonging to a certain class using the function
203early_platform_driver_register_all(). User specified devices from
204step 2 have priority over these. This step is omitted by the serial
205driver example since the early serial driver code should be disabled
206unless the user has specified port on the kernel command line.
207
2084. Early platform driver registration
209~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
210Compiled-in platform drivers making use of early_platform_init() are
211automatically registered during step 2 or 3. The serial driver example
212should use early_platform_init("earlyprintk", &platform_driver).
213
2145. Probing of early platform drivers belonging to a certain class
215~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
216The architecture code calls early_platform_driver_probe() to match
217registered early platform devices associated with a certain class with
218registered early platform drivers. Matched devices will get probed().
219This step can be executed at any point during the early boot. As soon
220as possible may be good for the serial port case.
221
2226. Inside the early platform driver probe()
223~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
224The driver code needs to take special care during early boot, especially
225when it comes to memory allocation and interrupt registration. The code
226in the probe() function can use is_early_platform_device() to check if
227it is called at early platform device or at the regular platform device
228time. The early serial driver performs register_console() at this point.
229
230For further information, see <linux/platform_device.h>.
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 7e2af10e8264..de491a3e2313 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -428,3 +428,12 @@ Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to
428 After a reasonable transition period, we will remove the legacy 428 After a reasonable transition period, we will remove the legacy
429 fakephp interface. 429 fakephp interface.
430Who: Alex Chiang <achiang@hp.com> 430Who: Alex Chiang <achiang@hp.com>
431
432---------------------------
433
434What: i2c-voodoo3 driver
435When: October 2009
436Why: Superseded by tdfxfb. I2C/DDC support used to live in a separate
437 driver but this caused driver conflicts.
438Who: Jean Delvare <khali@linux-fr.org>
439 Krzysztof Helt <krzysztof.h1@wp.pl>
diff --git a/Documentation/filesystems/pohmelfs/design_notes.txt b/Documentation/filesystems/pohmelfs/design_notes.txt
index 6d6db60d567d..dcf833587162 100644
--- a/Documentation/filesystems/pohmelfs/design_notes.txt
+++ b/Documentation/filesystems/pohmelfs/design_notes.txt
@@ -56,9 +56,10 @@ workloads and can fully utilize the bandwidth to the servers when doing bulk
56data transfers. 56data transfers.
57 57
58POHMELFS clients operate with a working set of servers and are capable of balancing read-only 58POHMELFS clients operate with a working set of servers and are capable of balancing read-only
59operations (like lookups or directory listings) between them. 59operations (like lookups or directory listings) between them according to IO priorities.
60Administrators can add or remove servers from the set at run-time via special commands (described 60Administrators can add or remove servers from the set at run-time via special commands (described
61in Documentation/pohmelfs/info.txt file). Writes are replicated to all servers. 61in Documentation/pohmelfs/info.txt file). Writes are replicated to all servers, which are connected
62with write permission turned on. IO priority and permissions can be changed in run-time.
62 63
63POHMELFS is capable of full data channel encryption and/or strong crypto hashing. 64POHMELFS is capable of full data channel encryption and/or strong crypto hashing.
64One can select any kernel supported cipher, encryption mode, hash type and operation mode 65One can select any kernel supported cipher, encryption mode, hash type and operation mode
diff --git a/Documentation/filesystems/pohmelfs/info.txt b/Documentation/filesystems/pohmelfs/info.txt
index 4e3d50157083..db2e41393626 100644
--- a/Documentation/filesystems/pohmelfs/info.txt
+++ b/Documentation/filesystems/pohmelfs/info.txt
@@ -1,6 +1,8 @@
1POHMELFS usage information. 1POHMELFS usage information.
2 2
3Mount options: 3Mount options.
4All but index, number of crypto threads and maximum IO size can changed via remount.
5
4idx=%u 6idx=%u
5 Each mountpoint is associated with a special index via this option. 7 Each mountpoint is associated with a special index via this option.
6 Administrator can add or remove servers from the given index, so all mounts, 8 Administrator can add or remove servers from the given index, so all mounts,
@@ -52,16 +54,27 @@ mcache_timeout=%u
52 54
53Usage examples. 55Usage examples.
54 56
55Add (or remove if it already exists) server server1.net:1025 into the working set with index $idx 57Add server server1.net:1025 into the working set with index $idx
56with appropriate hash algorithm and key file and cipher algorithm, mode and key file: 58with appropriate hash algorithm and key file and cipher algorithm, mode and key file:
57$cfg -a server1.net -p 1025 -i $idx -K $hash_key -k $cipher_key 59$cfg A add -a server1.net -p 1025 -i $idx -K $hash_key -k $cipher_key
58 60
59Mount filesystem with given index $idx to /mnt mountpoint. 61Mount filesystem with given index $idx to /mnt mountpoint.
60Client will connect to all servers specified in the working set via previous command: 62Client will connect to all servers specified in the working set via previous command:
61mount -t pohmel -o idx=$idx q /mnt 63mount -t pohmel -o idx=$idx q /mnt
62 64
63One can add or remove servers from working set after mounting too. 65Change permissions to read-only (-I 1 option, '-I 2' - write-only, 3 - rw):
66$cfg A modify -a server1.net -p 1025 -i $idx -I 1
67
68Change IO priority to 123 (node with the highest priority gets read requests).
69$cfg A modify -a server1.net -p 1025 -i $idx -P 123
64 70
71One can check currect status of all connections in the mountstats file:
72# cat /proc/$PID/mountstats
73...
74device none mounted on /mnt with fstype pohmel
75idx addr(:port) socket_type protocol active priority permissions
760 server1.net:1026 1 6 1 250 1
770 server2.net:1025 1 6 1 123 3
65 78
66Server installation. 79Server installation.
67 80
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index deeeed0faa8f..f49eecf2e573 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -277,8 +277,7 @@ or bottom half).
277 unfreeze_fs: called when VFS is unlocking a filesystem and making it writable 277 unfreeze_fs: called when VFS is unlocking a filesystem and making it writable
278 again. 278 again.
279 279
280 statfs: called when the VFS needs to get filesystem statistics. This 280 statfs: called when the VFS needs to get filesystem statistics.
281 is called with the kernel lock held
282 281
283 remount_fs: called when the filesystem is remounted. This is called 282 remount_fs: called when the filesystem is remounted. This is called
284 with the kernel lock held 283 with the kernel lock held
diff --git a/Documentation/infiniband/ipoib.txt b/Documentation/infiniband/ipoib.txt
index 864ff3283780..6d40f00b358c 100644
--- a/Documentation/infiniband/ipoib.txt
+++ b/Documentation/infiniband/ipoib.txt
@@ -24,6 +24,49 @@ Partitions and P_Keys
24 The P_Key for any interface is given by the "pkey" file, and the 24 The P_Key for any interface is given by the "pkey" file, and the
25 main interface for a subinterface is in "parent." 25 main interface for a subinterface is in "parent."
26 26
27Datagram vs Connected modes
28
29 The IPoIB driver supports two modes of operation: datagram and
30 connected. The mode is set and read through an interface's
31 /sys/class/net/<intf name>/mode file.
32
33 In datagram mode, the IB UD (Unreliable Datagram) transport is used
34 and so the interface MTU has is equal to the IB L2 MTU minus the
35 IPoIB encapsulation header (4 bytes). For example, in a typical IB
36 fabric with a 2K MTU, the IPoIB MTU will be 2048 - 4 = 2044 bytes.
37
38 In connected mode, the IB RC (Reliable Connected) transport is used.
39 Connected mode is to takes advantage of the connected nature of the
40 IB transport and allows an MTU up to the maximal IP packet size of
41 64K, which reduces the number of IP packets needed for handling
42 large UDP datagrams, TCP segments, etc and increases the performance
43 for large messages.
44
45 In connected mode, the interface's UD QP is still used for multicast
46 and communication with peers that don't support connected mode. In
47 this case, RX emulation of ICMP PMTU packets is used to cause the
48 networking stack to use the smaller UD MTU for these neighbours.
49
50Stateless offloads
51
52 If the IB HW supports IPoIB stateless offloads, IPoIB advertises
53 TCP/IP checksum and/or Large Send (LSO) offloading capability to the
54 network stack.
55
56 Large Receive (LRO) offloading is also implemented and may be turned
57 on/off using ethtool calls. Currently LRO is supported only for
58 checksum offload capable devices.
59
60 Stateless offloads are supported only in datagram mode.
61
62Interrupt moderation
63
64 If the underlying IB device supports CQ event moderation, one can
65 use ethtool to set interrupt mitigation parameters and thus reduce
66 the overhead incurred by handling interrupts. The main code path of
67 IPoIB doesn't use events for TX completion signaling so only RX
68 moderation is supported.
69
27Debugging Information 70Debugging Information
28 71
29 By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set 72 By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set
@@ -55,3 +98,5 @@ References
55 http://ietf.org/rfc/rfc4391.txt 98 http://ietf.org/rfc/rfc4391.txt
56 IP over InfiniBand (IPoIB) Architecture (RFC 4392) 99 IP over InfiniBand (IPoIB) Architecture (RFC 4392)
57 http://ietf.org/rfc/rfc4392.txt 100 http://ietf.org/rfc/rfc4392.txt
101 IP over InfiniBand: Connected Mode (RFC 4755)
102 http://ietf.org/rfc/rfc4755.txt
diff --git a/Documentation/input/rotary-encoder.txt b/Documentation/input/rotary-encoder.txt
new file mode 100644
index 000000000000..435102a26d96
--- /dev/null
+++ b/Documentation/input/rotary-encoder.txt
@@ -0,0 +1,101 @@
1rotary-encoder - a generic driver for GPIO connected devices
2Daniel Mack <daniel@caiaq.de>, Feb 2009
3
40. Function
5-----------
6
7Rotary encoders are devices which are connected to the CPU or other
8peripherals with two wires. The outputs are phase-shifted by 90 degrees
9and by triggering on falling and rising edges, the turn direction can
10be determined.
11
12The phase diagram of these two outputs look like this:
13
14 _____ _____ _____
15 | | | | | |
16 Channel A ____| |_____| |_____| |____
17
18 : : : : : : : : : : : :
19 __ _____ _____ _____
20 | | | | | | |
21 Channel B |_____| |_____| |_____| |__
22
23 : : : : : : : : : : : :
24 Event a b c d a b c d a b c d
25
26 |<-------->|
27 one step
28
29
30For more information, please see
31 http://en.wikipedia.org/wiki/Rotary_encoder
32
33
341. Events / state machine
35-------------------------
36
37a) Rising edge on channel A, channel B in low state
38 This state is used to recognize a clockwise turn
39
40b) Rising edge on channel B, channel A in high state
41 When entering this state, the encoder is put into 'armed' state,
42 meaning that there it has seen half the way of a one-step transition.
43
44c) Falling edge on channel A, channel B in high state
45 This state is used to recognize a counter-clockwise turn
46
47d) Falling edge on channel B, channel A in low state
48 Parking position. If the encoder enters this state, a full transition
49 should have happend, unless it flipped back on half the way. The
50 'armed' state tells us about that.
51
522. Platform requirements
53------------------------
54
55As there is no hardware dependent call in this driver, the platform it is
56used with must support gpiolib. Another requirement is that IRQs must be
57able to fire on both edges.
58
59
603. Board integration
61--------------------
62
63To use this driver in your system, register a platform_device with the
64name 'rotary-encoder' and associate the IRQs and some specific platform
65data with it.
66
67struct rotary_encoder_platform_data is declared in
68include/linux/rotary-encoder.h and needs to be filled with the number of
69steps the encoder has and can carry information about externally inverted
70signals (because of used invertig buffer or other reasons).
71
72Because GPIO to IRQ mapping is platform specific, this information must
73be given in seperately to the driver. See the example below.
74
75---------<snip>---------
76
77/* board support file example */
78
79#include <linux/input.h>
80#include <linux/rotary_encoder.h>
81
82#define GPIO_ROTARY_A 1
83#define GPIO_ROTARY_B 2
84
85static struct rotary_encoder_platform_data my_rotary_encoder_info = {
86 .steps = 24,
87 .axis = ABS_X,
88 .gpio_a = GPIO_ROTARY_A,
89 .gpio_b = GPIO_ROTARY_B,
90 .inverted_a = 0,
91 .inverted_b = 0,
92};
93
94static struct platform_device rotary_encoder_device = {
95 .name = "rotary-encoder",
96 .id = 0,
97 .dev = {
98 .platform_data = &my_rotary_encoder_info,
99 }
100};
101
diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt
index 51104f9194a5..d76cfd8712e1 100644
--- a/Documentation/kbuild/makefiles.txt
+++ b/Documentation/kbuild/makefiles.txt
@@ -40,10 +40,16 @@ This document describes the Linux kernel Makefiles.
40 --- 6.7 Custom kbuild commands 40 --- 6.7 Custom kbuild commands
41 --- 6.8 Preprocessing linker scripts 41 --- 6.8 Preprocessing linker scripts
42 42
43 === 7 Kbuild Variables 43 === 7 Kbuild syntax for exported headers
44 === 8 Makefile language 44 --- 7.1 header-y
45 === 9 Credits 45 --- 7.2 objhdr-y
46 === 10 TODO 46 --- 7.3 destination-y
47 --- 7.4 unifdef-y (deprecated)
48
49 === 8 Kbuild Variables
50 === 9 Makefile language
51 === 10 Credits
52 === 11 TODO
47 53
48=== 1 Overview 54=== 1 Overview
49 55
@@ -310,6 +316,16 @@ more details, with real examples.
310 #arch/m68k/fpsp040/Makefile 316 #arch/m68k/fpsp040/Makefile
311 ldflags-y := -x 317 ldflags-y := -x
312 318
319 subdir-ccflags-y, subdir-asflags-y
320 The two flags listed above are similar to ccflags-y and as-falgs-y.
321 The difference is that the subdir- variants has effect for the kbuild
322 file where tey are present and all subdirectories.
323 Options specified using subdir-* are added to the commandline before
324 the options specified using the non-subdir variants.
325
326 Example:
327 subdir-ccflags-y := -Werror
328
313 CFLAGS_$@, AFLAGS_$@ 329 CFLAGS_$@, AFLAGS_$@
314 330
315 CFLAGS_$@ and AFLAGS_$@ only apply to commands in current 331 CFLAGS_$@ and AFLAGS_$@ only apply to commands in current
@@ -1143,8 +1159,69 @@ When kbuild executes, the following steps are followed (roughly):
1143 The kbuild infrastructure for *lds file are used in several 1159 The kbuild infrastructure for *lds file are used in several
1144 architecture-specific files. 1160 architecture-specific files.
1145 1161
1162=== 7 Kbuild syntax for exported headers
1163
1164The kernel include a set of headers that is exported to userspace.
1165Many headers can be exported as-is but other headers requires a
1166minimal pre-processing before they are ready for user-space.
1167The pre-processing does:
1168- drop kernel specific annotations
1169- drop include of compiler.h
1170- drop all sections that is kernel internat (guarded by ifdef __KERNEL__)
1171
1172Each relevant directory contain a file name "Kbuild" which specify the
1173headers to be exported.
1174See subsequent chapter for the syntax of the Kbuild file.
1175
1176 --- 7.1 header-y
1177
1178 header-y specify header files to be exported.
1179
1180 Example:
1181 #include/linux/Kbuild
1182 header-y += usb/
1183 header-y += aio_abi.h
1184
1185 The convention is to list one file per line and
1186 preferably in alphabetic order.
1187
1188 header-y also specify which subdirectories to visit.
1189 A subdirectory is identified by a trailing '/' which
1190 can be seen in the example above for the usb subdirectory.
1191
1192 Subdirectories are visited before their parent directories.
1193
1194 --- 7.2 objhdr-y
1195
1196 objhdr-y specifies generated files to be exported.
1197 Generated files are special as they need to be looked
1198 up in another directory when doing 'make O=...' builds.
1199
1200 Example:
1201 #include/linux/Kbuild
1202 objhdr-y += version.h
1203
1204 --- 7.3 destination-y
1205
1206 When an architecture have a set of exported headers that needs to be
1207 exported to a different directory destination-y is used.
1208 destination-y specify the destination directory for all exported
1209 headers in the file where it is present.
1210
1211 Example:
1212 #arch/xtensa/platforms/s6105/include/platform/Kbuild
1213 destination-y := include/linux
1214
1215 In the example above all exported headers in the Kbuild file
1216 will be located in the directory "include/linux" when exported.
1217
1218
1219 --- 7.4 unifdef-y (deprecated)
1220
1221 unifdef-y is deprecated. A direct replacement is header-y.
1222
1146 1223
1147=== 7 Kbuild Variables 1224=== 8 Kbuild Variables
1148 1225
1149The top Makefile exports the following variables: 1226The top Makefile exports the following variables:
1150 1227
@@ -1206,7 +1283,7 @@ The top Makefile exports the following variables:
1206 INSTALL_MOD_STRIP will used as the option(s) to the strip command. 1283 INSTALL_MOD_STRIP will used as the option(s) to the strip command.
1207 1284
1208 1285
1209=== 8 Makefile language 1286=== 9 Makefile language
1210 1287
1211The kernel Makefiles are designed to be run with GNU Make. The Makefiles 1288The kernel Makefiles are designed to be run with GNU Make. The Makefiles
1212use only the documented features of GNU Make, but they do use many 1289use only the documented features of GNU Make, but they do use many
@@ -1225,14 +1302,14 @@ time the left-hand side is used.
1225There are some cases where "=" is appropriate. Usually, though, ":=" 1302There are some cases where "=" is appropriate. Usually, though, ":="
1226is the right choice. 1303is the right choice.
1227 1304
1228=== 9 Credits 1305=== 10 Credits
1229 1306
1230Original version made by Michael Elizabeth Chastain, <mailto:mec@shout.net> 1307Original version made by Michael Elizabeth Chastain, <mailto:mec@shout.net>
1231Updates by Kai Germaschewski <kai@tp1.ruhr-uni-bochum.de> 1308Updates by Kai Germaschewski <kai@tp1.ruhr-uni-bochum.de>
1232Updates by Sam Ravnborg <sam@ravnborg.org> 1309Updates by Sam Ravnborg <sam@ravnborg.org>
1233Language QA by Jan Engelhardt <jengelh@gmx.de> 1310Language QA by Jan Engelhardt <jengelh@gmx.de>
1234 1311
1235=== 10 TODO 1312=== 11 TODO
1236 1313
1237- Describe how kbuild supports shipped files with _shipped. 1314- Describe how kbuild supports shipped files with _shipped.
1238- Generating offset header files. 1315- Generating offset header files.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 6172e4360f60..600cdd72900c 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -134,7 +134,7 @@ and is between 256 and 4096 characters. It is defined in the file
134./include/asm/setup.h as COMMAND_LINE_SIZE. 134./include/asm/setup.h as COMMAND_LINE_SIZE.
135 135
136 136
137 acpi= [HW,ACPI,X86-64,i386] 137 acpi= [HW,ACPI,X86]
138 Advanced Configuration and Power Interface 138 Advanced Configuration and Power Interface
139 Format: { force | off | ht | strict | noirq | rsdt } 139 Format: { force | off | ht | strict | noirq | rsdt }
140 force -- enable ACPI if default was off 140 force -- enable ACPI if default was off
@@ -218,7 +218,7 @@ and is between 256 and 4096 characters. It is defined in the file
218 acpi_osi="!string2" # remove built-in string2 218 acpi_osi="!string2" # remove built-in string2
219 acpi_osi= # disable all strings 219 acpi_osi= # disable all strings
220 220
221 acpi_pm_good [X86-32,X86-64] 221 acpi_pm_good [X86]
222 Override the pmtimer bug detection: force the kernel 222 Override the pmtimer bug detection: force the kernel
223 to assume that this machine's pmtimer latches its value 223 to assume that this machine's pmtimer latches its value
224 and always returns good values. 224 and always returns good values.
@@ -231,6 +231,35 @@ and is between 256 and 4096 characters. It is defined in the file
231 power state again in power transition. 231 power state again in power transition.
232 1 : disable the power state check 232 1 : disable the power state check
233 233
234 acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode
235 Format: { level | edge | high | low }
236
237 acpi_serialize [HW,ACPI] force serialization of AML methods
238
239 acpi_skip_timer_override [HW,ACPI]
240 Recognize and ignore IRQ0/pin2 Interrupt Override.
241 For broken nForce2 BIOS resulting in XT-PIC timer.
242
243 acpi_sleep= [HW,ACPI] Sleep options
244 Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig,
245 old_ordering, s4_nonvs }
246 See Documentation/power/video.txt for information on
247 s3_bios and s3_mode.
248 s3_beep is for debugging; it makes the PC's speaker beep
249 as soon as the kernel's real-mode entry point is called.
250 s4_nohwsig prevents ACPI hardware signature from being
251 used during resume from hibernation.
252 old_ordering causes the ACPI 1.0 ordering of the _PTS
253 control method, with respect to putting devices into
254 low power states, to be enforced (the ACPI 2.0 ordering
255 of _PTS is used by default).
256 s4_nonvs prevents the kernel from saving/restoring the
257 ACPI NVS memory during hibernation.
258
259 acpi_use_timer_override [HW,ACPI]
260 Use timer override. For some broken Nvidia NF5 boards
261 that require a timer override, but don't have HPET
262
234 acpi_enforce_resources= [ACPI] 263 acpi_enforce_resources= [ACPI]
235 { strict | lax | no } 264 { strict | lax | no }
236 Check for resource conflicts between native drivers 265 Check for resource conflicts between native drivers
@@ -250,6 +279,9 @@ and is between 256 and 4096 characters. It is defined in the file
250 ad1848= [HW,OSS] 279 ad1848= [HW,OSS]
251 Format: <io>,<irq>,<dma>,<dma2>,<type> 280 Format: <io>,<irq>,<dma>,<dma2>,<type>
252 281
282 add_efi_memmap [EFI; X86] Include EFI memory map in
283 kernel's map of available physical RAM.
284
253 advansys= [HW,SCSI] 285 advansys= [HW,SCSI]
254 See header of drivers/scsi/advansys.c. 286 See header of drivers/scsi/advansys.c.
255 287
@@ -459,7 +491,7 @@ and is between 256 and 4096 characters. It is defined in the file
459 Also note the kernel might malfunction if you disable 491 Also note the kernel might malfunction if you disable
460 some critical bits. 492 some critical bits.
461 493
462 code_bytes [IA32/X86_64] How many bytes of object code to print 494 code_bytes [X86] How many bytes of object code to print
463 in an oops report. 495 in an oops report.
464 Range: 0 - 8192 496 Range: 0 - 8192
465 Default: 64 497 Default: 64
@@ -592,7 +624,7 @@ and is between 256 and 4096 characters. It is defined in the file
592 MTRR settings. This parameter disables that behavior, 624 MTRR settings. This parameter disables that behavior,
593 possibly causing your machine to run very slowly. 625 possibly causing your machine to run very slowly.
594 626
595 disable_timer_pin_1 [i386,x86-64] 627 disable_timer_pin_1 [X86]
596 Disable PIN 1 of APIC timer 628 Disable PIN 1 of APIC timer
597 Can be useful to work around chipset bugs. 629 Can be useful to work around chipset bugs.
598 630
@@ -624,7 +656,7 @@ and is between 256 and 4096 characters. It is defined in the file
624 UART at the specified I/O port or MMIO address. 656 UART at the specified I/O port or MMIO address.
625 The options are the same as for ttyS, above. 657 The options are the same as for ttyS, above.
626 658
627 earlyprintk= [X86-32,X86-64,SH,BLACKFIN] 659 earlyprintk= [X86,SH,BLACKFIN]
628 earlyprintk=vga 660 earlyprintk=vga
629 earlyprintk=serial[,ttySn[,baudrate]] 661 earlyprintk=serial[,ttySn[,baudrate]]
630 earlyprintk=dbgp 662 earlyprintk=dbgp
@@ -659,7 +691,7 @@ and is between 256 and 4096 characters. It is defined in the file
659 See Documentation/block/as-iosched.txt and 691 See Documentation/block/as-iosched.txt and
660 Documentation/block/deadline-iosched.txt for details. 692 Documentation/block/deadline-iosched.txt for details.
661 693
662 elfcorehdr= [IA64,PPC,SH,X86-32,X86_64] 694 elfcorehdr= [IA64,PPC,SH,X86]
663 Specifies physical address of start of kernel core 695 Specifies physical address of start of kernel core
664 image elf header. Generally kexec loader will 696 image elf header. Generally kexec loader will
665 pass this option to capture kernel. 697 pass this option to capture kernel.
@@ -938,7 +970,7 @@ and is between 256 and 4096 characters. It is defined in the file
938 See comment before marvel_specify_io7 in 970 See comment before marvel_specify_io7 in
939 arch/alpha/kernel/core_marvel.c. 971 arch/alpha/kernel/core_marvel.c.
940 972
941 io_delay= [X86-32,X86-64] I/O delay method 973 io_delay= [X86] I/O delay method
942 0x80 974 0x80
943 Standard port 0x80 based delay 975 Standard port 0x80 based delay
944 0xed 976 0xed
@@ -1000,7 +1032,7 @@ and is between 256 and 4096 characters. It is defined in the file
1000 1032
1001 keepinitrd [HW,ARM] 1033 keepinitrd [HW,ARM]
1002 1034
1003 kernelcore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter 1035 kernelcore=nn[KMG] [KNL,X86,IA-64,PPC] This parameter
1004 specifies the amount of memory usable by the kernel 1036 specifies the amount of memory usable by the kernel
1005 for non-movable allocations. The requested amount is 1037 for non-movable allocations. The requested amount is
1006 spread evenly throughout all nodes in the system. The 1038 spread evenly throughout all nodes in the system. The
@@ -1034,7 +1066,7 @@ and is between 256 and 4096 characters. It is defined in the file
1034 Configure the RouterBoard 532 series on-chip 1066 Configure the RouterBoard 532 series on-chip
1035 Ethernet adapter MAC address. 1067 Ethernet adapter MAC address.
1036 1068
1037 kstack=N [X86-32,X86-64] Print N words from the kernel stack 1069 kstack=N [X86] Print N words from the kernel stack
1038 in oops dumps. 1070 in oops dumps.
1039 1071
1040 l2cr= [PPC] 1072 l2cr= [PPC]
@@ -1044,7 +1076,7 @@ and is between 256 and 4096 characters. It is defined in the file
1044 lapic [X86-32,APIC] Enable the local APIC even if BIOS 1076 lapic [X86-32,APIC] Enable the local APIC even if BIOS
1045 disabled it. 1077 disabled it.
1046 1078
1047 lapic_timer_c2_ok [X86-32,x86-64,APIC] trust the local apic timer 1079 lapic_timer_c2_ok [X86,APIC] trust the local apic timer
1048 in C2 power state. 1080 in C2 power state.
1049 1081
1050 libata.dma= [LIBATA] DMA control 1082 libata.dma= [LIBATA] DMA control
@@ -1229,7 +1261,7 @@ and is between 256 and 4096 characters. It is defined in the file
1229 [KNL,SH] Allow user to override the default size for 1261 [KNL,SH] Allow user to override the default size for
1230 per-device physically contiguous DMA buffers. 1262 per-device physically contiguous DMA buffers.
1231 1263
1232 memmap=exactmap [KNL,X86-32,X86_64] Enable setting of an exact 1264 memmap=exactmap [KNL,X86] Enable setting of an exact
1233 E820 memory map, as specified by the user. 1265 E820 memory map, as specified by the user.
1234 Such memmap=exactmap lines can be constructed based on 1266 Such memmap=exactmap lines can be constructed based on
1235 BIOS output or other requirements. See the memmap=nn@ss 1267 BIOS output or other requirements. See the memmap=nn@ss
@@ -1320,7 +1352,7 @@ and is between 256 and 4096 characters. It is defined in the file
1320 mousedev.yres= [MOUSE] Vertical screen resolution, used for devices 1352 mousedev.yres= [MOUSE] Vertical screen resolution, used for devices
1321 reporting absolute coordinates, such as tablets 1353 reporting absolute coordinates, such as tablets
1322 1354
1323 movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter 1355 movablecore=nn[KMG] [KNL,X86,IA-64,PPC] This parameter
1324 is similar to kernelcore except it specifies the 1356 is similar to kernelcore except it specifies the
1325 amount of memory used for migratable allocations. 1357 amount of memory used for migratable allocations.
1326 If both kernelcore and movablecore is specified, 1358 If both kernelcore and movablecore is specified,
@@ -1422,7 +1454,7 @@ and is between 256 and 4096 characters. It is defined in the file
1422 when a NMI is triggered. 1454 when a NMI is triggered.
1423 Format: [state][,regs][,debounce][,die] 1455 Format: [state][,regs][,debounce][,die]
1424 1456
1425 nmi_watchdog= [KNL,BUGS=X86-32,X86-64] Debugging features for SMP kernels 1457 nmi_watchdog= [KNL,BUGS=X86] Debugging features for SMP kernels
1426 Format: [panic,][num] 1458 Format: [panic,][num]
1427 Valid num: 0,1,2 1459 Valid num: 0,1,2
1428 0 - turn nmi_watchdog off 1460 0 - turn nmi_watchdog off
@@ -1475,11 +1507,11 @@ and is between 256 and 4096 characters. It is defined in the file
1475 1507
1476 nodsp [SH] Disable hardware DSP at boot time. 1508 nodsp [SH] Disable hardware DSP at boot time.
1477 1509
1478 noefi [X86-32,X86-64] Disable EFI runtime services support. 1510 noefi [X86] Disable EFI runtime services support.
1479 1511
1480 noexec [IA-64] 1512 noexec [IA-64]
1481 1513
1482 noexec [X86-32,X86-64] 1514 noexec [X86]
1483 On X86-32 available only on PAE configured kernels. 1515 On X86-32 available only on PAE configured kernels.
1484 noexec=on: enable non-executable mappings (default) 1516 noexec=on: enable non-executable mappings (default)
1485 noexec=off: disable non-executable mappings 1517 noexec=off: disable non-executable mappings
@@ -1525,7 +1557,7 @@ and is between 256 and 4096 characters. It is defined in the file
1525 noirqdebug [X86-32] Disables the code which attempts to detect and 1557 noirqdebug [X86-32] Disables the code which attempts to detect and
1526 disable unhandled interrupt sources. 1558 disable unhandled interrupt sources.
1527 1559
1528 no_timer_check [X86-32,X86_64,APIC] Disables the code which tests for 1560 no_timer_check [X86,APIC] Disables the code which tests for
1529 broken timer IRQ sources. 1561 broken timer IRQ sources.
1530 1562
1531 noisapnp [ISAPNP] Disables ISA PnP code. 1563 noisapnp [ISAPNP] Disables ISA PnP code.
@@ -1689,7 +1721,7 @@ and is between 256 and 4096 characters. It is defined in the file
1689 disable the use of PCIE advanced error reporting. 1721 disable the use of PCIE advanced error reporting.
1690 nodomains [PCI] Disable support for multiple PCI 1722 nodomains [PCI] Disable support for multiple PCI
1691 root domains (aka PCI segments, in ACPI-speak). 1723 root domains (aka PCI segments, in ACPI-speak).
1692 nommconf [X86-32,X86_64] Disable use of MMCONFIG for PCI 1724 nommconf [X86] Disable use of MMCONFIG for PCI
1693 Configuration 1725 Configuration
1694 nomsi [MSI] If the PCI_MSI kernel config parameter is 1726 nomsi [MSI] If the PCI_MSI kernel config parameter is
1695 enabled, this kernel boot option can be used to 1727 enabled, this kernel boot option can be used to
@@ -1838,6 +1870,12 @@ and is between 256 and 4096 characters. It is defined in the file
1838 autoconfiguration. 1870 autoconfiguration.
1839 Ranges are in pairs (memory base and size). 1871 Ranges are in pairs (memory base and size).
1840 1872
1873 ports= [IP_VS_FTP] IPVS ftp helper module
1874 Default is 21.
1875 Up to 8 (IP_VS_APP_MAX_PORTS) ports
1876 may be specified.
1877 Format: <port>,<port>....
1878
1841 print-fatal-signals= 1879 print-fatal-signals=
1842 [KNL] debug: print fatal signals 1880 [KNL] debug: print fatal signals
1843 print-fatal-signals=1: print segfault info to 1881 print-fatal-signals=1: print segfault info to
@@ -2380,7 +2418,7 @@ and is between 256 and 4096 characters. It is defined in the file
2380 reported either. 2418 reported either.
2381 2419
2382 unknown_nmi_panic 2420 unknown_nmi_panic
2383 [X86-32,X86-64] 2421 [X86]
2384 Set unknown_nmi_panic=1 early on boot. 2422 Set unknown_nmi_panic=1 early on boot.
2385 2423
2386 usbcore.autosuspend= 2424 usbcore.autosuspend=
@@ -2447,12 +2485,12 @@ and is between 256 and 4096 characters. It is defined in the file
2447 medium is write-protected). 2485 medium is write-protected).
2448 Example: quirks=0419:aaf5:rl,0421:0433:rc 2486 Example: quirks=0419:aaf5:rl,0421:0433:rc
2449 2487
2450 vdso= [X86-32,SH,x86-64] 2488 vdso= [X86,SH]
2451 vdso=2: enable compat VDSO (default with COMPAT_VDSO) 2489 vdso=2: enable compat VDSO (default with COMPAT_VDSO)
2452 vdso=1: enable VDSO (default) 2490 vdso=1: enable VDSO (default)
2453 vdso=0: disable VDSO mapping 2491 vdso=0: disable VDSO mapping
2454 2492
2455 vdso32= [X86-32,X86-64] 2493 vdso32= [X86]
2456 vdso32=2: enable compat VDSO (default with COMPAT_VDSO) 2494 vdso32=2: enable compat VDSO (default with COMPAT_VDSO)
2457 vdso32=1: enable 32-bit VDSO (default) 2495 vdso32=1: enable 32-bit VDSO (default)
2458 vdso32=0: disable 32-bit VDSO mapping 2496 vdso32=0: disable 32-bit VDSO mapping
diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt
index 3d7650768bb5..e7e9a69069e1 100644
--- a/Documentation/laptops/thinkpad-acpi.txt
+++ b/Documentation/laptops/thinkpad-acpi.txt
@@ -1,7 +1,7 @@
1 ThinkPad ACPI Extras Driver 1 ThinkPad ACPI Extras Driver
2 2
3 Version 0.22 3 Version 0.23
4 November 23rd, 2008 4 April 10th, 2009
5 5
6 Borislav Deianov <borislav@users.sf.net> 6 Borislav Deianov <borislav@users.sf.net>
7 Henrique de Moraes Holschuh <hmh@hmh.eng.br> 7 Henrique de Moraes Holschuh <hmh@hmh.eng.br>
diff --git a/Documentation/lguest/.gitignore b/Documentation/lguest/.gitignore
new file mode 100644
index 000000000000..115587fd5f65
--- /dev/null
+++ b/Documentation/lguest/.gitignore
@@ -0,0 +1 @@
lguest
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt
index 29510dc51510..28c747362f95 100644
--- a/Documentation/lguest/lguest.txt
+++ b/Documentation/lguest/lguest.txt
@@ -3,11 +3,11 @@
3 /, /` - or, A Young Coder's Illustrated Hypervisor 3 /, /` - or, A Young Coder's Illustrated Hypervisor
4 \\"--\\ http://lguest.ozlabs.org 4 \\"--\\ http://lguest.ozlabs.org
5 5
6Lguest is designed to be a minimal hypervisor for the Linux kernel, for 6Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel,
7Linux developers and users to experiment with virtualization with the 7for Linux developers and users to experiment with virtualization with the
8minimum of complexity. Nonetheless, it should have sufficient 8minimum of complexity. Nonetheless, it should have sufficient features to
9features to make it useful for specific tasks, and, of course, you are 9make it useful for specific tasks, and, of course, you are encouraged to fork
10encouraged to fork and enhance it (see drivers/lguest/README). 10and enhance it (see drivers/lguest/README).
11 11
12Features: 12Features:
13 13
@@ -37,6 +37,7 @@ Running Lguest:
37 "Paravirtualized guest support" = Y 37 "Paravirtualized guest support" = Y
38 "Lguest guest support" = Y 38 "Lguest guest support" = Y
39 "High Memory Support" = off/4GB 39 "High Memory Support" = off/4GB
40 "PAE (Physical Address Extension) Support" = N
40 "Alignment value to which kernel should be aligned" = 0x100000 41 "Alignment value to which kernel should be aligned" = 0x100000
41 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and 42 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
42 CONFIG_PHYSICAL_ALIGN=0x100000) 43 CONFIG_PHYSICAL_ALIGN=0x100000)
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index 5ede7473b425..08762750f121 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -1242,7 +1242,7 @@ monitoring is enabled, and vice-versa.
1242To add ARP targets: 1242To add ARP targets:
1243# echo +192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target 1243# echo +192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target
1244# echo +192.168.0.101 > /sys/class/net/bond0/bonding/arp_ip_target 1244# echo +192.168.0.101 > /sys/class/net/bond0/bonding/arp_ip_target
1245 NOTE: up to 10 target addresses may be specified. 1245 NOTE: up to 16 target addresses may be specified.
1246 1246
1247To remove an ARP target: 1247To remove an ARP target:
1248# echo -192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target 1248# echo -192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target
diff --git a/Documentation/powerpc/dts-bindings/fsl/i2c.txt b/Documentation/powerpc/dts-bindings/fsl/i2c.txt
index d0ab33e21fe6..b6d2e21474f9 100644
--- a/Documentation/powerpc/dts-bindings/fsl/i2c.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/i2c.txt
@@ -7,8 +7,10 @@ Required properties :
7 7
8Recommended properties : 8Recommended properties :
9 9
10 - compatible : Should be "fsl-i2c" for parts compatible with 10 - compatible : compatibility list with 2 entries, the first should
11 Freescale I2C specifications. 11 be "fsl,CHIP-i2c" where CHIP is the name of a compatible processor,
12 e.g. mpc8313, mpc8543, mpc8544, mpc5200 or mpc5200b. The second one
13 should be "fsl-i2c".
12 - interrupts : <a b> where a is the interrupt number and b is a 14 - interrupts : <a b> where a is the interrupt number and b is a
13 field that represents an encoding of the sense and level 15 field that represents an encoding of the sense and level
14 information for the interrupt. This should be encoded based on 16 information for the interrupt. This should be encoded based on
@@ -16,17 +18,31 @@ Recommended properties :
16 controller you have. 18 controller you have.
17 - interrupt-parent : the phandle for the interrupt controller that 19 - interrupt-parent : the phandle for the interrupt controller that
18 services interrupts for this device. 20 services interrupts for this device.
19 - dfsrr : boolean; if defined, indicates that this I2C device has 21 - fsl,preserve-clocking : boolean; if defined, the clock settings
20 a digital filter sampling rate register 22 from the bootloader are preserved (not touched).
21 - fsl5200-clocking : boolean; if defined, indicated that this device 23 - clock-frequency : desired I2C bus clock frequency in Hz.
22 uses the FSL 5200 clocking mechanism. 24
23 25Examples :
24Example : 26
25 i2c@3000 { 27 i2c@3d00 {
26 interrupt-parent = <40000>; 28 #address-cells = <1>;
27 interrupts = <1b 3>; 29 #size-cells = <0>;
28 reg = <3000 18>; 30 compatible = "fsl,mpc5200b-i2c","fsl,mpc5200-i2c","fsl-i2c";
29 device_type = "i2c"; 31 cell-index = <0>;
30 compatible = "fsl-i2c"; 32 reg = <0x3d00 0x40>;
31 dfsrr; 33 interrupts = <2 15 0>;
34 interrupt-parent = <&mpc5200_pic>;
35 fsl,preserve-clocking;
32 }; 36 };
37
38 i2c@3100 {
39 #address-cells = <1>;
40 #size-cells = <0>;
41 cell-index = <1>;
42 compatible = "fsl,mpc8544-i2c", "fsl-i2c";
43 reg = <0x3100 0x100>;
44 interrupts = <43 2>;
45 interrupt-parent = <&mpic>;
46 clock-frequency = <400000>;
47 };
48
diff --git a/Documentation/sound/alsa/HD-Audio.txt b/Documentation/sound/alsa/HD-Audio.txt
index c5948f2f9a25..88b7433d2f11 100644
--- a/Documentation/sound/alsa/HD-Audio.txt
+++ b/Documentation/sound/alsa/HD-Audio.txt
@@ -169,7 +169,7 @@ PCI SSID look-up.
169What `model` option values are available depends on the codec chip. 169What `model` option values are available depends on the codec chip.
170Check your codec chip from the codec proc file (see "Codec Proc-File" 170Check your codec chip from the codec proc file (see "Codec Proc-File"
171section below). It will show the vendor/product name of your codec 171section below). It will show the vendor/product name of your codec
172chip. Then, see Documentation/sound/alsa/HD-Audio-Modelstxt file, 172chip. Then, see Documentation/sound/alsa/HD-Audio-Models.txt file,
173the section of HD-audio driver. You can find a list of codecs 173the section of HD-audio driver. You can find a list of codecs
174and `model` options belonging to each codec. For example, for Realtek 174and `model` options belonging to each codec. For example, for Realtek
175ALC262 codec chip, pass `model=ultra` for devices that are compatible 175ALC262 codec chip, pass `model=ultra` for devices that are compatible
@@ -177,7 +177,7 @@ with Samsung Q1 Ultra.
177 177
178Thus, the first thing you can do for any brand-new, unsupported and 178Thus, the first thing you can do for any brand-new, unsupported and
179non-working HD-audio hardware is to check HD-audio codec and several 179non-working HD-audio hardware is to check HD-audio codec and several
180different `model` option values. If you have a luck, some of them 180different `model` option values. If you have any luck, some of them
181might suit with your device well. 181might suit with your device well.
182 182
183Some codecs such as ALC880 have a special model option `model=test`. 183Some codecs such as ALC880 have a special model option `model=test`.
diff --git a/Documentation/sparse.txt b/Documentation/sparse.txt
index 42f43fa59f24..34c76a55bc04 100644
--- a/Documentation/sparse.txt
+++ b/Documentation/sparse.txt
@@ -42,6 +42,14 @@ sure that bitwise types don't get mixed up (little-endian vs big-endian
42vs cpu-endian vs whatever), and there the constant "0" really _is_ 42vs cpu-endian vs whatever), and there the constant "0" really _is_
43special. 43special.
44 44
45__bitwise__ - to be used for relatively compact stuff (gfp_t, etc.) that
46is mostly warning-free and is supposed to stay that way. Warnings will
47be generated without __CHECK_ENDIAN__.
48
49__bitwise - noisy stuff; in particular, __le*/__be* are that. We really
50don't want to drown in noise unless we'd explicitly asked for it.
51
52
45Getting sparse 53Getting sparse
46~~~~~~~~~~~~~~ 54~~~~~~~~~~~~~~
47 55
diff --git a/Documentation/spi/spi-summary b/Documentation/spi/spi-summary
index 0f5122eb282b..4a02d2508bc8 100644
--- a/Documentation/spi/spi-summary
+++ b/Documentation/spi/spi-summary
@@ -511,10 +511,16 @@ SPI MASTER METHODS
511 This sets up the device clock rate, SPI mode, and word sizes. 511 This sets up the device clock rate, SPI mode, and word sizes.
512 Drivers may change the defaults provided by board_info, and then 512 Drivers may change the defaults provided by board_info, and then
513 call spi_setup(spi) to invoke this routine. It may sleep. 513 call spi_setup(spi) to invoke this routine. It may sleep.
514
514 Unless each SPI slave has its own configuration registers, don't 515 Unless each SPI slave has its own configuration registers, don't
515 change them right away ... otherwise drivers could corrupt I/O 516 change them right away ... otherwise drivers could corrupt I/O
516 that's in progress for other SPI devices. 517 that's in progress for other SPI devices.
517 518
519 ** BUG ALERT: for some reason the first version of
520 ** many spi_master drivers seems to get this wrong.
521 ** When you code setup(), ASSUME that the controller
522 ** is actively processing transfers for another device.
523
518 master->transfer(struct spi_device *spi, struct spi_message *message) 524 master->transfer(struct spi_device *spi, struct spi_message *message)
519 This must not sleep. Its responsibility is arrange that the 525 This must not sleep. Its responsibility is arrange that the
520 transfer happens and its complete() callback is issued. The two 526 transfer happens and its complete() callback is issued. The two
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index a34d55b65441..df38ef046f8d 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -95,7 +95,7 @@ of struct cmsghdr structures with appended data.
95 95
96There is only one file in this directory. 96There is only one file in this directory.
97unix_dgram_qlen limits the max number of datagrams queued in Unix domain 97unix_dgram_qlen limits the max number of datagrams queued in Unix domain
98socket's buffer. It will not take effect unless PF_UNIX flag is spicified. 98socket's buffer. It will not take effect unless PF_UNIX flag is specified.
99 99
100 100
1013. /proc/sys/net/ipv4 - IPV4 settings 1013. /proc/sys/net/ipv4 - IPV4 settings
diff --git a/Documentation/tomoyo.txt b/Documentation/tomoyo.txt
new file mode 100644
index 000000000000..b3a232cae7f8
--- /dev/null
+++ b/Documentation/tomoyo.txt
@@ -0,0 +1,55 @@
1--- What is TOMOYO? ---
2
3TOMOYO is a name-based MAC extension (LSM module) for the Linux kernel.
4
5LiveCD-based tutorials are available at
6http://tomoyo.sourceforge.jp/en/1.6.x/1st-step/ubuntu8.04-live/
7http://tomoyo.sourceforge.jp/en/1.6.x/1st-step/centos5-live/ .
8Though these tutorials use non-LSM version of TOMOYO, they are useful for you
9to know what TOMOYO is.
10
11--- How to enable TOMOYO? ---
12
13Build the kernel with CONFIG_SECURITY_TOMOYO=y and pass "security=tomoyo" on
14kernel's command line.
15
16Please see http://tomoyo.sourceforge.jp/en/2.2.x/ for details.
17
18--- Where is documentation? ---
19
20User <-> Kernel interface documentation is available at
21http://tomoyo.sourceforge.jp/en/2.2.x/policy-reference.html .
22
23Materials we prepared for seminars and symposiums are available at
24http://sourceforge.jp/projects/tomoyo/docs/?category_id=532&language_id=1 .
25Below lists are chosen from three aspects.
26
27What is TOMOYO?
28 TOMOYO Linux Overview
29 http://sourceforge.jp/projects/tomoyo/docs/lca2009-takeda.pdf
30 TOMOYO Linux: pragmatic and manageable security for Linux
31 http://sourceforge.jp/projects/tomoyo/docs/freedomhectaipei-tomoyo.pdf
32 TOMOYO Linux: A Practical Method to Understand and Protect Your Own Linux Box
33 http://sourceforge.jp/projects/tomoyo/docs/PacSec2007-en-no-demo.pdf
34
35What can TOMOYO do?
36 Deep inside TOMOYO Linux
37 http://sourceforge.jp/projects/tomoyo/docs/lca2009-kumaneko.pdf
38 The role of "pathname based access control" in security.
39 http://sourceforge.jp/projects/tomoyo/docs/lfj2008-bof.pdf
40
41History of TOMOYO?
42 Realities of Mainlining
43 http://sourceforge.jp/projects/tomoyo/docs/lfj2008.pdf
44
45--- What is future plan? ---
46
47We believe that inode based security and name based security are complementary
48and both should be used together. But unfortunately, so far, we cannot enable
49multiple LSM modules at the same time. We feel sorry that you have to give up
50SELinux/SMACK/AppArmor etc. when you want to use TOMOYO.
51
52We hope that LSM becomes stackable in future. Meanwhile, you can use non-LSM
53version of TOMOYO, available at http://tomoyo.sourceforge.jp/en/1.6.x/ .
54LSM version of TOMOYO is a subset of non-LSM version of TOMOYO. We are planning
55to port non-LSM version's functionalities to LSM versions.
diff --git a/Documentation/ftrace.txt b/Documentation/trace/ftrace.txt
index fd9a3e693813..fd9a3e693813 100644
--- a/Documentation/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
diff --git a/Documentation/vm/kmemtrace.txt b/Documentation/trace/kmemtrace.txt
index a956d9b7f943..a956d9b7f943 100644
--- a/Documentation/vm/kmemtrace.txt
+++ b/Documentation/trace/kmemtrace.txt
diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/trace/mmiotrace.txt
index 5731c67abc55..5731c67abc55 100644
--- a/Documentation/tracers/mmiotrace.txt
+++ b/Documentation/trace/mmiotrace.txt
diff --git a/Documentation/tracepoints.txt b/Documentation/trace/tracepoints.txt
index c0e1ceed75a4..c0e1ceed75a4 100644
--- a/Documentation/tracepoints.txt
+++ b/Documentation/trace/tracepoints.txt
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
index 2131b00b63f6..2f77ced35df7 100644
--- a/Documentation/vm/00-INDEX
+++ b/Documentation/vm/00-INDEX
@@ -1,5 +1,7 @@
100-INDEX 100-INDEX
2 - this file. 2 - this file.
3active_mm.txt
4 - An explanation from Linus about tsk->active_mm vs tsk->mm.
3balance 5balance
4 - various information on memory balancing. 6 - various information on memory balancing.
5hugetlbpage.txt 7hugetlbpage.txt
diff --git a/Documentation/vm/active_mm.txt b/Documentation/vm/active_mm.txt
new file mode 100644
index 000000000000..4ee1f643d897
--- /dev/null
+++ b/Documentation/vm/active_mm.txt
@@ -0,0 +1,83 @@
1List: linux-kernel
2Subject: Re: active_mm
3From: Linus Torvalds <torvalds () transmeta ! com>
4Date: 1999-07-30 21:36:24
5
6Cc'd to linux-kernel, because I don't write explanations all that often,
7and when I do I feel better about more people reading them.
8
9On Fri, 30 Jul 1999, David Mosberger wrote:
10>
11> Is there a brief description someplace on how "mm" vs. "active_mm" in
12> the task_struct are supposed to be used? (My apologies if this was
13> discussed on the mailing lists---I just returned from vacation and
14> wasn't able to follow linux-kernel for a while).
15
16Basically, the new setup is:
17
18 - we have "real address spaces" and "anonymous address spaces". The
19 difference is that an anonymous address space doesn't care about the
20 user-level page tables at all, so when we do a context switch into an
21 anonymous address space we just leave the previous address space
22 active.
23
24 The obvious use for a "anonymous address space" is any thread that
25 doesn't need any user mappings - all kernel threads basically fall into
26 this category, but even "real" threads can temporarily say that for
27 some amount of time they are not going to be interested in user space,
28 and that the scheduler might as well try to avoid wasting time on
29 switching the VM state around. Currently only the old-style bdflush
30 sync does that.
31
32 - "tsk->mm" points to the "real address space". For an anonymous process,
33 tsk->mm will be NULL, for the logical reason that an anonymous process
34 really doesn't _have_ a real address space at all.
35
36 - however, we obviously need to keep track of which address space we
37 "stole" for such an anonymous user. For that, we have "tsk->active_mm",
38 which shows what the currently active address space is.
39
40 The rule is that for a process with a real address space (ie tsk->mm is
41 non-NULL) the active_mm obviously always has to be the same as the real
42 one.
43
44 For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the
45 "borrowed" mm while the anonymous process is running. When the
46 anonymous process gets scheduled away, the borrowed address space is
47 returned and cleared.
48
49To support all that, the "struct mm_struct" now has two counters: a
50"mm_users" counter that is how many "real address space users" there are,
51and a "mm_count" counter that is the number of "lazy" users (ie anonymous
52users) plus one if there are any real users.
53
54Usually there is at least one real user, but it could be that the real
55user exited on another CPU while a lazy user was still active, so you do
56actually get cases where you have a address space that is _only_ used by
57lazy users. That is often a short-lived state, because once that thread
58gets scheduled away in favour of a real thread, the "zombie" mm gets
59released because "mm_users" becomes zero.
60
61Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any
62more. "init_mm" should be considered just a "lazy context when no other
63context is available", and in fact it is mainly used just at bootup when
64no real VM has yet been created. So code that used to check
65
66 if (current->mm == &init_mm)
67
68should generally just do
69
70 if (!current->mm)
71
72instead (which makes more sense anyway - the test is basically one of "do
73we have a user context", and is generally done by the page fault handler
74and things like that).
75
76Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago,
77because it slightly changes the interfaces to accomodate the alpha (who
78would have thought it, but the alpha actually ends up having one of the
79ugliest context switch codes - unlike the other architectures where the MM
80and register state is separate, the alpha PALcode joins the two, and you
81need to switch both together).
82
83(From http://marc.info/?l=linux-kernel&m=93337278602211&w=2)
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt
index 0706a7282a8c..2d70d0d95108 100644
--- a/Documentation/vm/unevictable-lru.txt
+++ b/Documentation/vm/unevictable-lru.txt
@@ -1,588 +1,691 @@
1 1 ==============================
2This document describes the Linux memory management "Unevictable LRU" 2 UNEVICTABLE LRU INFRASTRUCTURE
3infrastructure and the use of this infrastructure to manage several types 3 ==============================
4of "unevictable" pages. The document attempts to provide the overall 4
5rationale behind this mechanism and the rationale for some of the design 5========
6decisions that drove the implementation. The latter design rationale is 6CONTENTS
7discussed in the context of an implementation description. Admittedly, one 7========
8can obtain the implementation details--the "what does it do?"--by reading the 8
9code. One hopes that the descriptions below add value by provide the answer 9 (*) The Unevictable LRU
10to "why does it do that?". 10
11 11 - The unevictable page list.
12Unevictable LRU Infrastructure: 12 - Memory control group interaction.
13 13 - Marking address spaces unevictable.
14The Unevictable LRU adds an additional LRU list to track unevictable pages 14 - Detecting Unevictable Pages.
15and to hide these pages from vmscan. This mechanism is based on a patch by 15 - vmscan's handling of unevictable pages.
16Larry Woodman of Red Hat to address several scalability problems with page 16
17 (*) mlock()'d pages.
18
19 - History.
20 - Basic management.
21 - mlock()/mlockall() system call handling.
22 - Filtering special vmas.
23 - munlock()/munlockall() system call handling.
24 - Migrating mlocked pages.
25 - mmap(MAP_LOCKED) system call handling.
26 - munmap()/exit()/exec() system call handling.
27 - try_to_unmap().
28 - try_to_munlock() reverse map scan.
29 - Page reclaim in shrink_*_list().
30
31
32============
33INTRODUCTION
34============
35
36This document describes the Linux memory manager's "Unevictable LRU"
37infrastructure and the use of this to manage several types of "unevictable"
38pages.
39
40The document attempts to provide the overall rationale behind this mechanism
41and the rationale for some of the design decisions that drove the
42implementation. The latter design rationale is discussed in the context of an
43implementation description. Admittedly, one can obtain the implementation
44details - the "what does it do?" - by reading the code. One hopes that the
45descriptions below add value by provide the answer to "why does it do that?".
46
47
48===================
49THE UNEVICTABLE LRU
50===================
51
52The Unevictable LRU facility adds an additional LRU list to track unevictable
53pages and to hide these pages from vmscan. This mechanism is based on a patch
54by Larry Woodman of Red Hat to address several scalability problems with page
17reclaim in Linux. The problems have been observed at customer sites on large 55reclaim in Linux. The problems have been observed at customer sites on large
18memory x86_64 systems. For example, a non-numal x86_64 platform with 128GB 56memory x86_64 systems.
19of main memory will have over 32 million 4k pages in a single zone. When a 57
20large fraction of these pages are not evictable for any reason [see below], 58To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of
21vmscan will spend a lot of time scanning the LRU lists looking for the small 59main memory will have over 32 million 4k pages in a single zone. When a large
22fraction of pages that are evictable. This can result in a situation where 60fraction of these pages are not evictable for any reason [see below], vmscan
23all cpus are spending 100% of their time in vmscan for hours or days on end, 61will spend a lot of time scanning the LRU lists looking for the small fraction
24with the system completely unresponsive. 62of pages that are evictable. This can result in a situation where all CPUs are
25 63spending 100% of their time in vmscan for hours or days on end, with the system
26The Unevictable LRU infrastructure addresses the following classes of 64completely unresponsive.
27unevictable pages: 65
28 66The unevictable list addresses the following classes of unevictable pages:
29+ page owned by ramfs 67
30+ page mapped into SHM_LOCKed shared memory regions 68 (*) Those owned by ramfs.
31+ page mapped into VM_LOCKED [mlock()ed] vmas 69
32 70 (*) Those mapped into SHM_LOCK'd shared memory regions.
33The infrastructure might be able to handle other conditions that make pages 71
72 (*) Those mapped into VM_LOCKED [mlock()ed] VMAs.
73
74The infrastructure may also be able to handle other conditions that make pages
34unevictable, either by definition or by circumstance, in the future. 75unevictable, either by definition or by circumstance, in the future.
35 76
36 77
37The Unevictable LRU List 78THE UNEVICTABLE PAGE LIST
79-------------------------
38 80
39The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list 81The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list
40called the "unevictable" list and an associated page flag, PG_unevictable, to 82called the "unevictable" list and an associated page flag, PG_unevictable, to
41indicate that the page is being managed on the unevictable list. The 83indicate that the page is being managed on the unevictable list.
42PG_unevictable flag is analogous to, and mutually exclusive with, the PG_active 84
43flag in that it indicates on which LRU list a page resides when PG_lru is set. 85The PG_unevictable flag is analogous to, and mutually exclusive with, the
44The unevictable LRU list is source configurable based on the UNEVICTABLE_LRU 86PG_active flag in that it indicates on which LRU list a page resides when
45Kconfig option. 87PG_lru is set. The unevictable list is compile-time configurable based on the
88UNEVICTABLE_LRU Kconfig option.
46 89
47The Unevictable LRU infrastructure maintains unevictable pages on an additional 90The Unevictable LRU infrastructure maintains unevictable pages on an additional
48LRU list for a few reasons: 91LRU list for a few reasons:
49 92
501) We get to "treat unevictable pages just like we treat other pages in the 93 (1) We get to "treat unevictable pages just like we treat other pages in the
51 system, which means we get to use the same code to manipulate them, the 94 system - which means we get to use the same code to manipulate them, the
52 same code to isolate them (for migrate, etc.), the same code to keep track 95 same code to isolate them (for migrate, etc.), the same code to keep track
53 of the statistics, etc..." [Rik van Riel] 96 of the statistics, etc..." [Rik van Riel]
97
98 (2) We want to be able to migrate unevictable pages between nodes for memory
99 defragmentation, workload management and memory hotplug. The linux kernel
100 can only migrate pages that it can successfully isolate from the LRU
101 lists. If we were to maintain pages elsewhere than on an LRU-like list,
102 where they can be found by isolate_lru_page(), we would prevent their
103 migration, unless we reworked migration code to find the unevictable pages
104 itself.
54 105
552) We want to be able to migrate unevictable pages between nodes--for memory
56 defragmentation, workload management and memory hotplug. The linux kernel
57 can only migrate pages that it can successfully isolate from the lru lists.
58 If we were to maintain pages elsewise than on an lru-like list, where they
59 can be found by isolate_lru_page(), we would prevent their migration, unless
60 we reworked migration code to find the unevictable pages.
61 106
107The unevictable list does not differentiate between file-backed and anonymous,
108swap-backed pages. This differentiation is only important while the pages are,
109in fact, evictable.
62 110
63The unevictable LRU list does not differentiate between file backed and swap 111The unevictable list benefits from the "arrayification" of the per-zone LRU
64backed [anon] pages. This differentiation is only important while the pages 112lists and statistics originally proposed and posted by Christoph Lameter.
65are, in fact, evictable.
66 113
67The unevictable LRU list benefits from the "arrayification" of the per-zone 114The unevictable list does not use the LRU pagevec mechanism. Rather,
68LRU lists and statistics originally proposed and posted by Christoph Lameter. 115unevictable pages are placed directly on the page's zone's unevictable list
116under the zone lru_lock. This allows us to prevent the stranding of pages on
117the unevictable list when one task has the page isolated from the LRU and other
118tasks are changing the "evictability" state of the page.
69 119
70The unevictable list does not use the lru pagevec mechanism. Rather,
71unevictable pages are placed directly on the page's zone's unevictable
72list under the zone lru_lock. The reason for this is to prevent stranding
73of pages on the unevictable list when one task has the page isolated from the
74lru and other tasks are changing the "evictability" state of the page.
75 120
121MEMORY CONTROL GROUP INTERACTION
122--------------------------------
76 123
77Unevictable LRU and Memory Controller Interaction 124The unevictable LRU facility interacts with the memory control group [aka
125memory controller; see Documentation/cgroups/memory.txt] by extending the
126lru_list enum.
127
128The memory controller data structure automatically gets a per-zone unevictable
129list as a result of the "arrayification" of the per-zone LRU lists (one per
130lru_list enum element). The memory controller tracks the movement of pages to
131and from the unevictable list.
78 132
79The memory controller data structure automatically gets a per zone unevictable
80lru list as a result of the "arrayification" of the per-zone LRU lists. The
81memory controller tracks the movement of pages to and from the unevictable list.
82When a memory control group comes under memory pressure, the controller will 133When a memory control group comes under memory pressure, the controller will
83not attempt to reclaim pages on the unevictable list. This has a couple of 134not attempt to reclaim pages on the unevictable list. This has a couple of
84effects. Because the pages are "hidden" from reclaim on the unevictable list, 135effects:
85the reclaim process can be more efficient, dealing only with pages that have 136
86a chance of being reclaimed. On the other hand, if too many of the pages 137 (1) Because the pages are "hidden" from reclaim on the unevictable list, the
87charged to the control group are unevictable, the evictable portion of the 138 reclaim process can be more efficient, dealing only with pages that have a
88working set of the tasks in the control group may not fit into the available 139 chance of being reclaimed.
89memory. This can cause the control group to thrash or to oom-kill tasks. 140
90 141 (2) On the other hand, if too many of the pages charged to the control group
91 142 are unevictable, the evictable portion of the working set of the tasks in
92Unevictable LRU: Detecting Unevictable Pages 143 the control group may not fit into the available memory. This can cause
93 144 the control group to thrash or to OOM-kill tasks.
94The function page_evictable(page, vma) in vmscan.c determines whether a 145
95page is evictable or not. For ramfs pages and pages in SHM_LOCKed regions, 146
96page_evictable() tests a new address space flag, AS_UNEVICTABLE, in the page's 147MARKING ADDRESS SPACES UNEVICTABLE
97address space using a wrapper function. Wrapper functions are used to set, 148----------------------------------
98clear and test the flag to reduce the requirement for #ifdef's throughout the 149
99source code. AS_UNEVICTABLE is set on ramfs inode/mapping when it is created. 150For facilities such as ramfs none of the pages attached to the address space
100This flag remains for the life of the inode. 151may be evicted. To prevent eviction of any such pages, the AS_UNEVICTABLE
101 152address space flag is provided, and this can be manipulated by a filesystem
102For shared memory regions, AS_UNEVICTABLE is set when an application 153using a number of wrapper functions:
103successfully SHM_LOCKs the region and is removed when the region is 154
104SHM_UNLOCKed. Note that shmctl(SHM_LOCK, ...) does not populate the page 155 (*) void mapping_set_unevictable(struct address_space *mapping);
105tables for the region as does, for example, mlock(). So, we make no special 156
106effort to push any pages in the SHM_LOCKed region to the unevictable list. 157 Mark the address space as being completely unevictable.
107Vmscan will do this when/if it encounters the pages during reclaim. On 158
108SHM_UNLOCK, shmctl() scans the pages in the region and "rescues" them from the 159 (*) void mapping_clear_unevictable(struct address_space *mapping);
109unevictable list if no other condition keeps them unevictable. If a SHM_LOCKed 160
110region is destroyed, the pages are also "rescued" from the unevictable list in 161 Mark the address space as being evictable.
111the process of freeing them. 162
112 163 (*) int mapping_unevictable(struct address_space *mapping);
113page_evictable() detects mlock()ed pages by testing an additional page flag, 164
114PG_mlocked via the PageMlocked() wrapper. If the page is NOT mlocked, and a 165 Query the address space, and return true if it is completely
115non-NULL vma is supplied, page_evictable() will check whether the vma is 166 unevictable.
167
168These are currently used in two places in the kernel:
169
170 (1) By ramfs to mark the address spaces of its inodes when they are created,
171 and this mark remains for the life of the inode.
172
173 (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called.
174
175 Note that SHM_LOCK is not required to page in the locked pages if they're
176 swapped out; the application must touch the pages manually if it wants to
177 ensure they're in memory.
178
179
180DETECTING UNEVICTABLE PAGES
181---------------------------
182
183The function page_evictable() in vmscan.c determines whether a page is
184evictable or not using the query function outlined above [see section "Marking
185address spaces unevictable"] to check the AS_UNEVICTABLE flag.
186
187For address spaces that are so marked after being populated (as SHM regions
188might be), the lock action (eg: SHM_LOCK) can be lazy, and need not populate
189the page tables for the region as does, for example, mlock(), nor need it make
190any special effort to push any pages in the SHM_LOCK'd area to the unevictable
191list. Instead, vmscan will do this if and when it encounters the pages during
192a reclamation scan.
193
194On an unlock action (such as SHM_UNLOCK), the unlocker (eg: shmctl()) must scan
195the pages in the region and "rescue" them from the unevictable list if no other
196condition is keeping them unevictable. If an unevictable region is destroyed,
197the pages are also "rescued" from the unevictable list in the process of
198freeing them.
199
200page_evictable() also checks for mlocked pages by testing an additional page
201flag, PG_mlocked (as wrapped by PageMlocked()). If the page is NOT mlocked,
202and a non-NULL VMA is supplied, page_evictable() will check whether the VMA is
116VM_LOCKED via is_mlocked_vma(). is_mlocked_vma() will SetPageMlocked() and 203VM_LOCKED via is_mlocked_vma(). is_mlocked_vma() will SetPageMlocked() and
117update the appropriate statistics if the vma is VM_LOCKED. This method allows 204update the appropriate statistics if the vma is VM_LOCKED. This method allows
118efficient "culling" of pages in the fault path that are being faulted in to 205efficient "culling" of pages in the fault path that are being faulted in to
119VM_LOCKED vmas. 206VM_LOCKED VMAs.
120 207
121 208
122Unevictable Pages and Vmscan [shrink_*_list()] 209VMSCAN'S HANDLING OF UNEVICTABLE PAGES
210--------------------------------------
123 211
124If unevictable pages are culled in the fault path, or moved to the unevictable 212If unevictable pages are culled in the fault path, or moved to the unevictable
125list at mlock() or mmap() time, vmscan will never encounter the pages until 213list at mlock() or mmap() time, vmscan will not encounter the pages until they
126they have become evictable again, for example, via munlock() and have been 214have become evictable again (via munlock() for example) and have been "rescued"
127"rescued" from the unevictable list. However, there may be situations where we 215from the unevictable list. However, there may be situations where we decide,
128decide, for the sake of expediency, to leave a unevictable page on one of the 216for the sake of expediency, to leave a unevictable page on one of the regular
129regular active/inactive LRU lists for vmscan to deal with. Vmscan checks for 217active/inactive LRU lists for vmscan to deal with. vmscan checks for such
130such pages in all of the shrink_{active|inactive|page}_list() functions and 218pages in all of the shrink_{active|inactive|page}_list() functions and will
131will "cull" such pages that it encounters--that is, it diverts those pages to 219"cull" such pages that it encounters: that is, it diverts those pages to the
132the unevictable list for the zone being scanned. 220unevictable list for the zone being scanned.
133 221
134There may be situations where a page is mapped into a VM_LOCKED vma, but the 222There may be situations where a page is mapped into a VM_LOCKED VMA, but the
135page is not marked as PageMlocked. Such pages will make it all the way to 223page is not marked as PG_mlocked. Such pages will make it all the way to
136shrink_page_list() where they will be detected when vmscan walks the reverse 224shrink_page_list() where they will be detected when vmscan walks the reverse
137map in try_to_unmap(). If try_to_unmap() returns SWAP_MLOCK, shrink_page_list() 225map in try_to_unmap(). If try_to_unmap() returns SWAP_MLOCK,
138will cull the page at that point. 226shrink_page_list() will cull the page at that point.
139 227
140To "cull" an unevictable page, vmscan simply puts the page back on the lru 228To "cull" an unevictable page, vmscan simply puts the page back on the LRU list
141list using putback_lru_page()--the inverse operation to isolate_lru_page()-- 229using putback_lru_page() - the inverse operation to isolate_lru_page() - after
142after dropping the page lock. Because the condition which makes the page 230dropping the page lock. Because the condition which makes the page unevictable
143unevictable may change once the page is unlocked, putback_lru_page() will 231may change once the page is unlocked, putback_lru_page() will recheck the
144recheck the unevictable state of a page that it places on the unevictable lru 232unevictable state of a page that it places on the unevictable list. If the
145list. If the page has become unevictable, putback_lru_page() removes it from 233page has become unevictable, putback_lru_page() removes it from the list and
146the list and retries, including the page_unevictable() test. Because such a 234retries, including the page_unevictable() test. Because such a race is a rare
147race is a rare event and movement of pages onto the unevictable list should be 235event and movement of pages onto the unevictable list should be rare, these
148rare, these extra evictabilty checks should not occur in the majority of calls 236extra evictabilty checks should not occur in the majority of calls to
149to putback_lru_page(). 237putback_lru_page().
150 238
151 239
152Mlocked Page: Prior Work 240=============
241MLOCKED PAGES
242=============
153 243
154The "Unevictable Mlocked Pages" infrastructure is based on work originally 244The unevictable page list is also useful for mlock(), in addition to ramfs and
245SYSV SHM. Note that mlock() is only available in CONFIG_MMU=y situations; in
246NOMMU situations, all mappings are effectively mlocked.
247
248
249HISTORY
250-------
251
252The "Unevictable mlocked Pages" infrastructure is based on work originally
155posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU". 253posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU".
156Nick posted his patch as an alternative to a patch posted by Christoph 254Nick posted his patch as an alternative to a patch posted by Christoph Lameter
157Lameter to achieve the same objective--hiding mlocked pages from vmscan. 255to achieve the same objective: hiding mlocked pages from vmscan.
158In Nick's patch, he used one of the struct page lru list link fields as a count 256
159of VM_LOCKED vmas that map the page. This use of the link field for a count 257In Nick's patch, he used one of the struct page LRU list link fields as a count
160prevented the management of the pages on an LRU list. Thus, mlocked pages were 258of VM_LOCKED VMAs that map the page. This use of the link field for a count
161not migratable as isolate_lru_page() could not find them and the lru list link 259prevented the management of the pages on an LRU list, and thus mlocked pages
162field was not available to the migration subsystem. Nick resolved this by 260were not migratable as isolate_lru_page() could not find them, and the LRU list
163putting mlocked pages back on the lru list before attempting to isolate them, 261link field was not available to the migration subsystem.
164thus abandoning the count of VM_LOCKED vmas. When Nick's patch was integrated 262
165with the Unevictable LRU work, the count was replaced by walking the reverse 263Nick resolved this by putting mlocked pages back on the lru list before
166map to determine whether any VM_LOCKED vmas mapped the page. More on this 264attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs. When
167below. 265Nick's patch was integrated with the Unevictable LRU work, the count was
168 266replaced by walking the reverse map to determine whether any VM_LOCKED VMAs
169 267mapped the page. More on this below.
170Mlocked Pages: Basic Management 268
171 269
172Mlocked pages--pages mapped into a VM_LOCKED vma--represent one class of 270BASIC MANAGEMENT
173unevictable pages. When such a page has been "noticed" by the memory 271----------------
174management subsystem, the page is marked with the PG_mlocked [PageMlocked()] 272
175flag. A PageMlocked() page will be placed on the unevictable LRU list when 273mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable
176it is added to the LRU. Pages can be "noticed" by memory management in 274pages. When such a page has been "noticed" by the memory management subsystem,
177several places: 275the page is marked with the PG_mlocked flag. This can be manipulated using the
178 276PageMlocked() functions.
1791) in the mlock()/mlockall() system call handlers. 277
1802) in the mmap() system call handler when mmap()ing a region with the 278A PG_mlocked page will be placed on the unevictable list when it is added to
181 MAP_LOCKED flag, or mmap()ing a region in a task that has called 279the LRU. Such pages can be "noticed" by memory management in several places:
182 mlockall() with the MCL_FUTURE flag. Both of these conditions result 280
183 in the VM_LOCKED flag being set for the vma. 281 (1) in the mlock()/mlockall() system call handlers;
1843) in the fault path, if mlocked pages are "culled" in the fault path, 282
185 and when a VM_LOCKED stack segment is expanded. 283 (2) in the mmap() system call handler when mmapping a region with the
1864) as mentioned above, in vmscan:shrink_page_list() when attempting to 284 MAP_LOCKED flag;
187 reclaim a page in a VM_LOCKED vma via try_to_unmap(). 285
188 286 (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE
189Mlocked pages become unlocked and rescued from the unevictable list when: 287 flag
190 288
1911) mapped in a range unlocked via the munlock()/munlockall() system calls. 289 (4) in the fault path, if mlocked pages are "culled" in the fault path,
1922) munmapped() out of the last VM_LOCKED vma that maps the page, including 290 and when a VM_LOCKED stack segment is expanded; or
193 unmapping at task exit. 291
1943) when the page is truncated from the last VM_LOCKED vma of an mmap()ed file. 292 (5) as mentioned above, in vmscan:shrink_page_list() when attempting to
1954) before a page is COWed in a VM_LOCKED vma. 293 reclaim a page in a VM_LOCKED VMA via try_to_unmap()
196 294
197 295all of which result in the VM_LOCKED flag being set for the VMA if it doesn't
198Mlocked Pages: mlock()/mlockall() System Call Handling 296already have it set.
297
298mlocked pages become unlocked and rescued from the unevictable list when:
299
300 (1) mapped in a range unlocked via the munlock()/munlockall() system calls;
301
302 (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including
303 unmapping at task exit;
304
305 (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file;
306 or
307
308 (4) before a page is COW'd in a VM_LOCKED VMA.
309
310
311mlock()/mlockall() SYSTEM CALL HANDLING
312---------------------------------------
199 313
200Both [do_]mlock() and [do_]mlockall() system call handlers call mlock_fixup() 314Both [do_]mlock() and [do_]mlockall() system call handlers call mlock_fixup()
201for each vma in the range specified by the call. In the case of mlockall(), 315for each VMA in the range specified by the call. In the case of mlockall(),
202this is the entire active address space of the task. Note that mlock_fixup() 316this is the entire active address space of the task. Note that mlock_fixup()
203is used for both mlock()ing and munlock()ing a range of memory. A call to 317is used for both mlocking and munlocking a range of memory. A call to mlock()
204mlock() an already VM_LOCKED vma, or to munlock() a vma that is not VM_LOCKED 318an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED is
205is treated as a no-op--mlock_fixup() simply returns. 319treated as a no-op, and mlock_fixup() simply returns.
206 320
207If the vma passes some filtering described in "Mlocked Pages: Filtering Vmas" 321If the VMA passes some filtering as described in "Filtering Special Vmas"
208below, mlock_fixup() will attempt to merge the vma with its neighbors or split 322below, mlock_fixup() will attempt to merge the VMA with its neighbors or split
209off a subset of the vma if the range does not cover the entire vma. Once the 323off a subset of the VMA if the range does not cover the entire VMA. Once the
210vma has been merged or split or neither, mlock_fixup() will call 324VMA has been merged or split or neither, mlock_fixup() will call
211__mlock_vma_pages_range() to fault in the pages via get_user_pages() and 325__mlock_vma_pages_range() to fault in the pages via get_user_pages() and to
212to mark the pages as mlocked via mlock_vma_page(). 326mark the pages as mlocked via mlock_vma_page().
213 327
214Note that the vma being mlocked might be mapped with PROT_NONE. In this case, 328Note that the VMA being mlocked might be mapped with PROT_NONE. In this case,
215get_user_pages() will be unable to fault in the pages. That's OK. If pages 329get_user_pages() will be unable to fault in the pages. That's okay. If pages
216do end up getting faulted into this VM_LOCKED vma, we'll handle them in the 330do end up getting faulted into this VM_LOCKED VMA, we'll handle them in the
217fault path or in vmscan. 331fault path or in vmscan.
218 332
219Also note that a page returned by get_user_pages() could be truncated or 333Also note that a page returned by get_user_pages() could be truncated or
220migrated out from under us, while we're trying to mlock it. To detect 334migrated out from under us, while we're trying to mlock it. To detect this,
221this, __mlock_vma_pages_range() tests the page_mapping after acquiring 335__mlock_vma_pages_range() checks page_mapping() after acquiring the page lock.
222the page lock. If the page is still associated with its mapping, we'll 336If the page is still associated with its mapping, we'll go ahead and call
223go ahead and call mlock_vma_page(). If the mapping is gone, we just 337mlock_vma_page(). If the mapping is gone, we just unlock the page and move on.
224unlock the page and move on. Worse case, this results in page mapped 338In the worst case, this will result in a page mapped in a VM_LOCKED VMA
225in a VM_LOCKED vma remaining on a normal LRU list without being 339remaining on a normal LRU list without being PageMlocked(). Again, vmscan will
226PageMlocked(). Again, vmscan will detect and cull such pages. 340detect and cull such pages.
227 341
228mlock_vma_page(), called with the page locked [N.B., not "mlocked"], will 342mlock_vma_page() will call TestSetPageMlocked() for each page returned by
229TestSetPageMlocked() for each page returned by get_user_pages(). We use 343get_user_pages(). We use TestSetPageMlocked() because the page might already
230TestSetPageMlocked() because the page might already be mlocked by another 344be mlocked by another task/VMA and we don't want to do extra work. We
231task/vma and we don't want to do extra work. We especially do not want to 345especially do not want to count an mlocked page more than once in the
232count an mlocked page more than once in the statistics. If the page was 346statistics. If the page was already mlocked, mlock_vma_page() need do nothing
233already mlocked, mlock_vma_page() is done. 347more.
234 348
235If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the 349If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the
236page from the LRU, as it is likely on the appropriate active or inactive list 350page from the LRU, as it is likely on the appropriate active or inactive list
237at that time. If the isolate_lru_page() succeeds, mlock_vma_page() will 351at that time. If the isolate_lru_page() succeeds, mlock_vma_page() will put
238putback the page--putback_lru_page()--which will notice that the page is now 352back the page - by calling putback_lru_page() - which will notice that the page
239mlocked and divert the page to the zone's unevictable LRU list. If 353is now mlocked and divert the page to the zone's unevictable list. If
240mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle 354mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle
241it later if/when it attempts to reclaim the page. 355it later if and when it attempts to reclaim the page.
242 356
243 357
244Mlocked Pages: Filtering Special Vmas 358FILTERING SPECIAL VMAS
359----------------------
245 360
246mlock_fixup() filters several classes of "special" vmas: 361mlock_fixup() filters several classes of "special" VMAs:
247 362
2481) vmas with VM_IO|VM_PFNMAP set are skipped entirely. The pages behind 3631) VMAs with VM_IO or VM_PFNMAP set are skipped entirely. The pages behind
249 these mappings are inherently pinned, so we don't need to mark them as 364 these mappings are inherently pinned, so we don't need to mark them as
250 mlocked. In any case, most of the pages have no struct page in which to 365 mlocked. In any case, most of the pages have no struct page in which to so
251 so mark the page. Because of this, get_user_pages() will fail for these 366 mark the page. Because of this, get_user_pages() will fail for these VMAs,
252 vmas, so there is no sense in attempting to visit them. 367 so there is no sense in attempting to visit them.
253 368
2542) vmas mapping hugetlbfs page are already effectively pinned into memory. 3692) VMAs mapping hugetlbfs page are already effectively pinned into memory. We
255 We don't need nor want to mlock() these pages. However, to preserve the 370 neither need nor want to mlock() these pages. However, to preserve the
256 prior behavior of mlock()--before the unevictable/mlock changes-- 371 prior behavior of mlock() - before the unevictable/mlock changes -
257 mlock_fixup() will call make_pages_present() in the hugetlbfs vma range 372 mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to
258 to allocate the huge pages and populate the ptes. 373 allocate the huge pages and populate the ptes.
259 374
2603) vmas with VM_DONTEXPAND|VM_RESERVED are generally user space mappings of 3753) VMAs with VM_DONTEXPAND or VM_RESERVED are generally userspace mappings of
261 kernel pages, such as the vdso page, relay channel pages, etc. These pages 376 kernel pages, such as the VDSO page, relay channel pages, etc. These pages
262 are inherently unevictable and are not managed on the LRU lists. 377 are inherently unevictable and are not managed on the LRU lists.
263 mlock_fixup() treats these vmas the same as hugetlbfs vmas. It calls 378 mlock_fixup() treats these VMAs the same as hugetlbfs VMAs. It calls
264 make_pages_present() to populate the ptes. 379 make_pages_present() to populate the ptes.
265 380
266Note that for all of these special vmas, mlock_fixup() does not set the 381Note that for all of these special VMAs, mlock_fixup() does not set the
267VM_LOCKED flag. Therefore, we won't have to deal with them later during 382VM_LOCKED flag. Therefore, we won't have to deal with them later during
268munlock() or munmap()--for example, at task exit. Neither does mlock_fixup() 383munlock(), munmap() or task exit. Neither does mlock_fixup() account these
269account these vmas against the task's "locked_vm". 384VMAs against the task's "locked_vm".
270 385
271Mlocked Pages: Downgrading the Mmap Semaphore. 386
272 387munlock()/munlockall() SYSTEM CALL HANDLING
273mlock_fixup() must be called with the mmap semaphore held for write, because 388-------------------------------------------
274it may have to merge or split vmas. However, mlocking a large region of 389
275memory can take a long time--especially if vmscan must reclaim pages to 390The munlock() and munlockall() system calls are handled by the same functions -
276satisfy the regions requirements. Faulting in a large region with the mmap 391do_mlock[all]() - as the mlock() and mlockall() system calls with the unlock vs
277semaphore held for write can hold off other faults on the address space, in 392lock operation indicated by an argument. So, these system calls are also
278the case of a multi-threaded task. It can also hold off scans of the task's 393handled by mlock_fixup(). Again, if called for an already munlocked VMA,
279address space via /proc. While testing under heavy load, it was observed that 394mlock_fixup() simply returns. Because of the VMA filtering discussed above,
280the ps(1) command could be held off for many minutes while a large segment was 395VM_LOCKED will not be set in any "special" VMAs. So, these VMAs will be
281mlock()ed down.
282
283To address this issue, and to make the system more responsive during mlock()ing
284of large segments, mlock_fixup() downgrades the mmap semaphore to read mode
285during the call to __mlock_vma_pages_range(). This works fine. However, the
286callers of mlock_fixup() expect the semaphore to be returned in write mode.
287So, mlock_fixup() "upgrades" the semphore to write mode. Linux does not
288support an atomic upgrade_sem() call, so mlock_fixup() must drop the semaphore
289and reacquire it in write mode. In a multi-threaded task, it is possible for
290the task memory map to change while the semaphore is dropped. Therefore,
291mlock_fixup() looks up the vma at the range start address after reacquiring
292the semaphore in write mode and verifies that it still covers the original
293range. If not, mlock_fixup() returns an error [-EAGAIN]. All callers of
294mlock_fixup() have been changed to deal with this new error condition.
295
296Note: when munlocking a region, all of the pages should already be resident--
297unless we have racing threads mlocking() and munlocking() regions. So,
298unlocking should not have to wait for page allocations nor faults of any kind.
299Therefore mlock_fixup() does not downgrade the semaphore for munlock().
300
301
302Mlocked Pages: munlock()/munlockall() System Call Handling
303
304The munlock() and munlockall() system calls are handled by the same functions--
305do_mlock[all]()--as the mlock() and mlockall() system calls with the unlock
306vs lock operation indicated by an argument. So, these system calls are also
307handled by mlock_fixup(). Again, if called for an already munlock()ed vma,
308mlock_fixup() simply returns. Because of the vma filtering discussed above,
309VM_LOCKED will not be set in any "special" vmas. So, these vmas will be
310ignored for munlock. 396ignored for munlock.
311 397
312If the vma is VM_LOCKED, mlock_fixup() again attempts to merge or split off 398If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the
313the specified range. The range is then munlocked via the function 399specified range. The range is then munlocked via the function
314__mlock_vma_pages_range()--the same function used to mlock a vma range-- 400__mlock_vma_pages_range() - the same function used to mlock a VMA range -
315passing a flag to indicate that munlock() is being performed. 401passing a flag to indicate that munlock() is being performed.
316 402
317Because the vma access protections could have been changed to PROT_NONE after 403Because the VMA access protections could have been changed to PROT_NONE after
318faulting in and mlocking pages, get_user_pages() was unreliable for visiting 404faulting in and mlocking pages, get_user_pages() was unreliable for visiting
319these pages for munlocking. Because we don't want to leave pages mlocked(), 405these pages for munlocking. Because we don't want to leave pages mlocked,
320get_user_pages() was enhanced to accept a flag to ignore the permissions when 406get_user_pages() was enhanced to accept a flag to ignore the permissions when
321fetching the pages--all of which should be resident as a result of previous 407fetching the pages - all of which should be resident as a result of previous
322mlock()ing. 408mlocking.
323 409
324For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling 410For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling
325munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked 411munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked
326flag using TestClearPageMlocked(). As with mlock_vma_page(), munlock_vma_page() 412flag using TestClearPageMlocked(). As with mlock_vma_page(),
327use the Test*PageMlocked() function to handle the case where the page might 413munlock_vma_page() use the Test*PageMlocked() function to handle the case where
328have already been unlocked by another task. If the page was mlocked, 414the page might have already been unlocked by another task. If the page was
329munlock_vma_page() updates that zone statistics for the number of mlocked 415mlocked, munlock_vma_page() updates that zone statistics for the number of
330pages. Note, however, that at this point we haven't checked whether the page 416mlocked pages. Note, however, that at this point we haven't checked whether
331is mapped by other VM_LOCKED vmas. 417the page is mapped by other VM_LOCKED VMAs.
332 418
333We can't call try_to_munlock(), the function that walks the reverse map to check 419We can't call try_to_munlock(), the function that walks the reverse map to
334for other VM_LOCKED vmas, without first isolating the page from the LRU. 420check for other VM_LOCKED VMAs, without first isolating the page from the LRU.
335try_to_munlock() is a variant of try_to_unmap() and thus requires that the page 421try_to_munlock() is a variant of try_to_unmap() and thus requires that the page
336not be on an lru list. [More on these below.] However, the call to 422not be on an LRU list [more on these below]. However, the call to
337isolate_lru_page() could fail, in which case we couldn't try_to_munlock(). 423isolate_lru_page() could fail, in which case we couldn't try_to_munlock(). So,
338So, we go ahead and clear PG_mlocked up front, as this might be the only chance 424we go ahead and clear PG_mlocked up front, as this might be the only chance we
339we have. If we can successfully isolate the page, we go ahead and 425have. If we can successfully isolate the page, we go ahead and
340try_to_munlock(), which will restore the PG_mlocked flag and update the zone 426try_to_munlock(), which will restore the PG_mlocked flag and update the zone
341page statistics if it finds another vma holding the page mlocked. If we fail 427page statistics if it finds another VMA holding the page mlocked. If we fail
342to isolate the page, we'll have left a potentially mlocked page on the LRU. 428to isolate the page, we'll have left a potentially mlocked page on the LRU.
343This is fine, because we'll catch it later when/if vmscan tries to reclaim the 429This is fine, because we'll catch it later if and if vmscan tries to reclaim
344page. This should be relatively rare. 430the page. This should be relatively rare.
345 431
346Mlocked Pages: Migrating Them... 432
347 433MIGRATING MLOCKED PAGES
348A page that is being migrated has been isolated from the lru lists and is 434-----------------------
349held locked across unmapping of the page, updating the page's mapping 435
350[address_space] entry and copying the contents and state, until the 436A page that is being migrated has been isolated from the LRU lists and is held
351page table entry has been replaced with an entry that refers to the new 437locked across unmapping of the page, updating the page's address space entry
352page. Linux supports migration of mlocked pages and other unevictable 438and copying the contents and state, until the page table entry has been
353pages. This involves simply moving the PageMlocked and PageUnevictable states 439replaced with an entry that refers to the new page. Linux supports migration
354from the old page to the new page. 440of mlocked pages and other unevictable pages. This involves simply moving the
355 441PG_mlocked and PG_unevictable states from the old page to the new page.
356Note that page migration can race with mlocking or munlocking of the same 442
357page. This has been discussed from the mlock/munlock perspective in the 443Note that page migration can race with mlocking or munlocking of the same page.
358respective sections above. Both processes [migration, m[un]locking], hold 444This has been discussed from the mlock/munlock perspective in the respective
359the page locked. This provides the first level of synchronization. Page 445sections above. Both processes (migration and m[un]locking) hold the page
360migration zeros out the page_mapping of the old page before unlocking it, 446locked. This provides the first level of synchronization. Page migration
361so m[un]lock can skip these pages by testing the page mapping under page 447zeros out the page_mapping of the old page before unlocking it, so m[un]lock
362lock. 448can skip these pages by testing the page mapping under page lock.
363 449
364When completing page migration, we place the new and old pages back onto the 450To complete page migration, we place the new and old pages back onto the LRU
365lru after dropping the page lock. The "unneeded" page--old page on success, 451after dropping the page lock. The "unneeded" page - old page on success, new
366new page on failure--will be freed when the reference count held by the 452page on failure - will be freed when the reference count held by the migration
367migration process is released. To ensure that we don't strand pages on the 453process is released. To ensure that we don't strand pages on the unevictable
368unevictable list because of a race between munlock and migration, page 454list because of a race between munlock and migration, page migration uses the
369migration uses the putback_lru_page() function to add migrated pages back to 455putback_lru_page() function to add migrated pages back to the LRU.
370the lru. 456
371 457
372 458mmap(MAP_LOCKED) SYSTEM CALL HANDLING
373Mlocked Pages: mmap(MAP_LOCKED) System Call Handling 459-------------------------------------
374 460
375In addition the the mlock()/mlockall() system calls, an application can request 461In addition the the mlock()/mlockall() system calls, an application can request
376that a region of memory be mlocked using the MAP_LOCKED flag with the mmap() 462that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap()
377call. Furthermore, any mmap() call or brk() call that expands the heap by a 463call. Furthermore, any mmap() call or brk() call that expands the heap by a
378task that has previously called mlockall() with the MCL_FUTURE flag will result 464task that has previously called mlockall() with the MCL_FUTURE flag will result
379in the newly mapped memory being mlocked. Before the unevictable/mlock changes, 465in the newly mapped memory being mlocked. Before the unevictable/mlock
380the kernel simply called make_pages_present() to allocate pages and populate 466changes, the kernel simply called make_pages_present() to allocate pages and
381the page table. 467populate the page table.
382 468
383To mlock a range of memory under the unevictable/mlock infrastructure, the 469To mlock a range of memory under the unevictable/mlock infrastructure, the
384mmap() handler and task address space expansion functions call 470mmap() handler and task address space expansion functions call
385mlock_vma_pages_range() specifying the vma and the address range to mlock. 471mlock_vma_pages_range() specifying the vma and the address range to mlock.
386mlock_vma_pages_range() filters vmas like mlock_fixup(), as described above in 472mlock_vma_pages_range() filters VMAs like mlock_fixup(), as described above in
387"Mlocked Pages: Filtering Vmas". It will clear the VM_LOCKED flag, which will 473"Filtering Special VMAs". It will clear the VM_LOCKED flag, which will have
388have already been set by the caller, in filtered vmas. Thus these vma's need 474already been set by the caller, in filtered VMAs. Thus these VMA's need not be
389not be visited for munlock when the region is unmapped. 475visited for munlock when the region is unmapped.
390 476
391For "normal" vmas, mlock_vma_pages_range() calls __mlock_vma_pages_range() to 477For "normal" VMAs, mlock_vma_pages_range() calls __mlock_vma_pages_range() to
392fault/allocate the pages and mlock them. Again, like mlock_fixup(), 478fault/allocate the pages and mlock them. Again, like mlock_fixup(),
393mlock_vma_pages_range() downgrades the mmap semaphore to read mode before 479mlock_vma_pages_range() downgrades the mmap semaphore to read mode before
394attempting to fault/allocate and mlock the pages; and "upgrades" the semaphore 480attempting to fault/allocate and mlock the pages and "upgrades" the semaphore
395back to write mode before returning. 481back to write mode before returning.
396 482
397The callers of mlock_vma_pages_range() will have already added the memory 483The callers of mlock_vma_pages_range() will have already added the memory range
398range to be mlocked to the task's "locked_vm". To account for filtered vmas, 484to be mlocked to the task's "locked_vm". To account for filtered VMAs,
399mlock_vma_pages_range() returns the number of pages NOT mlocked. All of the 485mlock_vma_pages_range() returns the number of pages NOT mlocked. All of the
400callers then subtract a non-negative return value from the task's locked_vm. 486callers then subtract a non-negative return value from the task's locked_vm. A
401A negative return value represent an error--for example, from get_user_pages() 487negative return value represent an error - for example, from get_user_pages()
402attempting to fault in a vma with PROT_NONE access. In this case, we leave 488attempting to fault in a VMA with PROT_NONE access. In this case, we leave the
403the memory range accounted as locked_vm, as the protections could be changed 489memory range accounted as locked_vm, as the protections could be changed later
404later and pages allocated into that region. 490and pages allocated into that region.
405 491
406 492
407Mlocked Pages: munmap()/exit()/exec() System Call Handling 493munmap()/exit()/exec() SYSTEM CALL HANDLING
494-------------------------------------------
408 495
409When unmapping an mlocked region of memory, whether by an explicit call to 496When unmapping an mlocked region of memory, whether by an explicit call to
410munmap() or via an internal unmap from exit() or exec() processing, we must 497munmap() or via an internal unmap from exit() or exec() processing, we must
411munlock the pages if we're removing the last VM_LOCKED vma that maps the pages. 498munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages.
412Before the unevictable/mlock changes, mlocking did not mark the pages in any 499Before the unevictable/mlock changes, mlocking did not mark the pages in any
413way, so unmapping them required no processing. 500way, so unmapping them required no processing.
414 501
415To munlock a range of memory under the unevictable/mlock infrastructure, the 502To munlock a range of memory under the unevictable/mlock infrastructure, the
416munmap() hander and task address space tear down function call 503munmap() handler and task address space call tear down function
417munlock_vma_pages_all(). The name reflects the observation that one always 504munlock_vma_pages_all(). The name reflects the observation that one always
418specifies the entire vma range when munlock()ing during unmap of a region. 505specifies the entire VMA range when munlock()ing during unmap of a region.
419Because of the vma filtering when mlocking() regions, only "normal" vmas that 506Because of the VMA filtering when mlocking() regions, only "normal" VMAs that
420actually contain mlocked pages will be passed to munlock_vma_pages_all(). 507actually contain mlocked pages will be passed to munlock_vma_pages_all().
421 508
422munlock_vma_pages_all() clears the VM_LOCKED vma flag and, like mlock_fixup() 509munlock_vma_pages_all() clears the VM_LOCKED VMA flag and, like mlock_fixup()
423for the munlock case, calls __munlock_vma_pages_range() to walk the page table 510for the munlock case, calls __munlock_vma_pages_range() to walk the page table
424for the vma's memory range and munlock_vma_page() each resident page mapped by 511for the VMA's memory range and munlock_vma_page() each resident page mapped by
425the vma. This effectively munlocks the page, only if this is the last 512the VMA. This effectively munlocks the page, only if this is the last
426VM_LOCKED vma that maps the page. 513VM_LOCKED VMA that maps the page.
427
428 514
429Mlocked Page: try_to_unmap()
430 515
431[Note: the code changes represented by this section are really quite small 516try_to_unmap()
432compared to the text to describe what happening and why, and to discuss the 517--------------
433implications.]
434 518
435Pages can, of course, be mapped into multiple vmas. Some of these vmas may 519Pages can, of course, be mapped into multiple VMAs. Some of these VMAs may
436have VM_LOCKED flag set. It is possible for a page mapped into one or more 520have VM_LOCKED flag set. It is possible for a page mapped into one or more
437VM_LOCKED vmas not to have the PG_mlocked flag set and therefore reside on one 521VM_LOCKED VMAs not to have the PG_mlocked flag set and therefore reside on one
438of the active or inactive LRU lists. This could happen if, for example, a 522of the active or inactive LRU lists. This could happen if, for example, a task
439task in the process of munlock()ing the page could not isolate the page from 523in the process of munlocking the page could not isolate the page from the LRU.
440the LRU. As a result, vmscan/shrink_page_list() might encounter such a page 524As a result, vmscan/shrink_page_list() might encounter such a page as described
441as described in "Unevictable Pages and Vmscan [shrink_*_list()]". To 525in section "vmscan's handling of unevictable pages". To handle this situation,
442handle this situation, try_to_unmap() has been enhanced to check for VM_LOCKED 526try_to_unmap() checks for VM_LOCKED VMAs while it is walking a page's reverse
443vmas while it is walking a page's reverse map. 527map.
444 528
445try_to_unmap() is always called, by either vmscan for reclaim or for page 529try_to_unmap() is always called, by either vmscan for reclaim or for page
446migration, with the argument page locked and isolated from the LRU. BUG_ON() 530migration, with the argument page locked and isolated from the LRU. Separate
447assertions enforce this requirement. Separate functions handle anonymous and 531functions handle anonymous and mapped file pages, as these types of pages have
448mapped file pages, as these types of pages have different reverse map 532different reverse map mechanisms.
449mechanisms. 533
450 534 (*) try_to_unmap_anon()
451 try_to_unmap_anon() 535
452 536 To unmap anonymous pages, each VMA in the list anchored in the anon_vma
453To unmap anonymous pages, each vma in the list anchored in the anon_vma must be 537 must be visited - at least until a VM_LOCKED VMA is encountered. If the
454visited--at least until a VM_LOCKED vma is encountered. If the page is being 538 page is being unmapped for migration, VM_LOCKED VMAs do not stop the
455unmapped for migration, VM_LOCKED vmas do not stop the process because mlocked 539 process because mlocked pages are migratable. However, for reclaim, if
456pages are migratable. However, for reclaim, if the page is mapped into a 540 the page is mapped into a VM_LOCKED VMA, the scan stops.
457VM_LOCKED vma, the scan stops. try_to_unmap() attempts to acquire the mmap 541
458semphore of the mm_struct to which the vma belongs in read mode. If this is 542 try_to_unmap_anon() attempts to acquire in read mode the mmap semphore of
459successful, try_to_unmap() will mlock the page via mlock_vma_page()--we 543 the mm_struct to which the VMA belongs. If this is successful, it will
460wouldn't have gotten to try_to_unmap() if the page were already mlocked--and 544 mlock the page via mlock_vma_page() - we wouldn't have gotten to
461will return SWAP_MLOCK, indicating that the page is unevictable. If the 545 try_to_unmap_anon() if the page were already mlocked - and will return
462mmap semaphore cannot be acquired, we are not sure whether the page is really 546 SWAP_MLOCK, indicating that the page is unevictable.
463unevictable or not. In this case, try_to_unmap() will return SWAP_AGAIN. 547
464 548 If the mmap semaphore cannot be acquired, we are not sure whether the page
465 try_to_unmap_file() -- linear mappings 549 is really unevictable or not. In this case, try_to_unmap_anon() will
466 550 return SWAP_AGAIN.
467Unmapping of a mapped file page works the same, except that the scan visits 551
468all vmas that maps the page's index/page offset in the page's mapping's 552 (*) try_to_unmap_file() - linear mappings
469reverse map priority search tree. It must also visit each vma in the page's 553
470mapping's non-linear list, if the list is non-empty. As for anonymous pages, 554 Unmapping of a mapped file page works the same as for anonymous mappings,
471on encountering a VM_LOCKED vma for a mapped file page, try_to_unmap() will 555 except that the scan visits all VMAs that map the page's index/page offset
472attempt to acquire the associated mm_struct's mmap semaphore to mlock the page, 556 in the page's mapping's reverse map priority search tree. It also visits
473returning SWAP_MLOCK if this is successful, and SWAP_AGAIN, if not. 557 each VMA in the page's mapping's non-linear list, if the list is
474 558 non-empty.
475 try_to_unmap_file() -- non-linear mappings 559
476 560 As for anonymous pages, on encountering a VM_LOCKED VMA for a mapped file
477If a page's mapping contains a non-empty non-linear mapping vma list, then 561 page, try_to_unmap_file() will attempt to acquire the associated
478try_to_un{map|lock}() must also visit each vma in that list to determine 562 mm_struct's mmap semaphore to mlock the page, returning SWAP_MLOCK if this
479whether the page is mapped in a VM_LOCKED vma. Again, the scan must visit 563 is successful, and SWAP_AGAIN, if not.
480all vmas in the non-linear list to ensure that the pages is not/should not be 564
481mlocked. If a VM_LOCKED vma is found in the list, the scan could terminate. 565 (*) try_to_unmap_file() - non-linear mappings
482However, there is no easy way to determine whether the page is actually mapped 566
483in a given vma--either for unmapping or testing whether the VM_LOCKED vma 567 If a page's mapping contains a non-empty non-linear mapping VMA list, then
484actually pins the page. 568 try_to_un{map|lock}() must also visit each VMA in that list to determine
485 569 whether the page is mapped in a VM_LOCKED VMA. Again, the scan must visit
486So, try_to_unmap_file() handles non-linear mappings by scanning a certain 570 all VMAs in the non-linear list to ensure that the pages is not/should not
487number of pages--a "cluster"--in each non-linear vma associated with the page's 571 be mlocked.
488mapping, for each file mapped page that vmscan tries to unmap. If this happens 572
489to unmap the page we're trying to unmap, try_to_unmap() will notice this on 573 If a VM_LOCKED VMA is found in the list, the scan could terminate.
490return--(page_mapcount(page) == 0)--and return SWAP_SUCCESS. Otherwise, it 574 However, there is no easy way to determine whether the page is actually
491will return SWAP_AGAIN, causing vmscan to recirculate this page. We take 575 mapped in a given VMA - either for unmapping or testing whether the
492advantage of the cluster scan in try_to_unmap_cluster() as follows: 576 VM_LOCKED VMA actually pins the page.
493 577
494For each non-linear vma, try_to_unmap_cluster() attempts to acquire the mmap 578 try_to_unmap_file() handles non-linear mappings by scanning a certain
495semaphore of the associated mm_struct for read without blocking. If this 579 number of pages - a "cluster" - in each non-linear VMA associated with the
496attempt is successful and the vma is VM_LOCKED, try_to_unmap_cluster() will 580 page's mapping, for each file mapped page that vmscan tries to unmap. If
497retain the mmap semaphore for the scan; otherwise it drops it here. Then, 581 this happens to unmap the page we're trying to unmap, try_to_unmap() will
498for each page in the cluster, if we're holding the mmap semaphore for a locked 582 notice this on return (page_mapcount(page) will be 0) and return
499vma, try_to_unmap_cluster() calls mlock_vma_page() to mlock the page. This 583 SWAP_SUCCESS. Otherwise, it will return SWAP_AGAIN, causing vmscan to
500call is a no-op if the page is already locked, but will mlock any pages in 584 recirculate this page. We take advantage of the cluster scan in
501the non-linear mapping that happen to be unlocked. If one of the pages so 585 try_to_unmap_cluster() as follows:
502mlocked is the page passed in to try_to_unmap(), try_to_unmap_cluster() will 586
503return SWAP_MLOCK, rather than the default SWAP_AGAIN. This will allow vmscan 587 For each non-linear VMA, try_to_unmap_cluster() attempts to acquire the
504to cull the page, rather than recirculating it on the inactive list. Again, 588 mmap semaphore of the associated mm_struct for read without blocking.
505if try_to_unmap_cluster() cannot acquire the vma's mmap sem, it returns 589
506SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED vma, but 590 If this attempt is successful and the VMA is VM_LOCKED,
507couldn't be mlocked. 591 try_to_unmap_cluster() will retain the mmap semaphore for the scan;
508 592 otherwise it drops it here.
509 593
510Mlocked pages: try_to_munlock() Reverse Map Scan 594 Then, for each page in the cluster, if we're holding the mmap semaphore
511 595 for a locked VMA, try_to_unmap_cluster() calls mlock_vma_page() to
512TODO/FIXME: a better name might be page_mlocked()--analogous to the 596 mlock the page. This call is a no-op if the page is already locked,
513page_referenced() reverse map walker. 597 but will mlock any pages in the non-linear mapping that happen to be
514 598 unlocked.
515When munlock_vma_page()--see "Mlocked Pages: munlock()/munlockall() 599
516System Call Handling" above--tries to munlock a page, it needs to 600 If one of the pages so mlocked is the page passed in to try_to_unmap(),
517determine whether or not the page is mapped by any VM_LOCKED vma, without 601 try_to_unmap_cluster() will return SWAP_MLOCK, rather than the default
518actually attempting to unmap all ptes from the page. For this purpose, the 602 SWAP_AGAIN. This will allow vmscan to cull the page, rather than
519unevictable/mlock infrastructure introduced a variant of try_to_unmap() called 603 recirculating it on the inactive list.
520try_to_munlock(). 604
605 Again, if try_to_unmap_cluster() cannot acquire the VMA's mmap sem, it
606 returns SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED
607 VMA, but couldn't be mlocked.
608
609
610try_to_munlock() REVERSE MAP SCAN
611---------------------------------
612
613 [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the
614 page_referenced() reverse map walker.
615
616When munlock_vma_page() [see section "munlock()/munlockall() System Call
617Handling" above] tries to munlock a page, it needs to determine whether or not
618the page is mapped by any VM_LOCKED VMA without actually attempting to unmap
619all PTEs from the page. For this purpose, the unevictable/mlock infrastructure
620introduced a variant of try_to_unmap() called try_to_munlock().
521 621
522try_to_munlock() calls the same functions as try_to_unmap() for anonymous and 622try_to_munlock() calls the same functions as try_to_unmap() for anonymous and
523mapped file pages with an additional argument specifing unlock versus unmap 623mapped file pages with an additional argument specifing unlock versus unmap
524processing. Again, these functions walk the respective reverse maps looking 624processing. Again, these functions walk the respective reverse maps looking
525for VM_LOCKED vmas. When such a vma is found for anonymous pages and file 625for VM_LOCKED VMAs. When such a VMA is found for anonymous pages and file
526pages mapped in linear VMAs, as in the try_to_unmap() case, the functions 626pages mapped in linear VMAs, as in the try_to_unmap() case, the functions
527attempt to acquire the associated mmap semphore, mlock the page via 627attempt to acquire the associated mmap semphore, mlock the page via
528mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the 628mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the
529pre-clearing of the page's PG_mlocked done by munlock_vma_page. 629pre-clearing of the page's PG_mlocked done by munlock_vma_page.
530 630
531If try_to_unmap() is unable to acquire a VM_LOCKED vma's associated mmap 631If try_to_unmap() is unable to acquire a VM_LOCKED VMA's associated mmap
532semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() 632semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() to
533to recycle the page on the inactive list and hope that it has better luck 633recycle the page on the inactive list and hope that it has better luck with the
534with the page next time. 634page next time.
535 635
536For file pages mapped into non-linear vmas, the try_to_munlock() logic works 636For file pages mapped into non-linear VMAs, the try_to_munlock() logic works
537slightly differently. On encountering a VM_LOCKED non-linear vma that might 637slightly differently. On encountering a VM_LOCKED non-linear VMA that might
538map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking 638map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking the
539the page. munlock_vma_page() will just leave the page unlocked and let 639page. munlock_vma_page() will just leave the page unlocked and let vmscan deal
540vmscan deal with it--the usual fallback position. 640with it - the usual fallback position.
541 641
542Note that try_to_munlock()'s reverse map walk must visit every vma in a pages' 642Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's
543reverse map to determine that a page is NOT mapped into any VM_LOCKED vma. 643reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA.
544However, the scan can terminate when it encounters a VM_LOCKED vma and can 644However, the scan can terminate when it encounters a VM_LOCKED VMA and can
545successfully acquire the vma's mmap semphore for read and mlock the page. 645successfully acquire the VMA's mmap semphore for read and mlock the page.
546Although try_to_munlock() can be called many [very many!] times when 646Although try_to_munlock() might be called a great many times when munlocking a
547munlock()ing a large region or tearing down a large address space that has been 647large region or tearing down a large address space that has been mlocked via
548mlocked via mlockall(), overall this is a fairly rare event. 648mlockall(), overall this is a fairly rare event.
549 649
550Mlocked Page: Page Reclaim in shrink_*_list() 650
551 651PAGE RECLAIM IN shrink_*_list()
552shrink_active_list() culls any obviously unevictable pages--i.e., 652-------------------------------
553!page_evictable(page, NULL)--diverting these to the unevictable lru 653
554list. However, shrink_active_list() only sees unevictable pages that 654shrink_active_list() culls any obviously unevictable pages - i.e.
555made it onto the active/inactive lru lists. Note that these pages do not 655!page_evictable(page, NULL) - diverting these to the unevictable list.
556have PageUnevictable set--otherwise, they would be on the unevictable list and 656However, shrink_active_list() only sees unevictable pages that made it onto the
557shrink_active_list would never see them. 657active/inactive lru lists. Note that these pages do not have PageUnevictable
658set - otherwise they would be on the unevictable list and shrink_active_list
659would never see them.
558 660
559Some examples of these unevictable pages on the LRU lists are: 661Some examples of these unevictable pages on the LRU lists are:
560 662
5611) ramfs pages that have been placed on the lru lists when first allocated. 663 (1) ramfs pages that have been placed on the LRU lists when first allocated.
664
665 (2) SHM_LOCK'd shared memory pages. shmctl(SHM_LOCK) does not attempt to
666 allocate or fault in the pages in the shared memory region. This happens
667 when an application accesses the page the first time after SHM_LOCK'ing
668 the segment.
562 669
5632) SHM_LOCKed shared memory pages. shmctl(SHM_LOCK) does not attempt to 670 (3) mlocked pages that could not be isolated from the LRU and moved to the
564 allocate or fault in the pages in the shared memory region. This happens 671 unevictable list in mlock_vma_page().
565 when an application accesses the page the first time after SHM_LOCKing
566 the segment.
567 672
5683) Mlocked pages that could not be isolated from the lru and moved to the 673 (4) Pages mapped into multiple VM_LOCKED VMAs, but try_to_munlock() couldn't
569 unevictable list in mlock_vma_page(). 674 acquire the VMA's mmap semaphore to test the flags and set PageMlocked.
675 munlock_vma_page() was forced to let the page back on to the normal LRU
676 list for vmscan to handle.
570 677
5713) Pages mapped into multiple VM_LOCKED vmas, but try_to_munlock() couldn't 678shrink_inactive_list() also diverts any unevictable pages that it finds on the
572 acquire the vma's mmap semaphore to test the flags and set PageMlocked. 679inactive lists to the appropriate zone's unevictable list.
573 munlock_vma_page() was forced to let the page back on to the normal
574 LRU list for vmscan to handle.
575 680
576shrink_inactive_list() also culls any unevictable pages that it finds on 681shrink_inactive_list() should only see SHM_LOCK'd pages that became SHM_LOCK'd
577the inactive lists, again diverting them to the appropriate zone's unevictable 682after shrink_active_list() had moved them to the inactive list, or pages mapped
578lru list. shrink_inactive_list() should only see SHM_LOCKed pages that became 683into VM_LOCKED VMAs that munlock_vma_page() couldn't isolate from the LRU to
579SHM_LOCKed after shrink_active_list() had moved them to the inactive list, or 684recheck via try_to_munlock(). shrink_inactive_list() won't notice the latter,
580pages mapped into VM_LOCKED vmas that munlock_vma_page() couldn't isolate from 685but will pass on to shrink_page_list().
581the lru to recheck via try_to_munlock(). shrink_inactive_list() won't notice
582the latter, but will pass on to shrink_page_list().
583 686
584shrink_page_list() again culls obviously unevictable pages that it could 687shrink_page_list() again culls obviously unevictable pages that it could
585encounter for similar reason to shrink_inactive_list(). Pages mapped into 688encounter for similar reason to shrink_inactive_list(). Pages mapped into
586VM_LOCKED vmas but without PG_mlocked set will make it all the way to 689VM_LOCKED VMAs but without PG_mlocked set will make it all the way to
587try_to_unmap(). shrink_page_list() will divert them to the unevictable list 690try_to_unmap(). shrink_page_list() will divert them to the unevictable list
588when try_to_unmap() returns SWAP_MLOCK, as discussed above. 691when try_to_unmap() returns SWAP_MLOCK, as discussed above.