diff options
Diffstat (limited to 'Documentation')
29 files changed, 1207 insertions, 555 deletions
diff --git a/Documentation/ABI/testing/debugfs-pktcdvd b/Documentation/ABI/testing/debugfs-pktcdvd index bf9c16b64c34..cf11736acb76 100644 --- a/Documentation/ABI/testing/debugfs-pktcdvd +++ b/Documentation/ABI/testing/debugfs-pktcdvd | |||
@@ -1,4 +1,4 @@ | |||
1 | What: /debug/pktcdvd/pktcdvd[0-7] | 1 | What: /sys/kernel/debug/pktcdvd/pktcdvd[0-7] |
2 | Date: Oct. 2006 | 2 | Date: Oct. 2006 |
3 | KernelVersion: 2.6.20 | 3 | KernelVersion: 2.6.20 |
4 | Contact: Thomas Maier <balagi@justmail.de> | 4 | Contact: Thomas Maier <balagi@justmail.de> |
@@ -10,10 +10,10 @@ debugfs interface | |||
10 | The pktcdvd module (packet writing driver) creates | 10 | The pktcdvd module (packet writing driver) creates |
11 | these files in debugfs: | 11 | these files in debugfs: |
12 | 12 | ||
13 | /debug/pktcdvd/pktcdvd[0-7]/ | 13 | /sys/kernel/debug/pktcdvd/pktcdvd[0-7]/ |
14 | info (0444) Lots of driver statistics and infos. | 14 | info (0444) Lots of driver statistics and infos. |
15 | 15 | ||
16 | Example: | 16 | Example: |
17 | ------- | 17 | ------- |
18 | 18 | ||
19 | cat /debug/pktcdvd/pktcdvd0/info | 19 | cat /sys/kernel/debug/pktcdvd/pktcdvd0/info |
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index a3a83d38f96f..8918a32c6b3a 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile | |||
@@ -31,7 +31,7 @@ PS_METHOD = $(prefer-db2x) | |||
31 | 31 | ||
32 | ### | 32 | ### |
33 | # The targets that may be used. | 33 | # The targets that may be used. |
34 | PHONY += xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs | 34 | PHONY += xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs cleandocs |
35 | 35 | ||
36 | BOOKS := $(addprefix $(obj)/,$(DOCBOOKS)) | 36 | BOOKS := $(addprefix $(obj)/,$(DOCBOOKS)) |
37 | xmldocs: $(BOOKS) | 37 | xmldocs: $(BOOKS) |
@@ -213,11 +213,12 @@ silent_gen_xml = : | |||
213 | dochelp: | 213 | dochelp: |
214 | @echo ' Linux kernel internal documentation in different formats:' | 214 | @echo ' Linux kernel internal documentation in different formats:' |
215 | @echo ' htmldocs - HTML' | 215 | @echo ' htmldocs - HTML' |
216 | @echo ' installmandocs - install man pages generated by mandocs' | ||
217 | @echo ' mandocs - man pages' | ||
218 | @echo ' pdfdocs - PDF' | 216 | @echo ' pdfdocs - PDF' |
219 | @echo ' psdocs - Postscript' | 217 | @echo ' psdocs - Postscript' |
220 | @echo ' xmldocs - XML DocBook' | 218 | @echo ' xmldocs - XML DocBook' |
219 | @echo ' mandocs - man pages' | ||
220 | @echo ' installmandocs - install man pages generated by mandocs' | ||
221 | @echo ' cleandocs - clean all generated DocBook files' | ||
221 | 222 | ||
222 | ### | 223 | ### |
223 | # Temporary files left by various tools | 224 | # Temporary files left by various tools |
@@ -235,6 +236,10 @@ clean-files := $(DOCBOOKS) \ | |||
235 | 236 | ||
236 | clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man | 237 | clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man |
237 | 238 | ||
239 | cleandocs: | ||
240 | $(Q)rm -f $(call objectify, $(clean-files)) | ||
241 | $(Q)rm -rf $(call objectify, $(clean-dirs)) | ||
242 | |||
238 | # Declare the contents of the .PHONY variable as phony. We keep that | 243 | # Declare the contents of the .PHONY variable as phony. We keep that |
239 | # information in a variable se we can use it in if_changed and friends. | 244 | # information in a variable se we can use it in if_changed and friends. |
240 | 245 | ||
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index ecad6ee75705..6fab97ea7e6b 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt | |||
@@ -1040,23 +1040,21 @@ Front merges are handled by the binary trees in AS and deadline schedulers. | |||
1040 | iii. Plugging the queue to batch requests in anticipation of opportunities for | 1040 | iii. Plugging the queue to batch requests in anticipation of opportunities for |
1041 | merge/sort optimizations | 1041 | merge/sort optimizations |
1042 | 1042 | ||
1043 | This is just the same as in 2.4 so far, though per-device unplugging | ||
1044 | support is anticipated for 2.5. Also with a priority-based i/o scheduler, | ||
1045 | such decisions could be based on request priorities. | ||
1046 | |||
1047 | Plugging is an approach that the current i/o scheduling algorithm resorts to so | 1043 | Plugging is an approach that the current i/o scheduling algorithm resorts to so |
1048 | that it collects up enough requests in the queue to be able to take | 1044 | that it collects up enough requests in the queue to be able to take |
1049 | advantage of the sorting/merging logic in the elevator. If the | 1045 | advantage of the sorting/merging logic in the elevator. If the |
1050 | queue is empty when a request comes in, then it plugs the request queue | 1046 | queue is empty when a request comes in, then it plugs the request queue |
1051 | (sort of like plugging the bottom of a vessel to get fluid to build up) | 1047 | (sort of like plugging the bath tub of a vessel to get fluid to build up) |
1052 | till it fills up with a few more requests, before starting to service | 1048 | till it fills up with a few more requests, before starting to service |
1053 | the requests. This provides an opportunity to merge/sort the requests before | 1049 | the requests. This provides an opportunity to merge/sort the requests before |
1054 | passing them down to the device. There are various conditions when the queue is | 1050 | passing them down to the device. There are various conditions when the queue is |
1055 | unplugged (to open up the flow again), either through a scheduled task or | 1051 | unplugged (to open up the flow again), either through a scheduled task or |
1056 | could be on demand. For example wait_on_buffer sets the unplugging going | 1052 | could be on demand. For example wait_on_buffer sets the unplugging going |
1057 | (by running tq_disk) so the read gets satisfied soon. So in the read case, | 1053 | through sync_buffer() running blk_run_address_space(mapping). Or the caller |
1058 | the queue gets explicitly unplugged as part of waiting for completion, | 1054 | can do it explicity through blk_unplug(bdev). So in the read case, |
1059 | in fact all queues get unplugged as a side-effect. | 1055 | the queue gets explicitly unplugged as part of waiting for completion on that |
1056 | buffer. For page driven IO, the address space ->sync_page() takes care of | ||
1057 | doing the blk_run_address_space(). | ||
1060 | 1058 | ||
1061 | Aside: | 1059 | Aside: |
1062 | This is kind of controversial territory, as it's not clear if plugging is | 1060 | This is kind of controversial territory, as it's not clear if plugging is |
@@ -1067,11 +1065,6 @@ Aside: | |||
1067 | multi-page bios being queued in one shot, we may not need to wait to merge | 1065 | multi-page bios being queued in one shot, we may not need to wait to merge |
1068 | a big request from the broken up pieces coming by. | 1066 | a big request from the broken up pieces coming by. |
1069 | 1067 | ||
1070 | Per-queue granularity unplugging (still a Todo) may help reduce some of the | ||
1071 | concerns with just a single tq_disk flush approach. Something like | ||
1072 | blk_kick_queue() to unplug a specific queue (right away ?) | ||
1073 | or optionally, all queues, is in the plan. | ||
1074 | |||
1075 | 4.4 I/O contexts | 1068 | 4.4 I/O contexts |
1076 | I/O contexts provide a dynamically allocated per process data area. They may | 1069 | I/O contexts provide a dynamically allocated per process data area. They may |
1077 | be used in I/O schedulers, and in the block layer (could be used for IO statis, | 1070 | be used in I/O schedulers, and in the block layer (could be used for IO statis, |
diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroups/cpuacct.txt index bb775fbe43d7..8b930946c52a 100644 --- a/Documentation/cgroups/cpuacct.txt +++ b/Documentation/cgroups/cpuacct.txt | |||
@@ -30,3 +30,21 @@ The above steps create a new group g1 and move the current shell | |||
30 | process (bash) into it. CPU time consumed by this bash and its children | 30 | process (bash) into it. CPU time consumed by this bash and its children |
31 | can be obtained from g1/cpuacct.usage and the same is accumulated in | 31 | can be obtained from g1/cpuacct.usage and the same is accumulated in |
32 | /cgroups/cpuacct.usage also. | 32 | /cgroups/cpuacct.usage also. |
33 | |||
34 | cpuacct.stat file lists a few statistics which further divide the | ||
35 | CPU time obtained by the cgroup into user and system times. Currently | ||
36 | the following statistics are supported: | ||
37 | |||
38 | user: Time spent by tasks of the cgroup in user mode. | ||
39 | system: Time spent by tasks of the cgroup in kernel mode. | ||
40 | |||
41 | user and system are in USER_HZ unit. | ||
42 | |||
43 | cpuacct controller uses percpu_counter interface to collect user and | ||
44 | system times. This has two side effects: | ||
45 | |||
46 | - It is theoretically possible to see wrong values for user and system times. | ||
47 | This is because percpu_counter_read() on 32bit systems isn't safe | ||
48 | against concurrent writes. | ||
49 | - It is possible to see slightly outdated values for user and system times | ||
50 | due to the batch processing nature of percpu_counter. | ||
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index a98a7fe7aabb..1a608877b14e 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -6,15 +6,14 @@ used here with the memory controller that is used in hardware. | |||
6 | 6 | ||
7 | Salient features | 7 | Salient features |
8 | 8 | ||
9 | a. Enable control of both RSS (mapped) and Page Cache (unmapped) pages | 9 | a. Enable control of Anonymous, Page Cache (mapped and unmapped) and |
10 | Swap Cache memory pages. | ||
10 | b. The infrastructure allows easy addition of other types of memory to control | 11 | b. The infrastructure allows easy addition of other types of memory to control |
11 | c. Provides *zero overhead* for non memory controller users | 12 | c. Provides *zero overhead* for non memory controller users |
12 | d. Provides a double LRU: global memory pressure causes reclaim from the | 13 | d. Provides a double LRU: global memory pressure causes reclaim from the |
13 | global LRU; a cgroup on hitting a limit, reclaims from the per | 14 | global LRU; a cgroup on hitting a limit, reclaims from the per |
14 | cgroup LRU | 15 | cgroup LRU |
15 | 16 | ||
16 | NOTE: Swap Cache (unmapped) is not accounted now. | ||
17 | |||
18 | Benefits and Purpose of the memory controller | 17 | Benefits and Purpose of the memory controller |
19 | 18 | ||
20 | The memory controller isolates the memory behaviour of a group of tasks | 19 | The memory controller isolates the memory behaviour of a group of tasks |
@@ -290,34 +289,44 @@ will be charged as a new owner of it. | |||
290 | moved to the parent. If you want to avoid that, force_empty will be useful. | 289 | moved to the parent. If you want to avoid that, force_empty will be useful. |
291 | 290 | ||
292 | 5.2 stat file | 291 | 5.2 stat file |
293 | memory.stat file includes following statistics (now) | 292 | |
294 | cache - # of pages from page-cache and shmem. | 293 | memory.stat file includes following statistics |
295 | rss - # of pages from anonymous memory. | 294 | |
296 | pgpgin - # of event of charging | 295 | cache - # of bytes of page cache memory. |
297 | pgpgout - # of event of uncharging | 296 | rss - # of bytes of anonymous and swap cache memory. |
298 | active_anon - # of pages on active lru of anon, shmem. | 297 | pgpgin - # of pages paged in (equivalent to # of charging events). |
299 | inactive_anon - # of pages on active lru of anon, shmem | 298 | pgpgout - # of pages paged out (equivalent to # of uncharging events). |
300 | active_file - # of pages on active lru of file-cache | 299 | active_anon - # of bytes of anonymous and swap cache memory on active |
301 | inactive_file - # of pages on inactive lru of file cache | 300 | lru list. |
302 | unevictable - # of pages cannot be reclaimed.(mlocked etc) | 301 | inactive_anon - # of bytes of anonymous memory and swap cache memory on |
303 | 302 | inactive lru list. | |
304 | Below is depend on CONFIG_DEBUG_VM. | 303 | active_file - # of bytes of file-backed memory on active lru list. |
305 | inactive_ratio - VM internal parameter. (see mm/page_alloc.c) | 304 | inactive_file - # of bytes of file-backed memory on inactive lru list. |
306 | recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) | 305 | unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). |
307 | recent_rotated_file - VM internal parameter. (see mm/vmscan.c) | 306 | |
308 | recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) | 307 | The following additional stats are dependent on CONFIG_DEBUG_VM. |
309 | recent_scanned_file - VM internal parameter. (see mm/vmscan.c) | 308 | |
310 | 309 | inactive_ratio - VM internal parameter. (see mm/page_alloc.c) | |
311 | Memo: | 310 | recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) |
311 | recent_rotated_file - VM internal parameter. (see mm/vmscan.c) | ||
312 | recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) | ||
313 | recent_scanned_file - VM internal parameter. (see mm/vmscan.c) | ||
314 | |||
315 | Memo: | ||
312 | recent_rotated means recent frequency of lru rotation. | 316 | recent_rotated means recent frequency of lru rotation. |
313 | recent_scanned means recent # of scans to lru. | 317 | recent_scanned means recent # of scans to lru. |
314 | showing for better debug please see the code for meanings. | 318 | showing for better debug please see the code for meanings. |
315 | 319 | ||
320 | Note: | ||
321 | Only anonymous and swap cache memory is listed as part of 'rss' stat. | ||
322 | This should not be confused with the true 'resident set size' or the | ||
323 | amount of physical memory used by the cgroup. Per-cgroup rss | ||
324 | accounting is not done yet. | ||
316 | 325 | ||
317 | 5.3 swappiness | 326 | 5.3 swappiness |
318 | Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. | 327 | Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. |
319 | 328 | ||
320 | Following cgroup's swapiness can't be changed. | 329 | Following cgroups' swapiness can't be changed. |
321 | - root cgroup (uses /proc/sys/vm/swappiness). | 330 | - root cgroup (uses /proc/sys/vm/swappiness). |
322 | - a cgroup which uses hierarchy and it has child cgroup. | 331 | - a cgroup which uses hierarchy and it has child cgroup. |
323 | - a cgroup which uses hierarchy and not the root of hierarchy. | 332 | - a cgroup which uses hierarchy and not the root of hierarchy. |
diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt index f196ac1d7d25..95b24d766eab 100644 --- a/Documentation/cgroups/resource_counter.txt +++ b/Documentation/cgroups/resource_counter.txt | |||
@@ -47,13 +47,18 @@ to work with it. | |||
47 | 47 | ||
48 | 2. Basic accounting routines | 48 | 2. Basic accounting routines |
49 | 49 | ||
50 | a. void res_counter_init(struct res_counter *rc) | 50 | a. void res_counter_init(struct res_counter *rc, |
51 | struct res_counter *rc_parent) | ||
51 | 52 | ||
52 | Initializes the resource counter. As usual, should be the first | 53 | Initializes the resource counter. As usual, should be the first |
53 | routine called for a new counter. | 54 | routine called for a new counter. |
54 | 55 | ||
55 | b. int res_counter_charge[_locked] | 56 | The struct res_counter *parent can be used to define a hierarchical |
56 | (struct res_counter *rc, unsigned long val) | 57 | child -> parent relationship directly in the res_counter structure, |
58 | NULL can be used to define no relationship. | ||
59 | |||
60 | c. int res_counter_charge(struct res_counter *rc, unsigned long val, | ||
61 | struct res_counter **limit_fail_at) | ||
57 | 62 | ||
58 | When a resource is about to be allocated it has to be accounted | 63 | When a resource is about to be allocated it has to be accounted |
59 | with the appropriate resource counter (controller should determine | 64 | with the appropriate resource counter (controller should determine |
@@ -67,15 +72,25 @@ to work with it. | |||
67 | * if the charging is performed first, then it should be uncharged | 72 | * if the charging is performed first, then it should be uncharged |
68 | on error path (if the one is called). | 73 | on error path (if the one is called). |
69 | 74 | ||
70 | c. void res_counter_uncharge[_locked] | 75 | If the charging fails and a hierarchical dependency exists, the |
76 | limit_fail_at parameter is set to the particular res_counter element | ||
77 | where the charging failed. | ||
78 | |||
79 | d. int res_counter_charge_locked | ||
80 | (struct res_counter *rc, unsigned long val) | ||
81 | |||
82 | The same as res_counter_charge(), but it must not acquire/release the | ||
83 | res_counter->lock internally (it must be called with res_counter->lock | ||
84 | held). | ||
85 | |||
86 | e. void res_counter_uncharge[_locked] | ||
71 | (struct res_counter *rc, unsigned long val) | 87 | (struct res_counter *rc, unsigned long val) |
72 | 88 | ||
73 | When a resource is released (freed) it should be de-accounted | 89 | When a resource is released (freed) it should be de-accounted |
74 | from the resource counter it was accounted to. This is called | 90 | from the resource counter it was accounted to. This is called |
75 | "uncharging". | 91 | "uncharging". |
76 | 92 | ||
77 | The _locked routines imply that the res_counter->lock is taken. | 93 | The _locked routines imply that the res_counter->lock is taken. |
78 | |||
79 | 94 | ||
80 | 2.1 Other accounting routines | 95 | 2.1 Other accounting routines |
81 | 96 | ||
diff --git a/Documentation/driver-model/platform.txt b/Documentation/driver-model/platform.txt index 83009fdcbbc8..2e2c2ea90ceb 100644 --- a/Documentation/driver-model/platform.txt +++ b/Documentation/driver-model/platform.txt | |||
@@ -169,3 +169,62 @@ three different ways to find such a match: | |||
169 | be probed later if another device registers. (Which is OK, since | 169 | be probed later if another device registers. (Which is OK, since |
170 | this interface is only for use with non-hotpluggable devices.) | 170 | this interface is only for use with non-hotpluggable devices.) |
171 | 171 | ||
172 | |||
173 | Early Platform Devices and Drivers | ||
174 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
175 | The early platform interfaces provide platform data to platform device | ||
176 | drivers early on during the system boot. The code is built on top of the | ||
177 | early_param() command line parsing and can be executed very early on. | ||
178 | |||
179 | Example: "earlyprintk" class early serial console in 6 steps | ||
180 | |||
181 | 1. Registering early platform device data | ||
182 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
183 | The architecture code registers platform device data using the function | ||
184 | early_platform_add_devices(). In the case of early serial console this | ||
185 | should be hardware configuration for the serial port. Devices registered | ||
186 | at this point will later on be matched against early platform drivers. | ||
187 | |||
188 | 2. Parsing kernel command line | ||
189 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
190 | The architecture code calls parse_early_param() to parse the kernel | ||
191 | command line. This will execute all matching early_param() callbacks. | ||
192 | User specified early platform devices will be registered at this point. | ||
193 | For the early serial console case the user can specify port on the | ||
194 | kernel command line as "earlyprintk=serial.0" where "earlyprintk" is | ||
195 | the class string, "serial" is the name of the platfrom driver and | ||
196 | 0 is the platform device id. If the id is -1 then the dot and the | ||
197 | id can be omitted. | ||
198 | |||
199 | 3. Installing early platform drivers belonging to a certain class | ||
200 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
201 | The architecture code may optionally force registration of all early | ||
202 | platform drivers belonging to a certain class using the function | ||
203 | early_platform_driver_register_all(). User specified devices from | ||
204 | step 2 have priority over these. This step is omitted by the serial | ||
205 | driver example since the early serial driver code should be disabled | ||
206 | unless the user has specified port on the kernel command line. | ||
207 | |||
208 | 4. Early platform driver registration | ||
209 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
210 | Compiled-in platform drivers making use of early_platform_init() are | ||
211 | automatically registered during step 2 or 3. The serial driver example | ||
212 | should use early_platform_init("earlyprintk", &platform_driver). | ||
213 | |||
214 | 5. Probing of early platform drivers belonging to a certain class | ||
215 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
216 | The architecture code calls early_platform_driver_probe() to match | ||
217 | registered early platform devices associated with a certain class with | ||
218 | registered early platform drivers. Matched devices will get probed(). | ||
219 | This step can be executed at any point during the early boot. As soon | ||
220 | as possible may be good for the serial port case. | ||
221 | |||
222 | 6. Inside the early platform driver probe() | ||
223 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
224 | The driver code needs to take special care during early boot, especially | ||
225 | when it comes to memory allocation and interrupt registration. The code | ||
226 | in the probe() function can use is_early_platform_device() to check if | ||
227 | it is called at early platform device or at the regular platform device | ||
228 | time. The early serial driver performs register_console() at this point. | ||
229 | |||
230 | For further information, see <linux/platform_device.h>. | ||
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 7e2af10e8264..de491a3e2313 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt | |||
@@ -428,3 +428,12 @@ Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to | |||
428 | After a reasonable transition period, we will remove the legacy | 428 | After a reasonable transition period, we will remove the legacy |
429 | fakephp interface. | 429 | fakephp interface. |
430 | Who: Alex Chiang <achiang@hp.com> | 430 | Who: Alex Chiang <achiang@hp.com> |
431 | |||
432 | --------------------------- | ||
433 | |||
434 | What: i2c-voodoo3 driver | ||
435 | When: October 2009 | ||
436 | Why: Superseded by tdfxfb. I2C/DDC support used to live in a separate | ||
437 | driver but this caused driver conflicts. | ||
438 | Who: Jean Delvare <khali@linux-fr.org> | ||
439 | Krzysztof Helt <krzysztof.h1@wp.pl> | ||
diff --git a/Documentation/filesystems/pohmelfs/design_notes.txt b/Documentation/filesystems/pohmelfs/design_notes.txt index 6d6db60d567d..dcf833587162 100644 --- a/Documentation/filesystems/pohmelfs/design_notes.txt +++ b/Documentation/filesystems/pohmelfs/design_notes.txt | |||
@@ -56,9 +56,10 @@ workloads and can fully utilize the bandwidth to the servers when doing bulk | |||
56 | data transfers. | 56 | data transfers. |
57 | 57 | ||
58 | POHMELFS clients operate with a working set of servers and are capable of balancing read-only | 58 | POHMELFS clients operate with a working set of servers and are capable of balancing read-only |
59 | operations (like lookups or directory listings) between them. | 59 | operations (like lookups or directory listings) between them according to IO priorities. |
60 | Administrators can add or remove servers from the set at run-time via special commands (described | 60 | Administrators can add or remove servers from the set at run-time via special commands (described |
61 | in Documentation/pohmelfs/info.txt file). Writes are replicated to all servers. | 61 | in Documentation/pohmelfs/info.txt file). Writes are replicated to all servers, which are connected |
62 | with write permission turned on. IO priority and permissions can be changed in run-time. | ||
62 | 63 | ||
63 | POHMELFS is capable of full data channel encryption and/or strong crypto hashing. | 64 | POHMELFS is capable of full data channel encryption and/or strong crypto hashing. |
64 | One can select any kernel supported cipher, encryption mode, hash type and operation mode | 65 | One can select any kernel supported cipher, encryption mode, hash type and operation mode |
diff --git a/Documentation/filesystems/pohmelfs/info.txt b/Documentation/filesystems/pohmelfs/info.txt index 4e3d50157083..db2e41393626 100644 --- a/Documentation/filesystems/pohmelfs/info.txt +++ b/Documentation/filesystems/pohmelfs/info.txt | |||
@@ -1,6 +1,8 @@ | |||
1 | POHMELFS usage information. | 1 | POHMELFS usage information. |
2 | 2 | ||
3 | Mount options: | 3 | Mount options. |
4 | All but index, number of crypto threads and maximum IO size can changed via remount. | ||
5 | |||
4 | idx=%u | 6 | idx=%u |
5 | Each mountpoint is associated with a special index via this option. | 7 | Each mountpoint is associated with a special index via this option. |
6 | Administrator can add or remove servers from the given index, so all mounts, | 8 | Administrator can add or remove servers from the given index, so all mounts, |
@@ -52,16 +54,27 @@ mcache_timeout=%u | |||
52 | 54 | ||
53 | Usage examples. | 55 | Usage examples. |
54 | 56 | ||
55 | Add (or remove if it already exists) server server1.net:1025 into the working set with index $idx | 57 | Add server server1.net:1025 into the working set with index $idx |
56 | with appropriate hash algorithm and key file and cipher algorithm, mode and key file: | 58 | with appropriate hash algorithm and key file and cipher algorithm, mode and key file: |
57 | $cfg -a server1.net -p 1025 -i $idx -K $hash_key -k $cipher_key | 59 | $cfg A add -a server1.net -p 1025 -i $idx -K $hash_key -k $cipher_key |
58 | 60 | ||
59 | Mount filesystem with given index $idx to /mnt mountpoint. | 61 | Mount filesystem with given index $idx to /mnt mountpoint. |
60 | Client will connect to all servers specified in the working set via previous command: | 62 | Client will connect to all servers specified in the working set via previous command: |
61 | mount -t pohmel -o idx=$idx q /mnt | 63 | mount -t pohmel -o idx=$idx q /mnt |
62 | 64 | ||
63 | One can add or remove servers from working set after mounting too. | 65 | Change permissions to read-only (-I 1 option, '-I 2' - write-only, 3 - rw): |
66 | $cfg A modify -a server1.net -p 1025 -i $idx -I 1 | ||
67 | |||
68 | Change IO priority to 123 (node with the highest priority gets read requests). | ||
69 | $cfg A modify -a server1.net -p 1025 -i $idx -P 123 | ||
64 | 70 | ||
71 | One can check currect status of all connections in the mountstats file: | ||
72 | # cat /proc/$PID/mountstats | ||
73 | ... | ||
74 | device none mounted on /mnt with fstype pohmel | ||
75 | idx addr(:port) socket_type protocol active priority permissions | ||
76 | 0 server1.net:1026 1 6 1 250 1 | ||
77 | 0 server2.net:1025 1 6 1 123 3 | ||
65 | 78 | ||
66 | Server installation. | 79 | Server installation. |
67 | 80 | ||
diff --git a/Documentation/infiniband/ipoib.txt b/Documentation/infiniband/ipoib.txt index 864ff3283780..6d40f00b358c 100644 --- a/Documentation/infiniband/ipoib.txt +++ b/Documentation/infiniband/ipoib.txt | |||
@@ -24,6 +24,49 @@ Partitions and P_Keys | |||
24 | The P_Key for any interface is given by the "pkey" file, and the | 24 | The P_Key for any interface is given by the "pkey" file, and the |
25 | main interface for a subinterface is in "parent." | 25 | main interface for a subinterface is in "parent." |
26 | 26 | ||
27 | Datagram vs Connected modes | ||
28 | |||
29 | The IPoIB driver supports two modes of operation: datagram and | ||
30 | connected. The mode is set and read through an interface's | ||
31 | /sys/class/net/<intf name>/mode file. | ||
32 | |||
33 | In datagram mode, the IB UD (Unreliable Datagram) transport is used | ||
34 | and so the interface MTU has is equal to the IB L2 MTU minus the | ||
35 | IPoIB encapsulation header (4 bytes). For example, in a typical IB | ||
36 | fabric with a 2K MTU, the IPoIB MTU will be 2048 - 4 = 2044 bytes. | ||
37 | |||
38 | In connected mode, the IB RC (Reliable Connected) transport is used. | ||
39 | Connected mode is to takes advantage of the connected nature of the | ||
40 | IB transport and allows an MTU up to the maximal IP packet size of | ||
41 | 64K, which reduces the number of IP packets needed for handling | ||
42 | large UDP datagrams, TCP segments, etc and increases the performance | ||
43 | for large messages. | ||
44 | |||
45 | In connected mode, the interface's UD QP is still used for multicast | ||
46 | and communication with peers that don't support connected mode. In | ||
47 | this case, RX emulation of ICMP PMTU packets is used to cause the | ||
48 | networking stack to use the smaller UD MTU for these neighbours. | ||
49 | |||
50 | Stateless offloads | ||
51 | |||
52 | If the IB HW supports IPoIB stateless offloads, IPoIB advertises | ||
53 | TCP/IP checksum and/or Large Send (LSO) offloading capability to the | ||
54 | network stack. | ||
55 | |||
56 | Large Receive (LRO) offloading is also implemented and may be turned | ||
57 | on/off using ethtool calls. Currently LRO is supported only for | ||
58 | checksum offload capable devices. | ||
59 | |||
60 | Stateless offloads are supported only in datagram mode. | ||
61 | |||
62 | Interrupt moderation | ||
63 | |||
64 | If the underlying IB device supports CQ event moderation, one can | ||
65 | use ethtool to set interrupt mitigation parameters and thus reduce | ||
66 | the overhead incurred by handling interrupts. The main code path of | ||
67 | IPoIB doesn't use events for TX completion signaling so only RX | ||
68 | moderation is supported. | ||
69 | |||
27 | Debugging Information | 70 | Debugging Information |
28 | 71 | ||
29 | By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set | 72 | By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set |
@@ -55,3 +98,5 @@ References | |||
55 | http://ietf.org/rfc/rfc4391.txt | 98 | http://ietf.org/rfc/rfc4391.txt |
56 | IP over InfiniBand (IPoIB) Architecture (RFC 4392) | 99 | IP over InfiniBand (IPoIB) Architecture (RFC 4392) |
57 | http://ietf.org/rfc/rfc4392.txt | 100 | http://ietf.org/rfc/rfc4392.txt |
101 | IP over InfiniBand: Connected Mode (RFC 4755) | ||
102 | http://ietf.org/rfc/rfc4755.txt | ||
diff --git a/Documentation/input/rotary-encoder.txt b/Documentation/input/rotary-encoder.txt new file mode 100644 index 000000000000..435102a26d96 --- /dev/null +++ b/Documentation/input/rotary-encoder.txt | |||
@@ -0,0 +1,101 @@ | |||
1 | rotary-encoder - a generic driver for GPIO connected devices | ||
2 | Daniel Mack <daniel@caiaq.de>, Feb 2009 | ||
3 | |||
4 | 0. Function | ||
5 | ----------- | ||
6 | |||
7 | Rotary encoders are devices which are connected to the CPU or other | ||
8 | peripherals with two wires. The outputs are phase-shifted by 90 degrees | ||
9 | and by triggering on falling and rising edges, the turn direction can | ||
10 | be determined. | ||
11 | |||
12 | The phase diagram of these two outputs look like this: | ||
13 | |||
14 | _____ _____ _____ | ||
15 | | | | | | | | ||
16 | Channel A ____| |_____| |_____| |____ | ||
17 | |||
18 | : : : : : : : : : : : : | ||
19 | __ _____ _____ _____ | ||
20 | | | | | | | | | ||
21 | Channel B |_____| |_____| |_____| |__ | ||
22 | |||
23 | : : : : : : : : : : : : | ||
24 | Event a b c d a b c d a b c d | ||
25 | |||
26 | |<-------->| | ||
27 | one step | ||
28 | |||
29 | |||
30 | For more information, please see | ||
31 | http://en.wikipedia.org/wiki/Rotary_encoder | ||
32 | |||
33 | |||
34 | 1. Events / state machine | ||
35 | ------------------------- | ||
36 | |||
37 | a) Rising edge on channel A, channel B in low state | ||
38 | This state is used to recognize a clockwise turn | ||
39 | |||
40 | b) Rising edge on channel B, channel A in high state | ||
41 | When entering this state, the encoder is put into 'armed' state, | ||
42 | meaning that there it has seen half the way of a one-step transition. | ||
43 | |||
44 | c) Falling edge on channel A, channel B in high state | ||
45 | This state is used to recognize a counter-clockwise turn | ||
46 | |||
47 | d) Falling edge on channel B, channel A in low state | ||
48 | Parking position. If the encoder enters this state, a full transition | ||
49 | should have happend, unless it flipped back on half the way. The | ||
50 | 'armed' state tells us about that. | ||
51 | |||
52 | 2. Platform requirements | ||
53 | ------------------------ | ||
54 | |||
55 | As there is no hardware dependent call in this driver, the platform it is | ||
56 | used with must support gpiolib. Another requirement is that IRQs must be | ||
57 | able to fire on both edges. | ||
58 | |||
59 | |||
60 | 3. Board integration | ||
61 | -------------------- | ||
62 | |||
63 | To use this driver in your system, register a platform_device with the | ||
64 | name 'rotary-encoder' and associate the IRQs and some specific platform | ||
65 | data with it. | ||
66 | |||
67 | struct rotary_encoder_platform_data is declared in | ||
68 | include/linux/rotary-encoder.h and needs to be filled with the number of | ||
69 | steps the encoder has and can carry information about externally inverted | ||
70 | signals (because of used invertig buffer or other reasons). | ||
71 | |||
72 | Because GPIO to IRQ mapping is platform specific, this information must | ||
73 | be given in seperately to the driver. See the example below. | ||
74 | |||
75 | ---------<snip>--------- | ||
76 | |||
77 | /* board support file example */ | ||
78 | |||
79 | #include <linux/input.h> | ||
80 | #include <linux/rotary_encoder.h> | ||
81 | |||
82 | #define GPIO_ROTARY_A 1 | ||
83 | #define GPIO_ROTARY_B 2 | ||
84 | |||
85 | static struct rotary_encoder_platform_data my_rotary_encoder_info = { | ||
86 | .steps = 24, | ||
87 | .axis = ABS_X, | ||
88 | .gpio_a = GPIO_ROTARY_A, | ||
89 | .gpio_b = GPIO_ROTARY_B, | ||
90 | .inverted_a = 0, | ||
91 | .inverted_b = 0, | ||
92 | }; | ||
93 | |||
94 | static struct platform_device rotary_encoder_device = { | ||
95 | .name = "rotary-encoder", | ||
96 | .id = 0, | ||
97 | .dev = { | ||
98 | .platform_data = &my_rotary_encoder_info, | ||
99 | } | ||
100 | }; | ||
101 | |||
diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt index 51104f9194a5..d76cfd8712e1 100644 --- a/Documentation/kbuild/makefiles.txt +++ b/Documentation/kbuild/makefiles.txt | |||
@@ -40,10 +40,16 @@ This document describes the Linux kernel Makefiles. | |||
40 | --- 6.7 Custom kbuild commands | 40 | --- 6.7 Custom kbuild commands |
41 | --- 6.8 Preprocessing linker scripts | 41 | --- 6.8 Preprocessing linker scripts |
42 | 42 | ||
43 | === 7 Kbuild Variables | 43 | === 7 Kbuild syntax for exported headers |
44 | === 8 Makefile language | 44 | --- 7.1 header-y |
45 | === 9 Credits | 45 | --- 7.2 objhdr-y |
46 | === 10 TODO | 46 | --- 7.3 destination-y |
47 | --- 7.4 unifdef-y (deprecated) | ||
48 | |||
49 | === 8 Kbuild Variables | ||
50 | === 9 Makefile language | ||
51 | === 10 Credits | ||
52 | === 11 TODO | ||
47 | 53 | ||
48 | === 1 Overview | 54 | === 1 Overview |
49 | 55 | ||
@@ -310,6 +316,16 @@ more details, with real examples. | |||
310 | #arch/m68k/fpsp040/Makefile | 316 | #arch/m68k/fpsp040/Makefile |
311 | ldflags-y := -x | 317 | ldflags-y := -x |
312 | 318 | ||
319 | subdir-ccflags-y, subdir-asflags-y | ||
320 | The two flags listed above are similar to ccflags-y and as-falgs-y. | ||
321 | The difference is that the subdir- variants has effect for the kbuild | ||
322 | file where tey are present and all subdirectories. | ||
323 | Options specified using subdir-* are added to the commandline before | ||
324 | the options specified using the non-subdir variants. | ||
325 | |||
326 | Example: | ||
327 | subdir-ccflags-y := -Werror | ||
328 | |||
313 | CFLAGS_$@, AFLAGS_$@ | 329 | CFLAGS_$@, AFLAGS_$@ |
314 | 330 | ||
315 | CFLAGS_$@ and AFLAGS_$@ only apply to commands in current | 331 | CFLAGS_$@ and AFLAGS_$@ only apply to commands in current |
@@ -1143,8 +1159,69 @@ When kbuild executes, the following steps are followed (roughly): | |||
1143 | The kbuild infrastructure for *lds file are used in several | 1159 | The kbuild infrastructure for *lds file are used in several |
1144 | architecture-specific files. | 1160 | architecture-specific files. |
1145 | 1161 | ||
1162 | === 7 Kbuild syntax for exported headers | ||
1163 | |||
1164 | The kernel include a set of headers that is exported to userspace. | ||
1165 | Many headers can be exported as-is but other headers requires a | ||
1166 | minimal pre-processing before they are ready for user-space. | ||
1167 | The pre-processing does: | ||
1168 | - drop kernel specific annotations | ||
1169 | - drop include of compiler.h | ||
1170 | - drop all sections that is kernel internat (guarded by ifdef __KERNEL__) | ||
1171 | |||
1172 | Each relevant directory contain a file name "Kbuild" which specify the | ||
1173 | headers to be exported. | ||
1174 | See subsequent chapter for the syntax of the Kbuild file. | ||
1175 | |||
1176 | --- 7.1 header-y | ||
1177 | |||
1178 | header-y specify header files to be exported. | ||
1179 | |||
1180 | Example: | ||
1181 | #include/linux/Kbuild | ||
1182 | header-y += usb/ | ||
1183 | header-y += aio_abi.h | ||
1184 | |||
1185 | The convention is to list one file per line and | ||
1186 | preferably in alphabetic order. | ||
1187 | |||
1188 | header-y also specify which subdirectories to visit. | ||
1189 | A subdirectory is identified by a trailing '/' which | ||
1190 | can be seen in the example above for the usb subdirectory. | ||
1191 | |||
1192 | Subdirectories are visited before their parent directories. | ||
1193 | |||
1194 | --- 7.2 objhdr-y | ||
1195 | |||
1196 | objhdr-y specifies generated files to be exported. | ||
1197 | Generated files are special as they need to be looked | ||
1198 | up in another directory when doing 'make O=...' builds. | ||
1199 | |||
1200 | Example: | ||
1201 | #include/linux/Kbuild | ||
1202 | objhdr-y += version.h | ||
1203 | |||
1204 | --- 7.3 destination-y | ||
1205 | |||
1206 | When an architecture have a set of exported headers that needs to be | ||
1207 | exported to a different directory destination-y is used. | ||
1208 | destination-y specify the destination directory for all exported | ||
1209 | headers in the file where it is present. | ||
1210 | |||
1211 | Example: | ||
1212 | #arch/xtensa/platforms/s6105/include/platform/Kbuild | ||
1213 | destination-y := include/linux | ||
1214 | |||
1215 | In the example above all exported headers in the Kbuild file | ||
1216 | will be located in the directory "include/linux" when exported. | ||
1217 | |||
1218 | |||
1219 | --- 7.4 unifdef-y (deprecated) | ||
1220 | |||
1221 | unifdef-y is deprecated. A direct replacement is header-y. | ||
1222 | |||
1146 | 1223 | ||
1147 | === 7 Kbuild Variables | 1224 | === 8 Kbuild Variables |
1148 | 1225 | ||
1149 | The top Makefile exports the following variables: | 1226 | The top Makefile exports the following variables: |
1150 | 1227 | ||
@@ -1206,7 +1283,7 @@ The top Makefile exports the following variables: | |||
1206 | INSTALL_MOD_STRIP will used as the option(s) to the strip command. | 1283 | INSTALL_MOD_STRIP will used as the option(s) to the strip command. |
1207 | 1284 | ||
1208 | 1285 | ||
1209 | === 8 Makefile language | 1286 | === 9 Makefile language |
1210 | 1287 | ||
1211 | The kernel Makefiles are designed to be run with GNU Make. The Makefiles | 1288 | The kernel Makefiles are designed to be run with GNU Make. The Makefiles |
1212 | use only the documented features of GNU Make, but they do use many | 1289 | use only the documented features of GNU Make, but they do use many |
@@ -1225,14 +1302,14 @@ time the left-hand side is used. | |||
1225 | There are some cases where "=" is appropriate. Usually, though, ":=" | 1302 | There are some cases where "=" is appropriate. Usually, though, ":=" |
1226 | is the right choice. | 1303 | is the right choice. |
1227 | 1304 | ||
1228 | === 9 Credits | 1305 | === 10 Credits |
1229 | 1306 | ||
1230 | Original version made by Michael Elizabeth Chastain, <mailto:mec@shout.net> | 1307 | Original version made by Michael Elizabeth Chastain, <mailto:mec@shout.net> |
1231 | Updates by Kai Germaschewski <kai@tp1.ruhr-uni-bochum.de> | 1308 | Updates by Kai Germaschewski <kai@tp1.ruhr-uni-bochum.de> |
1232 | Updates by Sam Ravnborg <sam@ravnborg.org> | 1309 | Updates by Sam Ravnborg <sam@ravnborg.org> |
1233 | Language QA by Jan Engelhardt <jengelh@gmx.de> | 1310 | Language QA by Jan Engelhardt <jengelh@gmx.de> |
1234 | 1311 | ||
1235 | === 10 TODO | 1312 | === 11 TODO |
1236 | 1313 | ||
1237 | - Describe how kbuild supports shipped files with _shipped. | 1314 | - Describe how kbuild supports shipped files with _shipped. |
1238 | - Generating offset header files. | 1315 | - Generating offset header files. |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9e4fe724c87c..90b3924071b6 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -231,6 +231,35 @@ and is between 256 and 4096 characters. It is defined in the file | |||
231 | power state again in power transition. | 231 | power state again in power transition. |
232 | 1 : disable the power state check | 232 | 1 : disable the power state check |
233 | 233 | ||
234 | acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode | ||
235 | Format: { level | edge | high | low } | ||
236 | |||
237 | acpi_serialize [HW,ACPI] force serialization of AML methods | ||
238 | |||
239 | acpi_skip_timer_override [HW,ACPI] | ||
240 | Recognize and ignore IRQ0/pin2 Interrupt Override. | ||
241 | For broken nForce2 BIOS resulting in XT-PIC timer. | ||
242 | |||
243 | acpi_sleep= [HW,ACPI] Sleep options | ||
244 | Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig, | ||
245 | old_ordering, s4_nonvs } | ||
246 | See Documentation/power/video.txt for information on | ||
247 | s3_bios and s3_mode. | ||
248 | s3_beep is for debugging; it makes the PC's speaker beep | ||
249 | as soon as the kernel's real-mode entry point is called. | ||
250 | s4_nohwsig prevents ACPI hardware signature from being | ||
251 | used during resume from hibernation. | ||
252 | old_ordering causes the ACPI 1.0 ordering of the _PTS | ||
253 | control method, with respect to putting devices into | ||
254 | low power states, to be enforced (the ACPI 2.0 ordering | ||
255 | of _PTS is used by default). | ||
256 | s4_nonvs prevents the kernel from saving/restoring the | ||
257 | ACPI NVS memory during hibernation. | ||
258 | |||
259 | acpi_use_timer_override [HW,ACPI] | ||
260 | Use timer override. For some broken Nvidia NF5 boards | ||
261 | that require a timer override, but don't have HPET | ||
262 | |||
234 | acpi_enforce_resources= [ACPI] | 263 | acpi_enforce_resources= [ACPI] |
235 | { strict | lax | no } | 264 | { strict | lax | no } |
236 | Check for resource conflicts between native drivers | 265 | Check for resource conflicts between native drivers |
@@ -250,6 +279,9 @@ and is between 256 and 4096 characters. It is defined in the file | |||
250 | ad1848= [HW,OSS] | 279 | ad1848= [HW,OSS] |
251 | Format: <io>,<irq>,<dma>,<dma2>,<type> | 280 | Format: <io>,<irq>,<dma>,<dma2>,<type> |
252 | 281 | ||
282 | add_efi_memmap [EFI; X86] Include EFI memory map in | ||
283 | kernel's map of available physical RAM. | ||
284 | |||
253 | advansys= [HW,SCSI] | 285 | advansys= [HW,SCSI] |
254 | See header of drivers/scsi/advansys.c. | 286 | See header of drivers/scsi/advansys.c. |
255 | 287 | ||
@@ -1840,6 +1872,12 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1840 | autoconfiguration. | 1872 | autoconfiguration. |
1841 | Ranges are in pairs (memory base and size). | 1873 | Ranges are in pairs (memory base and size). |
1842 | 1874 | ||
1875 | ports= [IP_VS_FTP] IPVS ftp helper module | ||
1876 | Default is 21. | ||
1877 | Up to 8 (IP_VS_APP_MAX_PORTS) ports | ||
1878 | may be specified. | ||
1879 | Format: <port>,<port>.... | ||
1880 | |||
1843 | print-fatal-signals= | 1881 | print-fatal-signals= |
1844 | [KNL] debug: print fatal signals | 1882 | [KNL] debug: print fatal signals |
1845 | print-fatal-signals=1: print segfault info to | 1883 | print-fatal-signals=1: print segfault info to |
diff --git a/Documentation/lguest/.gitignore b/Documentation/lguest/.gitignore new file mode 100644 index 000000000000..115587fd5f65 --- /dev/null +++ b/Documentation/lguest/.gitignore | |||
@@ -0,0 +1 @@ | |||
lguest | |||
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt index 29510dc51510..28c747362f95 100644 --- a/Documentation/lguest/lguest.txt +++ b/Documentation/lguest/lguest.txt | |||
@@ -3,11 +3,11 @@ | |||
3 | /, /` - or, A Young Coder's Illustrated Hypervisor | 3 | /, /` - or, A Young Coder's Illustrated Hypervisor |
4 | \\"--\\ http://lguest.ozlabs.org | 4 | \\"--\\ http://lguest.ozlabs.org |
5 | 5 | ||
6 | Lguest is designed to be a minimal hypervisor for the Linux kernel, for | 6 | Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel, |
7 | Linux developers and users to experiment with virtualization with the | 7 | for Linux developers and users to experiment with virtualization with the |
8 | minimum of complexity. Nonetheless, it should have sufficient | 8 | minimum of complexity. Nonetheless, it should have sufficient features to |
9 | features to make it useful for specific tasks, and, of course, you are | 9 | make it useful for specific tasks, and, of course, you are encouraged to fork |
10 | encouraged to fork and enhance it (see drivers/lguest/README). | 10 | and enhance it (see drivers/lguest/README). |
11 | 11 | ||
12 | Features: | 12 | Features: |
13 | 13 | ||
@@ -37,6 +37,7 @@ Running Lguest: | |||
37 | "Paravirtualized guest support" = Y | 37 | "Paravirtualized guest support" = Y |
38 | "Lguest guest support" = Y | 38 | "Lguest guest support" = Y |
39 | "High Memory Support" = off/4GB | 39 | "High Memory Support" = off/4GB |
40 | "PAE (Physical Address Extension) Support" = N | ||
40 | "Alignment value to which kernel should be aligned" = 0x100000 | 41 | "Alignment value to which kernel should be aligned" = 0x100000 |
41 | (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and | 42 | (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and |
42 | CONFIG_PHYSICAL_ALIGN=0x100000) | 43 | CONFIG_PHYSICAL_ALIGN=0x100000) |
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt index 5ede7473b425..08762750f121 100644 --- a/Documentation/networking/bonding.txt +++ b/Documentation/networking/bonding.txt | |||
@@ -1242,7 +1242,7 @@ monitoring is enabled, and vice-versa. | |||
1242 | To add ARP targets: | 1242 | To add ARP targets: |
1243 | # echo +192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target | 1243 | # echo +192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target |
1244 | # echo +192.168.0.101 > /sys/class/net/bond0/bonding/arp_ip_target | 1244 | # echo +192.168.0.101 > /sys/class/net/bond0/bonding/arp_ip_target |
1245 | NOTE: up to 10 target addresses may be specified. | 1245 | NOTE: up to 16 target addresses may be specified. |
1246 | 1246 | ||
1247 | To remove an ARP target: | 1247 | To remove an ARP target: |
1248 | # echo -192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target | 1248 | # echo -192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target |
diff --git a/Documentation/powerpc/dts-bindings/fsl/i2c.txt b/Documentation/powerpc/dts-bindings/fsl/i2c.txt index d0ab33e21fe6..b6d2e21474f9 100644 --- a/Documentation/powerpc/dts-bindings/fsl/i2c.txt +++ b/Documentation/powerpc/dts-bindings/fsl/i2c.txt | |||
@@ -7,8 +7,10 @@ Required properties : | |||
7 | 7 | ||
8 | Recommended properties : | 8 | Recommended properties : |
9 | 9 | ||
10 | - compatible : Should be "fsl-i2c" for parts compatible with | 10 | - compatible : compatibility list with 2 entries, the first should |
11 | Freescale I2C specifications. | 11 | be "fsl,CHIP-i2c" where CHIP is the name of a compatible processor, |
12 | e.g. mpc8313, mpc8543, mpc8544, mpc5200 or mpc5200b. The second one | ||
13 | should be "fsl-i2c". | ||
12 | - interrupts : <a b> where a is the interrupt number and b is a | 14 | - interrupts : <a b> where a is the interrupt number and b is a |
13 | field that represents an encoding of the sense and level | 15 | field that represents an encoding of the sense and level |
14 | information for the interrupt. This should be encoded based on | 16 | information for the interrupt. This should be encoded based on |
@@ -16,17 +18,31 @@ Recommended properties : | |||
16 | controller you have. | 18 | controller you have. |
17 | - interrupt-parent : the phandle for the interrupt controller that | 19 | - interrupt-parent : the phandle for the interrupt controller that |
18 | services interrupts for this device. | 20 | services interrupts for this device. |
19 | - dfsrr : boolean; if defined, indicates that this I2C device has | 21 | - fsl,preserve-clocking : boolean; if defined, the clock settings |
20 | a digital filter sampling rate register | 22 | from the bootloader are preserved (not touched). |
21 | - fsl5200-clocking : boolean; if defined, indicated that this device | 23 | - clock-frequency : desired I2C bus clock frequency in Hz. |
22 | uses the FSL 5200 clocking mechanism. | 24 | |
23 | 25 | Examples : | |
24 | Example : | 26 | |
25 | i2c@3000 { | 27 | i2c@3d00 { |
26 | interrupt-parent = <40000>; | 28 | #address-cells = <1>; |
27 | interrupts = <1b 3>; | 29 | #size-cells = <0>; |
28 | reg = <3000 18>; | 30 | compatible = "fsl,mpc5200b-i2c","fsl,mpc5200-i2c","fsl-i2c"; |
29 | device_type = "i2c"; | 31 | cell-index = <0>; |
30 | compatible = "fsl-i2c"; | 32 | reg = <0x3d00 0x40>; |
31 | dfsrr; | 33 | interrupts = <2 15 0>; |
34 | interrupt-parent = <&mpc5200_pic>; | ||
35 | fsl,preserve-clocking; | ||
32 | }; | 36 | }; |
37 | |||
38 | i2c@3100 { | ||
39 | #address-cells = <1>; | ||
40 | #size-cells = <0>; | ||
41 | cell-index = <1>; | ||
42 | compatible = "fsl,mpc8544-i2c", "fsl-i2c"; | ||
43 | reg = <0x3100 0x100>; | ||
44 | interrupts = <43 2>; | ||
45 | interrupt-parent = <&mpic>; | ||
46 | clock-frequency = <400000>; | ||
47 | }; | ||
48 | |||
diff --git a/Documentation/sound/alsa/HD-Audio.txt b/Documentation/sound/alsa/HD-Audio.txt index c5948f2f9a25..88b7433d2f11 100644 --- a/Documentation/sound/alsa/HD-Audio.txt +++ b/Documentation/sound/alsa/HD-Audio.txt | |||
@@ -169,7 +169,7 @@ PCI SSID look-up. | |||
169 | What `model` option values are available depends on the codec chip. | 169 | What `model` option values are available depends on the codec chip. |
170 | Check your codec chip from the codec proc file (see "Codec Proc-File" | 170 | Check your codec chip from the codec proc file (see "Codec Proc-File" |
171 | section below). It will show the vendor/product name of your codec | 171 | section below). It will show the vendor/product name of your codec |
172 | chip. Then, see Documentation/sound/alsa/HD-Audio-Modelstxt file, | 172 | chip. Then, see Documentation/sound/alsa/HD-Audio-Models.txt file, |
173 | the section of HD-audio driver. You can find a list of codecs | 173 | the section of HD-audio driver. You can find a list of codecs |
174 | and `model` options belonging to each codec. For example, for Realtek | 174 | and `model` options belonging to each codec. For example, for Realtek |
175 | ALC262 codec chip, pass `model=ultra` for devices that are compatible | 175 | ALC262 codec chip, pass `model=ultra` for devices that are compatible |
@@ -177,7 +177,7 @@ with Samsung Q1 Ultra. | |||
177 | 177 | ||
178 | Thus, the first thing you can do for any brand-new, unsupported and | 178 | Thus, the first thing you can do for any brand-new, unsupported and |
179 | non-working HD-audio hardware is to check HD-audio codec and several | 179 | non-working HD-audio hardware is to check HD-audio codec and several |
180 | different `model` option values. If you have a luck, some of them | 180 | different `model` option values. If you have any luck, some of them |
181 | might suit with your device well. | 181 | might suit with your device well. |
182 | 182 | ||
183 | Some codecs such as ALC880 have a special model option `model=test`. | 183 | Some codecs such as ALC880 have a special model option `model=test`. |
diff --git a/Documentation/sparse.txt b/Documentation/sparse.txt index 42f43fa59f24..34c76a55bc04 100644 --- a/Documentation/sparse.txt +++ b/Documentation/sparse.txt | |||
@@ -42,6 +42,14 @@ sure that bitwise types don't get mixed up (little-endian vs big-endian | |||
42 | vs cpu-endian vs whatever), and there the constant "0" really _is_ | 42 | vs cpu-endian vs whatever), and there the constant "0" really _is_ |
43 | special. | 43 | special. |
44 | 44 | ||
45 | __bitwise__ - to be used for relatively compact stuff (gfp_t, etc.) that | ||
46 | is mostly warning-free and is supposed to stay that way. Warnings will | ||
47 | be generated without __CHECK_ENDIAN__. | ||
48 | |||
49 | __bitwise - noisy stuff; in particular, __le*/__be* are that. We really | ||
50 | don't want to drown in noise unless we'd explicitly asked for it. | ||
51 | |||
52 | |||
45 | Getting sparse | 53 | Getting sparse |
46 | ~~~~~~~~~~~~~~ | 54 | ~~~~~~~~~~~~~~ |
47 | 55 | ||
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index a34d55b65441..df38ef046f8d 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt | |||
@@ -95,7 +95,7 @@ of struct cmsghdr structures with appended data. | |||
95 | 95 | ||
96 | There is only one file in this directory. | 96 | There is only one file in this directory. |
97 | unix_dgram_qlen limits the max number of datagrams queued in Unix domain | 97 | unix_dgram_qlen limits the max number of datagrams queued in Unix domain |
98 | socket's buffer. It will not take effect unless PF_UNIX flag is spicified. | 98 | socket's buffer. It will not take effect unless PF_UNIX flag is specified. |
99 | 99 | ||
100 | 100 | ||
101 | 3. /proc/sys/net/ipv4 - IPV4 settings | 101 | 3. /proc/sys/net/ipv4 - IPV4 settings |
diff --git a/Documentation/tomoyo.txt b/Documentation/tomoyo.txt new file mode 100644 index 000000000000..b3a232cae7f8 --- /dev/null +++ b/Documentation/tomoyo.txt | |||
@@ -0,0 +1,55 @@ | |||
1 | --- What is TOMOYO? --- | ||
2 | |||
3 | TOMOYO is a name-based MAC extension (LSM module) for the Linux kernel. | ||
4 | |||
5 | LiveCD-based tutorials are available at | ||
6 | http://tomoyo.sourceforge.jp/en/1.6.x/1st-step/ubuntu8.04-live/ | ||
7 | http://tomoyo.sourceforge.jp/en/1.6.x/1st-step/centos5-live/ . | ||
8 | Though these tutorials use non-LSM version of TOMOYO, they are useful for you | ||
9 | to know what TOMOYO is. | ||
10 | |||
11 | --- How to enable TOMOYO? --- | ||
12 | |||
13 | Build the kernel with CONFIG_SECURITY_TOMOYO=y and pass "security=tomoyo" on | ||
14 | kernel's command line. | ||
15 | |||
16 | Please see http://tomoyo.sourceforge.jp/en/2.2.x/ for details. | ||
17 | |||
18 | --- Where is documentation? --- | ||
19 | |||
20 | User <-> Kernel interface documentation is available at | ||
21 | http://tomoyo.sourceforge.jp/en/2.2.x/policy-reference.html . | ||
22 | |||
23 | Materials we prepared for seminars and symposiums are available at | ||
24 | http://sourceforge.jp/projects/tomoyo/docs/?category_id=532&language_id=1 . | ||
25 | Below lists are chosen from three aspects. | ||
26 | |||
27 | What is TOMOYO? | ||
28 | TOMOYO Linux Overview | ||
29 | http://sourceforge.jp/projects/tomoyo/docs/lca2009-takeda.pdf | ||
30 | TOMOYO Linux: pragmatic and manageable security for Linux | ||
31 | http://sourceforge.jp/projects/tomoyo/docs/freedomhectaipei-tomoyo.pdf | ||
32 | TOMOYO Linux: A Practical Method to Understand and Protect Your Own Linux Box | ||
33 | http://sourceforge.jp/projects/tomoyo/docs/PacSec2007-en-no-demo.pdf | ||
34 | |||
35 | What can TOMOYO do? | ||
36 | Deep inside TOMOYO Linux | ||
37 | http://sourceforge.jp/projects/tomoyo/docs/lca2009-kumaneko.pdf | ||
38 | The role of "pathname based access control" in security. | ||
39 | http://sourceforge.jp/projects/tomoyo/docs/lfj2008-bof.pdf | ||
40 | |||
41 | History of TOMOYO? | ||
42 | Realities of Mainlining | ||
43 | http://sourceforge.jp/projects/tomoyo/docs/lfj2008.pdf | ||
44 | |||
45 | --- What is future plan? --- | ||
46 | |||
47 | We believe that inode based security and name based security are complementary | ||
48 | and both should be used together. But unfortunately, so far, we cannot enable | ||
49 | multiple LSM modules at the same time. We feel sorry that you have to give up | ||
50 | SELinux/SMACK/AppArmor etc. when you want to use TOMOYO. | ||
51 | |||
52 | We hope that LSM becomes stackable in future. Meanwhile, you can use non-LSM | ||
53 | version of TOMOYO, available at http://tomoyo.sourceforge.jp/en/1.6.x/ . | ||
54 | LSM version of TOMOYO is a subset of non-LSM version of TOMOYO. We are planning | ||
55 | to port non-LSM version's functionalities to LSM versions. | ||
diff --git a/Documentation/ftrace.txt b/Documentation/trace/ftrace.txt index fd9a3e693813..fd9a3e693813 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/trace/ftrace.txt | |||
diff --git a/Documentation/vm/kmemtrace.txt b/Documentation/trace/kmemtrace.txt index a956d9b7f943..a956d9b7f943 100644 --- a/Documentation/vm/kmemtrace.txt +++ b/Documentation/trace/kmemtrace.txt | |||
diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/trace/mmiotrace.txt index 5731c67abc55..5731c67abc55 100644 --- a/Documentation/tracers/mmiotrace.txt +++ b/Documentation/trace/mmiotrace.txt | |||
diff --git a/Documentation/tracepoints.txt b/Documentation/trace/tracepoints.txt index c0e1ceed75a4..c0e1ceed75a4 100644 --- a/Documentation/tracepoints.txt +++ b/Documentation/trace/tracepoints.txt | |||
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX index 2131b00b63f6..2f77ced35df7 100644 --- a/Documentation/vm/00-INDEX +++ b/Documentation/vm/00-INDEX | |||
@@ -1,5 +1,7 @@ | |||
1 | 00-INDEX | 1 | 00-INDEX |
2 | - this file. | 2 | - this file. |
3 | active_mm.txt | ||
4 | - An explanation from Linus about tsk->active_mm vs tsk->mm. | ||
3 | balance | 5 | balance |
4 | - various information on memory balancing. | 6 | - various information on memory balancing. |
5 | hugetlbpage.txt | 7 | hugetlbpage.txt |
diff --git a/Documentation/vm/active_mm.txt b/Documentation/vm/active_mm.txt new file mode 100644 index 000000000000..4ee1f643d897 --- /dev/null +++ b/Documentation/vm/active_mm.txt | |||
@@ -0,0 +1,83 @@ | |||
1 | List: linux-kernel | ||
2 | Subject: Re: active_mm | ||
3 | From: Linus Torvalds <torvalds () transmeta ! com> | ||
4 | Date: 1999-07-30 21:36:24 | ||
5 | |||
6 | Cc'd to linux-kernel, because I don't write explanations all that often, | ||
7 | and when I do I feel better about more people reading them. | ||
8 | |||
9 | On Fri, 30 Jul 1999, David Mosberger wrote: | ||
10 | > | ||
11 | > Is there a brief description someplace on how "mm" vs. "active_mm" in | ||
12 | > the task_struct are supposed to be used? (My apologies if this was | ||
13 | > discussed on the mailing lists---I just returned from vacation and | ||
14 | > wasn't able to follow linux-kernel for a while). | ||
15 | |||
16 | Basically, the new setup is: | ||
17 | |||
18 | - we have "real address spaces" and "anonymous address spaces". The | ||
19 | difference is that an anonymous address space doesn't care about the | ||
20 | user-level page tables at all, so when we do a context switch into an | ||
21 | anonymous address space we just leave the previous address space | ||
22 | active. | ||
23 | |||
24 | The obvious use for a "anonymous address space" is any thread that | ||
25 | doesn't need any user mappings - all kernel threads basically fall into | ||
26 | this category, but even "real" threads can temporarily say that for | ||
27 | some amount of time they are not going to be interested in user space, | ||
28 | and that the scheduler might as well try to avoid wasting time on | ||
29 | switching the VM state around. Currently only the old-style bdflush | ||
30 | sync does that. | ||
31 | |||
32 | - "tsk->mm" points to the "real address space". For an anonymous process, | ||
33 | tsk->mm will be NULL, for the logical reason that an anonymous process | ||
34 | really doesn't _have_ a real address space at all. | ||
35 | |||
36 | - however, we obviously need to keep track of which address space we | ||
37 | "stole" for such an anonymous user. For that, we have "tsk->active_mm", | ||
38 | which shows what the currently active address space is. | ||
39 | |||
40 | The rule is that for a process with a real address space (ie tsk->mm is | ||
41 | non-NULL) the active_mm obviously always has to be the same as the real | ||
42 | one. | ||
43 | |||
44 | For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the | ||
45 | "borrowed" mm while the anonymous process is running. When the | ||
46 | anonymous process gets scheduled away, the borrowed address space is | ||
47 | returned and cleared. | ||
48 | |||
49 | To support all that, the "struct mm_struct" now has two counters: a | ||
50 | "mm_users" counter that is how many "real address space users" there are, | ||
51 | and a "mm_count" counter that is the number of "lazy" users (ie anonymous | ||
52 | users) plus one if there are any real users. | ||
53 | |||
54 | Usually there is at least one real user, but it could be that the real | ||
55 | user exited on another CPU while a lazy user was still active, so you do | ||
56 | actually get cases where you have a address space that is _only_ used by | ||
57 | lazy users. That is often a short-lived state, because once that thread | ||
58 | gets scheduled away in favour of a real thread, the "zombie" mm gets | ||
59 | released because "mm_users" becomes zero. | ||
60 | |||
61 | Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any | ||
62 | more. "init_mm" should be considered just a "lazy context when no other | ||
63 | context is available", and in fact it is mainly used just at bootup when | ||
64 | no real VM has yet been created. So code that used to check | ||
65 | |||
66 | if (current->mm == &init_mm) | ||
67 | |||
68 | should generally just do | ||
69 | |||
70 | if (!current->mm) | ||
71 | |||
72 | instead (which makes more sense anyway - the test is basically one of "do | ||
73 | we have a user context", and is generally done by the page fault handler | ||
74 | and things like that). | ||
75 | |||
76 | Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago, | ||
77 | because it slightly changes the interfaces to accomodate the alpha (who | ||
78 | would have thought it, but the alpha actually ends up having one of the | ||
79 | ugliest context switch codes - unlike the other architectures where the MM | ||
80 | and register state is separate, the alpha PALcode joins the two, and you | ||
81 | need to switch both together). | ||
82 | |||
83 | (From http://marc.info/?l=linux-kernel&m=93337278602211&w=2) | ||
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt index 0706a7282a8c..2d70d0d95108 100644 --- a/Documentation/vm/unevictable-lru.txt +++ b/Documentation/vm/unevictable-lru.txt | |||
@@ -1,588 +1,691 @@ | |||
1 | 1 | ============================== | |
2 | This document describes the Linux memory management "Unevictable LRU" | 2 | UNEVICTABLE LRU INFRASTRUCTURE |
3 | infrastructure and the use of this infrastructure to manage several types | 3 | ============================== |
4 | of "unevictable" pages. The document attempts to provide the overall | 4 | |
5 | rationale behind this mechanism and the rationale for some of the design | 5 | ======== |
6 | decisions that drove the implementation. The latter design rationale is | 6 | CONTENTS |
7 | discussed in the context of an implementation description. Admittedly, one | 7 | ======== |
8 | can obtain the implementation details--the "what does it do?"--by reading the | 8 | |
9 | code. One hopes that the descriptions below add value by provide the answer | 9 | (*) The Unevictable LRU |
10 | to "why does it do that?". | 10 | |
11 | 11 | - The unevictable page list. | |
12 | Unevictable LRU Infrastructure: | 12 | - Memory control group interaction. |
13 | 13 | - Marking address spaces unevictable. | |
14 | The Unevictable LRU adds an additional LRU list to track unevictable pages | 14 | - Detecting Unevictable Pages. |
15 | and to hide these pages from vmscan. This mechanism is based on a patch by | 15 | - vmscan's handling of unevictable pages. |
16 | Larry Woodman of Red Hat to address several scalability problems with page | 16 | |
17 | (*) mlock()'d pages. | ||
18 | |||
19 | - History. | ||
20 | - Basic management. | ||
21 | - mlock()/mlockall() system call handling. | ||
22 | - Filtering special vmas. | ||
23 | - munlock()/munlockall() system call handling. | ||
24 | - Migrating mlocked pages. | ||
25 | - mmap(MAP_LOCKED) system call handling. | ||
26 | - munmap()/exit()/exec() system call handling. | ||
27 | - try_to_unmap(). | ||
28 | - try_to_munlock() reverse map scan. | ||
29 | - Page reclaim in shrink_*_list(). | ||
30 | |||
31 | |||
32 | ============ | ||
33 | INTRODUCTION | ||
34 | ============ | ||
35 | |||
36 | This document describes the Linux memory manager's "Unevictable LRU" | ||
37 | infrastructure and the use of this to manage several types of "unevictable" | ||
38 | pages. | ||
39 | |||
40 | The document attempts to provide the overall rationale behind this mechanism | ||
41 | and the rationale for some of the design decisions that drove the | ||
42 | implementation. The latter design rationale is discussed in the context of an | ||
43 | implementation description. Admittedly, one can obtain the implementation | ||
44 | details - the "what does it do?" - by reading the code. One hopes that the | ||
45 | descriptions below add value by provide the answer to "why does it do that?". | ||
46 | |||
47 | |||
48 | =================== | ||
49 | THE UNEVICTABLE LRU | ||
50 | =================== | ||
51 | |||
52 | The Unevictable LRU facility adds an additional LRU list to track unevictable | ||
53 | pages and to hide these pages from vmscan. This mechanism is based on a patch | ||
54 | by Larry Woodman of Red Hat to address several scalability problems with page | ||
17 | reclaim in Linux. The problems have been observed at customer sites on large | 55 | reclaim in Linux. The problems have been observed at customer sites on large |
18 | memory x86_64 systems. For example, a non-numal x86_64 platform with 128GB | 56 | memory x86_64 systems. |
19 | of main memory will have over 32 million 4k pages in a single zone. When a | 57 | |
20 | large fraction of these pages are not evictable for any reason [see below], | 58 | To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of |
21 | vmscan will spend a lot of time scanning the LRU lists looking for the small | 59 | main memory will have over 32 million 4k pages in a single zone. When a large |
22 | fraction of pages that are evictable. This can result in a situation where | 60 | fraction of these pages are not evictable for any reason [see below], vmscan |
23 | all cpus are spending 100% of their time in vmscan for hours or days on end, | 61 | will spend a lot of time scanning the LRU lists looking for the small fraction |
24 | with the system completely unresponsive. | 62 | of pages that are evictable. This can result in a situation where all CPUs are |
25 | 63 | spending 100% of their time in vmscan for hours or days on end, with the system | |
26 | The Unevictable LRU infrastructure addresses the following classes of | 64 | completely unresponsive. |
27 | unevictable pages: | 65 | |
28 | 66 | The unevictable list addresses the following classes of unevictable pages: | |
29 | + page owned by ramfs | 67 | |
30 | + page mapped into SHM_LOCKed shared memory regions | 68 | (*) Those owned by ramfs. |
31 | + page mapped into VM_LOCKED [mlock()ed] vmas | 69 | |
32 | 70 | (*) Those mapped into SHM_LOCK'd shared memory regions. | |
33 | The infrastructure might be able to handle other conditions that make pages | 71 | |
72 | (*) Those mapped into VM_LOCKED [mlock()ed] VMAs. | ||
73 | |||
74 | The infrastructure may also be able to handle other conditions that make pages | ||
34 | unevictable, either by definition or by circumstance, in the future. | 75 | unevictable, either by definition or by circumstance, in the future. |
35 | 76 | ||
36 | 77 | ||
37 | The Unevictable LRU List | 78 | THE UNEVICTABLE PAGE LIST |
79 | ------------------------- | ||
38 | 80 | ||
39 | The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list | 81 | The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list |
40 | called the "unevictable" list and an associated page flag, PG_unevictable, to | 82 | called the "unevictable" list and an associated page flag, PG_unevictable, to |
41 | indicate that the page is being managed on the unevictable list. The | 83 | indicate that the page is being managed on the unevictable list. |
42 | PG_unevictable flag is analogous to, and mutually exclusive with, the PG_active | 84 | |
43 | flag in that it indicates on which LRU list a page resides when PG_lru is set. | 85 | The PG_unevictable flag is analogous to, and mutually exclusive with, the |
44 | The unevictable LRU list is source configurable based on the UNEVICTABLE_LRU | 86 | PG_active flag in that it indicates on which LRU list a page resides when |
45 | Kconfig option. | 87 | PG_lru is set. The unevictable list is compile-time configurable based on the |
88 | UNEVICTABLE_LRU Kconfig option. | ||
46 | 89 | ||
47 | The Unevictable LRU infrastructure maintains unevictable pages on an additional | 90 | The Unevictable LRU infrastructure maintains unevictable pages on an additional |
48 | LRU list for a few reasons: | 91 | LRU list for a few reasons: |
49 | 92 | ||
50 | 1) We get to "treat unevictable pages just like we treat other pages in the | 93 | (1) We get to "treat unevictable pages just like we treat other pages in the |
51 | system, which means we get to use the same code to manipulate them, the | 94 | system - which means we get to use the same code to manipulate them, the |
52 | same code to isolate them (for migrate, etc.), the same code to keep track | 95 | same code to isolate them (for migrate, etc.), the same code to keep track |
53 | of the statistics, etc..." [Rik van Riel] | 96 | of the statistics, etc..." [Rik van Riel] |
97 | |||
98 | (2) We want to be able to migrate unevictable pages between nodes for memory | ||
99 | defragmentation, workload management and memory hotplug. The linux kernel | ||
100 | can only migrate pages that it can successfully isolate from the LRU | ||
101 | lists. If we were to maintain pages elsewhere than on an LRU-like list, | ||
102 | where they can be found by isolate_lru_page(), we would prevent their | ||
103 | migration, unless we reworked migration code to find the unevictable pages | ||
104 | itself. | ||
54 | 105 | ||
55 | 2) We want to be able to migrate unevictable pages between nodes--for memory | ||
56 | defragmentation, workload management and memory hotplug. The linux kernel | ||
57 | can only migrate pages that it can successfully isolate from the lru lists. | ||
58 | If we were to maintain pages elsewise than on an lru-like list, where they | ||
59 | can be found by isolate_lru_page(), we would prevent their migration, unless | ||
60 | we reworked migration code to find the unevictable pages. | ||
61 | 106 | ||
107 | The unevictable list does not differentiate between file-backed and anonymous, | ||
108 | swap-backed pages. This differentiation is only important while the pages are, | ||
109 | in fact, evictable. | ||
62 | 110 | ||
63 | The unevictable LRU list does not differentiate between file backed and swap | 111 | The unevictable list benefits from the "arrayification" of the per-zone LRU |
64 | backed [anon] pages. This differentiation is only important while the pages | 112 | lists and statistics originally proposed and posted by Christoph Lameter. |
65 | are, in fact, evictable. | ||
66 | 113 | ||
67 | The unevictable LRU list benefits from the "arrayification" of the per-zone | 114 | The unevictable list does not use the LRU pagevec mechanism. Rather, |
68 | LRU lists and statistics originally proposed and posted by Christoph Lameter. | 115 | unevictable pages are placed directly on the page's zone's unevictable list |
116 | under the zone lru_lock. This allows us to prevent the stranding of pages on | ||
117 | the unevictable list when one task has the page isolated from the LRU and other | ||
118 | tasks are changing the "evictability" state of the page. | ||
69 | 119 | ||
70 | The unevictable list does not use the lru pagevec mechanism. Rather, | ||
71 | unevictable pages are placed directly on the page's zone's unevictable | ||
72 | list under the zone lru_lock. The reason for this is to prevent stranding | ||
73 | of pages on the unevictable list when one task has the page isolated from the | ||
74 | lru and other tasks are changing the "evictability" state of the page. | ||
75 | 120 | ||
121 | MEMORY CONTROL GROUP INTERACTION | ||
122 | -------------------------------- | ||
76 | 123 | ||
77 | Unevictable LRU and Memory Controller Interaction | 124 | The unevictable LRU facility interacts with the memory control group [aka |
125 | memory controller; see Documentation/cgroups/memory.txt] by extending the | ||
126 | lru_list enum. | ||
127 | |||
128 | The memory controller data structure automatically gets a per-zone unevictable | ||
129 | list as a result of the "arrayification" of the per-zone LRU lists (one per | ||
130 | lru_list enum element). The memory controller tracks the movement of pages to | ||
131 | and from the unevictable list. | ||
78 | 132 | ||
79 | The memory controller data structure automatically gets a per zone unevictable | ||
80 | lru list as a result of the "arrayification" of the per-zone LRU lists. The | ||
81 | memory controller tracks the movement of pages to and from the unevictable list. | ||
82 | When a memory control group comes under memory pressure, the controller will | 133 | When a memory control group comes under memory pressure, the controller will |
83 | not attempt to reclaim pages on the unevictable list. This has a couple of | 134 | not attempt to reclaim pages on the unevictable list. This has a couple of |
84 | effects. Because the pages are "hidden" from reclaim on the unevictable list, | 135 | effects: |
85 | the reclaim process can be more efficient, dealing only with pages that have | 136 | |
86 | a chance of being reclaimed. On the other hand, if too many of the pages | 137 | (1) Because the pages are "hidden" from reclaim on the unevictable list, the |
87 | charged to the control group are unevictable, the evictable portion of the | 138 | reclaim process can be more efficient, dealing only with pages that have a |
88 | working set of the tasks in the control group may not fit into the available | 139 | chance of being reclaimed. |
89 | memory. This can cause the control group to thrash or to oom-kill tasks. | 140 | |
90 | 141 | (2) On the other hand, if too many of the pages charged to the control group | |
91 | 142 | are unevictable, the evictable portion of the working set of the tasks in | |
92 | Unevictable LRU: Detecting Unevictable Pages | 143 | the control group may not fit into the available memory. This can cause |
93 | 144 | the control group to thrash or to OOM-kill tasks. | |
94 | The function page_evictable(page, vma) in vmscan.c determines whether a | 145 | |
95 | page is evictable or not. For ramfs pages and pages in SHM_LOCKed regions, | 146 | |
96 | page_evictable() tests a new address space flag, AS_UNEVICTABLE, in the page's | 147 | MARKING ADDRESS SPACES UNEVICTABLE |
97 | address space using a wrapper function. Wrapper functions are used to set, | 148 | ---------------------------------- |
98 | clear and test the flag to reduce the requirement for #ifdef's throughout the | 149 | |
99 | source code. AS_UNEVICTABLE is set on ramfs inode/mapping when it is created. | 150 | For facilities such as ramfs none of the pages attached to the address space |
100 | This flag remains for the life of the inode. | 151 | may be evicted. To prevent eviction of any such pages, the AS_UNEVICTABLE |
101 | 152 | address space flag is provided, and this can be manipulated by a filesystem | |
102 | For shared memory regions, AS_UNEVICTABLE is set when an application | 153 | using a number of wrapper functions: |
103 | successfully SHM_LOCKs the region and is removed when the region is | 154 | |
104 | SHM_UNLOCKed. Note that shmctl(SHM_LOCK, ...) does not populate the page | 155 | (*) void mapping_set_unevictable(struct address_space *mapping); |
105 | tables for the region as does, for example, mlock(). So, we make no special | 156 | |
106 | effort to push any pages in the SHM_LOCKed region to the unevictable list. | 157 | Mark the address space as being completely unevictable. |
107 | Vmscan will do this when/if it encounters the pages during reclaim. On | 158 | |
108 | SHM_UNLOCK, shmctl() scans the pages in the region and "rescues" them from the | 159 | (*) void mapping_clear_unevictable(struct address_space *mapping); |
109 | unevictable list if no other condition keeps them unevictable. If a SHM_LOCKed | 160 | |
110 | region is destroyed, the pages are also "rescued" from the unevictable list in | 161 | Mark the address space as being evictable. |
111 | the process of freeing them. | 162 | |
112 | 163 | (*) int mapping_unevictable(struct address_space *mapping); | |
113 | page_evictable() detects mlock()ed pages by testing an additional page flag, | 164 | |
114 | PG_mlocked via the PageMlocked() wrapper. If the page is NOT mlocked, and a | 165 | Query the address space, and return true if it is completely |
115 | non-NULL vma is supplied, page_evictable() will check whether the vma is | 166 | unevictable. |
167 | |||
168 | These are currently used in two places in the kernel: | ||
169 | |||
170 | (1) By ramfs to mark the address spaces of its inodes when they are created, | ||
171 | and this mark remains for the life of the inode. | ||
172 | |||
173 | (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called. | ||
174 | |||
175 | Note that SHM_LOCK is not required to page in the locked pages if they're | ||
176 | swapped out; the application must touch the pages manually if it wants to | ||
177 | ensure they're in memory. | ||
178 | |||
179 | |||
180 | DETECTING UNEVICTABLE PAGES | ||
181 | --------------------------- | ||
182 | |||
183 | The function page_evictable() in vmscan.c determines whether a page is | ||
184 | evictable or not using the query function outlined above [see section "Marking | ||
185 | address spaces unevictable"] to check the AS_UNEVICTABLE flag. | ||
186 | |||
187 | For address spaces that are so marked after being populated (as SHM regions | ||
188 | might be), the lock action (eg: SHM_LOCK) can be lazy, and need not populate | ||
189 | the page tables for the region as does, for example, mlock(), nor need it make | ||
190 | any special effort to push any pages in the SHM_LOCK'd area to the unevictable | ||
191 | list. Instead, vmscan will do this if and when it encounters the pages during | ||
192 | a reclamation scan. | ||
193 | |||
194 | On an unlock action (such as SHM_UNLOCK), the unlocker (eg: shmctl()) must scan | ||
195 | the pages in the region and "rescue" them from the unevictable list if no other | ||
196 | condition is keeping them unevictable. If an unevictable region is destroyed, | ||
197 | the pages are also "rescued" from the unevictable list in the process of | ||
198 | freeing them. | ||
199 | |||
200 | page_evictable() also checks for mlocked pages by testing an additional page | ||
201 | flag, PG_mlocked (as wrapped by PageMlocked()). If the page is NOT mlocked, | ||
202 | and a non-NULL VMA is supplied, page_evictable() will check whether the VMA is | ||
116 | VM_LOCKED via is_mlocked_vma(). is_mlocked_vma() will SetPageMlocked() and | 203 | VM_LOCKED via is_mlocked_vma(). is_mlocked_vma() will SetPageMlocked() and |
117 | update the appropriate statistics if the vma is VM_LOCKED. This method allows | 204 | update the appropriate statistics if the vma is VM_LOCKED. This method allows |
118 | efficient "culling" of pages in the fault path that are being faulted in to | 205 | efficient "culling" of pages in the fault path that are being faulted in to |
119 | VM_LOCKED vmas. | 206 | VM_LOCKED VMAs. |
120 | 207 | ||
121 | 208 | ||
122 | Unevictable Pages and Vmscan [shrink_*_list()] | 209 | VMSCAN'S HANDLING OF UNEVICTABLE PAGES |
210 | -------------------------------------- | ||
123 | 211 | ||
124 | If unevictable pages are culled in the fault path, or moved to the unevictable | 212 | If unevictable pages are culled in the fault path, or moved to the unevictable |
125 | list at mlock() or mmap() time, vmscan will never encounter the pages until | 213 | list at mlock() or mmap() time, vmscan will not encounter the pages until they |
126 | they have become evictable again, for example, via munlock() and have been | 214 | have become evictable again (via munlock() for example) and have been "rescued" |
127 | "rescued" from the unevictable list. However, there may be situations where we | 215 | from the unevictable list. However, there may be situations where we decide, |
128 | decide, for the sake of expediency, to leave a unevictable page on one of the | 216 | for the sake of expediency, to leave a unevictable page on one of the regular |
129 | regular active/inactive LRU lists for vmscan to deal with. Vmscan checks for | 217 | active/inactive LRU lists for vmscan to deal with. vmscan checks for such |
130 | such pages in all of the shrink_{active|inactive|page}_list() functions and | 218 | pages in all of the shrink_{active|inactive|page}_list() functions and will |
131 | will "cull" such pages that it encounters--that is, it diverts those pages to | 219 | "cull" such pages that it encounters: that is, it diverts those pages to the |
132 | the unevictable list for the zone being scanned. | 220 | unevictable list for the zone being scanned. |
133 | 221 | ||
134 | There may be situations where a page is mapped into a VM_LOCKED vma, but the | 222 | There may be situations where a page is mapped into a VM_LOCKED VMA, but the |
135 | page is not marked as PageMlocked. Such pages will make it all the way to | 223 | page is not marked as PG_mlocked. Such pages will make it all the way to |
136 | shrink_page_list() where they will be detected when vmscan walks the reverse | 224 | shrink_page_list() where they will be detected when vmscan walks the reverse |
137 | map in try_to_unmap(). If try_to_unmap() returns SWAP_MLOCK, shrink_page_list() | 225 | map in try_to_unmap(). If try_to_unmap() returns SWAP_MLOCK, |
138 | will cull the page at that point. | 226 | shrink_page_list() will cull the page at that point. |
139 | 227 | ||
140 | To "cull" an unevictable page, vmscan simply puts the page back on the lru | 228 | To "cull" an unevictable page, vmscan simply puts the page back on the LRU list |
141 | list using putback_lru_page()--the inverse operation to isolate_lru_page()-- | 229 | using putback_lru_page() - the inverse operation to isolate_lru_page() - after |
142 | after dropping the page lock. Because the condition which makes the page | 230 | dropping the page lock. Because the condition which makes the page unevictable |
143 | unevictable may change once the page is unlocked, putback_lru_page() will | 231 | may change once the page is unlocked, putback_lru_page() will recheck the |
144 | recheck the unevictable state of a page that it places on the unevictable lru | 232 | unevictable state of a page that it places on the unevictable list. If the |
145 | list. If the page has become unevictable, putback_lru_page() removes it from | 233 | page has become unevictable, putback_lru_page() removes it from the list and |
146 | the list and retries, including the page_unevictable() test. Because such a | 234 | retries, including the page_unevictable() test. Because such a race is a rare |
147 | race is a rare event and movement of pages onto the unevictable list should be | 235 | event and movement of pages onto the unevictable list should be rare, these |
148 | rare, these extra evictabilty checks should not occur in the majority of calls | 236 | extra evictabilty checks should not occur in the majority of calls to |
149 | to putback_lru_page(). | 237 | putback_lru_page(). |
150 | 238 | ||
151 | 239 | ||
152 | Mlocked Page: Prior Work | 240 | ============= |
241 | MLOCKED PAGES | ||
242 | ============= | ||
153 | 243 | ||
154 | The "Unevictable Mlocked Pages" infrastructure is based on work originally | 244 | The unevictable page list is also useful for mlock(), in addition to ramfs and |
245 | SYSV SHM. Note that mlock() is only available in CONFIG_MMU=y situations; in | ||
246 | NOMMU situations, all mappings are effectively mlocked. | ||
247 | |||
248 | |||
249 | HISTORY | ||
250 | ------- | ||
251 | |||
252 | The "Unevictable mlocked Pages" infrastructure is based on work originally | ||
155 | posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU". | 253 | posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU". |
156 | Nick posted his patch as an alternative to a patch posted by Christoph | 254 | Nick posted his patch as an alternative to a patch posted by Christoph Lameter |
157 | Lameter to achieve the same objective--hiding mlocked pages from vmscan. | 255 | to achieve the same objective: hiding mlocked pages from vmscan. |
158 | In Nick's patch, he used one of the struct page lru list link fields as a count | 256 | |
159 | of VM_LOCKED vmas that map the page. This use of the link field for a count | 257 | In Nick's patch, he used one of the struct page LRU list link fields as a count |
160 | prevented the management of the pages on an LRU list. Thus, mlocked pages were | 258 | of VM_LOCKED VMAs that map the page. This use of the link field for a count |
161 | not migratable as isolate_lru_page() could not find them and the lru list link | 259 | prevented the management of the pages on an LRU list, and thus mlocked pages |
162 | field was not available to the migration subsystem. Nick resolved this by | 260 | were not migratable as isolate_lru_page() could not find them, and the LRU list |
163 | putting mlocked pages back on the lru list before attempting to isolate them, | 261 | link field was not available to the migration subsystem. |
164 | thus abandoning the count of VM_LOCKED vmas. When Nick's patch was integrated | 262 | |
165 | with the Unevictable LRU work, the count was replaced by walking the reverse | 263 | Nick resolved this by putting mlocked pages back on the lru list before |
166 | map to determine whether any VM_LOCKED vmas mapped the page. More on this | 264 | attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs. When |
167 | below. | 265 | Nick's patch was integrated with the Unevictable LRU work, the count was |
168 | 266 | replaced by walking the reverse map to determine whether any VM_LOCKED VMAs | |
169 | 267 | mapped the page. More on this below. | |
170 | Mlocked Pages: Basic Management | 268 | |
171 | 269 | ||
172 | Mlocked pages--pages mapped into a VM_LOCKED vma--represent one class of | 270 | BASIC MANAGEMENT |
173 | unevictable pages. When such a page has been "noticed" by the memory | 271 | ---------------- |
174 | management subsystem, the page is marked with the PG_mlocked [PageMlocked()] | 272 | |
175 | flag. A PageMlocked() page will be placed on the unevictable LRU list when | 273 | mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable |
176 | it is added to the LRU. Pages can be "noticed" by memory management in | 274 | pages. When such a page has been "noticed" by the memory management subsystem, |
177 | several places: | 275 | the page is marked with the PG_mlocked flag. This can be manipulated using the |
178 | 276 | PageMlocked() functions. | |
179 | 1) in the mlock()/mlockall() system call handlers. | 277 | |
180 | 2) in the mmap() system call handler when mmap()ing a region with the | 278 | A PG_mlocked page will be placed on the unevictable list when it is added to |
181 | MAP_LOCKED flag, or mmap()ing a region in a task that has called | 279 | the LRU. Such pages can be "noticed" by memory management in several places: |
182 | mlockall() with the MCL_FUTURE flag. Both of these conditions result | 280 | |
183 | in the VM_LOCKED flag being set for the vma. | 281 | (1) in the mlock()/mlockall() system call handlers; |
184 | 3) in the fault path, if mlocked pages are "culled" in the fault path, | 282 | |
185 | and when a VM_LOCKED stack segment is expanded. | 283 | (2) in the mmap() system call handler when mmapping a region with the |
186 | 4) as mentioned above, in vmscan:shrink_page_list() when attempting to | 284 | MAP_LOCKED flag; |
187 | reclaim a page in a VM_LOCKED vma via try_to_unmap(). | 285 | |
188 | 286 | (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE | |
189 | Mlocked pages become unlocked and rescued from the unevictable list when: | 287 | flag |
190 | 288 | ||
191 | 1) mapped in a range unlocked via the munlock()/munlockall() system calls. | 289 | (4) in the fault path, if mlocked pages are "culled" in the fault path, |
192 | 2) munmapped() out of the last VM_LOCKED vma that maps the page, including | 290 | and when a VM_LOCKED stack segment is expanded; or |
193 | unmapping at task exit. | 291 | |
194 | 3) when the page is truncated from the last VM_LOCKED vma of an mmap()ed file. | 292 | (5) as mentioned above, in vmscan:shrink_page_list() when attempting to |
195 | 4) before a page is COWed in a VM_LOCKED vma. | 293 | reclaim a page in a VM_LOCKED VMA via try_to_unmap() |
196 | 294 | ||
197 | 295 | all of which result in the VM_LOCKED flag being set for the VMA if it doesn't | |
198 | Mlocked Pages: mlock()/mlockall() System Call Handling | 296 | already have it set. |
297 | |||
298 | mlocked pages become unlocked and rescued from the unevictable list when: | ||
299 | |||
300 | (1) mapped in a range unlocked via the munlock()/munlockall() system calls; | ||
301 | |||
302 | (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including | ||
303 | unmapping at task exit; | ||
304 | |||
305 | (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file; | ||
306 | or | ||
307 | |||
308 | (4) before a page is COW'd in a VM_LOCKED VMA. | ||
309 | |||
310 | |||
311 | mlock()/mlockall() SYSTEM CALL HANDLING | ||
312 | --------------------------------------- | ||
199 | 313 | ||
200 | Both [do_]mlock() and [do_]mlockall() system call handlers call mlock_fixup() | 314 | Both [do_]mlock() and [do_]mlockall() system call handlers call mlock_fixup() |
201 | for each vma in the range specified by the call. In the case of mlockall(), | 315 | for each VMA in the range specified by the call. In the case of mlockall(), |
202 | this is the entire active address space of the task. Note that mlock_fixup() | 316 | this is the entire active address space of the task. Note that mlock_fixup() |
203 | is used for both mlock()ing and munlock()ing a range of memory. A call to | 317 | is used for both mlocking and munlocking a range of memory. A call to mlock() |
204 | mlock() an already VM_LOCKED vma, or to munlock() a vma that is not VM_LOCKED | 318 | an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED is |
205 | is treated as a no-op--mlock_fixup() simply returns. | 319 | treated as a no-op, and mlock_fixup() simply returns. |
206 | 320 | ||
207 | If the vma passes some filtering described in "Mlocked Pages: Filtering Vmas" | 321 | If the VMA passes some filtering as described in "Filtering Special Vmas" |
208 | below, mlock_fixup() will attempt to merge the vma with its neighbors or split | 322 | below, mlock_fixup() will attempt to merge the VMA with its neighbors or split |
209 | off a subset of the vma if the range does not cover the entire vma. Once the | 323 | off a subset of the VMA if the range does not cover the entire VMA. Once the |
210 | vma has been merged or split or neither, mlock_fixup() will call | 324 | VMA has been merged or split or neither, mlock_fixup() will call |
211 | __mlock_vma_pages_range() to fault in the pages via get_user_pages() and | 325 | __mlock_vma_pages_range() to fault in the pages via get_user_pages() and to |
212 | to mark the pages as mlocked via mlock_vma_page(). | 326 | mark the pages as mlocked via mlock_vma_page(). |
213 | 327 | ||
214 | Note that the vma being mlocked might be mapped with PROT_NONE. In this case, | 328 | Note that the VMA being mlocked might be mapped with PROT_NONE. In this case, |
215 | get_user_pages() will be unable to fault in the pages. That's OK. If pages | 329 | get_user_pages() will be unable to fault in the pages. That's okay. If pages |
216 | do end up getting faulted into this VM_LOCKED vma, we'll handle them in the | 330 | do end up getting faulted into this VM_LOCKED VMA, we'll handle them in the |
217 | fault path or in vmscan. | 331 | fault path or in vmscan. |
218 | 332 | ||
219 | Also note that a page returned by get_user_pages() could be truncated or | 333 | Also note that a page returned by get_user_pages() could be truncated or |
220 | migrated out from under us, while we're trying to mlock it. To detect | 334 | migrated out from under us, while we're trying to mlock it. To detect this, |
221 | this, __mlock_vma_pages_range() tests the page_mapping after acquiring | 335 | __mlock_vma_pages_range() checks page_mapping() after acquiring the page lock. |
222 | the page lock. If the page is still associated with its mapping, we'll | 336 | If the page is still associated with its mapping, we'll go ahead and call |
223 | go ahead and call mlock_vma_page(). If the mapping is gone, we just | 337 | mlock_vma_page(). If the mapping is gone, we just unlock the page and move on. |
224 | unlock the page and move on. Worse case, this results in page mapped | 338 | In the worst case, this will result in a page mapped in a VM_LOCKED VMA |
225 | in a VM_LOCKED vma remaining on a normal LRU list without being | 339 | remaining on a normal LRU list without being PageMlocked(). Again, vmscan will |
226 | PageMlocked(). Again, vmscan will detect and cull such pages. | 340 | detect and cull such pages. |
227 | 341 | ||
228 | mlock_vma_page(), called with the page locked [N.B., not "mlocked"], will | 342 | mlock_vma_page() will call TestSetPageMlocked() for each page returned by |
229 | TestSetPageMlocked() for each page returned by get_user_pages(). We use | 343 | get_user_pages(). We use TestSetPageMlocked() because the page might already |
230 | TestSetPageMlocked() because the page might already be mlocked by another | 344 | be mlocked by another task/VMA and we don't want to do extra work. We |
231 | task/vma and we don't want to do extra work. We especially do not want to | 345 | especially do not want to count an mlocked page more than once in the |
232 | count an mlocked page more than once in the statistics. If the page was | 346 | statistics. If the page was already mlocked, mlock_vma_page() need do nothing |
233 | already mlocked, mlock_vma_page() is done. | 347 | more. |
234 | 348 | ||
235 | If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the | 349 | If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the |
236 | page from the LRU, as it is likely on the appropriate active or inactive list | 350 | page from the LRU, as it is likely on the appropriate active or inactive list |
237 | at that time. If the isolate_lru_page() succeeds, mlock_vma_page() will | 351 | at that time. If the isolate_lru_page() succeeds, mlock_vma_page() will put |
238 | putback the page--putback_lru_page()--which will notice that the page is now | 352 | back the page - by calling putback_lru_page() - which will notice that the page |
239 | mlocked and divert the page to the zone's unevictable LRU list. If | 353 | is now mlocked and divert the page to the zone's unevictable list. If |
240 | mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle | 354 | mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle |
241 | it later if/when it attempts to reclaim the page. | 355 | it later if and when it attempts to reclaim the page. |
242 | 356 | ||
243 | 357 | ||
244 | Mlocked Pages: Filtering Special Vmas | 358 | FILTERING SPECIAL VMAS |
359 | ---------------------- | ||
245 | 360 | ||
246 | mlock_fixup() filters several classes of "special" vmas: | 361 | mlock_fixup() filters several classes of "special" VMAs: |
247 | 362 | ||
248 | 1) vmas with VM_IO|VM_PFNMAP set are skipped entirely. The pages behind | 363 | 1) VMAs with VM_IO or VM_PFNMAP set are skipped entirely. The pages behind |
249 | these mappings are inherently pinned, so we don't need to mark them as | 364 | these mappings are inherently pinned, so we don't need to mark them as |
250 | mlocked. In any case, most of the pages have no struct page in which to | 365 | mlocked. In any case, most of the pages have no struct page in which to so |
251 | so mark the page. Because of this, get_user_pages() will fail for these | 366 | mark the page. Because of this, get_user_pages() will fail for these VMAs, |
252 | vmas, so there is no sense in attempting to visit them. | 367 | so there is no sense in attempting to visit them. |
253 | 368 | ||
254 | 2) vmas mapping hugetlbfs page are already effectively pinned into memory. | 369 | 2) VMAs mapping hugetlbfs page are already effectively pinned into memory. We |
255 | We don't need nor want to mlock() these pages. However, to preserve the | 370 | neither need nor want to mlock() these pages. However, to preserve the |
256 | prior behavior of mlock()--before the unevictable/mlock changes-- | 371 | prior behavior of mlock() - before the unevictable/mlock changes - |
257 | mlock_fixup() will call make_pages_present() in the hugetlbfs vma range | 372 | mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to |
258 | to allocate the huge pages and populate the ptes. | 373 | allocate the huge pages and populate the ptes. |
259 | 374 | ||
260 | 3) vmas with VM_DONTEXPAND|VM_RESERVED are generally user space mappings of | 375 | 3) VMAs with VM_DONTEXPAND or VM_RESERVED are generally userspace mappings of |
261 | kernel pages, such as the vdso page, relay channel pages, etc. These pages | 376 | kernel pages, such as the VDSO page, relay channel pages, etc. These pages |
262 | are inherently unevictable and are not managed on the LRU lists. | 377 | are inherently unevictable and are not managed on the LRU lists. |
263 | mlock_fixup() treats these vmas the same as hugetlbfs vmas. It calls | 378 | mlock_fixup() treats these VMAs the same as hugetlbfs VMAs. It calls |
264 | make_pages_present() to populate the ptes. | 379 | make_pages_present() to populate the ptes. |
265 | 380 | ||
266 | Note that for all of these special vmas, mlock_fixup() does not set the | 381 | Note that for all of these special VMAs, mlock_fixup() does not set the |
267 | VM_LOCKED flag. Therefore, we won't have to deal with them later during | 382 | VM_LOCKED flag. Therefore, we won't have to deal with them later during |
268 | munlock() or munmap()--for example, at task exit. Neither does mlock_fixup() | 383 | munlock(), munmap() or task exit. Neither does mlock_fixup() account these |
269 | account these vmas against the task's "locked_vm". | 384 | VMAs against the task's "locked_vm". |
270 | 385 | ||
271 | Mlocked Pages: Downgrading the Mmap Semaphore. | 386 | |
272 | 387 | munlock()/munlockall() SYSTEM CALL HANDLING | |
273 | mlock_fixup() must be called with the mmap semaphore held for write, because | 388 | ------------------------------------------- |
274 | it may have to merge or split vmas. However, mlocking a large region of | 389 | |
275 | memory can take a long time--especially if vmscan must reclaim pages to | 390 | The munlock() and munlockall() system calls are handled by the same functions - |
276 | satisfy the regions requirements. Faulting in a large region with the mmap | 391 | do_mlock[all]() - as the mlock() and mlockall() system calls with the unlock vs |
277 | semaphore held for write can hold off other faults on the address space, in | 392 | lock operation indicated by an argument. So, these system calls are also |
278 | the case of a multi-threaded task. It can also hold off scans of the task's | 393 | handled by mlock_fixup(). Again, if called for an already munlocked VMA, |
279 | address space via /proc. While testing under heavy load, it was observed that | 394 | mlock_fixup() simply returns. Because of the VMA filtering discussed above, |
280 | the ps(1) command could be held off for many minutes while a large segment was | 395 | VM_LOCKED will not be set in any "special" VMAs. So, these VMAs will be |
281 | mlock()ed down. | ||
282 | |||
283 | To address this issue, and to make the system more responsive during mlock()ing | ||
284 | of large segments, mlock_fixup() downgrades the mmap semaphore to read mode | ||
285 | during the call to __mlock_vma_pages_range(). This works fine. However, the | ||
286 | callers of mlock_fixup() expect the semaphore to be returned in write mode. | ||
287 | So, mlock_fixup() "upgrades" the semphore to write mode. Linux does not | ||
288 | support an atomic upgrade_sem() call, so mlock_fixup() must drop the semaphore | ||
289 | and reacquire it in write mode. In a multi-threaded task, it is possible for | ||
290 | the task memory map to change while the semaphore is dropped. Therefore, | ||
291 | mlock_fixup() looks up the vma at the range start address after reacquiring | ||
292 | the semaphore in write mode and verifies that it still covers the original | ||
293 | range. If not, mlock_fixup() returns an error [-EAGAIN]. All callers of | ||
294 | mlock_fixup() have been changed to deal with this new error condition. | ||
295 | |||
296 | Note: when munlocking a region, all of the pages should already be resident-- | ||
297 | unless we have racing threads mlocking() and munlocking() regions. So, | ||
298 | unlocking should not have to wait for page allocations nor faults of any kind. | ||
299 | Therefore mlock_fixup() does not downgrade the semaphore for munlock(). | ||
300 | |||
301 | |||
302 | Mlocked Pages: munlock()/munlockall() System Call Handling | ||
303 | |||
304 | The munlock() and munlockall() system calls are handled by the same functions-- | ||
305 | do_mlock[all]()--as the mlock() and mlockall() system calls with the unlock | ||
306 | vs lock operation indicated by an argument. So, these system calls are also | ||
307 | handled by mlock_fixup(). Again, if called for an already munlock()ed vma, | ||
308 | mlock_fixup() simply returns. Because of the vma filtering discussed above, | ||
309 | VM_LOCKED will not be set in any "special" vmas. So, these vmas will be | ||
310 | ignored for munlock. | 396 | ignored for munlock. |
311 | 397 | ||
312 | If the vma is VM_LOCKED, mlock_fixup() again attempts to merge or split off | 398 | If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the |
313 | the specified range. The range is then munlocked via the function | 399 | specified range. The range is then munlocked via the function |
314 | __mlock_vma_pages_range()--the same function used to mlock a vma range-- | 400 | __mlock_vma_pages_range() - the same function used to mlock a VMA range - |
315 | passing a flag to indicate that munlock() is being performed. | 401 | passing a flag to indicate that munlock() is being performed. |
316 | 402 | ||
317 | Because the vma access protections could have been changed to PROT_NONE after | 403 | Because the VMA access protections could have been changed to PROT_NONE after |
318 | faulting in and mlocking pages, get_user_pages() was unreliable for visiting | 404 | faulting in and mlocking pages, get_user_pages() was unreliable for visiting |
319 | these pages for munlocking. Because we don't want to leave pages mlocked(), | 405 | these pages for munlocking. Because we don't want to leave pages mlocked, |
320 | get_user_pages() was enhanced to accept a flag to ignore the permissions when | 406 | get_user_pages() was enhanced to accept a flag to ignore the permissions when |
321 | fetching the pages--all of which should be resident as a result of previous | 407 | fetching the pages - all of which should be resident as a result of previous |
322 | mlock()ing. | 408 | mlocking. |
323 | 409 | ||
324 | For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling | 410 | For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling |
325 | munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked | 411 | munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked |
326 | flag using TestClearPageMlocked(). As with mlock_vma_page(), munlock_vma_page() | 412 | flag using TestClearPageMlocked(). As with mlock_vma_page(), |
327 | use the Test*PageMlocked() function to handle the case where the page might | 413 | munlock_vma_page() use the Test*PageMlocked() function to handle the case where |
328 | have already been unlocked by another task. If the page was mlocked, | 414 | the page might have already been unlocked by another task. If the page was |
329 | munlock_vma_page() updates that zone statistics for the number of mlocked | 415 | mlocked, munlock_vma_page() updates that zone statistics for the number of |
330 | pages. Note, however, that at this point we haven't checked whether the page | 416 | mlocked pages. Note, however, that at this point we haven't checked whether |
331 | is mapped by other VM_LOCKED vmas. | 417 | the page is mapped by other VM_LOCKED VMAs. |
332 | 418 | ||
333 | We can't call try_to_munlock(), the function that walks the reverse map to check | 419 | We can't call try_to_munlock(), the function that walks the reverse map to |
334 | for other VM_LOCKED vmas, without first isolating the page from the LRU. | 420 | check for other VM_LOCKED VMAs, without first isolating the page from the LRU. |
335 | try_to_munlock() is a variant of try_to_unmap() and thus requires that the page | 421 | try_to_munlock() is a variant of try_to_unmap() and thus requires that the page |
336 | not be on an lru list. [More on these below.] However, the call to | 422 | not be on an LRU list [more on these below]. However, the call to |
337 | isolate_lru_page() could fail, in which case we couldn't try_to_munlock(). | 423 | isolate_lru_page() could fail, in which case we couldn't try_to_munlock(). So, |
338 | So, we go ahead and clear PG_mlocked up front, as this might be the only chance | 424 | we go ahead and clear PG_mlocked up front, as this might be the only chance we |
339 | we have. If we can successfully isolate the page, we go ahead and | 425 | have. If we can successfully isolate the page, we go ahead and |
340 | try_to_munlock(), which will restore the PG_mlocked flag and update the zone | 426 | try_to_munlock(), which will restore the PG_mlocked flag and update the zone |
341 | page statistics if it finds another vma holding the page mlocked. If we fail | 427 | page statistics if it finds another VMA holding the page mlocked. If we fail |
342 | to isolate the page, we'll have left a potentially mlocked page on the LRU. | 428 | to isolate the page, we'll have left a potentially mlocked page on the LRU. |
343 | This is fine, because we'll catch it later when/if vmscan tries to reclaim the | 429 | This is fine, because we'll catch it later if and if vmscan tries to reclaim |
344 | page. This should be relatively rare. | 430 | the page. This should be relatively rare. |
345 | 431 | ||
346 | Mlocked Pages: Migrating Them... | 432 | |
347 | 433 | MIGRATING MLOCKED PAGES | |
348 | A page that is being migrated has been isolated from the lru lists and is | 434 | ----------------------- |
349 | held locked across unmapping of the page, updating the page's mapping | 435 | |
350 | [address_space] entry and copying the contents and state, until the | 436 | A page that is being migrated has been isolated from the LRU lists and is held |
351 | page table entry has been replaced with an entry that refers to the new | 437 | locked across unmapping of the page, updating the page's address space entry |
352 | page. Linux supports migration of mlocked pages and other unevictable | 438 | and copying the contents and state, until the page table entry has been |
353 | pages. This involves simply moving the PageMlocked and PageUnevictable states | 439 | replaced with an entry that refers to the new page. Linux supports migration |
354 | from the old page to the new page. | 440 | of mlocked pages and other unevictable pages. This involves simply moving the |
355 | 441 | PG_mlocked and PG_unevictable states from the old page to the new page. | |
356 | Note that page migration can race with mlocking or munlocking of the same | 442 | |
357 | page. This has been discussed from the mlock/munlock perspective in the | 443 | Note that page migration can race with mlocking or munlocking of the same page. |
358 | respective sections above. Both processes [migration, m[un]locking], hold | 444 | This has been discussed from the mlock/munlock perspective in the respective |
359 | the page locked. This provides the first level of synchronization. Page | 445 | sections above. Both processes (migration and m[un]locking) hold the page |
360 | migration zeros out the page_mapping of the old page before unlocking it, | 446 | locked. This provides the first level of synchronization. Page migration |
361 | so m[un]lock can skip these pages by testing the page mapping under page | 447 | zeros out the page_mapping of the old page before unlocking it, so m[un]lock |
362 | lock. | 448 | can skip these pages by testing the page mapping under page lock. |
363 | 449 | ||
364 | When completing page migration, we place the new and old pages back onto the | 450 | To complete page migration, we place the new and old pages back onto the LRU |
365 | lru after dropping the page lock. The "unneeded" page--old page on success, | 451 | after dropping the page lock. The "unneeded" page - old page on success, new |
366 | new page on failure--will be freed when the reference count held by the | 452 | page on failure - will be freed when the reference count held by the migration |
367 | migration process is released. To ensure that we don't strand pages on the | 453 | process is released. To ensure that we don't strand pages on the unevictable |
368 | unevictable list because of a race between munlock and migration, page | 454 | list because of a race between munlock and migration, page migration uses the |
369 | migration uses the putback_lru_page() function to add migrated pages back to | 455 | putback_lru_page() function to add migrated pages back to the LRU. |
370 | the lru. | 456 | |
371 | 457 | ||
372 | 458 | mmap(MAP_LOCKED) SYSTEM CALL HANDLING | |
373 | Mlocked Pages: mmap(MAP_LOCKED) System Call Handling | 459 | ------------------------------------- |
374 | 460 | ||
375 | In addition the the mlock()/mlockall() system calls, an application can request | 461 | In addition the the mlock()/mlockall() system calls, an application can request |
376 | that a region of memory be mlocked using the MAP_LOCKED flag with the mmap() | 462 | that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap() |
377 | call. Furthermore, any mmap() call or brk() call that expands the heap by a | 463 | call. Furthermore, any mmap() call or brk() call that expands the heap by a |
378 | task that has previously called mlockall() with the MCL_FUTURE flag will result | 464 | task that has previously called mlockall() with the MCL_FUTURE flag will result |
379 | in the newly mapped memory being mlocked. Before the unevictable/mlock changes, | 465 | in the newly mapped memory being mlocked. Before the unevictable/mlock |
380 | the kernel simply called make_pages_present() to allocate pages and populate | 466 | changes, the kernel simply called make_pages_present() to allocate pages and |
381 | the page table. | 467 | populate the page table. |
382 | 468 | ||
383 | To mlock a range of memory under the unevictable/mlock infrastructure, the | 469 | To mlock a range of memory under the unevictable/mlock infrastructure, the |
384 | mmap() handler and task address space expansion functions call | 470 | mmap() handler and task address space expansion functions call |
385 | mlock_vma_pages_range() specifying the vma and the address range to mlock. | 471 | mlock_vma_pages_range() specifying the vma and the address range to mlock. |
386 | mlock_vma_pages_range() filters vmas like mlock_fixup(), as described above in | 472 | mlock_vma_pages_range() filters VMAs like mlock_fixup(), as described above in |
387 | "Mlocked Pages: Filtering Vmas". It will clear the VM_LOCKED flag, which will | 473 | "Filtering Special VMAs". It will clear the VM_LOCKED flag, which will have |
388 | have already been set by the caller, in filtered vmas. Thus these vma's need | 474 | already been set by the caller, in filtered VMAs. Thus these VMA's need not be |
389 | not be visited for munlock when the region is unmapped. | 475 | visited for munlock when the region is unmapped. |
390 | 476 | ||
391 | For "normal" vmas, mlock_vma_pages_range() calls __mlock_vma_pages_range() to | 477 | For "normal" VMAs, mlock_vma_pages_range() calls __mlock_vma_pages_range() to |
392 | fault/allocate the pages and mlock them. Again, like mlock_fixup(), | 478 | fault/allocate the pages and mlock them. Again, like mlock_fixup(), |
393 | mlock_vma_pages_range() downgrades the mmap semaphore to read mode before | 479 | mlock_vma_pages_range() downgrades the mmap semaphore to read mode before |
394 | attempting to fault/allocate and mlock the pages; and "upgrades" the semaphore | 480 | attempting to fault/allocate and mlock the pages and "upgrades" the semaphore |
395 | back to write mode before returning. | 481 | back to write mode before returning. |
396 | 482 | ||
397 | The callers of mlock_vma_pages_range() will have already added the memory | 483 | The callers of mlock_vma_pages_range() will have already added the memory range |
398 | range to be mlocked to the task's "locked_vm". To account for filtered vmas, | 484 | to be mlocked to the task's "locked_vm". To account for filtered VMAs, |
399 | mlock_vma_pages_range() returns the number of pages NOT mlocked. All of the | 485 | mlock_vma_pages_range() returns the number of pages NOT mlocked. All of the |
400 | callers then subtract a non-negative return value from the task's locked_vm. | 486 | callers then subtract a non-negative return value from the task's locked_vm. A |
401 | A negative return value represent an error--for example, from get_user_pages() | 487 | negative return value represent an error - for example, from get_user_pages() |
402 | attempting to fault in a vma with PROT_NONE access. In this case, we leave | 488 | attempting to fault in a VMA with PROT_NONE access. In this case, we leave the |
403 | the memory range accounted as locked_vm, as the protections could be changed | 489 | memory range accounted as locked_vm, as the protections could be changed later |
404 | later and pages allocated into that region. | 490 | and pages allocated into that region. |
405 | 491 | ||
406 | 492 | ||
407 | Mlocked Pages: munmap()/exit()/exec() System Call Handling | 493 | munmap()/exit()/exec() SYSTEM CALL HANDLING |
494 | ------------------------------------------- | ||
408 | 495 | ||
409 | When unmapping an mlocked region of memory, whether by an explicit call to | 496 | When unmapping an mlocked region of memory, whether by an explicit call to |
410 | munmap() or via an internal unmap from exit() or exec() processing, we must | 497 | munmap() or via an internal unmap from exit() or exec() processing, we must |
411 | munlock the pages if we're removing the last VM_LOCKED vma that maps the pages. | 498 | munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages. |
412 | Before the unevictable/mlock changes, mlocking did not mark the pages in any | 499 | Before the unevictable/mlock changes, mlocking did not mark the pages in any |
413 | way, so unmapping them required no processing. | 500 | way, so unmapping them required no processing. |
414 | 501 | ||
415 | To munlock a range of memory under the unevictable/mlock infrastructure, the | 502 | To munlock a range of memory under the unevictable/mlock infrastructure, the |
416 | munmap() hander and task address space tear down function call | 503 | munmap() handler and task address space call tear down function |
417 | munlock_vma_pages_all(). The name reflects the observation that one always | 504 | munlock_vma_pages_all(). The name reflects the observation that one always |
418 | specifies the entire vma range when munlock()ing during unmap of a region. | 505 | specifies the entire VMA range when munlock()ing during unmap of a region. |
419 | Because of the vma filtering when mlocking() regions, only "normal" vmas that | 506 | Because of the VMA filtering when mlocking() regions, only "normal" VMAs that |
420 | actually contain mlocked pages will be passed to munlock_vma_pages_all(). | 507 | actually contain mlocked pages will be passed to munlock_vma_pages_all(). |
421 | 508 | ||
422 | munlock_vma_pages_all() clears the VM_LOCKED vma flag and, like mlock_fixup() | 509 | munlock_vma_pages_all() clears the VM_LOCKED VMA flag and, like mlock_fixup() |
423 | for the munlock case, calls __munlock_vma_pages_range() to walk the page table | 510 | for the munlock case, calls __munlock_vma_pages_range() to walk the page table |
424 | for the vma's memory range and munlock_vma_page() each resident page mapped by | 511 | for the VMA's memory range and munlock_vma_page() each resident page mapped by |
425 | the vma. This effectively munlocks the page, only if this is the last | 512 | the VMA. This effectively munlocks the page, only if this is the last |
426 | VM_LOCKED vma that maps the page. | 513 | VM_LOCKED VMA that maps the page. |
427 | |||
428 | 514 | ||
429 | Mlocked Page: try_to_unmap() | ||
430 | 515 | ||
431 | [Note: the code changes represented by this section are really quite small | 516 | try_to_unmap() |
432 | compared to the text to describe what happening and why, and to discuss the | 517 | -------------- |
433 | implications.] | ||
434 | 518 | ||
435 | Pages can, of course, be mapped into multiple vmas. Some of these vmas may | 519 | Pages can, of course, be mapped into multiple VMAs. Some of these VMAs may |
436 | have VM_LOCKED flag set. It is possible for a page mapped into one or more | 520 | have VM_LOCKED flag set. It is possible for a page mapped into one or more |
437 | VM_LOCKED vmas not to have the PG_mlocked flag set and therefore reside on one | 521 | VM_LOCKED VMAs not to have the PG_mlocked flag set and therefore reside on one |
438 | of the active or inactive LRU lists. This could happen if, for example, a | 522 | of the active or inactive LRU lists. This could happen if, for example, a task |
439 | task in the process of munlock()ing the page could not isolate the page from | 523 | in the process of munlocking the page could not isolate the page from the LRU. |
440 | the LRU. As a result, vmscan/shrink_page_list() might encounter such a page | 524 | As a result, vmscan/shrink_page_list() might encounter such a page as described |
441 | as described in "Unevictable Pages and Vmscan [shrink_*_list()]". To | 525 | in section "vmscan's handling of unevictable pages". To handle this situation, |
442 | handle this situation, try_to_unmap() has been enhanced to check for VM_LOCKED | 526 | try_to_unmap() checks for VM_LOCKED VMAs while it is walking a page's reverse |
443 | vmas while it is walking a page's reverse map. | 527 | map. |
444 | 528 | ||
445 | try_to_unmap() is always called, by either vmscan for reclaim or for page | 529 | try_to_unmap() is always called, by either vmscan for reclaim or for page |
446 | migration, with the argument page locked and isolated from the LRU. BUG_ON() | 530 | migration, with the argument page locked and isolated from the LRU. Separate |
447 | assertions enforce this requirement. Separate functions handle anonymous and | 531 | functions handle anonymous and mapped file pages, as these types of pages have |
448 | mapped file pages, as these types of pages have different reverse map | 532 | different reverse map mechanisms. |
449 | mechanisms. | 533 | |
450 | 534 | (*) try_to_unmap_anon() | |
451 | try_to_unmap_anon() | 535 | |
452 | 536 | To unmap anonymous pages, each VMA in the list anchored in the anon_vma | |
453 | To unmap anonymous pages, each vma in the list anchored in the anon_vma must be | 537 | must be visited - at least until a VM_LOCKED VMA is encountered. If the |
454 | visited--at least until a VM_LOCKED vma is encountered. If the page is being | 538 | page is being unmapped for migration, VM_LOCKED VMAs do not stop the |
455 | unmapped for migration, VM_LOCKED vmas do not stop the process because mlocked | 539 | process because mlocked pages are migratable. However, for reclaim, if |
456 | pages are migratable. However, for reclaim, if the page is mapped into a | 540 | the page is mapped into a VM_LOCKED VMA, the scan stops. |
457 | VM_LOCKED vma, the scan stops. try_to_unmap() attempts to acquire the mmap | 541 | |
458 | semphore of the mm_struct to which the vma belongs in read mode. If this is | 542 | try_to_unmap_anon() attempts to acquire in read mode the mmap semphore of |
459 | successful, try_to_unmap() will mlock the page via mlock_vma_page()--we | 543 | the mm_struct to which the VMA belongs. If this is successful, it will |
460 | wouldn't have gotten to try_to_unmap() if the page were already mlocked--and | 544 | mlock the page via mlock_vma_page() - we wouldn't have gotten to |
461 | will return SWAP_MLOCK, indicating that the page is unevictable. If the | 545 | try_to_unmap_anon() if the page were already mlocked - and will return |
462 | mmap semaphore cannot be acquired, we are not sure whether the page is really | 546 | SWAP_MLOCK, indicating that the page is unevictable. |
463 | unevictable or not. In this case, try_to_unmap() will return SWAP_AGAIN. | 547 | |
464 | 548 | If the mmap semaphore cannot be acquired, we are not sure whether the page | |
465 | try_to_unmap_file() -- linear mappings | 549 | is really unevictable or not. In this case, try_to_unmap_anon() will |
466 | 550 | return SWAP_AGAIN. | |
467 | Unmapping of a mapped file page works the same, except that the scan visits | 551 | |
468 | all vmas that maps the page's index/page offset in the page's mapping's | 552 | (*) try_to_unmap_file() - linear mappings |
469 | reverse map priority search tree. It must also visit each vma in the page's | 553 | |
470 | mapping's non-linear list, if the list is non-empty. As for anonymous pages, | 554 | Unmapping of a mapped file page works the same as for anonymous mappings, |
471 | on encountering a VM_LOCKED vma for a mapped file page, try_to_unmap() will | 555 | except that the scan visits all VMAs that map the page's index/page offset |
472 | attempt to acquire the associated mm_struct's mmap semaphore to mlock the page, | 556 | in the page's mapping's reverse map priority search tree. It also visits |
473 | returning SWAP_MLOCK if this is successful, and SWAP_AGAIN, if not. | 557 | each VMA in the page's mapping's non-linear list, if the list is |
474 | 558 | non-empty. | |
475 | try_to_unmap_file() -- non-linear mappings | 559 | |
476 | 560 | As for anonymous pages, on encountering a VM_LOCKED VMA for a mapped file | |
477 | If a page's mapping contains a non-empty non-linear mapping vma list, then | 561 | page, try_to_unmap_file() will attempt to acquire the associated |
478 | try_to_un{map|lock}() must also visit each vma in that list to determine | 562 | mm_struct's mmap semaphore to mlock the page, returning SWAP_MLOCK if this |
479 | whether the page is mapped in a VM_LOCKED vma. Again, the scan must visit | 563 | is successful, and SWAP_AGAIN, if not. |
480 | all vmas in the non-linear list to ensure that the pages is not/should not be | 564 | |
481 | mlocked. If a VM_LOCKED vma is found in the list, the scan could terminate. | 565 | (*) try_to_unmap_file() - non-linear mappings |
482 | However, there is no easy way to determine whether the page is actually mapped | 566 | |
483 | in a given vma--either for unmapping or testing whether the VM_LOCKED vma | 567 | If a page's mapping contains a non-empty non-linear mapping VMA list, then |
484 | actually pins the page. | 568 | try_to_un{map|lock}() must also visit each VMA in that list to determine |
485 | 569 | whether the page is mapped in a VM_LOCKED VMA. Again, the scan must visit | |
486 | So, try_to_unmap_file() handles non-linear mappings by scanning a certain | 570 | all VMAs in the non-linear list to ensure that the pages is not/should not |
487 | number of pages--a "cluster"--in each non-linear vma associated with the page's | 571 | be mlocked. |
488 | mapping, for each file mapped page that vmscan tries to unmap. If this happens | 572 | |
489 | to unmap the page we're trying to unmap, try_to_unmap() will notice this on | 573 | If a VM_LOCKED VMA is found in the list, the scan could terminate. |
490 | return--(page_mapcount(page) == 0)--and return SWAP_SUCCESS. Otherwise, it | 574 | However, there is no easy way to determine whether the page is actually |
491 | will return SWAP_AGAIN, causing vmscan to recirculate this page. We take | 575 | mapped in a given VMA - either for unmapping or testing whether the |
492 | advantage of the cluster scan in try_to_unmap_cluster() as follows: | 576 | VM_LOCKED VMA actually pins the page. |
493 | 577 | ||
494 | For each non-linear vma, try_to_unmap_cluster() attempts to acquire the mmap | 578 | try_to_unmap_file() handles non-linear mappings by scanning a certain |
495 | semaphore of the associated mm_struct for read without blocking. If this | 579 | number of pages - a "cluster" - in each non-linear VMA associated with the |
496 | attempt is successful and the vma is VM_LOCKED, try_to_unmap_cluster() will | 580 | page's mapping, for each file mapped page that vmscan tries to unmap. If |
497 | retain the mmap semaphore for the scan; otherwise it drops it here. Then, | 581 | this happens to unmap the page we're trying to unmap, try_to_unmap() will |
498 | for each page in the cluster, if we're holding the mmap semaphore for a locked | 582 | notice this on return (page_mapcount(page) will be 0) and return |
499 | vma, try_to_unmap_cluster() calls mlock_vma_page() to mlock the page. This | 583 | SWAP_SUCCESS. Otherwise, it will return SWAP_AGAIN, causing vmscan to |
500 | call is a no-op if the page is already locked, but will mlock any pages in | 584 | recirculate this page. We take advantage of the cluster scan in |
501 | the non-linear mapping that happen to be unlocked. If one of the pages so | 585 | try_to_unmap_cluster() as follows: |
502 | mlocked is the page passed in to try_to_unmap(), try_to_unmap_cluster() will | 586 | |
503 | return SWAP_MLOCK, rather than the default SWAP_AGAIN. This will allow vmscan | 587 | For each non-linear VMA, try_to_unmap_cluster() attempts to acquire the |
504 | to cull the page, rather than recirculating it on the inactive list. Again, | 588 | mmap semaphore of the associated mm_struct for read without blocking. |
505 | if try_to_unmap_cluster() cannot acquire the vma's mmap sem, it returns | 589 | |
506 | SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED vma, but | 590 | If this attempt is successful and the VMA is VM_LOCKED, |
507 | couldn't be mlocked. | 591 | try_to_unmap_cluster() will retain the mmap semaphore for the scan; |
508 | 592 | otherwise it drops it here. | |
509 | 593 | ||
510 | Mlocked pages: try_to_munlock() Reverse Map Scan | 594 | Then, for each page in the cluster, if we're holding the mmap semaphore |
511 | 595 | for a locked VMA, try_to_unmap_cluster() calls mlock_vma_page() to | |
512 | TODO/FIXME: a better name might be page_mlocked()--analogous to the | 596 | mlock the page. This call is a no-op if the page is already locked, |
513 | page_referenced() reverse map walker. | 597 | but will mlock any pages in the non-linear mapping that happen to be |
514 | 598 | unlocked. | |
515 | When munlock_vma_page()--see "Mlocked Pages: munlock()/munlockall() | 599 | |
516 | System Call Handling" above--tries to munlock a page, it needs to | 600 | If one of the pages so mlocked is the page passed in to try_to_unmap(), |
517 | determine whether or not the page is mapped by any VM_LOCKED vma, without | 601 | try_to_unmap_cluster() will return SWAP_MLOCK, rather than the default |
518 | actually attempting to unmap all ptes from the page. For this purpose, the | 602 | SWAP_AGAIN. This will allow vmscan to cull the page, rather than |
519 | unevictable/mlock infrastructure introduced a variant of try_to_unmap() called | 603 | recirculating it on the inactive list. |
520 | try_to_munlock(). | 604 | |
605 | Again, if try_to_unmap_cluster() cannot acquire the VMA's mmap sem, it | ||
606 | returns SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED | ||
607 | VMA, but couldn't be mlocked. | ||
608 | |||
609 | |||
610 | try_to_munlock() REVERSE MAP SCAN | ||
611 | --------------------------------- | ||
612 | |||
613 | [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the | ||
614 | page_referenced() reverse map walker. | ||
615 | |||
616 | When munlock_vma_page() [see section "munlock()/munlockall() System Call | ||
617 | Handling" above] tries to munlock a page, it needs to determine whether or not | ||
618 | the page is mapped by any VM_LOCKED VMA without actually attempting to unmap | ||
619 | all PTEs from the page. For this purpose, the unevictable/mlock infrastructure | ||
620 | introduced a variant of try_to_unmap() called try_to_munlock(). | ||
521 | 621 | ||
522 | try_to_munlock() calls the same functions as try_to_unmap() for anonymous and | 622 | try_to_munlock() calls the same functions as try_to_unmap() for anonymous and |
523 | mapped file pages with an additional argument specifing unlock versus unmap | 623 | mapped file pages with an additional argument specifing unlock versus unmap |
524 | processing. Again, these functions walk the respective reverse maps looking | 624 | processing. Again, these functions walk the respective reverse maps looking |
525 | for VM_LOCKED vmas. When such a vma is found for anonymous pages and file | 625 | for VM_LOCKED VMAs. When such a VMA is found for anonymous pages and file |
526 | pages mapped in linear VMAs, as in the try_to_unmap() case, the functions | 626 | pages mapped in linear VMAs, as in the try_to_unmap() case, the functions |
527 | attempt to acquire the associated mmap semphore, mlock the page via | 627 | attempt to acquire the associated mmap semphore, mlock the page via |
528 | mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the | 628 | mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the |
529 | pre-clearing of the page's PG_mlocked done by munlock_vma_page. | 629 | pre-clearing of the page's PG_mlocked done by munlock_vma_page. |
530 | 630 | ||
531 | If try_to_unmap() is unable to acquire a VM_LOCKED vma's associated mmap | 631 | If try_to_unmap() is unable to acquire a VM_LOCKED VMA's associated mmap |
532 | semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() | 632 | semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() to |
533 | to recycle the page on the inactive list and hope that it has better luck | 633 | recycle the page on the inactive list and hope that it has better luck with the |
534 | with the page next time. | 634 | page next time. |
535 | 635 | ||
536 | For file pages mapped into non-linear vmas, the try_to_munlock() logic works | 636 | For file pages mapped into non-linear VMAs, the try_to_munlock() logic works |
537 | slightly differently. On encountering a VM_LOCKED non-linear vma that might | 637 | slightly differently. On encountering a VM_LOCKED non-linear VMA that might |
538 | map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking | 638 | map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking the |
539 | the page. munlock_vma_page() will just leave the page unlocked and let | 639 | page. munlock_vma_page() will just leave the page unlocked and let vmscan deal |
540 | vmscan deal with it--the usual fallback position. | 640 | with it - the usual fallback position. |
541 | 641 | ||
542 | Note that try_to_munlock()'s reverse map walk must visit every vma in a pages' | 642 | Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's |
543 | reverse map to determine that a page is NOT mapped into any VM_LOCKED vma. | 643 | reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA. |
544 | However, the scan can terminate when it encounters a VM_LOCKED vma and can | 644 | However, the scan can terminate when it encounters a VM_LOCKED VMA and can |
545 | successfully acquire the vma's mmap semphore for read and mlock the page. | 645 | successfully acquire the VMA's mmap semphore for read and mlock the page. |
546 | Although try_to_munlock() can be called many [very many!] times when | 646 | Although try_to_munlock() might be called a great many times when munlocking a |
547 | munlock()ing a large region or tearing down a large address space that has been | 647 | large region or tearing down a large address space that has been mlocked via |
548 | mlocked via mlockall(), overall this is a fairly rare event. | 648 | mlockall(), overall this is a fairly rare event. |
549 | 649 | ||
550 | Mlocked Page: Page Reclaim in shrink_*_list() | 650 | |
551 | 651 | PAGE RECLAIM IN shrink_*_list() | |
552 | shrink_active_list() culls any obviously unevictable pages--i.e., | 652 | ------------------------------- |
553 | !page_evictable(page, NULL)--diverting these to the unevictable lru | 653 | |
554 | list. However, shrink_active_list() only sees unevictable pages that | 654 | shrink_active_list() culls any obviously unevictable pages - i.e. |
555 | made it onto the active/inactive lru lists. Note that these pages do not | 655 | !page_evictable(page, NULL) - diverting these to the unevictable list. |
556 | have PageUnevictable set--otherwise, they would be on the unevictable list and | 656 | However, shrink_active_list() only sees unevictable pages that made it onto the |
557 | shrink_active_list would never see them. | 657 | active/inactive lru lists. Note that these pages do not have PageUnevictable |
658 | set - otherwise they would be on the unevictable list and shrink_active_list | ||
659 | would never see them. | ||
558 | 660 | ||
559 | Some examples of these unevictable pages on the LRU lists are: | 661 | Some examples of these unevictable pages on the LRU lists are: |
560 | 662 | ||
561 | 1) ramfs pages that have been placed on the lru lists when first allocated. | 663 | (1) ramfs pages that have been placed on the LRU lists when first allocated. |
664 | |||
665 | (2) SHM_LOCK'd shared memory pages. shmctl(SHM_LOCK) does not attempt to | ||
666 | allocate or fault in the pages in the shared memory region. This happens | ||
667 | when an application accesses the page the first time after SHM_LOCK'ing | ||
668 | the segment. | ||
562 | 669 | ||
563 | 2) SHM_LOCKed shared memory pages. shmctl(SHM_LOCK) does not attempt to | 670 | (3) mlocked pages that could not be isolated from the LRU and moved to the |
564 | allocate or fault in the pages in the shared memory region. This happens | 671 | unevictable list in mlock_vma_page(). |
565 | when an application accesses the page the first time after SHM_LOCKing | ||
566 | the segment. | ||
567 | 672 | ||
568 | 3) Mlocked pages that could not be isolated from the lru and moved to the | 673 | (4) Pages mapped into multiple VM_LOCKED VMAs, but try_to_munlock() couldn't |
569 | unevictable list in mlock_vma_page(). | 674 | acquire the VMA's mmap semaphore to test the flags and set PageMlocked. |
675 | munlock_vma_page() was forced to let the page back on to the normal LRU | ||
676 | list for vmscan to handle. | ||
570 | 677 | ||
571 | 3) Pages mapped into multiple VM_LOCKED vmas, but try_to_munlock() couldn't | 678 | shrink_inactive_list() also diverts any unevictable pages that it finds on the |
572 | acquire the vma's mmap semaphore to test the flags and set PageMlocked. | 679 | inactive lists to the appropriate zone's unevictable list. |
573 | munlock_vma_page() was forced to let the page back on to the normal | ||
574 | LRU list for vmscan to handle. | ||
575 | 680 | ||
576 | shrink_inactive_list() also culls any unevictable pages that it finds on | 681 | shrink_inactive_list() should only see SHM_LOCK'd pages that became SHM_LOCK'd |
577 | the inactive lists, again diverting them to the appropriate zone's unevictable | 682 | after shrink_active_list() had moved them to the inactive list, or pages mapped |
578 | lru list. shrink_inactive_list() should only see SHM_LOCKed pages that became | 683 | into VM_LOCKED VMAs that munlock_vma_page() couldn't isolate from the LRU to |
579 | SHM_LOCKed after shrink_active_list() had moved them to the inactive list, or | 684 | recheck via try_to_munlock(). shrink_inactive_list() won't notice the latter, |
580 | pages mapped into VM_LOCKED vmas that munlock_vma_page() couldn't isolate from | 685 | but will pass on to shrink_page_list(). |
581 | the lru to recheck via try_to_munlock(). shrink_inactive_list() won't notice | ||
582 | the latter, but will pass on to shrink_page_list(). | ||
583 | 686 | ||
584 | shrink_page_list() again culls obviously unevictable pages that it could | 687 | shrink_page_list() again culls obviously unevictable pages that it could |
585 | encounter for similar reason to shrink_inactive_list(). Pages mapped into | 688 | encounter for similar reason to shrink_inactive_list(). Pages mapped into |
586 | VM_LOCKED vmas but without PG_mlocked set will make it all the way to | 689 | VM_LOCKED VMAs but without PG_mlocked set will make it all the way to |
587 | try_to_unmap(). shrink_page_list() will divert them to the unevictable list | 690 | try_to_unmap(). shrink_page_list() will divert them to the unevictable list |
588 | when try_to_unmap() returns SWAP_MLOCK, as discussed above. | 691 | when try_to_unmap() returns SWAP_MLOCK, as discussed above. |