diff options
author | Paul Mackerras <paulus@samba.org> | 2009-04-21 23:02:09 -0400 |
---|---|---|
committer | Paul Mackerras <paulus@samba.org> | 2009-04-21 23:02:09 -0400 |
commit | 5bd3ef84d73c2ea7b4babbad060909753c4828d4 (patch) | |
tree | fdf2bafb48ae1ed03175f6c77a7548a181e69ee9 /Documentation | |
parent | 0658c16056660886ea2f35c4f038be70a94b1532 (diff) | |
parent | 6d25b688ecc488753af3c9e6f6a9a575b863cf37 (diff) |
Merge branch 'merge' of git://git.secretlab.ca/git/linux-2.6 into merge
Diffstat (limited to 'Documentation')
43 files changed, 2390 insertions, 929 deletions
diff --git a/Documentation/ABI/testing/debugfs-pktcdvd b/Documentation/ABI/testing/debugfs-pktcdvd index bf9c16b64c34..cf11736acb76 100644 --- a/Documentation/ABI/testing/debugfs-pktcdvd +++ b/Documentation/ABI/testing/debugfs-pktcdvd | |||
@@ -1,4 +1,4 @@ | |||
1 | What: /debug/pktcdvd/pktcdvd[0-7] | 1 | What: /sys/kernel/debug/pktcdvd/pktcdvd[0-7] |
2 | Date: Oct. 2006 | 2 | Date: Oct. 2006 |
3 | KernelVersion: 2.6.20 | 3 | KernelVersion: 2.6.20 |
4 | Contact: Thomas Maier <balagi@justmail.de> | 4 | Contact: Thomas Maier <balagi@justmail.de> |
@@ -10,10 +10,10 @@ debugfs interface | |||
10 | The pktcdvd module (packet writing driver) creates | 10 | The pktcdvd module (packet writing driver) creates |
11 | these files in debugfs: | 11 | these files in debugfs: |
12 | 12 | ||
13 | /debug/pktcdvd/pktcdvd[0-7]/ | 13 | /sys/kernel/debug/pktcdvd/pktcdvd[0-7]/ |
14 | info (0444) Lots of driver statistics and infos. | 14 | info (0444) Lots of driver statistics and infos. |
15 | 15 | ||
16 | Example: | 16 | Example: |
17 | ------- | 17 | ------- |
18 | 18 | ||
19 | cat /debug/pktcdvd/pktcdvd0/info | 19 | cat /sys/kernel/debug/pktcdvd/pktcdvd0/info |
diff --git a/Documentation/DMA-mapping.txt b/Documentation/DMA-mapping.txt index b2a4d6d244d9..01f24e94bdb6 100644 --- a/Documentation/DMA-mapping.txt +++ b/Documentation/DMA-mapping.txt | |||
@@ -136,7 +136,7 @@ exactly why. | |||
136 | The standard 32-bit addressing PCI device would do something like | 136 | The standard 32-bit addressing PCI device would do something like |
137 | this: | 137 | this: |
138 | 138 | ||
139 | if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) { | 139 | if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) { |
140 | printk(KERN_WARNING | 140 | printk(KERN_WARNING |
141 | "mydev: No suitable DMA available.\n"); | 141 | "mydev: No suitable DMA available.\n"); |
142 | goto ignore_this_device; | 142 | goto ignore_this_device; |
@@ -155,9 +155,9 @@ all 64-bits when accessing streaming DMA: | |||
155 | 155 | ||
156 | int using_dac; | 156 | int using_dac; |
157 | 157 | ||
158 | if (!pci_set_dma_mask(pdev, DMA_64BIT_MASK)) { | 158 | if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) { |
159 | using_dac = 1; | 159 | using_dac = 1; |
160 | } else if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK)) { | 160 | } else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) { |
161 | using_dac = 0; | 161 | using_dac = 0; |
162 | } else { | 162 | } else { |
163 | printk(KERN_WARNING | 163 | printk(KERN_WARNING |
@@ -170,14 +170,14 @@ the case would look like this: | |||
170 | 170 | ||
171 | int using_dac, consistent_using_dac; | 171 | int using_dac, consistent_using_dac; |
172 | 172 | ||
173 | if (!pci_set_dma_mask(pdev, DMA_64BIT_MASK)) { | 173 | if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) { |
174 | using_dac = 1; | 174 | using_dac = 1; |
175 | consistent_using_dac = 1; | 175 | consistent_using_dac = 1; |
176 | pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK); | 176 | pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); |
177 | } else if (!pci_set_dma_mask(pdev, DMA_32BIT_MASK)) { | 177 | } else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) { |
178 | using_dac = 0; | 178 | using_dac = 0; |
179 | consistent_using_dac = 0; | 179 | consistent_using_dac = 0; |
180 | pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK); | 180 | pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); |
181 | } else { | 181 | } else { |
182 | printk(KERN_WARNING | 182 | printk(KERN_WARNING |
183 | "mydev: No suitable DMA available.\n"); | 183 | "mydev: No suitable DMA available.\n"); |
@@ -192,7 +192,7 @@ check the return value from pci_set_consistent_dma_mask(). | |||
192 | Finally, if your device can only drive the low 24-bits of | 192 | Finally, if your device can only drive the low 24-bits of |
193 | address during PCI bus mastering you might do something like: | 193 | address during PCI bus mastering you might do something like: |
194 | 194 | ||
195 | if (pci_set_dma_mask(pdev, DMA_24BIT_MASK)) { | 195 | if (pci_set_dma_mask(pdev, DMA_BIT_MASK(24))) { |
196 | printk(KERN_WARNING | 196 | printk(KERN_WARNING |
197 | "mydev: 24-bit DMA addressing not available.\n"); | 197 | "mydev: 24-bit DMA addressing not available.\n"); |
198 | goto ignore_this_device; | 198 | goto ignore_this_device; |
@@ -213,7 +213,7 @@ most specific mask. | |||
213 | 213 | ||
214 | Here is pseudo-code showing how this might be done: | 214 | Here is pseudo-code showing how this might be done: |
215 | 215 | ||
216 | #define PLAYBACK_ADDRESS_BITS DMA_32BIT_MASK | 216 | #define PLAYBACK_ADDRESS_BITS DMA_BIT_MASK(32) |
217 | #define RECORD_ADDRESS_BITS 0x00ffffff | 217 | #define RECORD_ADDRESS_BITS 0x00ffffff |
218 | 218 | ||
219 | struct my_sound_card *card; | 219 | struct my_sound_card *card; |
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index a3a83d38f96f..8918a32c6b3a 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile | |||
@@ -31,7 +31,7 @@ PS_METHOD = $(prefer-db2x) | |||
31 | 31 | ||
32 | ### | 32 | ### |
33 | # The targets that may be used. | 33 | # The targets that may be used. |
34 | PHONY += xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs | 34 | PHONY += xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs cleandocs |
35 | 35 | ||
36 | BOOKS := $(addprefix $(obj)/,$(DOCBOOKS)) | 36 | BOOKS := $(addprefix $(obj)/,$(DOCBOOKS)) |
37 | xmldocs: $(BOOKS) | 37 | xmldocs: $(BOOKS) |
@@ -213,11 +213,12 @@ silent_gen_xml = : | |||
213 | dochelp: | 213 | dochelp: |
214 | @echo ' Linux kernel internal documentation in different formats:' | 214 | @echo ' Linux kernel internal documentation in different formats:' |
215 | @echo ' htmldocs - HTML' | 215 | @echo ' htmldocs - HTML' |
216 | @echo ' installmandocs - install man pages generated by mandocs' | ||
217 | @echo ' mandocs - man pages' | ||
218 | @echo ' pdfdocs - PDF' | 216 | @echo ' pdfdocs - PDF' |
219 | @echo ' psdocs - Postscript' | 217 | @echo ' psdocs - Postscript' |
220 | @echo ' xmldocs - XML DocBook' | 218 | @echo ' xmldocs - XML DocBook' |
219 | @echo ' mandocs - man pages' | ||
220 | @echo ' installmandocs - install man pages generated by mandocs' | ||
221 | @echo ' cleandocs - clean all generated DocBook files' | ||
221 | 222 | ||
222 | ### | 223 | ### |
223 | # Temporary files left by various tools | 224 | # Temporary files left by various tools |
@@ -235,6 +236,10 @@ clean-files := $(DOCBOOKS) \ | |||
235 | 236 | ||
236 | clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man | 237 | clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man |
237 | 238 | ||
239 | cleandocs: | ||
240 | $(Q)rm -f $(call objectify, $(clean-files)) | ||
241 | $(Q)rm -rf $(call objectify, $(clean-dirs)) | ||
242 | |||
238 | # Declare the contents of the .PHONY variable as phony. We keep that | 243 | # Declare the contents of the .PHONY variable as phony. We keep that |
239 | # information in a variable se we can use it in if_changed and friends. | 244 | # information in a variable se we can use it in if_changed and friends. |
240 | 245 | ||
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl index 58c194572c76..d6ac5d61820e 100644 --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl | |||
@@ -259,7 +259,7 @@ X!Earch/x86/kernel/mca_32.c | |||
259 | !Eblock/blk-tag.c | 259 | !Eblock/blk-tag.c |
260 | !Iblock/blk-tag.c | 260 | !Iblock/blk-tag.c |
261 | !Eblock/blk-integrity.c | 261 | !Eblock/blk-integrity.c |
262 | !Iblock/blktrace.c | 262 | !Ikernel/trace/blktrace.c |
263 | !Iblock/genhd.c | 263 | !Iblock/genhd.c |
264 | !Eblock/genhd.c | 264 | !Eblock/genhd.c |
265 | </chapter> | 265 | </chapter> |
diff --git a/Documentation/DocBook/writing-an-alsa-driver.tmpl b/Documentation/DocBook/writing-an-alsa-driver.tmpl index 46b08fef3744..7a2e0e98986a 100644 --- a/Documentation/DocBook/writing-an-alsa-driver.tmpl +++ b/Documentation/DocBook/writing-an-alsa-driver.tmpl | |||
@@ -1137,8 +1137,8 @@ | |||
1137 | if (err < 0) | 1137 | if (err < 0) |
1138 | return err; | 1138 | return err; |
1139 | /* check PCI availability (28bit DMA) */ | 1139 | /* check PCI availability (28bit DMA) */ |
1140 | if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 || | 1140 | if (pci_set_dma_mask(pci, DMA_BIT_MASK(28)) < 0 || |
1141 | pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) { | 1141 | pci_set_consistent_dma_mask(pci, DMA_BIT_MASK(28)) < 0) { |
1142 | printk(KERN_ERR "error to set 28bit mask DMA\n"); | 1142 | printk(KERN_ERR "error to set 28bit mask DMA\n"); |
1143 | pci_disable_device(pci); | 1143 | pci_disable_device(pci); |
1144 | return -ENXIO; | 1144 | return -ENXIO; |
@@ -1252,8 +1252,8 @@ | |||
1252 | err = pci_enable_device(pci); | 1252 | err = pci_enable_device(pci); |
1253 | if (err < 0) | 1253 | if (err < 0) |
1254 | return err; | 1254 | return err; |
1255 | if (pci_set_dma_mask(pci, DMA_28BIT_MASK) < 0 || | 1255 | if (pci_set_dma_mask(pci, DMA_BIT_MASK(28)) < 0 || |
1256 | pci_set_consistent_dma_mask(pci, DMA_28BIT_MASK) < 0) { | 1256 | pci_set_consistent_dma_mask(pci, DMA_BIT_MASK(28)) < 0) { |
1257 | printk(KERN_ERR "error to set 28bit mask DMA\n"); | 1257 | printk(KERN_ERR "error to set 28bit mask DMA\n"); |
1258 | pci_disable_device(pci); | 1258 | pci_disable_device(pci); |
1259 | return -ENXIO; | 1259 | return -ENXIO; |
diff --git a/Documentation/blockdev/00-INDEX b/Documentation/blockdev/00-INDEX index 86f054c47013..c08df56dd91b 100644 --- a/Documentation/blockdev/00-INDEX +++ b/Documentation/blockdev/00-INDEX | |||
@@ -8,6 +8,8 @@ cpqarray.txt | |||
8 | - info on using Compaq's SMART2 Intelligent Disk Array Controllers. | 8 | - info on using Compaq's SMART2 Intelligent Disk Array Controllers. |
9 | floppy.txt | 9 | floppy.txt |
10 | - notes and driver options for the floppy disk driver. | 10 | - notes and driver options for the floppy disk driver. |
11 | mflash.txt | ||
12 | - info on mGine m(g)flash driver for linux. | ||
11 | nbd.txt | 13 | nbd.txt |
12 | - info on a TCP implementation of a network block device. | 14 | - info on a TCP implementation of a network block device. |
13 | paride.txt | 15 | paride.txt |
diff --git a/Documentation/blockdev/mflash.txt b/Documentation/blockdev/mflash.txt new file mode 100644 index 000000000000..1f610ecf698a --- /dev/null +++ b/Documentation/blockdev/mflash.txt | |||
@@ -0,0 +1,84 @@ | |||
1 | This document describes m[g]flash support in linux. | ||
2 | |||
3 | Contents | ||
4 | 1. Overview | ||
5 | 2. Reserved area configuration | ||
6 | 3. Example of mflash platform driver registration | ||
7 | |||
8 | 1. Overview | ||
9 | |||
10 | Mflash and gflash are embedded flash drive. The only difference is mflash is | ||
11 | MCP(Multi Chip Package) device. These two device operate exactly same way. | ||
12 | So the rest mflash repersents mflash and gflash altogether. | ||
13 | |||
14 | Internally, mflash has nand flash and other hardware logics and supports | ||
15 | 2 different operation (ATA, IO) modes. ATA mode doesn't need any new | ||
16 | driver and currently works well under standard IDE subsystem. Actually it's | ||
17 | one chip SSD. IO mode is ATA-like custom mode for the host that doesn't have | ||
18 | IDE interface. | ||
19 | |||
20 | Followings are brief descriptions about IO mode. | ||
21 | A. IO mode based on ATA protocol and uses some custom command. (read confirm, | ||
22 | write confirm) | ||
23 | B. IO mode uses SRAM bus interface. | ||
24 | C. IO mode supports 4kB boot area, so host can boot from mflash. | ||
25 | |||
26 | 2. Reserved area configuration | ||
27 | If host boot from mflash, usually needs raw area for boot loader image. All of | ||
28 | the mflash's block device operation will be taken this value as start offset. | ||
29 | Note that boot loader's size of reserved area and kernel configuration value | ||
30 | must be same. | ||
31 | |||
32 | 3. Example of mflash platform driver registration | ||
33 | Working mflash is very straight forward. Adding platform device stuff to board | ||
34 | configuration file is all. Here is some pseudo example. | ||
35 | |||
36 | static struct mg_drv_data mflash_drv_data = { | ||
37 | /* If you want to polling driver set to 1 */ | ||
38 | .use_polling = 0, | ||
39 | /* device attribution */ | ||
40 | .dev_attr = MG_BOOT_DEV | ||
41 | }; | ||
42 | |||
43 | static struct resource mg_mflash_rsc[] = { | ||
44 | /* Base address of mflash */ | ||
45 | [0] = { | ||
46 | .start = 0x08000000, | ||
47 | .end = 0x08000000 + SZ_64K - 1, | ||
48 | .flags = IORESOURCE_MEM | ||
49 | }, | ||
50 | /* mflash interrupt pin */ | ||
51 | [1] = { | ||
52 | .start = IRQ_GPIO(84), | ||
53 | .end = IRQ_GPIO(84), | ||
54 | .flags = IORESOURCE_IRQ | ||
55 | }, | ||
56 | /* mflash reset pin */ | ||
57 | [2] = { | ||
58 | .start = 43, | ||
59 | .end = 43, | ||
60 | .name = MG_RST_PIN, | ||
61 | .flags = IORESOURCE_IO | ||
62 | }, | ||
63 | /* mflash reset-out pin | ||
64 | * If you use mflash as storage device (i.e. other than MG_BOOT_DEV), | ||
65 | * should assign this */ | ||
66 | [3] = { | ||
67 | .start = 51, | ||
68 | .end = 51, | ||
69 | .name = MG_RSTOUT_PIN, | ||
70 | .flags = IORESOURCE_IO | ||
71 | } | ||
72 | }; | ||
73 | |||
74 | static struct platform_device mflash_dev = { | ||
75 | .name = MG_DEV_NAME, | ||
76 | .id = -1, | ||
77 | .dev = { | ||
78 | .platform_data = &mflash_drv_data, | ||
79 | }, | ||
80 | .num_resources = ARRAY_SIZE(mg_mflash_rsc), | ||
81 | .resource = mg_mflash_rsc | ||
82 | }; | ||
83 | |||
84 | platform_device_register(&mflash_dev); | ||
diff --git a/Documentation/cgroups/cpuacct.txt b/Documentation/cgroups/cpuacct.txt index bb775fbe43d7..8b930946c52a 100644 --- a/Documentation/cgroups/cpuacct.txt +++ b/Documentation/cgroups/cpuacct.txt | |||
@@ -30,3 +30,21 @@ The above steps create a new group g1 and move the current shell | |||
30 | process (bash) into it. CPU time consumed by this bash and its children | 30 | process (bash) into it. CPU time consumed by this bash and its children |
31 | can be obtained from g1/cpuacct.usage and the same is accumulated in | 31 | can be obtained from g1/cpuacct.usage and the same is accumulated in |
32 | /cgroups/cpuacct.usage also. | 32 | /cgroups/cpuacct.usage also. |
33 | |||
34 | cpuacct.stat file lists a few statistics which further divide the | ||
35 | CPU time obtained by the cgroup into user and system times. Currently | ||
36 | the following statistics are supported: | ||
37 | |||
38 | user: Time spent by tasks of the cgroup in user mode. | ||
39 | system: Time spent by tasks of the cgroup in kernel mode. | ||
40 | |||
41 | user and system are in USER_HZ unit. | ||
42 | |||
43 | cpuacct controller uses percpu_counter interface to collect user and | ||
44 | system times. This has two side effects: | ||
45 | |||
46 | - It is theoretically possible to see wrong values for user and system times. | ||
47 | This is because percpu_counter_read() on 32bit systems isn't safe | ||
48 | against concurrent writes. | ||
49 | - It is possible to see slightly outdated values for user and system times | ||
50 | due to the batch processing nature of percpu_counter. | ||
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index a98a7fe7aabb..1a608877b14e 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt | |||
@@ -6,15 +6,14 @@ used here with the memory controller that is used in hardware. | |||
6 | 6 | ||
7 | Salient features | 7 | Salient features |
8 | 8 | ||
9 | a. Enable control of both RSS (mapped) and Page Cache (unmapped) pages | 9 | a. Enable control of Anonymous, Page Cache (mapped and unmapped) and |
10 | Swap Cache memory pages. | ||
10 | b. The infrastructure allows easy addition of other types of memory to control | 11 | b. The infrastructure allows easy addition of other types of memory to control |
11 | c. Provides *zero overhead* for non memory controller users | 12 | c. Provides *zero overhead* for non memory controller users |
12 | d. Provides a double LRU: global memory pressure causes reclaim from the | 13 | d. Provides a double LRU: global memory pressure causes reclaim from the |
13 | global LRU; a cgroup on hitting a limit, reclaims from the per | 14 | global LRU; a cgroup on hitting a limit, reclaims from the per |
14 | cgroup LRU | 15 | cgroup LRU |
15 | 16 | ||
16 | NOTE: Swap Cache (unmapped) is not accounted now. | ||
17 | |||
18 | Benefits and Purpose of the memory controller | 17 | Benefits and Purpose of the memory controller |
19 | 18 | ||
20 | The memory controller isolates the memory behaviour of a group of tasks | 19 | The memory controller isolates the memory behaviour of a group of tasks |
@@ -290,34 +289,44 @@ will be charged as a new owner of it. | |||
290 | moved to the parent. If you want to avoid that, force_empty will be useful. | 289 | moved to the parent. If you want to avoid that, force_empty will be useful. |
291 | 290 | ||
292 | 5.2 stat file | 291 | 5.2 stat file |
293 | memory.stat file includes following statistics (now) | 292 | |
294 | cache - # of pages from page-cache and shmem. | 293 | memory.stat file includes following statistics |
295 | rss - # of pages from anonymous memory. | 294 | |
296 | pgpgin - # of event of charging | 295 | cache - # of bytes of page cache memory. |
297 | pgpgout - # of event of uncharging | 296 | rss - # of bytes of anonymous and swap cache memory. |
298 | active_anon - # of pages on active lru of anon, shmem. | 297 | pgpgin - # of pages paged in (equivalent to # of charging events). |
299 | inactive_anon - # of pages on active lru of anon, shmem | 298 | pgpgout - # of pages paged out (equivalent to # of uncharging events). |
300 | active_file - # of pages on active lru of file-cache | 299 | active_anon - # of bytes of anonymous and swap cache memory on active |
301 | inactive_file - # of pages on inactive lru of file cache | 300 | lru list. |
302 | unevictable - # of pages cannot be reclaimed.(mlocked etc) | 301 | inactive_anon - # of bytes of anonymous memory and swap cache memory on |
303 | 302 | inactive lru list. | |
304 | Below is depend on CONFIG_DEBUG_VM. | 303 | active_file - # of bytes of file-backed memory on active lru list. |
305 | inactive_ratio - VM internal parameter. (see mm/page_alloc.c) | 304 | inactive_file - # of bytes of file-backed memory on inactive lru list. |
306 | recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) | 305 | unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc). |
307 | recent_rotated_file - VM internal parameter. (see mm/vmscan.c) | 306 | |
308 | recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) | 307 | The following additional stats are dependent on CONFIG_DEBUG_VM. |
309 | recent_scanned_file - VM internal parameter. (see mm/vmscan.c) | 308 | |
310 | 309 | inactive_ratio - VM internal parameter. (see mm/page_alloc.c) | |
311 | Memo: | 310 | recent_rotated_anon - VM internal parameter. (see mm/vmscan.c) |
311 | recent_rotated_file - VM internal parameter. (see mm/vmscan.c) | ||
312 | recent_scanned_anon - VM internal parameter. (see mm/vmscan.c) | ||
313 | recent_scanned_file - VM internal parameter. (see mm/vmscan.c) | ||
314 | |||
315 | Memo: | ||
312 | recent_rotated means recent frequency of lru rotation. | 316 | recent_rotated means recent frequency of lru rotation. |
313 | recent_scanned means recent # of scans to lru. | 317 | recent_scanned means recent # of scans to lru. |
314 | showing for better debug please see the code for meanings. | 318 | showing for better debug please see the code for meanings. |
315 | 319 | ||
320 | Note: | ||
321 | Only anonymous and swap cache memory is listed as part of 'rss' stat. | ||
322 | This should not be confused with the true 'resident set size' or the | ||
323 | amount of physical memory used by the cgroup. Per-cgroup rss | ||
324 | accounting is not done yet. | ||
316 | 325 | ||
317 | 5.3 swappiness | 326 | 5.3 swappiness |
318 | Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. | 327 | Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. |
319 | 328 | ||
320 | Following cgroup's swapiness can't be changed. | 329 | Following cgroups' swapiness can't be changed. |
321 | - root cgroup (uses /proc/sys/vm/swappiness). | 330 | - root cgroup (uses /proc/sys/vm/swappiness). |
322 | - a cgroup which uses hierarchy and it has child cgroup. | 331 | - a cgroup which uses hierarchy and it has child cgroup. |
323 | - a cgroup which uses hierarchy and not the root of hierarchy. | 332 | - a cgroup which uses hierarchy and not the root of hierarchy. |
diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt index f196ac1d7d25..95b24d766eab 100644 --- a/Documentation/cgroups/resource_counter.txt +++ b/Documentation/cgroups/resource_counter.txt | |||
@@ -47,13 +47,18 @@ to work with it. | |||
47 | 47 | ||
48 | 2. Basic accounting routines | 48 | 2. Basic accounting routines |
49 | 49 | ||
50 | a. void res_counter_init(struct res_counter *rc) | 50 | a. void res_counter_init(struct res_counter *rc, |
51 | struct res_counter *rc_parent) | ||
51 | 52 | ||
52 | Initializes the resource counter. As usual, should be the first | 53 | Initializes the resource counter. As usual, should be the first |
53 | routine called for a new counter. | 54 | routine called for a new counter. |
54 | 55 | ||
55 | b. int res_counter_charge[_locked] | 56 | The struct res_counter *parent can be used to define a hierarchical |
56 | (struct res_counter *rc, unsigned long val) | 57 | child -> parent relationship directly in the res_counter structure, |
58 | NULL can be used to define no relationship. | ||
59 | |||
60 | c. int res_counter_charge(struct res_counter *rc, unsigned long val, | ||
61 | struct res_counter **limit_fail_at) | ||
57 | 62 | ||
58 | When a resource is about to be allocated it has to be accounted | 63 | When a resource is about to be allocated it has to be accounted |
59 | with the appropriate resource counter (controller should determine | 64 | with the appropriate resource counter (controller should determine |
@@ -67,15 +72,25 @@ to work with it. | |||
67 | * if the charging is performed first, then it should be uncharged | 72 | * if the charging is performed first, then it should be uncharged |
68 | on error path (if the one is called). | 73 | on error path (if the one is called). |
69 | 74 | ||
70 | c. void res_counter_uncharge[_locked] | 75 | If the charging fails and a hierarchical dependency exists, the |
76 | limit_fail_at parameter is set to the particular res_counter element | ||
77 | where the charging failed. | ||
78 | |||
79 | d. int res_counter_charge_locked | ||
80 | (struct res_counter *rc, unsigned long val) | ||
81 | |||
82 | The same as res_counter_charge(), but it must not acquire/release the | ||
83 | res_counter->lock internally (it must be called with res_counter->lock | ||
84 | held). | ||
85 | |||
86 | e. void res_counter_uncharge[_locked] | ||
71 | (struct res_counter *rc, unsigned long val) | 87 | (struct res_counter *rc, unsigned long val) |
72 | 88 | ||
73 | When a resource is released (freed) it should be de-accounted | 89 | When a resource is released (freed) it should be de-accounted |
74 | from the resource counter it was accounted to. This is called | 90 | from the resource counter it was accounted to. This is called |
75 | "uncharging". | 91 | "uncharging". |
76 | 92 | ||
77 | The _locked routines imply that the res_counter->lock is taken. | 93 | The _locked routines imply that the res_counter->lock is taken. |
78 | |||
79 | 94 | ||
80 | 2.1 Other accounting routines | 95 | 2.1 Other accounting routines |
81 | 96 | ||
diff --git a/Documentation/devices.txt b/Documentation/devices.txt index 327de1624759..53d64d382343 100644 --- a/Documentation/devices.txt +++ b/Documentation/devices.txt | |||
@@ -3,7 +3,7 @@ | |||
3 | 3 | ||
4 | Maintained by Alan Cox <device@lanana.org> | 4 | Maintained by Alan Cox <device@lanana.org> |
5 | 5 | ||
6 | Last revised: 29 November 2006 | 6 | Last revised: 6th April 2009 |
7 | 7 | ||
8 | This list is the Linux Device List, the official registry of allocated | 8 | This list is the Linux Device List, the official registry of allocated |
9 | device numbers and /dev directory nodes for the Linux operating | 9 | device numbers and /dev directory nodes for the Linux operating |
@@ -2797,6 +2797,10 @@ Your cooperation is appreciated. | |||
2797 | 206 = /dev/ttySC1 SC26xx serial port 1 | 2797 | 206 = /dev/ttySC1 SC26xx serial port 1 |
2798 | 207 = /dev/ttySC2 SC26xx serial port 2 | 2798 | 207 = /dev/ttySC2 SC26xx serial port 2 |
2799 | 208 = /dev/ttySC3 SC26xx serial port 3 | 2799 | 208 = /dev/ttySC3 SC26xx serial port 3 |
2800 | 209 = /dev/ttyMAX0 MAX3100 serial port 0 | ||
2801 | 210 = /dev/ttyMAX1 MAX3100 serial port 1 | ||
2802 | 211 = /dev/ttyMAX2 MAX3100 serial port 2 | ||
2803 | 212 = /dev/ttyMAX3 MAX3100 serial port 3 | ||
2800 | 2804 | ||
2801 | 205 char Low-density serial ports (alternate device) | 2805 | 205 char Low-density serial ports (alternate device) |
2802 | 0 = /dev/culu0 Callout device for ttyLU0 | 2806 | 0 = /dev/culu0 Callout device for ttyLU0 |
diff --git a/Documentation/fb/uvesafb.txt b/Documentation/fb/uvesafb.txt index 7ac3c4078ff9..eefdd91d298a 100644 --- a/Documentation/fb/uvesafb.txt +++ b/Documentation/fb/uvesafb.txt | |||
@@ -59,7 +59,8 @@ Accepted options: | |||
59 | ypan Enable display panning using the VESA protected mode | 59 | ypan Enable display panning using the VESA protected mode |
60 | interface. The visible screen is just a window of the | 60 | interface. The visible screen is just a window of the |
61 | video memory, console scrolling is done by changing the | 61 | video memory, console scrolling is done by changing the |
62 | start of the window. Available on x86 only. | 62 | start of the window. This option is available on x86 |
63 | only and is the default option on that architecture. | ||
63 | 64 | ||
64 | ywrap Same as ypan, but assumes your gfx board can wrap-around | 65 | ywrap Same as ypan, but assumes your gfx board can wrap-around |
65 | the video memory (i.e. starts reading from top if it | 66 | the video memory (i.e. starts reading from top if it |
@@ -67,7 +68,7 @@ ywrap Same as ypan, but assumes your gfx board can wrap-around | |||
67 | Available on x86 only. | 68 | Available on x86 only. |
68 | 69 | ||
69 | redraw Scroll by redrawing the affected part of the screen, this | 70 | redraw Scroll by redrawing the affected part of the screen, this |
70 | is the safe (and slow) default. | 71 | is the default on non-x86. |
71 | 72 | ||
72 | (If you're using uvesafb as a module, the above three options are | 73 | (If you're using uvesafb as a module, the above three options are |
73 | used a parameter of the scroll option, e.g. scroll=ypan.) | 74 | used a parameter of the scroll option, e.g. scroll=ypan.) |
@@ -182,7 +183,7 @@ from the Video BIOS if you set pixclock to 0 in fb_var_screeninfo. | |||
182 | 183 | ||
183 | -- | 184 | -- |
184 | Michal Januszewski <spock@gentoo.org> | 185 | Michal Januszewski <spock@gentoo.org> |
185 | Last updated: 2007-06-16 | 186 | Last updated: 2009-03-30 |
186 | 187 | ||
187 | Documentation of the uvesafb options is loosely based on vesafb.txt. | 188 | Documentation of the uvesafb options is loosely based on vesafb.txt. |
188 | 189 | ||
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 39246fc11257..de491a3e2313 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt | |||
@@ -354,7 +354,8 @@ Who: Krzysztof Piotr Oledzki <ole@ans.pl> | |||
354 | 354 | ||
355 | --------------------------- | 355 | --------------------------- |
356 | 356 | ||
357 | What: i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client() | 357 | What: i2c_attach_client(), i2c_detach_client(), i2c_driver->detach_client(), |
358 | i2c_adapter->client_register(), i2c_adapter->client_unregister | ||
358 | When: 2.6.30 | 359 | When: 2.6.30 |
359 | Check: i2c_attach_client i2c_detach_client | 360 | Check: i2c_attach_client i2c_detach_client |
360 | Why: Deprecated by the new (standard) device driver binding model. Use | 361 | Why: Deprecated by the new (standard) device driver binding model. Use |
@@ -427,3 +428,12 @@ Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to | |||
427 | After a reasonable transition period, we will remove the legacy | 428 | After a reasonable transition period, we will remove the legacy |
428 | fakephp interface. | 429 | fakephp interface. |
429 | Who: Alex Chiang <achiang@hp.com> | 430 | Who: Alex Chiang <achiang@hp.com> |
431 | |||
432 | --------------------------- | ||
433 | |||
434 | What: i2c-voodoo3 driver | ||
435 | When: October 2009 | ||
436 | Why: Superseded by tdfxfb. I2C/DDC support used to live in a separate | ||
437 | driver but this caused driver conflicts. | ||
438 | Who: Jean Delvare <khali@linux-fr.org> | ||
439 | Krzysztof Helt <krzysztof.h1@wp.pl> | ||
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index 52cd611277a3..8dd6db76171d 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX | |||
@@ -68,6 +68,8 @@ ncpfs.txt | |||
68 | - info on Novell Netware(tm) filesystem using NCP protocol. | 68 | - info on Novell Netware(tm) filesystem using NCP protocol. |
69 | nfsroot.txt | 69 | nfsroot.txt |
70 | - short guide on setting up a diskless box with NFS root filesystem. | 70 | - short guide on setting up a diskless box with NFS root filesystem. |
71 | nilfs2.txt | ||
72 | - info and mount options for the NILFS2 filesystem. | ||
71 | ntfs.txt | 73 | ntfs.txt |
72 | - info and mount options for the NTFS filesystem (Windows NT). | 74 | - info and mount options for the NTFS filesystem (Windows NT). |
73 | ocfs2.txt | 75 | ocfs2.txt |
diff --git a/Documentation/filesystems/knfsd-stats.txt b/Documentation/filesystems/knfsd-stats.txt new file mode 100644 index 000000000000..64ced5149d37 --- /dev/null +++ b/Documentation/filesystems/knfsd-stats.txt | |||
@@ -0,0 +1,159 @@ | |||
1 | |||
2 | Kernel NFS Server Statistics | ||
3 | ============================ | ||
4 | |||
5 | This document describes the format and semantics of the statistics | ||
6 | which the kernel NFS server makes available to userspace. These | ||
7 | statistics are available in several text form pseudo files, each of | ||
8 | which is described separately below. | ||
9 | |||
10 | In most cases you don't need to know these formats, as the nfsstat(8) | ||
11 | program from the nfs-utils distribution provides a helpful command-line | ||
12 | interface for extracting and printing them. | ||
13 | |||
14 | All the files described here are formatted as a sequence of text lines, | ||
15 | separated by newline '\n' characters. Lines beginning with a hash | ||
16 | '#' character are comments intended for humans and should be ignored | ||
17 | by parsing routines. All other lines contain a sequence of fields | ||
18 | separated by whitespace. | ||
19 | |||
20 | /proc/fs/nfsd/pool_stats | ||
21 | ------------------------ | ||
22 | |||
23 | This file is available in kernels from 2.6.30 onwards, if the | ||
24 | /proc/fs/nfsd filesystem is mounted (it almost always should be). | ||
25 | |||
26 | The first line is a comment which describes the fields present in | ||
27 | all the other lines. The other lines present the following data as | ||
28 | a sequence of unsigned decimal numeric fields. One line is shown | ||
29 | for each NFS thread pool. | ||
30 | |||
31 | All counters are 64 bits wide and wrap naturally. There is no way | ||
32 | to zero these counters, instead applications should do their own | ||
33 | rate conversion. | ||
34 | |||
35 | pool | ||
36 | The id number of the NFS thread pool to which this line applies. | ||
37 | This number does not change. | ||
38 | |||
39 | Thread pool ids are a contiguous set of small integers starting | ||
40 | at zero. The maximum value depends on the thread pool mode, but | ||
41 | currently cannot be larger than the number of CPUs in the system. | ||
42 | Note that in the default case there will be a single thread pool | ||
43 | which contains all the nfsd threads and all the CPUs in the system, | ||
44 | and thus this file will have a single line with a pool id of "0". | ||
45 | |||
46 | packets-arrived | ||
47 | Counts how many NFS packets have arrived. More precisely, this | ||
48 | is the number of times that the network stack has notified the | ||
49 | sunrpc server layer that new data may be available on a transport | ||
50 | (e.g. an NFS or UDP socket or an NFS/RDMA endpoint). | ||
51 | |||
52 | Depending on the NFS workload patterns and various network stack | ||
53 | effects (such as Large Receive Offload) which can combine packets | ||
54 | on the wire, this may be either more or less than the number | ||
55 | of NFS calls received (which statistic is available elsewhere). | ||
56 | However this is a more accurate and less workload-dependent measure | ||
57 | of how much CPU load is being placed on the sunrpc server layer | ||
58 | due to NFS network traffic. | ||
59 | |||
60 | sockets-enqueued | ||
61 | Counts how many times an NFS transport is enqueued to wait for | ||
62 | an nfsd thread to service it, i.e. no nfsd thread was considered | ||
63 | available. | ||
64 | |||
65 | The circumstance this statistic tracks indicates that there was NFS | ||
66 | network-facing work to be done but it couldn't be done immediately, | ||
67 | thus introducing a small delay in servicing NFS calls. The ideal | ||
68 | rate of change for this counter is zero; significantly non-zero | ||
69 | values may indicate a performance limitation. | ||
70 | |||
71 | This can happen either because there are too few nfsd threads in the | ||
72 | thread pool for the NFS workload (the workload is thread-limited), | ||
73 | or because the NFS workload needs more CPU time than is available in | ||
74 | the thread pool (the workload is CPU-limited). In the former case, | ||
75 | configuring more nfsd threads will probably improve the performance | ||
76 | of the NFS workload. In the latter case, the sunrpc server layer is | ||
77 | already choosing not to wake idle nfsd threads because there are too | ||
78 | many nfsd threads which want to run but cannot, so configuring more | ||
79 | nfsd threads will make no difference whatsoever. The overloads-avoided | ||
80 | statistic (see below) can be used to distinguish these cases. | ||
81 | |||
82 | threads-woken | ||
83 | Counts how many times an idle nfsd thread is woken to try to | ||
84 | receive some data from an NFS transport. | ||
85 | |||
86 | This statistic tracks the circumstance where incoming | ||
87 | network-facing NFS work is being handled quickly, which is a good | ||
88 | thing. The ideal rate of change for this counter will be close | ||
89 | to but less than the rate of change of the packets-arrived counter. | ||
90 | |||
91 | overloads-avoided | ||
92 | Counts how many times the sunrpc server layer chose not to wake an | ||
93 | nfsd thread, despite the presence of idle nfsd threads, because | ||
94 | too many nfsd threads had been recently woken but could not get | ||
95 | enough CPU time to actually run. | ||
96 | |||
97 | This statistic counts a circumstance where the sunrpc layer | ||
98 | heuristically avoids overloading the CPU scheduler with too many | ||
99 | runnable nfsd threads. The ideal rate of change for this counter | ||
100 | is zero. Significant non-zero values indicate that the workload | ||
101 | is CPU limited. Usually this is associated with heavy CPU usage | ||
102 | on all the CPUs in the nfsd thread pool. | ||
103 | |||
104 | If a sustained large overloads-avoided rate is detected on a pool, | ||
105 | the top(1) utility should be used to check for the following | ||
106 | pattern of CPU usage on all the CPUs associated with the given | ||
107 | nfsd thread pool. | ||
108 | |||
109 | - %us ~= 0 (as you're *NOT* running applications on your NFS server) | ||
110 | |||
111 | - %wa ~= 0 | ||
112 | |||
113 | - %id ~= 0 | ||
114 | |||
115 | - %sy + %hi + %si ~= 100 | ||
116 | |||
117 | If this pattern is seen, configuring more nfsd threads will *not* | ||
118 | improve the performance of the workload. If this patten is not | ||
119 | seen, then something more subtle is wrong. | ||
120 | |||
121 | threads-timedout | ||
122 | Counts how many times an nfsd thread triggered an idle timeout, | ||
123 | i.e. was not woken to handle any incoming network packets for | ||
124 | some time. | ||
125 | |||
126 | This statistic counts a circumstance where there are more nfsd | ||
127 | threads configured than can be used by the NFS workload. This is | ||
128 | a clue that the number of nfsd threads can be reduced without | ||
129 | affecting performance. Unfortunately, it's only a clue and not | ||
130 | a strong indication, for a couple of reasons: | ||
131 | |||
132 | - Currently the rate at which the counter is incremented is quite | ||
133 | slow; the idle timeout is 60 minutes. Unless the NFS workload | ||
134 | remains constant for hours at a time, this counter is unlikely | ||
135 | to be providing information that is still useful. | ||
136 | |||
137 | - It is usually a wise policy to provide some slack, | ||
138 | i.e. configure a few more nfsds than are currently needed, | ||
139 | to allow for future spikes in load. | ||
140 | |||
141 | |||
142 | Note that incoming packets on NFS transports will be dealt with in | ||
143 | one of three ways. An nfsd thread can be woken (threads-woken counts | ||
144 | this case), or the transport can be enqueued for later attention | ||
145 | (sockets-enqueued counts this case), or the packet can be temporarily | ||
146 | deferred because the transport is currently being used by an nfsd | ||
147 | thread. This last case is not very interesting and is not explicitly | ||
148 | counted, but can be inferred from the other counters thus: | ||
149 | |||
150 | packets-deferred = packets-arrived - ( sockets-enqueued + threads-woken ) | ||
151 | |||
152 | |||
153 | More | ||
154 | ---- | ||
155 | Descriptions of the other statistics file should go here. | ||
156 | |||
157 | |||
158 | Greg Banks <gnb@sgi.com> | ||
159 | 26 Mar 2009 | ||
diff --git a/Documentation/filesystems/nfs41-server.txt b/Documentation/filesystems/nfs41-server.txt new file mode 100644 index 000000000000..05d81cbcb2e1 --- /dev/null +++ b/Documentation/filesystems/nfs41-server.txt | |||
@@ -0,0 +1,161 @@ | |||
1 | NFSv4.1 Server Implementation | ||
2 | |||
3 | Server support for minorversion 1 can be controlled using the | ||
4 | /proc/fs/nfsd/versions control file. The string output returned | ||
5 | by reading this file will contain either "+4.1" or "-4.1" | ||
6 | correspondingly. | ||
7 | |||
8 | Currently, server support for minorversion 1 is disabled by default. | ||
9 | It can be enabled at run time by writing the string "+4.1" to | ||
10 | the /proc/fs/nfsd/versions control file. Note that to write this | ||
11 | control file, the nfsd service must be taken down. Use your user-mode | ||
12 | nfs-utils to set this up; see rpc.nfsd(8) | ||
13 | |||
14 | The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based | ||
15 | on the latest NFSv4.1 Internet Draft: | ||
16 | http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29 | ||
17 | |||
18 | From the many new features in NFSv4.1 the current implementation | ||
19 | focuses on the mandatory-to-implement NFSv4.1 Sessions, providing | ||
20 | "exactly once" semantics and better control and throttling of the | ||
21 | resources allocated for each client. | ||
22 | |||
23 | Other NFSv4.1 features, Parallel NFS operations in particular, | ||
24 | are still under development out of tree. | ||
25 | See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design | ||
26 | for more information. | ||
27 | |||
28 | The table below, taken from the NFSv4.1 document, lists | ||
29 | the operations that are mandatory to implement (REQ), optional | ||
30 | (OPT), and NFSv4.0 operations that are required not to implement (MNI) | ||
31 | in minor version 1. The first column indicates the operations that | ||
32 | are not supported yet by the linux server implementation. | ||
33 | |||
34 | The OPTIONAL features identified and their abbreviations are as follows: | ||
35 | pNFS Parallel NFS | ||
36 | FDELG File Delegations | ||
37 | DDELG Directory Delegations | ||
38 | |||
39 | The following abbreviations indicate the linux server implementation status. | ||
40 | I Implemented NFSv4.1 operations. | ||
41 | NS Not Supported. | ||
42 | NS* unimplemented optional feature. | ||
43 | P pNFS features implemented out of tree. | ||
44 | PNS pNFS features that are not supported yet (out of tree). | ||
45 | |||
46 | Operations | ||
47 | |||
48 | +----------------------+------------+--------------+----------------+ | ||
49 | | Operation | REQ, REC, | Feature | Definition | | ||
50 | | | OPT, or | (REQ, REC, | | | ||
51 | | | MNI | or OPT) | | | ||
52 | +----------------------+------------+--------------+----------------+ | ||
53 | | ACCESS | REQ | | Section 18.1 | | ||
54 | NS | BACKCHANNEL_CTL | REQ | | Section 18.33 | | ||
55 | NS | BIND_CONN_TO_SESSION | REQ | | Section 18.34 | | ||
56 | | CLOSE | REQ | | Section 18.2 | | ||
57 | | COMMIT | REQ | | Section 18.3 | | ||
58 | | CREATE | REQ | | Section 18.4 | | ||
59 | I | CREATE_SESSION | REQ | | Section 18.36 | | ||
60 | NS*| DELEGPURGE | OPT | FDELG (REQ) | Section 18.5 | | ||
61 | | DELEGRETURN | OPT | FDELG, | Section 18.6 | | ||
62 | | | | DDELG, pNFS | | | ||
63 | | | | (REQ) | | | ||
64 | NS | DESTROY_CLIENTID | REQ | | Section 18.50 | | ||
65 | I | DESTROY_SESSION | REQ | | Section 18.37 | | ||
66 | I | EXCHANGE_ID | REQ | | Section 18.35 | | ||
67 | NS | FREE_STATEID | REQ | | Section 18.38 | | ||
68 | | GETATTR | REQ | | Section 18.7 | | ||
69 | P | GETDEVICEINFO | OPT | pNFS (REQ) | Section 18.40 | | ||
70 | P | GETDEVICELIST | OPT | pNFS (OPT) | Section 18.41 | | ||
71 | | GETFH | REQ | | Section 18.8 | | ||
72 | NS*| GET_DIR_DELEGATION | OPT | DDELG (REQ) | Section 18.39 | | ||
73 | P | LAYOUTCOMMIT | OPT | pNFS (REQ) | Section 18.42 | | ||
74 | P | LAYOUTGET | OPT | pNFS (REQ) | Section 18.43 | | ||
75 | P | LAYOUTRETURN | OPT | pNFS (REQ) | Section 18.44 | | ||
76 | | LINK | OPT | | Section 18.9 | | ||
77 | | LOCK | REQ | | Section 18.10 | | ||
78 | | LOCKT | REQ | | Section 18.11 | | ||
79 | | LOCKU | REQ | | Section 18.12 | | ||
80 | | LOOKUP | REQ | | Section 18.13 | | ||
81 | | LOOKUPP | REQ | | Section 18.14 | | ||
82 | | NVERIFY | REQ | | Section 18.15 | | ||
83 | | OPEN | REQ | | Section 18.16 | | ||
84 | NS*| OPENATTR | OPT | | Section 18.17 | | ||
85 | | OPEN_CONFIRM | MNI | | N/A | | ||
86 | | OPEN_DOWNGRADE | REQ | | Section 18.18 | | ||
87 | | PUTFH | REQ | | Section 18.19 | | ||
88 | | PUTPUBFH | REQ | | Section 18.20 | | ||
89 | | PUTROOTFH | REQ | | Section 18.21 | | ||
90 | | READ | REQ | | Section 18.22 | | ||
91 | | READDIR | REQ | | Section 18.23 | | ||
92 | | READLINK | OPT | | Section 18.24 | | ||
93 | NS | RECLAIM_COMPLETE | REQ | | Section 18.51 | | ||
94 | | RELEASE_LOCKOWNER | MNI | | N/A | | ||
95 | | REMOVE | REQ | | Section 18.25 | | ||
96 | | RENAME | REQ | | Section 18.26 | | ||
97 | | RENEW | MNI | | N/A | | ||
98 | | RESTOREFH | REQ | | Section 18.27 | | ||
99 | | SAVEFH | REQ | | Section 18.28 | | ||
100 | | SECINFO | REQ | | Section 18.29 | | ||
101 | NS | SECINFO_NO_NAME | REC | pNFS files | Section 18.45, | | ||
102 | | | | layout (REQ) | Section 13.12 | | ||
103 | I | SEQUENCE | REQ | | Section 18.46 | | ||
104 | | SETATTR | REQ | | Section 18.30 | | ||
105 | | SETCLIENTID | MNI | | N/A | | ||
106 | | SETCLIENTID_CONFIRM | MNI | | N/A | | ||
107 | NS | SET_SSV | REQ | | Section 18.47 | | ||
108 | NS | TEST_STATEID | REQ | | Section 18.48 | | ||
109 | | VERIFY | REQ | | Section 18.31 | | ||
110 | NS*| WANT_DELEGATION | OPT | FDELG (OPT) | Section 18.49 | | ||
111 | | WRITE | REQ | | Section 18.32 | | ||
112 | |||
113 | Callback Operations | ||
114 | |||
115 | +-------------------------+-----------+-------------+---------------+ | ||
116 | | Operation | REQ, REC, | Feature | Definition | | ||
117 | | | OPT, or | (REQ, REC, | | | ||
118 | | | MNI | or OPT) | | | ||
119 | +-------------------------+-----------+-------------+---------------+ | ||
120 | | CB_GETATTR | OPT | FDELG (REQ) | Section 20.1 | | ||
121 | P | CB_LAYOUTRECALL | OPT | pNFS (REQ) | Section 20.3 | | ||
122 | NS*| CB_NOTIFY | OPT | DDELG (REQ) | Section 20.4 | | ||
123 | P | CB_NOTIFY_DEVICEID | OPT | pNFS (OPT) | Section 20.12 | | ||
124 | NS*| CB_NOTIFY_LOCK | OPT | | Section 20.11 | | ||
125 | NS*| CB_PUSH_DELEG | OPT | FDELG (OPT) | Section 20.5 | | ||
126 | | CB_RECALL | OPT | FDELG, | Section 20.2 | | ||
127 | | | | DDELG, pNFS | | | ||
128 | | | | (REQ) | | | ||
129 | NS*| CB_RECALL_ANY | OPT | FDELG, | Section 20.6 | | ||
130 | | | | DDELG, pNFS | | | ||
131 | | | | (REQ) | | | ||
132 | NS | CB_RECALL_SLOT | REQ | | Section 20.8 | | ||
133 | NS*| CB_RECALLABLE_OBJ_AVAIL | OPT | DDELG, pNFS | Section 20.7 | | ||
134 | | | | (REQ) | | | ||
135 | I | CB_SEQUENCE | OPT | FDELG, | Section 20.9 | | ||
136 | | | | DDELG, pNFS | | | ||
137 | | | | (REQ) | | | ||
138 | NS*| CB_WANTS_CANCELLED | OPT | FDELG, | Section 20.10 | | ||
139 | | | | DDELG, pNFS | | | ||
140 | | | | (REQ) | | | ||
141 | +-------------------------+-----------+-------------+---------------+ | ||
142 | |||
143 | Implementation notes: | ||
144 | |||
145 | EXCHANGE_ID: | ||
146 | * only SP4_NONE state protection supported | ||
147 | * implementation ids are ignored | ||
148 | |||
149 | CREATE_SESSION: | ||
150 | * backchannel attributes are ignored | ||
151 | * backchannel security parameters are ignored | ||
152 | |||
153 | SEQUENCE: | ||
154 | * no support for dynamic slot table renegotiation (optional) | ||
155 | |||
156 | nfsv4.1 COMPOUND rules: | ||
157 | The following cases aren't supported yet: | ||
158 | * Enforcing of NFS4ERR_NOT_ONLY_OP for: BIND_CONN_TO_SESSION, CREATE_SESSION, | ||
159 | DESTROY_CLIENTID, DESTROY_SESSION, EXCHANGE_ID. | ||
160 | * DESTROY_SESSION MUST be the final operation in the COMPOUND request. | ||
161 | |||
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt new file mode 100644 index 000000000000..55c4300abfcb --- /dev/null +++ b/Documentation/filesystems/nilfs2.txt | |||
@@ -0,0 +1,200 @@ | |||
1 | NILFS2 | ||
2 | ------ | ||
3 | |||
4 | NILFS2 is a log-structured file system (LFS) supporting continuous | ||
5 | snapshotting. In addition to versioning capability of the entire file | ||
6 | system, users can even restore files mistakenly overwritten or | ||
7 | destroyed just a few seconds ago. Since NILFS2 can keep consistency | ||
8 | like conventional LFS, it achieves quick recovery after system | ||
9 | crashes. | ||
10 | |||
11 | NILFS2 creates a number of checkpoints every few seconds or per | ||
12 | synchronous write basis (unless there is no change). Users can select | ||
13 | significant versions among continuously created checkpoints, and can | ||
14 | change them into snapshots which will be preserved until they are | ||
15 | changed back to checkpoints. | ||
16 | |||
17 | There is no limit on the number of snapshots until the volume gets | ||
18 | full. Each snapshot is mountable as a read-only file system | ||
19 | concurrently with its writable mount, and this feature is convenient | ||
20 | for online backup. | ||
21 | |||
22 | The userland tools are included in nilfs-utils package, which is | ||
23 | available from the following download page. At least "mkfs.nilfs2", | ||
24 | "mount.nilfs2", "umount.nilfs2", and "nilfs_cleanerd" (so called | ||
25 | cleaner or garbage collector) are required. Details on the tools are | ||
26 | described in the man pages included in the package. | ||
27 | |||
28 | Project web page: http://www.nilfs.org/en/ | ||
29 | Download page: http://www.nilfs.org/en/download.html | ||
30 | Git tree web page: http://www.nilfs.org/git/ | ||
31 | NILFS mailing lists: http://www.nilfs.org/mailman/listinfo/users | ||
32 | |||
33 | Caveats | ||
34 | ======= | ||
35 | |||
36 | Features which NILFS2 does not support yet: | ||
37 | |||
38 | - atime | ||
39 | - extended attributes | ||
40 | - POSIX ACLs | ||
41 | - quotas | ||
42 | - writable snapshots | ||
43 | - remote backup (CDP) | ||
44 | - data integrity | ||
45 | - defragmentation | ||
46 | |||
47 | Mount options | ||
48 | ============= | ||
49 | |||
50 | NILFS2 supports the following mount options: | ||
51 | (*) == default | ||
52 | |||
53 | barrier=on(*) This enables/disables barriers. barrier=off disables | ||
54 | it, barrier=on enables it. | ||
55 | errors=continue(*) Keep going on a filesystem error. | ||
56 | errors=remount-ro Remount the filesystem read-only on an error. | ||
57 | errors=panic Panic and halt the machine if an error occurs. | ||
58 | cp=n Specify the checkpoint-number of the snapshot to be | ||
59 | mounted. Checkpoints and snapshots are listed by lscp | ||
60 | user command. Only the checkpoints marked as snapshot | ||
61 | are mountable with this option. Snapshot is read-only, | ||
62 | so a read-only mount option must be specified together. | ||
63 | order=relaxed(*) Apply relaxed order semantics that allows modified data | ||
64 | blocks to be written to disk without making a | ||
65 | checkpoint if no metadata update is going. This mode | ||
66 | is equivalent to the ordered data mode of the ext3 | ||
67 | filesystem except for the updates on data blocks still | ||
68 | conserve atomicity. This will improve synchronous | ||
69 | write performance for overwriting. | ||
70 | order=strict Apply strict in-order semantics that preserves sequence | ||
71 | of all file operations including overwriting of data | ||
72 | blocks. That means, it is guaranteed that no | ||
73 | overtaking of events occurs in the recovered file | ||
74 | system after a crash. | ||
75 | |||
76 | NILFS2 usage | ||
77 | ============ | ||
78 | |||
79 | To use nilfs2 as a local file system, simply: | ||
80 | |||
81 | # mkfs -t nilfs2 /dev/block_device | ||
82 | # mount -t nilfs2 /dev/block_device /dir | ||
83 | |||
84 | This will also invoke the cleaner through the mount helper program | ||
85 | (mount.nilfs2). | ||
86 | |||
87 | Checkpoints and snapshots are managed by the following commands. | ||
88 | Their manpages are included in the nilfs-utils package above. | ||
89 | |||
90 | lscp list checkpoints or snapshots. | ||
91 | mkcp make a checkpoint or a snapshot. | ||
92 | chcp change an existing checkpoint to a snapshot or vice versa. | ||
93 | rmcp invalidate specified checkpoint(s). | ||
94 | |||
95 | To mount a snapshot, | ||
96 | |||
97 | # mount -t nilfs2 -r -o cp=<cno> /dev/block_device /snap_dir | ||
98 | |||
99 | where <cno> is the checkpoint number of the snapshot. | ||
100 | |||
101 | To unmount the NILFS2 mount point or snapshot, simply: | ||
102 | |||
103 | # umount /dir | ||
104 | |||
105 | Then, the cleaner daemon is automatically shut down by the umount | ||
106 | helper program (umount.nilfs2). | ||
107 | |||
108 | Disk format | ||
109 | =========== | ||
110 | |||
111 | A nilfs2 volume is equally divided into a number of segments except | ||
112 | for the super block (SB) and segment #0. A segment is the container | ||
113 | of logs. Each log is composed of summary information blocks, payload | ||
114 | blocks, and an optional super root block (SR): | ||
115 | |||
116 | ______________________________________________________ | ||
117 | | |SB| | Segment | Segment | Segment | ... | Segment | | | ||
118 | |_|__|_|____0____|____1____|____2____|_____|____N____|_| | ||
119 | 0 +1K +4K +8M +16M +24M +(8MB x N) | ||
120 | . . (Typical offsets for 4KB-block) | ||
121 | . . | ||
122 | .______________________. | ||
123 | | log | log |... | log | | ||
124 | |__1__|__2__|____|__m__| | ||
125 | . . | ||
126 | . . | ||
127 | . . | ||
128 | .______________________________. | ||
129 | | Summary | Payload blocks |SR| | ||
130 | |_blocks__|_________________|__| | ||
131 | |||
132 | The payload blocks are organized per file, and each file consists of | ||
133 | data blocks and B-tree node blocks: | ||
134 | |||
135 | |<--- File-A --->|<--- File-B --->| | ||
136 | _______________________________________________________________ | ||
137 | | Data blocks | B-tree blocks | Data blocks | B-tree blocks | ... | ||
138 | _|_____________|_______________|_____________|_______________|_ | ||
139 | |||
140 | |||
141 | Since only the modified blocks are written in the log, it may have | ||
142 | files without data blocks or B-tree node blocks. | ||
143 | |||
144 | The organization of the blocks is recorded in the summary information | ||
145 | blocks, which contains a header structure (nilfs_segment_summary), per | ||
146 | file structures (nilfs_finfo), and per block structures (nilfs_binfo): | ||
147 | |||
148 | _________________________________________________________________________ | ||
149 | | Summary | finfo | binfo | ... | binfo | finfo | binfo | ... | binfo |... | ||
150 | |_blocks__|___A___|_(A,1)_|_____|(A,Na)_|___B___|_(B,1)_|_____|(B,Nb)_|___ | ||
151 | |||
152 | |||
153 | The logs include regular files, directory files, symbolic link files | ||
154 | and several meta data files. The mata data files are the files used | ||
155 | to maintain file system meta data. The current version of NILFS2 uses | ||
156 | the following meta data files: | ||
157 | |||
158 | 1) Inode file (ifile) -- Stores on-disk inodes | ||
159 | 2) Checkpoint file (cpfile) -- Stores checkpoints | ||
160 | 3) Segment usage file (sufile) -- Stores allocation state of segments | ||
161 | 4) Data address translation file -- Maps virtual block numbers to usual | ||
162 | (DAT) block numbers. This file serves to | ||
163 | make on-disk blocks relocatable. | ||
164 | |||
165 | The following figure shows a typical organization of the logs: | ||
166 | |||
167 | _________________________________________________________________________ | ||
168 | | Summary | regular file | file | ... | ifile | cpfile | sufile | DAT |SR| | ||
169 | |_blocks__|_or_directory_|_______|_____|_______|________|________|_____|__| | ||
170 | |||
171 | |||
172 | To stride over segment boundaries, this sequence of files may be split | ||
173 | into multiple logs. The sequence of logs that should be treated as | ||
174 | logically one log, is delimited with flags marked in the segment | ||
175 | summary. The recovery code of nilfs2 looks this boundary information | ||
176 | to ensure atomicity of updates. | ||
177 | |||
178 | The super root block is inserted for every checkpoints. It includes | ||
179 | three special inodes, inodes for the DAT, cpfile, and sufile. Inodes | ||
180 | of regular files, directories, symlinks and other special files, are | ||
181 | included in the ifile. The inode of ifile itself is included in the | ||
182 | corresponding checkpoint entry in the cpfile. Thus, the hierarchy | ||
183 | among NILFS2 files can be depicted as follows: | ||
184 | |||
185 | Super block (SB) | ||
186 | | | ||
187 | v | ||
188 | Super root block (the latest cno=xx) | ||
189 | |-- DAT | ||
190 | |-- sufile | ||
191 | `-- cpfile | ||
192 | |-- ifile (cno=c1) | ||
193 | |-- ifile (cno=c2) ---- file (ino=i1) | ||
194 | : : |-- file (ino=i2) | ||
195 | `-- ifile (cno=xx) |-- file (ino=i3) | ||
196 | : : | ||
197 | `-- file (ino=yy) | ||
198 | ( regular file, directory, or symlink ) | ||
199 | |||
200 | For detail on the format of each file, please see include/linux/nilfs2_fs.h. | ||
diff --git a/Documentation/hwmon/g760a b/Documentation/hwmon/g760a new file mode 100644 index 000000000000..e032eeb75629 --- /dev/null +++ b/Documentation/hwmon/g760a | |||
@@ -0,0 +1,36 @@ | |||
1 | Kernel driver g760a | ||
2 | =================== | ||
3 | |||
4 | Supported chips: | ||
5 | * Global Mixed-mode Technology Inc. G760A | ||
6 | Prefix: 'g760a' | ||
7 | Datasheet: Publicly available at the GMT website | ||
8 | http://www.gmt.com.tw/datasheet/g760a.pdf | ||
9 | |||
10 | Author: Herbert Valerio Riedel <hvr@gnu.org> | ||
11 | |||
12 | Description | ||
13 | ----------- | ||
14 | |||
15 | The GMT G760A Fan Speed PWM Controller is connected directly to a fan | ||
16 | and performs closed-loop control of the fan speed. | ||
17 | |||
18 | The fan speed is programmed by setting the period via 'pwm1' of two | ||
19 | consecutive speed pulses. The period is defined in terms of clock | ||
20 | cycle counts of an assumed 32kHz clock source. | ||
21 | |||
22 | Setting a period of 0 stops the fan; setting the period to 255 sets | ||
23 | fan to maximum speed. | ||
24 | |||
25 | The measured fan rotation speed returned via 'fan1_input' is derived | ||
26 | from the measured speed pulse period by assuming again a 32kHz clock | ||
27 | source and a 2 pulse-per-revolution fan. | ||
28 | |||
29 | The 'alarms' file provides access to the two alarm bits provided by | ||
30 | the G760A chip's status register: Bit 0 is set when the actual fan | ||
31 | speed differs more than 20% with respect to the programmed fan speed; | ||
32 | bit 1 is set when fan speed is below 1920 RPM. | ||
33 | |||
34 | The g760a driver will not update its values more frequently than every | ||
35 | other second; reading them more often will do no harm, but will return | ||
36 | 'old' values. | ||
diff --git a/Documentation/infiniband/ipoib.txt b/Documentation/infiniband/ipoib.txt index 864ff3283780..6d40f00b358c 100644 --- a/Documentation/infiniband/ipoib.txt +++ b/Documentation/infiniband/ipoib.txt | |||
@@ -24,6 +24,49 @@ Partitions and P_Keys | |||
24 | The P_Key for any interface is given by the "pkey" file, and the | 24 | The P_Key for any interface is given by the "pkey" file, and the |
25 | main interface for a subinterface is in "parent." | 25 | main interface for a subinterface is in "parent." |
26 | 26 | ||
27 | Datagram vs Connected modes | ||
28 | |||
29 | The IPoIB driver supports two modes of operation: datagram and | ||
30 | connected. The mode is set and read through an interface's | ||
31 | /sys/class/net/<intf name>/mode file. | ||
32 | |||
33 | In datagram mode, the IB UD (Unreliable Datagram) transport is used | ||
34 | and so the interface MTU has is equal to the IB L2 MTU minus the | ||
35 | IPoIB encapsulation header (4 bytes). For example, in a typical IB | ||
36 | fabric with a 2K MTU, the IPoIB MTU will be 2048 - 4 = 2044 bytes. | ||
37 | |||
38 | In connected mode, the IB RC (Reliable Connected) transport is used. | ||
39 | Connected mode is to takes advantage of the connected nature of the | ||
40 | IB transport and allows an MTU up to the maximal IP packet size of | ||
41 | 64K, which reduces the number of IP packets needed for handling | ||
42 | large UDP datagrams, TCP segments, etc and increases the performance | ||
43 | for large messages. | ||
44 | |||
45 | In connected mode, the interface's UD QP is still used for multicast | ||
46 | and communication with peers that don't support connected mode. In | ||
47 | this case, RX emulation of ICMP PMTU packets is used to cause the | ||
48 | networking stack to use the smaller UD MTU for these neighbours. | ||
49 | |||
50 | Stateless offloads | ||
51 | |||
52 | If the IB HW supports IPoIB stateless offloads, IPoIB advertises | ||
53 | TCP/IP checksum and/or Large Send (LSO) offloading capability to the | ||
54 | network stack. | ||
55 | |||
56 | Large Receive (LRO) offloading is also implemented and may be turned | ||
57 | on/off using ethtool calls. Currently LRO is supported only for | ||
58 | checksum offload capable devices. | ||
59 | |||
60 | Stateless offloads are supported only in datagram mode. | ||
61 | |||
62 | Interrupt moderation | ||
63 | |||
64 | If the underlying IB device supports CQ event moderation, one can | ||
65 | use ethtool to set interrupt mitigation parameters and thus reduce | ||
66 | the overhead incurred by handling interrupts. The main code path of | ||
67 | IPoIB doesn't use events for TX completion signaling so only RX | ||
68 | moderation is supported. | ||
69 | |||
27 | Debugging Information | 70 | Debugging Information |
28 | 71 | ||
29 | By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set | 72 | By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set |
@@ -55,3 +98,5 @@ References | |||
55 | http://ietf.org/rfc/rfc4391.txt | 98 | http://ietf.org/rfc/rfc4391.txt |
56 | IP over InfiniBand (IPoIB) Architecture (RFC 4392) | 99 | IP over InfiniBand (IPoIB) Architecture (RFC 4392) |
57 | http://ietf.org/rfc/rfc4392.txt | 100 | http://ietf.org/rfc/rfc4392.txt |
101 | IP over InfiniBand: Connected Mode (RFC 4755) | ||
102 | http://ietf.org/rfc/rfc4755.txt | ||
diff --git a/Documentation/input/rotary-encoder.txt b/Documentation/input/rotary-encoder.txt new file mode 100644 index 000000000000..435102a26d96 --- /dev/null +++ b/Documentation/input/rotary-encoder.txt | |||
@@ -0,0 +1,101 @@ | |||
1 | rotary-encoder - a generic driver for GPIO connected devices | ||
2 | Daniel Mack <daniel@caiaq.de>, Feb 2009 | ||
3 | |||
4 | 0. Function | ||
5 | ----------- | ||
6 | |||
7 | Rotary encoders are devices which are connected to the CPU or other | ||
8 | peripherals with two wires. The outputs are phase-shifted by 90 degrees | ||
9 | and by triggering on falling and rising edges, the turn direction can | ||
10 | be determined. | ||
11 | |||
12 | The phase diagram of these two outputs look like this: | ||
13 | |||
14 | _____ _____ _____ | ||
15 | | | | | | | | ||
16 | Channel A ____| |_____| |_____| |____ | ||
17 | |||
18 | : : : : : : : : : : : : | ||
19 | __ _____ _____ _____ | ||
20 | | | | | | | | | ||
21 | Channel B |_____| |_____| |_____| |__ | ||
22 | |||
23 | : : : : : : : : : : : : | ||
24 | Event a b c d a b c d a b c d | ||
25 | |||
26 | |<-------->| | ||
27 | one step | ||
28 | |||
29 | |||
30 | For more information, please see | ||
31 | http://en.wikipedia.org/wiki/Rotary_encoder | ||
32 | |||
33 | |||
34 | 1. Events / state machine | ||
35 | ------------------------- | ||
36 | |||
37 | a) Rising edge on channel A, channel B in low state | ||
38 | This state is used to recognize a clockwise turn | ||
39 | |||
40 | b) Rising edge on channel B, channel A in high state | ||
41 | When entering this state, the encoder is put into 'armed' state, | ||
42 | meaning that there it has seen half the way of a one-step transition. | ||
43 | |||
44 | c) Falling edge on channel A, channel B in high state | ||
45 | This state is used to recognize a counter-clockwise turn | ||
46 | |||
47 | d) Falling edge on channel B, channel A in low state | ||
48 | Parking position. If the encoder enters this state, a full transition | ||
49 | should have happend, unless it flipped back on half the way. The | ||
50 | 'armed' state tells us about that. | ||
51 | |||
52 | 2. Platform requirements | ||
53 | ------------------------ | ||
54 | |||
55 | As there is no hardware dependent call in this driver, the platform it is | ||
56 | used with must support gpiolib. Another requirement is that IRQs must be | ||
57 | able to fire on both edges. | ||
58 | |||
59 | |||
60 | 3. Board integration | ||
61 | -------------------- | ||
62 | |||
63 | To use this driver in your system, register a platform_device with the | ||
64 | name 'rotary-encoder' and associate the IRQs and some specific platform | ||
65 | data with it. | ||
66 | |||
67 | struct rotary_encoder_platform_data is declared in | ||
68 | include/linux/rotary-encoder.h and needs to be filled with the number of | ||
69 | steps the encoder has and can carry information about externally inverted | ||
70 | signals (because of used invertig buffer or other reasons). | ||
71 | |||
72 | Because GPIO to IRQ mapping is platform specific, this information must | ||
73 | be given in seperately to the driver. See the example below. | ||
74 | |||
75 | ---------<snip>--------- | ||
76 | |||
77 | /* board support file example */ | ||
78 | |||
79 | #include <linux/input.h> | ||
80 | #include <linux/rotary_encoder.h> | ||
81 | |||
82 | #define GPIO_ROTARY_A 1 | ||
83 | #define GPIO_ROTARY_B 2 | ||
84 | |||
85 | static struct rotary_encoder_platform_data my_rotary_encoder_info = { | ||
86 | .steps = 24, | ||
87 | .axis = ABS_X, | ||
88 | .gpio_a = GPIO_ROTARY_A, | ||
89 | .gpio_b = GPIO_ROTARY_B, | ||
90 | .inverted_a = 0, | ||
91 | .inverted_b = 0, | ||
92 | }; | ||
93 | |||
94 | static struct platform_device rotary_encoder_device = { | ||
95 | .name = "rotary-encoder", | ||
96 | .id = 0, | ||
97 | .dev = { | ||
98 | .platform_data = &my_rotary_encoder_info, | ||
99 | } | ||
100 | }; | ||
101 | |||
diff --git a/Documentation/isdn/README.gigaset b/Documentation/isdn/README.gigaset index 55b2852904a4..02c0e9341dd8 100644 --- a/Documentation/isdn/README.gigaset +++ b/Documentation/isdn/README.gigaset | |||
@@ -61,24 +61,28 @@ GigaSet 307x Device Driver | |||
61 | --------------------- | 61 | --------------------- |
62 | 2.1. Modules | 62 | 2.1. Modules |
63 | ------- | 63 | ------- |
64 | To get the device working, you have to load the proper kernel module. You | 64 | For the devices to work, the proper kernel modules have to be loaded. |
65 | can do this using | 65 | This normally happens automatically when the system detects the USB |
66 | modprobe modulename | 66 | device (base, M105) or when the line discipline is attached (M101). It |
67 | where modulename is ser_gigaset (M101), usb_gigaset (M105), or | 67 | can also be triggered manually using the modprobe(8) command, for example |
68 | bas_gigaset (direct USB connection to the base). | 68 | for troubleshooting or to pass module parameters. |
69 | 69 | ||
70 | The module ser_gigaset provides a serial line discipline N_GIGASET_M101 | 70 | The module ser_gigaset provides a serial line discipline N_GIGASET_M101 |
71 | which drives the device through the regular serial line driver. To use it, | 71 | which drives the device through the regular serial line driver. It must |
72 | run the Gigaset M101 daemon "gigasetm101d" (also available from | 72 | be attached to the serial line to which the M101 is connected with the |
73 | http://sourceforge.net/projects/gigaset307x/) with the device file of the | 73 | ldattach(8) command (requires util-linux-ng release 2.14 or later), for |
74 | RS232 port to the M101 as an argument, for example: | 74 | example: |
75 | gigasetm101d /dev/ttyS1 | 75 | ldattach GIGASET_M101 /dev/ttyS1 |
76 | This will open the device file, set its line discipline to N_GIGASET_M101, | 76 | This will open the device file, attach the line discipline to it, and |
77 | and then sleep in the background, keeping the device open so that the | 77 | then sleep in the background, keeping the device open so that the line |
78 | line discipline remains active. To deactivate it, kill the daemon, for | 78 | discipline remains active. To deactivate it, kill the daemon, for example |
79 | example with | 79 | with |
80 | killall gigasetm101d | 80 | killall ldattach |
81 | before disconnecting the device. | 81 | before disconnecting the device. To have this happen automatically at |
82 | system startup/shutdown on an LSB compatible system, create and activate | ||
83 | an appropriate LSB startup script /etc/init.d/gigaset. (The init name | ||
84 | 'gigaset' is officially assigned to this project by LANANA.) | ||
85 | Alternatively, just add the 'ldattach' command line to /etc/rc.local. | ||
82 | 86 | ||
83 | 2.2. Device nodes for user space programs | 87 | 2.2. Device nodes for user space programs |
84 | ------------------------------------ | 88 | ------------------------------------ |
@@ -194,10 +198,11 @@ GigaSet 307x Device Driver | |||
194 | operation (for wireless access to the base), but are needed for access | 198 | operation (for wireless access to the base), but are needed for access |
195 | to the M105's own configuration mode (registration to the base, baudrate | 199 | to the M105's own configuration mode (registration to the base, baudrate |
196 | and line format settings, device status queries) via the gigacontr | 200 | and line format settings, device status queries) via the gigacontr |
197 | utility. Their use is disabled in the driver by default for safety | 201 | utility. Their use is controlled by the kernel configuration option |
198 | reasons but can be enabled by setting the kernel configuration option | 202 | "Support for undocumented USB requests" (CONFIG_GIGASET_UNDOCREQ). If you |
199 | "Support for undocumented USB requests" (GIGASET_UNDOCREQ) to "Y" and | 203 | encounter error code -ENOTTY when trying to use some features of the |
200 | recompiling. | 204 | M105, try setting that option to "y" via 'make {x,menu}config' and |
205 | recompiling the driver. | ||
201 | 206 | ||
202 | 207 | ||
203 | 3. Troubleshooting | 208 | 3. Troubleshooting |
@@ -228,6 +233,13 @@ GigaSet 307x Device Driver | |||
228 | Solution: | 233 | Solution: |
229 | Select Unimodem mode for all DECT data adapters. (see section 2.4.) | 234 | Select Unimodem mode for all DECT data adapters. (see section 2.4.) |
230 | 235 | ||
236 | Problem: | ||
237 | You want to configure your USB DECT data adapter (M105) but gigacontr | ||
238 | reports an error: "/dev/ttyGU0: Inappropriate ioctl for device". | ||
239 | Solution: | ||
240 | Recompile the usb_gigaset driver with the kernel configuration option | ||
241 | CONFIG_GIGASET_UNDOCREQ set to 'y'. (see section 2.6.) | ||
242 | |||
231 | 3.2. Telling the driver to provide more information | 243 | 3.2. Telling the driver to provide more information |
232 | ---------------------------------------------- | 244 | ---------------------------------------------- |
233 | Building the driver with the "Gigaset debugging" kernel configuration | 245 | Building the driver with the "Gigaset debugging" kernel configuration |
diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt index 51104f9194a5..d4b05672f9f7 100644 --- a/Documentation/kbuild/makefiles.txt +++ b/Documentation/kbuild/makefiles.txt | |||
@@ -40,10 +40,16 @@ This document describes the Linux kernel Makefiles. | |||
40 | --- 6.7 Custom kbuild commands | 40 | --- 6.7 Custom kbuild commands |
41 | --- 6.8 Preprocessing linker scripts | 41 | --- 6.8 Preprocessing linker scripts |
42 | 42 | ||
43 | === 7 Kbuild Variables | 43 | === 7 Kbuild syntax for exported headers |
44 | === 8 Makefile language | 44 | --- 7.1 header-y |
45 | === 9 Credits | 45 | --- 7.2 objhdr-y |
46 | === 10 TODO | 46 | --- 7.3 destination-y |
47 | --- 7.4 unifdef-y (deprecated) | ||
48 | |||
49 | === 8 Kbuild Variables | ||
50 | === 9 Makefile language | ||
51 | === 10 Credits | ||
52 | === 11 TODO | ||
47 | 53 | ||
48 | === 1 Overview | 54 | === 1 Overview |
49 | 55 | ||
@@ -1143,8 +1149,69 @@ When kbuild executes, the following steps are followed (roughly): | |||
1143 | The kbuild infrastructure for *lds file are used in several | 1149 | The kbuild infrastructure for *lds file are used in several |
1144 | architecture-specific files. | 1150 | architecture-specific files. |
1145 | 1151 | ||
1152 | === 7 Kbuild syntax for exported headers | ||
1153 | |||
1154 | The kernel include a set of headers that is exported to userspace. | ||
1155 | Many headers can be exported as-is but other headers requires a | ||
1156 | minimal pre-processing before they are ready for user-space. | ||
1157 | The pre-processing does: | ||
1158 | - drop kernel specific annotations | ||
1159 | - drop include of compiler.h | ||
1160 | - drop all sections that is kernel internat (guarded by ifdef __KERNEL__) | ||
1161 | |||
1162 | Each relevant directory contain a file name "Kbuild" which specify the | ||
1163 | headers to be exported. | ||
1164 | See subsequent chapter for the syntax of the Kbuild file. | ||
1165 | |||
1166 | --- 7.1 header-y | ||
1167 | |||
1168 | header-y specify header files to be exported. | ||
1169 | |||
1170 | Example: | ||
1171 | #include/linux/Kbuild | ||
1172 | header-y += usb/ | ||
1173 | header-y += aio_abi.h | ||
1174 | |||
1175 | The convention is to list one file per line and | ||
1176 | preferably in alphabetic order. | ||
1177 | |||
1178 | header-y also specify which subdirectories to visit. | ||
1179 | A subdirectory is identified by a trailing '/' which | ||
1180 | can be seen in the example above for the usb subdirectory. | ||
1181 | |||
1182 | Subdirectories are visited before their parent directories. | ||
1183 | |||
1184 | --- 7.2 objhdr-y | ||
1185 | |||
1186 | objhdr-y specifies generated files to be exported. | ||
1187 | Generated files are special as they need to be looked | ||
1188 | up in another directory when doing 'make O=...' builds. | ||
1189 | |||
1190 | Example: | ||
1191 | #include/linux/Kbuild | ||
1192 | objhdr-y += version.h | ||
1193 | |||
1194 | --- 7.3 destination-y | ||
1195 | |||
1196 | When an architecture have a set of exported headers that needs to be | ||
1197 | exported to a different directory destination-y is used. | ||
1198 | destination-y specify the destination directory for all exported | ||
1199 | headers in the file where it is present. | ||
1200 | |||
1201 | Example: | ||
1202 | #arch/xtensa/platforms/s6105/include/platform/Kbuild | ||
1203 | destination-y := include/linux | ||
1204 | |||
1205 | In the example above all exported headers in the Kbuild file | ||
1206 | will be located in the directory "include/linux" when exported. | ||
1207 | |||
1208 | |||
1209 | --- 7.4 unifdef-y (deprecated) | ||
1210 | |||
1211 | unifdef-y is deprecated. A direct replacement is header-y. | ||
1212 | |||
1146 | 1213 | ||
1147 | === 7 Kbuild Variables | 1214 | === 8 Kbuild Variables |
1148 | 1215 | ||
1149 | The top Makefile exports the following variables: | 1216 | The top Makefile exports the following variables: |
1150 | 1217 | ||
@@ -1206,7 +1273,7 @@ The top Makefile exports the following variables: | |||
1206 | INSTALL_MOD_STRIP will used as the option(s) to the strip command. | 1273 | INSTALL_MOD_STRIP will used as the option(s) to the strip command. |
1207 | 1274 | ||
1208 | 1275 | ||
1209 | === 8 Makefile language | 1276 | === 9 Makefile language |
1210 | 1277 | ||
1211 | The kernel Makefiles are designed to be run with GNU Make. The Makefiles | 1278 | The kernel Makefiles are designed to be run with GNU Make. The Makefiles |
1212 | use only the documented features of GNU Make, but they do use many | 1279 | use only the documented features of GNU Make, but they do use many |
@@ -1225,14 +1292,14 @@ time the left-hand side is used. | |||
1225 | There are some cases where "=" is appropriate. Usually, though, ":=" | 1292 | There are some cases where "=" is appropriate. Usually, though, ":=" |
1226 | is the right choice. | 1293 | is the right choice. |
1227 | 1294 | ||
1228 | === 9 Credits | 1295 | === 10 Credits |
1229 | 1296 | ||
1230 | Original version made by Michael Elizabeth Chastain, <mailto:mec@shout.net> | 1297 | Original version made by Michael Elizabeth Chastain, <mailto:mec@shout.net> |
1231 | Updates by Kai Germaschewski <kai@tp1.ruhr-uni-bochum.de> | 1298 | Updates by Kai Germaschewski <kai@tp1.ruhr-uni-bochum.de> |
1232 | Updates by Sam Ravnborg <sam@ravnborg.org> | 1299 | Updates by Sam Ravnborg <sam@ravnborg.org> |
1233 | Language QA by Jan Engelhardt <jengelh@gmx.de> | 1300 | Language QA by Jan Engelhardt <jengelh@gmx.de> |
1234 | 1301 | ||
1235 | === 10 TODO | 1302 | === 11 TODO |
1236 | 1303 | ||
1237 | - Describe how kbuild supports shipped files with _shipped. | 1304 | - Describe how kbuild supports shipped files with _shipped. |
1238 | - Generating offset header files. | 1305 | - Generating offset header files. |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 2895ce29dea5..6172e4360f60 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -153,60 +153,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
153 | 1,0: use 1st APIC table | 153 | 1,0: use 1st APIC table |
154 | default: 0 | 154 | default: 0 |
155 | 155 | ||
156 | acpi_sleep= [HW,ACPI] Sleep options | ||
157 | Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig, | ||
158 | old_ordering, s4_nonvs } | ||
159 | See Documentation/power/video.txt for information on | ||
160 | s3_bios and s3_mode. | ||
161 | s3_beep is for debugging; it makes the PC's speaker beep | ||
162 | as soon as the kernel's real-mode entry point is called. | ||
163 | s4_nohwsig prevents ACPI hardware signature from being | ||
164 | used during resume from hibernation. | ||
165 | old_ordering causes the ACPI 1.0 ordering of the _PTS | ||
166 | control method, with respect to putting devices into | ||
167 | low power states, to be enforced (the ACPI 2.0 ordering | ||
168 | of _PTS is used by default). | ||
169 | s4_nonvs prevents the kernel from saving/restoring the | ||
170 | ACPI NVS memory during hibernation. | ||
171 | |||
172 | acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode | ||
173 | Format: { level | edge | high | low } | ||
174 | |||
175 | acpi_irq_balance [HW,ACPI] | ||
176 | ACPI will balance active IRQs | ||
177 | default in APIC mode | ||
178 | |||
179 | acpi_irq_nobalance [HW,ACPI] | ||
180 | ACPI will not move active IRQs (default) | ||
181 | default in PIC mode | ||
182 | |||
183 | acpi_irq_pci= [HW,ACPI] If irq_balance, clear listed IRQs for | ||
184 | use by PCI | ||
185 | Format: <irq>,<irq>... | ||
186 | |||
187 | acpi_irq_isa= [HW,ACPI] If irq_balance, mark listed IRQs used by ISA | ||
188 | Format: <irq>,<irq>... | ||
189 | |||
190 | acpi_no_auto_ssdt [HW,ACPI] Disable automatic loading of SSDT | ||
191 | |||
192 | acpi_os_name= [HW,ACPI] Tell ACPI BIOS the name of the OS | ||
193 | Format: To spoof as Windows 98: ="Microsoft Windows" | ||
194 | |||
195 | acpi_osi= [HW,ACPI] Modify list of supported OS interface strings | ||
196 | acpi_osi="string1" # add string1 -- only one string | ||
197 | acpi_osi="!string2" # remove built-in string2 | ||
198 | acpi_osi= # disable all strings | ||
199 | |||
200 | acpi_serialize [HW,ACPI] force serialization of AML methods | ||
201 | |||
202 | acpi_skip_timer_override [HW,ACPI] | ||
203 | Recognize and ignore IRQ0/pin2 Interrupt Override. | ||
204 | For broken nForce2 BIOS resulting in XT-PIC timer. | ||
205 | acpi_use_timer_override [HW,ACPI] | ||
206 | Use timer override. For some broken Nvidia NF5 boards | ||
207 | that require a timer override, but don't have | ||
208 | HPET | ||
209 | |||
210 | acpi_backlight= [HW,ACPI] | 156 | acpi_backlight= [HW,ACPI] |
211 | acpi_backlight=vendor | 157 | acpi_backlight=vendor |
212 | acpi_backlight=video | 158 | acpi_backlight=video |
@@ -214,11 +160,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
214 | (e.g. thinkpad_acpi, sony_acpi, etc.) instead | 160 | (e.g. thinkpad_acpi, sony_acpi, etc.) instead |
215 | of the ACPI video.ko driver. | 161 | of the ACPI video.ko driver. |
216 | 162 | ||
217 | acpi_display_output= [HW,ACPI] | ||
218 | acpi_display_output=vendor | ||
219 | acpi_display_output=video | ||
220 | See above. | ||
221 | |||
222 | acpi.debug_layer= [HW,ACPI,ACPI_DEBUG] | 163 | acpi.debug_layer= [HW,ACPI,ACPI_DEBUG] |
223 | acpi.debug_level= [HW,ACPI,ACPI_DEBUG] | 164 | acpi.debug_level= [HW,ACPI,ACPI_DEBUG] |
224 | Format: <int> | 165 | Format: <int> |
@@ -247,6 +188,41 @@ and is between 256 and 4096 characters. It is defined in the file | |||
247 | unusable. The "log_buf_len" parameter may be useful | 188 | unusable. The "log_buf_len" parameter may be useful |
248 | if you need to capture more output. | 189 | if you need to capture more output. |
249 | 190 | ||
191 | acpi_display_output= [HW,ACPI] | ||
192 | acpi_display_output=vendor | ||
193 | acpi_display_output=video | ||
194 | See above. | ||
195 | |||
196 | acpi_irq_balance [HW,ACPI] | ||
197 | ACPI will balance active IRQs | ||
198 | default in APIC mode | ||
199 | |||
200 | acpi_irq_nobalance [HW,ACPI] | ||
201 | ACPI will not move active IRQs (default) | ||
202 | default in PIC mode | ||
203 | |||
204 | acpi_irq_isa= [HW,ACPI] If irq_balance, mark listed IRQs used by ISA | ||
205 | Format: <irq>,<irq>... | ||
206 | |||
207 | acpi_irq_pci= [HW,ACPI] If irq_balance, clear listed IRQs for | ||
208 | use by PCI | ||
209 | Format: <irq>,<irq>... | ||
210 | |||
211 | acpi_no_auto_ssdt [HW,ACPI] Disable automatic loading of SSDT | ||
212 | |||
213 | acpi_os_name= [HW,ACPI] Tell ACPI BIOS the name of the OS | ||
214 | Format: To spoof as Windows 98: ="Microsoft Windows" | ||
215 | |||
216 | acpi_osi= [HW,ACPI] Modify list of supported OS interface strings | ||
217 | acpi_osi="string1" # add string1 -- only one string | ||
218 | acpi_osi="!string2" # remove built-in string2 | ||
219 | acpi_osi= # disable all strings | ||
220 | |||
221 | acpi_pm_good [X86-32,X86-64] | ||
222 | Override the pmtimer bug detection: force the kernel | ||
223 | to assume that this machine's pmtimer latches its value | ||
224 | and always returns good values. | ||
225 | |||
250 | acpi.power_nocheck= [HW,ACPI] | 226 | acpi.power_nocheck= [HW,ACPI] |
251 | Format: 1/0 enable/disable the check of power state. | 227 | Format: 1/0 enable/disable the check of power state. |
252 | On some bogus BIOS the _PSC object/_STA object of | 228 | On some bogus BIOS the _PSC object/_STA object of |
@@ -255,11 +231,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
255 | power state again in power transition. | 231 | power state again in power transition. |
256 | 1 : disable the power state check | 232 | 1 : disable the power state check |
257 | 233 | ||
258 | acpi_pm_good [X86-32,X86-64] | ||
259 | Override the pmtimer bug detection: force the kernel | ||
260 | to assume that this machine's pmtimer latches its value | ||
261 | and always returns good values. | ||
262 | |||
263 | acpi_enforce_resources= [ACPI] | 234 | acpi_enforce_resources= [ACPI] |
264 | { strict | lax | no } | 235 | { strict | lax | no } |
265 | Check for resource conflicts between native drivers | 236 | Check for resource conflicts between native drivers |
@@ -276,22 +247,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
276 | no: ACPI OperationRegions are not marked as reserved, | 247 | no: ACPI OperationRegions are not marked as reserved, |
277 | no further checks are performed. | 248 | no further checks are performed. |
278 | 249 | ||
279 | agp= [AGP] | ||
280 | { off | try_unsupported } | ||
281 | off: disable AGP support | ||
282 | try_unsupported: try to drive unsupported chipsets | ||
283 | (may crash computer or cause data corruption) | ||
284 | |||
285 | enable_timer_pin_1 [i386,x86-64] | ||
286 | Enable PIN 1 of APIC timer | ||
287 | Can be useful to work around chipset bugs | ||
288 | (in particular on some ATI chipsets). | ||
289 | The kernel tries to set a reasonable default. | ||
290 | |||
291 | disable_timer_pin_1 [i386,x86-64] | ||
292 | Disable PIN 1 of APIC timer | ||
293 | Can be useful to work around chipset bugs. | ||
294 | |||
295 | ad1848= [HW,OSS] | 250 | ad1848= [HW,OSS] |
296 | Format: <io>,<irq>,<dma>,<dma2>,<type> | 251 | Format: <io>,<irq>,<dma>,<dma2>,<type> |
297 | 252 | ||
@@ -305,6 +260,12 @@ and is between 256 and 4096 characters. It is defined in the file | |||
305 | Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq> | 260 | Format: <io>,<irq>,<dma>,<mss_io>,<mpu_io>,<mpu_irq> |
306 | See also header of sound/oss/aedsp16.c. | 261 | See also header of sound/oss/aedsp16.c. |
307 | 262 | ||
263 | agp= [AGP] | ||
264 | { off | try_unsupported } | ||
265 | off: disable AGP support | ||
266 | try_unsupported: try to drive unsupported chipsets | ||
267 | (may crash computer or cause data corruption) | ||
268 | |||
308 | aha152x= [HW,SCSI] | 269 | aha152x= [HW,SCSI] |
309 | See Documentation/scsi/aha152x.txt. | 270 | See Documentation/scsi/aha152x.txt. |
310 | 271 | ||
@@ -432,12 +393,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
432 | possible to determine what the correct size should be. | 393 | possible to determine what the correct size should be. |
433 | This option provides an override for these situations. | 394 | This option provides an override for these situations. |
434 | 395 | ||
435 | security= [SECURITY] Choose a security module to enable at boot. | ||
436 | If this boot parameter is not specified, only the first | ||
437 | security module asking for security registration will be | ||
438 | loaded. An invalid security module name will be treated | ||
439 | as if no module has been chosen. | ||
440 | |||
441 | capability.disable= | 396 | capability.disable= |
442 | [SECURITY] Disable capabilities. This would normally | 397 | [SECURITY] Disable capabilities. This would normally |
443 | be used only if an alternative security model is to be | 398 | be used only if an alternative security model is to be |
@@ -509,24 +464,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
509 | Range: 0 - 8192 | 464 | Range: 0 - 8192 |
510 | Default: 64 | 465 | Default: 64 |
511 | 466 | ||
512 | dma_debug=off If the kernel is compiled with DMA_API_DEBUG support | ||
513 | this option disables the debugging code at boot. | ||
514 | |||
515 | dma_debug_entries=<number> | ||
516 | This option allows to tune the number of preallocated | ||
517 | entries for DMA-API debugging code. One entry is | ||
518 | required per DMA-API allocation. Use this if the | ||
519 | DMA-API debugging code disables itself because the | ||
520 | architectural default is too low. | ||
521 | |||
522 | hpet= [X86-32,HPET] option to control HPET usage | ||
523 | Format: { enable (default) | disable | force | | ||
524 | verbose } | ||
525 | disable: disable HPET and use PIT instead | ||
526 | force: allow force enabled of undocumented chips (ICH4, | ||
527 | VIA, nVidia) | ||
528 | verbose: show contents of HPET registers during setup | ||
529 | |||
530 | com20020= [HW,NET] ARCnet - COM20020 chipset | 467 | com20020= [HW,NET] ARCnet - COM20020 chipset |
531 | Format: | 468 | Format: |
532 | <io>[,<irq>[,<nodeID>[,<backplane>[,<ckp>[,<timeout>]]]]] | 469 | <io>[,<irq>[,<nodeID>[,<backplane>[,<ckp>[,<timeout>]]]]] |
@@ -570,23 +507,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
570 | console=brl,ttyS0 | 507 | console=brl,ttyS0 |
571 | For now, only VisioBraille is supported. | 508 | For now, only VisioBraille is supported. |
572 | 509 | ||
573 | earlycon= [KNL] Output early console device and options. | ||
574 | uart[8250],io,<addr>[,options] | ||
575 | uart[8250],mmio,<addr>[,options] | ||
576 | Start an early, polled-mode console on the 8250/16550 | ||
577 | UART at the specified I/O port or MMIO address. | ||
578 | The options are the same as for ttyS, above. | ||
579 | |||
580 | no_console_suspend | ||
581 | [HW] Never suspend the console | ||
582 | Disable suspending of consoles during suspend and | ||
583 | hibernate operations. Once disabled, debugging | ||
584 | messages can reach various consoles while the rest | ||
585 | of the system is being put to sleep (ie, while | ||
586 | debugging driver suspend/resume hooks). This may | ||
587 | not work reliably with all consoles, but is known | ||
588 | to work with serial and VGA consoles. | ||
589 | |||
590 | coredump_filter= | 510 | coredump_filter= |
591 | [KNL] Change the default value for | 511 | [KNL] Change the default value for |
592 | /proc/<pid>/coredump_filter. | 512 | /proc/<pid>/coredump_filter. |
@@ -643,30 +563,13 @@ and is between 256 and 4096 characters. It is defined in the file | |||
643 | Format: <area>[,<node>] | 563 | Format: <area>[,<node>] |
644 | See also Documentation/networking/decnet.txt. | 564 | See also Documentation/networking/decnet.txt. |
645 | 565 | ||
646 | vt.default_blu= [VT] | 566 | default_hugepagesz= |
647 | Format: <blue0>,<blue1>,<blue2>,...,<blue15> | 567 | [same as hugepagesz=] The size of the default |
648 | Change the default blue palette of the console. | 568 | HugeTLB page size. This is the size represented by |
649 | This is a 16-member array composed of values | 569 | the legacy /proc/ hugepages APIs, used for SHM, and |
650 | ranging from 0-255. | 570 | default size when mounting hugetlbfs filesystems. |
651 | 571 | Defaults to the default architecture's huge page size | |
652 | vt.default_grn= [VT] | 572 | if not specified. |
653 | Format: <green0>,<green1>,<green2>,...,<green15> | ||
654 | Change the default green palette of the console. | ||
655 | This is a 16-member array composed of values | ||
656 | ranging from 0-255. | ||
657 | |||
658 | vt.default_red= [VT] | ||
659 | Format: <red0>,<red1>,<red2>,...,<red15> | ||
660 | Change the default red palette of the console. | ||
661 | This is a 16-member array composed of values | ||
662 | ranging from 0-255. | ||
663 | |||
664 | vt.default_utf8= | ||
665 | [VT] | ||
666 | Format=<0|1> | ||
667 | Set system-wide default UTF-8 mode for all tty's. | ||
668 | Default is 1, i.e. UTF-8 mode is enabled for all | ||
669 | newly opened terminals. | ||
670 | 573 | ||
671 | dhash_entries= [KNL] | 574 | dhash_entries= [KNL] |
672 | Set number of hash buckets for dentry cache. | 575 | Set number of hash buckets for dentry cache. |
@@ -679,27 +582,9 @@ and is between 256 and 4096 characters. It is defined in the file | |||
679 | Documentation/serial/digiepca.txt. | 582 | Documentation/serial/digiepca.txt. |
680 | 583 | ||
681 | disable_mtrr_cleanup [X86] | 584 | disable_mtrr_cleanup [X86] |
682 | enable_mtrr_cleanup [X86] | ||
683 | The kernel tries to adjust MTRR layout from continuous | 585 | The kernel tries to adjust MTRR layout from continuous |
684 | to discrete, to make X server driver able to add WB | 586 | to discrete, to make X server driver able to add WB |
685 | entry later. This parameter enables/disables that. | 587 | entry later. This parameter disables that. |
686 | |||
687 | mtrr_chunk_size=nn[KMG] [X86] | ||
688 | used for mtrr cleanup. It is largest continous chunk | ||
689 | that could hold holes aka. UC entries. | ||
690 | |||
691 | mtrr_gran_size=nn[KMG] [X86] | ||
692 | Used for mtrr cleanup. It is granularity of mtrr block. | ||
693 | Default is 1. | ||
694 | Large value could prevent small alignment from | ||
695 | using up MTRRs. | ||
696 | |||
697 | mtrr_spare_reg_nr=n [X86] | ||
698 | Format: <integer> | ||
699 | Range: 0,7 : spare reg number | ||
700 | Default : 1 | ||
701 | Used for mtrr cleanup. It is spare mtrr entries number. | ||
702 | Set to 2 or more if your graphical card needs more. | ||
703 | 588 | ||
704 | disable_mtrr_trim [X86, Intel and AMD only] | 589 | disable_mtrr_trim [X86, Intel and AMD only] |
705 | By default the kernel will trim any uncacheable | 590 | By default the kernel will trim any uncacheable |
@@ -707,12 +592,38 @@ and is between 256 and 4096 characters. It is defined in the file | |||
707 | MTRR settings. This parameter disables that behavior, | 592 | MTRR settings. This parameter disables that behavior, |
708 | possibly causing your machine to run very slowly. | 593 | possibly causing your machine to run very slowly. |
709 | 594 | ||
595 | disable_timer_pin_1 [i386,x86-64] | ||
596 | Disable PIN 1 of APIC timer | ||
597 | Can be useful to work around chipset bugs. | ||
598 | |||
710 | dmasound= [HW,OSS] Sound subsystem buffers | 599 | dmasound= [HW,OSS] Sound subsystem buffers |
711 | 600 | ||
601 | dma_debug=off If the kernel is compiled with DMA_API_DEBUG support, | ||
602 | this option disables the debugging code at boot. | ||
603 | |||
604 | dma_debug_entries=<number> | ||
605 | This option allows to tune the number of preallocated | ||
606 | entries for DMA-API debugging code. One entry is | ||
607 | required per DMA-API allocation. Use this if the | ||
608 | DMA-API debugging code disables itself because the | ||
609 | architectural default is too low. | ||
610 | |||
712 | dscc4.setup= [NET] | 611 | dscc4.setup= [NET] |
713 | 612 | ||
714 | dtc3181e= [HW,SCSI] | 613 | dtc3181e= [HW,SCSI] |
715 | 614 | ||
615 | dynamic_printk Enables pr_debug()/dev_dbg() calls if | ||
616 | CONFIG_DYNAMIC_PRINTK_DEBUG has been enabled. | ||
617 | These can also be switched on/off via | ||
618 | <debugfs>/dynamic_printk/modules | ||
619 | |||
620 | earlycon= [KNL] Output early console device and options. | ||
621 | uart[8250],io,<addr>[,options] | ||
622 | uart[8250],mmio,<addr>[,options] | ||
623 | Start an early, polled-mode console on the 8250/16550 | ||
624 | UART at the specified I/O port or MMIO address. | ||
625 | The options are the same as for ttyS, above. | ||
626 | |||
716 | earlyprintk= [X86-32,X86-64,SH,BLACKFIN] | 627 | earlyprintk= [X86-32,X86-64,SH,BLACKFIN] |
717 | earlyprintk=vga | 628 | earlyprintk=vga |
718 | earlyprintk=serial[,ttySn[,baudrate]] | 629 | earlyprintk=serial[,ttySn[,baudrate]] |
@@ -754,6 +665,17 @@ and is between 256 and 4096 characters. It is defined in the file | |||
754 | pass this option to capture kernel. | 665 | pass this option to capture kernel. |
755 | See Documentation/kdump/kdump.txt for details. | 666 | See Documentation/kdump/kdump.txt for details. |
756 | 667 | ||
668 | enable_mtrr_cleanup [X86] | ||
669 | The kernel tries to adjust MTRR layout from continuous | ||
670 | to discrete, to make X server driver able to add WB | ||
671 | entry later. This parameter enables that. | ||
672 | |||
673 | enable_timer_pin_1 [i386,x86-64] | ||
674 | Enable PIN 1 of APIC timer | ||
675 | Can be useful to work around chipset bugs | ||
676 | (in particular on some ATI chipsets). | ||
677 | The kernel tries to set a reasonable default. | ||
678 | |||
757 | enforcing [SELINUX] Set initial enforcing status. | 679 | enforcing [SELINUX] Set initial enforcing status. |
758 | Format: {"0" | "1"} | 680 | Format: {"0" | "1"} |
759 | See security/selinux/Kconfig help text. | 681 | See security/selinux/Kconfig help text. |
@@ -841,6 +763,16 @@ and is between 256 and 4096 characters. It is defined in the file | |||
841 | hisax= [HW,ISDN] | 763 | hisax= [HW,ISDN] |
842 | See Documentation/isdn/README.HiSax. | 764 | See Documentation/isdn/README.HiSax. |
843 | 765 | ||
766 | hlt [BUGS=ARM,SH] | ||
767 | |||
768 | hpet= [X86-32,HPET] option to control HPET usage | ||
769 | Format: { enable (default) | disable | force | | ||
770 | verbose } | ||
771 | disable: disable HPET and use PIT instead | ||
772 | force: allow force enabled of undocumented chips (ICH4, | ||
773 | VIA, nVidia) | ||
774 | verbose: show contents of HPET registers during setup | ||
775 | |||
844 | hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot. | 776 | hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot. |
845 | hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages. | 777 | hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages. |
846 | On x86-64 and powerpc, this option can be specified | 778 | On x86-64 and powerpc, this option can be specified |
@@ -850,15 +782,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
850 | (when the CPU supports the "pdpe1gb" cpuinfo flag) | 782 | (when the CPU supports the "pdpe1gb" cpuinfo flag) |
851 | Note that 1GB pages can only be allocated at boot time | 783 | Note that 1GB pages can only be allocated at boot time |
852 | using hugepages= and not freed afterwards. | 784 | using hugepages= and not freed afterwards. |
853 | default_hugepagesz= | ||
854 | [same as hugepagesz=] The size of the default | ||
855 | HugeTLB page size. This is the size represented by | ||
856 | the legacy /proc/ hugepages APIs, used for SHM, and | ||
857 | default size when mounting hugetlbfs filesystems. | ||
858 | Defaults to the default architecture's huge page size | ||
859 | if not specified. | ||
860 | |||
861 | hlt [BUGS=ARM,SH] | ||
862 | 785 | ||
863 | hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC) | 786 | hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC) |
864 | terminal devices. Valid values: 0..8 | 787 | terminal devices. Valid values: 0..8 |
@@ -919,6 +842,9 @@ and is between 256 and 4096 characters. It is defined in the file | |||
919 | idebus= [HW] (E)IDE subsystem - VLB/PCI bus speed | 842 | idebus= [HW] (E)IDE subsystem - VLB/PCI bus speed |
920 | See Documentation/ide/ide.txt. | 843 | See Documentation/ide/ide.txt. |
921 | 844 | ||
845 | ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem | ||
846 | Claim all unknown PCI IDE storage controllers. | ||
847 | |||
922 | idle= [X86] | 848 | idle= [X86] |
923 | Format: idle=poll, idle=mwait, idle=halt, idle=nomwait | 849 | Format: idle=poll, idle=mwait, idle=halt, idle=nomwait |
924 | Poll forces a polling idle loop that can slightly | 850 | Poll forces a polling idle loop that can slightly |
@@ -934,9 +860,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
934 | In such case C2/C3 won't be used again. | 860 | In such case C2/C3 won't be used again. |
935 | idle=nomwait: Disable mwait for CPU C-states | 861 | idle=nomwait: Disable mwait for CPU C-states |
936 | 862 | ||
937 | ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem | ||
938 | Claim all unknown PCI IDE storage controllers. | ||
939 | |||
940 | ignore_loglevel [KNL] | 863 | ignore_loglevel [KNL] |
941 | Ignore loglevel setting - this will print /all/ | 864 | Ignore loglevel setting - this will print /all/ |
942 | kernel messages to the console. Useful for debugging. | 865 | kernel messages to the console. Useful for debugging. |
@@ -970,25 +893,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
970 | inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver | 893 | inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver |
971 | Format: <irq> | 894 | Format: <irq> |
972 | 895 | ||
973 | inttest= [IA64] | ||
974 | |||
975 | iomem= Disable strict checking of access to MMIO memory | ||
976 | strict regions from userspace. | ||
977 | relaxed | ||
978 | |||
979 | iommu= [x86] | ||
980 | off | ||
981 | force | ||
982 | noforce | ||
983 | biomerge | ||
984 | panic | ||
985 | nopanic | ||
986 | merge | ||
987 | nomerge | ||
988 | forcesac | ||
989 | soft | ||
990 | |||
991 | |||
992 | intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option | 896 | intel_iommu= [DMAR] Intel IOMMU driver (DMAR) option |
993 | on | 897 | on |
994 | Enable intel iommu driver. | 898 | Enable intel iommu driver. |
@@ -1012,6 +916,28 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1012 | result in a hardware IOTLB flush operation as opposed | 916 | result in a hardware IOTLB flush operation as opposed |
1013 | to batching them for performance. | 917 | to batching them for performance. |
1014 | 918 | ||
919 | inttest= [IA64] | ||
920 | |||
921 | iomem= Disable strict checking of access to MMIO memory | ||
922 | strict regions from userspace. | ||
923 | relaxed | ||
924 | |||
925 | iommu= [x86] | ||
926 | off | ||
927 | force | ||
928 | noforce | ||
929 | biomerge | ||
930 | panic | ||
931 | nopanic | ||
932 | merge | ||
933 | nomerge | ||
934 | forcesac | ||
935 | soft | ||
936 | |||
937 | io7= [HW] IO7 for Marvel based alpha systems | ||
938 | See comment before marvel_specify_io7 in | ||
939 | arch/alpha/kernel/core_marvel.c. | ||
940 | |||
1015 | io_delay= [X86-32,X86-64] I/O delay method | 941 | io_delay= [X86-32,X86-64] I/O delay method |
1016 | 0x80 | 942 | 0x80 |
1017 | Standard port 0x80 based delay | 943 | Standard port 0x80 based delay |
@@ -1022,10 +948,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1022 | none | 948 | none |
1023 | No delay | 949 | No delay |
1024 | 950 | ||
1025 | io7= [HW] IO7 for Marvel based alpha systems | ||
1026 | See comment before marvel_specify_io7 in | ||
1027 | arch/alpha/kernel/core_marvel.c. | ||
1028 | |||
1029 | ip= [IP_PNP] | 951 | ip= [IP_PNP] |
1030 | See Documentation/filesystems/nfsroot.txt. | 952 | See Documentation/filesystems/nfsroot.txt. |
1031 | 953 | ||
@@ -1036,12 +958,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1036 | ips= [HW,SCSI] Adaptec / IBM ServeRAID controller | 958 | ips= [HW,SCSI] Adaptec / IBM ServeRAID controller |
1037 | See header of drivers/scsi/ips.c. | 959 | See header of drivers/scsi/ips.c. |
1038 | 960 | ||
1039 | ports= [IP_VS_FTP] IPVS ftp helper module | ||
1040 | Default is 21. | ||
1041 | Up to 8 (IP_VS_APP_MAX_PORTS) ports | ||
1042 | may be specified. | ||
1043 | Format: <port>,<port>.... | ||
1044 | |||
1045 | irqfixup [HW] | 961 | irqfixup [HW] |
1046 | When an interrupt is not handled search all handlers | 962 | When an interrupt is not handled search all handlers |
1047 | for it. Intended to get systems with badly broken | 963 | for it. Intended to get systems with badly broken |
@@ -1082,6 +998,8 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1082 | js= [HW,JOY] Analog joystick | 998 | js= [HW,JOY] Analog joystick |
1083 | See Documentation/input/joystick.txt. | 999 | See Documentation/input/joystick.txt. |
1084 | 1000 | ||
1001 | keepinitrd [HW,ARM] | ||
1002 | |||
1085 | kernelcore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter | 1003 | kernelcore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter |
1086 | specifies the amount of memory usable by the kernel | 1004 | specifies the amount of memory usable by the kernel |
1087 | for non-movable allocations. The requested amount is | 1005 | for non-movable allocations. The requested amount is |
@@ -1107,21 +1025,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1107 | higher than default (KMEMTRACE_N_SUBBUFS in code) if | 1025 | higher than default (KMEMTRACE_N_SUBBUFS in code) if |
1108 | you experience buffer overruns. | 1026 | you experience buffer overruns. |
1109 | 1027 | ||
1110 | movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter | ||
1111 | is similar to kernelcore except it specifies the | ||
1112 | amount of memory used for migratable allocations. | ||
1113 | If both kernelcore and movablecore is specified, | ||
1114 | then kernelcore will be at *least* the specified | ||
1115 | value but may be more. If movablecore on its own | ||
1116 | is specified, the administrator must be careful | ||
1117 | that the amount of memory usable for all allocations | ||
1118 | is not too small. | ||
1119 | |||
1120 | keepinitrd [HW,ARM] | ||
1121 | |||
1122 | kstack=N [X86-32,X86-64] Print N words from the kernel stack | ||
1123 | in oops dumps. | ||
1124 | |||
1125 | kgdboc= [HW] kgdb over consoles. | 1028 | kgdboc= [HW] kgdb over consoles. |
1126 | Requires a tty driver that supports console polling. | 1029 | Requires a tty driver that supports console polling. |
1127 | (only serial suported for now) | 1030 | (only serial suported for now) |
@@ -1131,6 +1034,9 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1131 | Configure the RouterBoard 532 series on-chip | 1034 | Configure the RouterBoard 532 series on-chip |
1132 | Ethernet adapter MAC address. | 1035 | Ethernet adapter MAC address. |
1133 | 1036 | ||
1037 | kstack=N [X86-32,X86-64] Print N words from the kernel stack | ||
1038 | in oops dumps. | ||
1039 | |||
1134 | l2cr= [PPC] | 1040 | l2cr= [PPC] |
1135 | 1041 | ||
1136 | l3cr= [PPC] | 1042 | l3cr= [PPC] |
@@ -1276,9 +1182,8 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1276 | (machvec) in a generic kernel. | 1182 | (machvec) in a generic kernel. |
1277 | Example: machvec=hpzx1_swiotlb | 1183 | Example: machvec=hpzx1_swiotlb |
1278 | 1184 | ||
1279 | max_loop= [LOOP] Maximum number of loopback devices that can | 1185 | max_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory greater |
1280 | be mounted | 1186 | than or equal to this physical address is ignored. |
1281 | Format: <1-256> | ||
1282 | 1187 | ||
1283 | maxcpus= [SMP] Maximum number of processors that an SMP kernel | 1188 | maxcpus= [SMP] Maximum number of processors that an SMP kernel |
1284 | should make use of. maxcpus=n : n >= 0 limits the | 1189 | should make use of. maxcpus=n : n >= 0 limits the |
@@ -1286,8 +1191,9 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1286 | it is equivalent to "nosmp", which also disables | 1191 | it is equivalent to "nosmp", which also disables |
1287 | the IO APIC. | 1192 | the IO APIC. |
1288 | 1193 | ||
1289 | max_addr=nn[KMG] [KNL,BOOT,ia64] All physical memory greater than | 1194 | max_loop= [LOOP] Maximum number of loopback devices that can |
1290 | or equal to this physical address is ignored. | 1195 | be mounted |
1196 | Format: <1-256> | ||
1291 | 1197 | ||
1292 | max_luns= [SCSI] Maximum number of LUNs to probe. | 1198 | max_luns= [SCSI] Maximum number of LUNs to probe. |
1293 | Should be between 1 and 2^32-1. | 1199 | Should be between 1 and 2^32-1. |
@@ -1414,6 +1320,16 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1414 | mousedev.yres= [MOUSE] Vertical screen resolution, used for devices | 1320 | mousedev.yres= [MOUSE] Vertical screen resolution, used for devices |
1415 | reporting absolute coordinates, such as tablets | 1321 | reporting absolute coordinates, such as tablets |
1416 | 1322 | ||
1323 | movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter | ||
1324 | is similar to kernelcore except it specifies the | ||
1325 | amount of memory used for migratable allocations. | ||
1326 | If both kernelcore and movablecore is specified, | ||
1327 | then kernelcore will be at *least* the specified | ||
1328 | value but may be more. If movablecore on its own | ||
1329 | is specified, the administrator must be careful | ||
1330 | that the amount of memory usable for all allocations | ||
1331 | is not too small. | ||
1332 | |||
1417 | mpu401= [HW,OSS] | 1333 | mpu401= [HW,OSS] |
1418 | Format: <io>,<irq> | 1334 | Format: <io>,<irq> |
1419 | 1335 | ||
@@ -1435,6 +1351,23 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1435 | [HW] Make the MicroTouch USB driver use raw coordinates | 1351 | [HW] Make the MicroTouch USB driver use raw coordinates |
1436 | ('y', default) or cooked coordinates ('n') | 1352 | ('y', default) or cooked coordinates ('n') |
1437 | 1353 | ||
1354 | mtrr_chunk_size=nn[KMG] [X86] | ||
1355 | used for mtrr cleanup. It is largest continous chunk | ||
1356 | that could hold holes aka. UC entries. | ||
1357 | |||
1358 | mtrr_gran_size=nn[KMG] [X86] | ||
1359 | Used for mtrr cleanup. It is granularity of mtrr block. | ||
1360 | Default is 1. | ||
1361 | Large value could prevent small alignment from | ||
1362 | using up MTRRs. | ||
1363 | |||
1364 | mtrr_spare_reg_nr=n [X86] | ||
1365 | Format: <integer> | ||
1366 | Range: 0,7 : spare reg number | ||
1367 | Default : 1 | ||
1368 | Used for mtrr cleanup. It is spare mtrr entries number. | ||
1369 | Set to 2 or more if your graphical card needs more. | ||
1370 | |||
1438 | n2= [NET] SDL Inc. RISCom/N2 synchronous serial card | 1371 | n2= [NET] SDL Inc. RISCom/N2 synchronous serial card |
1439 | 1372 | ||
1440 | NCR_D700= [HW,SCSI] | 1373 | NCR_D700= [HW,SCSI] |
@@ -1495,11 +1428,13 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1495 | 0 - turn nmi_watchdog off | 1428 | 0 - turn nmi_watchdog off |
1496 | 1 - use the IO-APIC timer for the NMI watchdog | 1429 | 1 - use the IO-APIC timer for the NMI watchdog |
1497 | 2 - use the local APIC for the NMI watchdog using | 1430 | 2 - use the local APIC for the NMI watchdog using |
1498 | a performance counter. Note: This will use one performance | 1431 | a performance counter. Note: This will use one |
1499 | counter and the local APIC's performance vector. | 1432 | performance counter and the local APIC's performance |
1500 | When panic is specified panic when an NMI watchdog timeout occurs. | 1433 | vector. |
1501 | This is useful when you use a panic=... timeout and need the box | 1434 | When panic is specified, panic when an NMI watchdog |
1502 | quickly up again. | 1435 | timeout occurs. |
1436 | This is useful when you use a panic=... timeout and | ||
1437 | need the box quickly up again. | ||
1503 | Instead of 1 and 2 it is possible to use the following | 1438 | Instead of 1 and 2 it is possible to use the following |
1504 | symbolic names: lapic and ioapic | 1439 | symbolic names: lapic and ioapic |
1505 | Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic | 1440 | Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic |
@@ -1508,6 +1443,16 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1508 | emulation library even if a 387 maths coprocessor | 1443 | emulation library even if a 387 maths coprocessor |
1509 | is present. | 1444 | is present. |
1510 | 1445 | ||
1446 | no_console_suspend | ||
1447 | [HW] Never suspend the console | ||
1448 | Disable suspending of consoles during suspend and | ||
1449 | hibernate operations. Once disabled, debugging | ||
1450 | messages can reach various consoles while the rest | ||
1451 | of the system is being put to sleep (ie, while | ||
1452 | debugging driver suspend/resume hooks). This may | ||
1453 | not work reliably with all consoles, but is known | ||
1454 | to work with serial and VGA consoles. | ||
1455 | |||
1511 | noaliencache [MM, NUMA, SLAB] Disables the allocation of alien | 1456 | noaliencache [MM, NUMA, SLAB] Disables the allocation of alien |
1512 | caches in the slab allocator. Saves per-node memory, | 1457 | caches in the slab allocator. Saves per-node memory, |
1513 | but will impact performance. | 1458 | but will impact performance. |
@@ -1522,6 +1467,8 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1522 | 1467 | ||
1523 | nocache [ARM] | 1468 | nocache [ARM] |
1524 | 1469 | ||
1470 | noclflush [BUGS=X86] Don't use the CLFLUSH instruction | ||
1471 | |||
1525 | nodelayacct [KNL] Disable per-task delay accounting | 1472 | nodelayacct [KNL] Disable per-task delay accounting |
1526 | 1473 | ||
1527 | nodisconnect [HW,SCSI,M68K] Disables SCSI disconnects. | 1474 | nodisconnect [HW,SCSI,M68K] Disables SCSI disconnects. |
@@ -1550,8 +1497,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1550 | register save and restore. The kernel will only save | 1497 | register save and restore. The kernel will only save |
1551 | legacy floating-point registers on task switch. | 1498 | legacy floating-point registers on task switch. |
1552 | 1499 | ||
1553 | noclflush [BUGS=X86] Don't use the CLFLUSH instruction | ||
1554 | |||
1555 | nohlt [BUGS=ARM,SH] Tells the kernel that the sleep(SH) or | 1500 | nohlt [BUGS=ARM,SH] Tells the kernel that the sleep(SH) or |
1556 | wfi(ARM) instruction doesn't work correctly and not to | 1501 | wfi(ARM) instruction doesn't work correctly and not to |
1557 | use it. This is also useful when using JTAG debugger. | 1502 | use it. This is also useful when using JTAG debugger. |
@@ -1596,12 +1541,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1596 | 1541 | ||
1597 | nolapic_timer [X86-32,APIC] Do not use the local APIC timer. | 1542 | nolapic_timer [X86-32,APIC] Do not use the local APIC timer. |
1598 | 1543 | ||
1599 | nox2apic [X86-64,APIC] Do not enable x2APIC mode. | ||
1600 | |||
1601 | x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of | ||
1602 | default x2apic cluster mode on platforms | ||
1603 | supporting x2apic. | ||
1604 | |||
1605 | noltlbs [PPC] Do not use large page/tlb entries for kernel | 1544 | noltlbs [PPC] Do not use large page/tlb entries for kernel |
1606 | lowmem mapping on PPC40x. | 1545 | lowmem mapping on PPC40x. |
1607 | 1546 | ||
@@ -1612,6 +1551,9 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1612 | nomfgpt [X86-32] Disable Multi-Function General Purpose | 1551 | nomfgpt [X86-32] Disable Multi-Function General Purpose |
1613 | Timer usage (for AMD Geode machines). | 1552 | Timer usage (for AMD Geode machines). |
1614 | 1553 | ||
1554 | norandmaps Don't use address space randomization. Equivalent to | ||
1555 | echo 0 > /proc/sys/kernel/randomize_va_space | ||
1556 | |||
1615 | noreplace-paravirt [X86-32,PV_OPS] Don't patch paravirt_ops | 1557 | noreplace-paravirt [X86-32,PV_OPS] Don't patch paravirt_ops |
1616 | 1558 | ||
1617 | noreplace-smp [X86-32,SMP] Don't replace SMP instructions | 1559 | noreplace-smp [X86-32,SMP] Don't replace SMP instructions |
@@ -1650,13 +1592,13 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1650 | purges which is reported from either PAL_VM_SUMMARY or | 1592 | purges which is reported from either PAL_VM_SUMMARY or |
1651 | SAL PALO. | 1593 | SAL PALO. |
1652 | 1594 | ||
1595 | nr_uarts= [SERIAL] maximum number of UARTs to be registered. | ||
1596 | |||
1653 | numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. | 1597 | numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. |
1654 | one of ['zone', 'node', 'default'] can be specified | 1598 | one of ['zone', 'node', 'default'] can be specified |
1655 | This can be set from sysctl after boot. | 1599 | This can be set from sysctl after boot. |
1656 | See Documentation/sysctl/vm.txt for details. | 1600 | See Documentation/sysctl/vm.txt for details. |
1657 | 1601 | ||
1658 | nr_uarts= [SERIAL] maximum number of UARTs to be registered. | ||
1659 | |||
1660 | ohci1394_dma=early [HW] enable debugging via the ohci1394 driver. | 1602 | ohci1394_dma=early [HW] enable debugging via the ohci1394 driver. |
1661 | See Documentation/debugging-via-ohci1394.txt for more | 1603 | See Documentation/debugging-via-ohci1394.txt for more |
1662 | info. | 1604 | info. |
@@ -1905,6 +1847,14 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1905 | printk.time= Show timing data prefixed to each printk message line | 1847 | printk.time= Show timing data prefixed to each printk message line |
1906 | Format: <bool> (1/Y/y=enable, 0/N/n=disable) | 1848 | Format: <bool> (1/Y/y=enable, 0/N/n=disable) |
1907 | 1849 | ||
1850 | processor.max_cstate= [HW,ACPI] | ||
1851 | Limit processor to maximum C-state | ||
1852 | max_cstate=9 overrides any DMI blacklist limit. | ||
1853 | |||
1854 | processor.nocst [HW,ACPI] | ||
1855 | Ignore the _CST method to determine C-states, | ||
1856 | instead using the legacy FADT method | ||
1857 | |||
1908 | profile= [KNL] Enable kernel profiling via /proc/profile | 1858 | profile= [KNL] Enable kernel profiling via /proc/profile |
1909 | Format: [schedule,]<number> | 1859 | Format: [schedule,]<number> |
1910 | Param: "schedule" - profile schedule points. | 1860 | Param: "schedule" - profile schedule points. |
@@ -1914,14 +1864,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
1914 | Requires CONFIG_SCHEDSTATS | 1864 | Requires CONFIG_SCHEDSTATS |
1915 | Param: "kvm" - profile VM exits. | 1865 | Param: "kvm" - profile VM exits. |
1916 | 1866 | ||
1917 | processor.max_cstate= [HW,ACPI] | ||
1918 | Limit processor to maximum C-state | ||
1919 | max_cstate=9 overrides any DMI blacklist limit. | ||
1920 | |||
1921 | processor.nocst [HW,ACPI] | ||
1922 | Ignore the _CST method to determine C-states, | ||
1923 | instead using the legacy FADT method | ||
1924 | |||
1925 | prompt_ramdisk= [RAM] List of RAM disks to prompt for floppy disk | 1867 | prompt_ramdisk= [RAM] List of RAM disks to prompt for floppy disk |
1926 | before loading. | 1868 | before loading. |
1927 | See Documentation/blockdev/ramdisk.txt. | 1869 | See Documentation/blockdev/ramdisk.txt. |
@@ -2075,7 +2017,13 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2075 | allowing boot to proceed. none ignores them, expecting | 2017 | allowing boot to proceed. none ignores them, expecting |
2076 | user space to do the scan. | 2018 | user space to do the scan. |
2077 | 2019 | ||
2078 | selinux [SELINUX] Disable or enable SELinux at boot time. | 2020 | security= [SECURITY] Choose a security module to enable at boot. |
2021 | If this boot parameter is not specified, only the first | ||
2022 | security module asking for security registration will be | ||
2023 | loaded. An invalid security module name will be treated | ||
2024 | as if no module has been chosen. | ||
2025 | |||
2026 | selinux= [SELINUX] Disable or enable SELinux at boot time. | ||
2079 | Format: { "0" | "1" } | 2027 | Format: { "0" | "1" } |
2080 | See security/selinux/Kconfig help text. | 2028 | See security/selinux/Kconfig help text. |
2081 | 0 -- disable. | 2029 | 0 -- disable. |
@@ -2499,9 +2447,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2499 | medium is write-protected). | 2447 | medium is write-protected). |
2500 | Example: quirks=0419:aaf5:rl,0421:0433:rc | 2448 | Example: quirks=0419:aaf5:rl,0421:0433:rc |
2501 | 2449 | ||
2502 | add_efi_memmap [EFI; x86-32,X86-64] Include EFI memory map in | ||
2503 | kernel's map of available physical RAM. | ||
2504 | |||
2505 | vdso= [X86-32,SH,x86-64] | 2450 | vdso= [X86-32,SH,x86-64] |
2506 | vdso=2: enable compat VDSO (default with COMPAT_VDSO) | 2451 | vdso=2: enable compat VDSO (default with COMPAT_VDSO) |
2507 | vdso=1: enable VDSO (default) | 2452 | vdso=1: enable VDSO (default) |
@@ -2540,6 +2485,31 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2540 | vmpoff= [KNL,S390] Perform z/VM CP command after power off. | 2485 | vmpoff= [KNL,S390] Perform z/VM CP command after power off. |
2541 | Format: <command> | 2486 | Format: <command> |
2542 | 2487 | ||
2488 | vt.default_blu= [VT] | ||
2489 | Format: <blue0>,<blue1>,<blue2>,...,<blue15> | ||
2490 | Change the default blue palette of the console. | ||
2491 | This is a 16-member array composed of values | ||
2492 | ranging from 0-255. | ||
2493 | |||
2494 | vt.default_grn= [VT] | ||
2495 | Format: <green0>,<green1>,<green2>,...,<green15> | ||
2496 | Change the default green palette of the console. | ||
2497 | This is a 16-member array composed of values | ||
2498 | ranging from 0-255. | ||
2499 | |||
2500 | vt.default_red= [VT] | ||
2501 | Format: <red0>,<red1>,<red2>,...,<red15> | ||
2502 | Change the default red palette of the console. | ||
2503 | This is a 16-member array composed of values | ||
2504 | ranging from 0-255. | ||
2505 | |||
2506 | vt.default_utf8= | ||
2507 | [VT] | ||
2508 | Format=<0|1> | ||
2509 | Set system-wide default UTF-8 mode for all tty's. | ||
2510 | Default is 1, i.e. UTF-8 mode is enabled for all | ||
2511 | newly opened terminals. | ||
2512 | |||
2543 | waveartist= [HW,OSS] | 2513 | waveartist= [HW,OSS] |
2544 | Format: <io>,<irq>,<dma>,<dma2> | 2514 | Format: <io>,<irq>,<dma>,<dma2> |
2545 | 2515 | ||
@@ -2552,6 +2522,10 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2552 | wdt= [WDT] Watchdog | 2522 | wdt= [WDT] Watchdog |
2553 | See Documentation/watchdog/wdt.txt. | 2523 | See Documentation/watchdog/wdt.txt. |
2554 | 2524 | ||
2525 | x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of | ||
2526 | default x2apic cluster mode on platforms | ||
2527 | supporting x2apic. | ||
2528 | |||
2555 | xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks. | 2529 | xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks. |
2556 | xd_geo= See header of drivers/block/xd.c. | 2530 | xd_geo= See header of drivers/block/xd.c. |
2557 | 2531 | ||
@@ -2559,9 +2533,6 @@ and is between 256 and 4096 characters. It is defined in the file | |||
2559 | Format: | 2533 | Format: |
2560 | <irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]] | 2534 | <irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]] |
2561 | 2535 | ||
2562 | norandmaps Don't use address space randomization. Equivalent to | ||
2563 | echo 0 > /proc/sys/kernel/randomize_va_space | ||
2564 | |||
2565 | ______________________________________________________________________ | 2536 | ______________________________________________________________________ |
2566 | 2537 | ||
2567 | TODO: | 2538 | TODO: |
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 48b3de90eb1e..1e7a769a10f9 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt | |||
@@ -212,7 +212,9 @@ hit, Kprobes calls kp->pre_handler. After the probed instruction | |||
212 | is single-stepped, Kprobe calls kp->post_handler. If a fault | 212 | is single-stepped, Kprobe calls kp->post_handler. If a fault |
213 | occurs during execution of kp->pre_handler or kp->post_handler, | 213 | occurs during execution of kp->pre_handler or kp->post_handler, |
214 | or during single-stepping of the probed instruction, Kprobes calls | 214 | or during single-stepping of the probed instruction, Kprobes calls |
215 | kp->fault_handler. Any or all handlers can be NULL. | 215 | kp->fault_handler. Any or all handlers can be NULL. If kp->flags |
216 | is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled, | ||
217 | so, it's handlers aren't hit until calling enable_kprobe(kp). | ||
216 | 218 | ||
217 | NOTE: | 219 | NOTE: |
218 | 1. With the introduction of the "symbol_name" field to struct kprobe, | 220 | 1. With the introduction of the "symbol_name" field to struct kprobe, |
@@ -363,6 +365,26 @@ probes) in the specified array, they clear the addr field of those | |||
363 | incorrect probes. However, other probes in the array are | 365 | incorrect probes. However, other probes in the array are |
364 | unregistered correctly. | 366 | unregistered correctly. |
365 | 367 | ||
368 | 4.7 disable_*probe | ||
369 | |||
370 | #include <linux/kprobes.h> | ||
371 | int disable_kprobe(struct kprobe *kp); | ||
372 | int disable_kretprobe(struct kretprobe *rp); | ||
373 | int disable_jprobe(struct jprobe *jp); | ||
374 | |||
375 | Temporarily disables the specified *probe. You can enable it again by using | ||
376 | enable_*probe(). You must specify the probe which has been registered. | ||
377 | |||
378 | 4.8 enable_*probe | ||
379 | |||
380 | #include <linux/kprobes.h> | ||
381 | int enable_kprobe(struct kprobe *kp); | ||
382 | int enable_kretprobe(struct kretprobe *rp); | ||
383 | int enable_jprobe(struct jprobe *jp); | ||
384 | |||
385 | Enables *probe which has been disabled by disable_*probe(). You must specify | ||
386 | the probe which has been registered. | ||
387 | |||
366 | 5. Kprobes Features and Limitations | 388 | 5. Kprobes Features and Limitations |
367 | 389 | ||
368 | Kprobes allows multiple probes at the same address. Currently, | 390 | Kprobes allows multiple probes at the same address. Currently, |
@@ -500,10 +522,14 @@ the probe. If the probed function belongs to a module, the module name | |||
500 | is also specified. Following columns show probe status. If the probe is on | 522 | is also specified. Following columns show probe status. If the probe is on |
501 | a virtual address that is no longer valid (module init sections, module | 523 | a virtual address that is no longer valid (module init sections, module |
502 | virtual addresses that correspond to modules that've been unloaded), | 524 | virtual addresses that correspond to modules that've been unloaded), |
503 | such probes are marked with [GONE]. | 525 | such probes are marked with [GONE]. If the probe is temporarily disabled, |
526 | such probes are marked with [DISABLED]. | ||
504 | 527 | ||
505 | /debug/kprobes/enabled: Turn kprobes ON/OFF | 528 | /debug/kprobes/enabled: Turn kprobes ON/OFF forcibly. |
506 | 529 | ||
507 | Provides a knob to globally turn registered kprobes ON or OFF. By default, | 530 | Provides a knob to globally and forcibly turn registered kprobes ON or OFF. |
508 | all kprobes are enabled. By echoing "0" to this file, all registered probes | 531 | By default, all kprobes are enabled. By echoing "0" to this file, all |
509 | will be disarmed, till such time a "1" is echoed to this file. | 532 | registered probes will be disarmed, till such time a "1" is echoed to this |
533 | file. Note that this knob just disarms and arms all kprobes and doesn't | ||
534 | change each probe's disabling state. This means that disabled kprobes (marked | ||
535 | [DISABLED]) will be not enabled if you turn ON all kprobes by this knob. | ||
diff --git a/Documentation/powerpc/booting-without-of.txt b/Documentation/powerpc/booting-without-of.txt index 0ab0230cbcb0..d16b7a1c3793 100644 --- a/Documentation/powerpc/booting-without-of.txt +++ b/Documentation/powerpc/booting-without-of.txt | |||
@@ -43,12 +43,11 @@ Table of Contents | |||
43 | 2) Representing devices without a current OF specification | 43 | 2) Representing devices without a current OF specification |
44 | a) PHY nodes | 44 | a) PHY nodes |
45 | b) Interrupt controllers | 45 | b) Interrupt controllers |
46 | c) CFI or JEDEC memory-mapped NOR flash | 46 | c) 4xx/Axon EMAC ethernet nodes |
47 | d) 4xx/Axon EMAC ethernet nodes | 47 | d) Xilinx IP cores |
48 | e) Xilinx IP cores | 48 | e) USB EHCI controllers |
49 | f) USB EHCI controllers | 49 | f) MDIO on GPIOs |
50 | g) MDIO on GPIOs | 50 | g) SPI busses |
51 | h) SPI busses | ||
52 | 51 | ||
53 | VII - Marvell Discovery mv64[345]6x System Controller chips | 52 | VII - Marvell Discovery mv64[345]6x System Controller chips |
54 | 1) The /system-controller node | 53 | 1) The /system-controller node |
@@ -999,7 +998,7 @@ compatibility. | |||
999 | translation of SOC addresses for memory mapped SOC registers. | 998 | translation of SOC addresses for memory mapped SOC registers. |
1000 | - bus-frequency: Contains the bus frequency for the SOC node. | 999 | - bus-frequency: Contains the bus frequency for the SOC node. |
1001 | Typically, the value of this field is filled in by the boot | 1000 | Typically, the value of this field is filled in by the boot |
1002 | loader. | 1001 | loader. |
1003 | 1002 | ||
1004 | 1003 | ||
1005 | Recommended properties: | 1004 | Recommended properties: |
@@ -1287,71 +1286,7 @@ platforms are moved over to use the flattened-device-tree model. | |||
1287 | device_type = "open-pic"; | 1286 | device_type = "open-pic"; |
1288 | }; | 1287 | }; |
1289 | 1288 | ||
1290 | c) CFI or JEDEC memory-mapped NOR flash | 1289 | c) 4xx/Axon EMAC ethernet nodes |
1291 | |||
1292 | Flash chips (Memory Technology Devices) are often used for solid state | ||
1293 | file systems on embedded devices. | ||
1294 | |||
1295 | - compatible : should contain the specific model of flash chip(s) | ||
1296 | used, if known, followed by either "cfi-flash" or "jedec-flash" | ||
1297 | - reg : Address range of the flash chip | ||
1298 | - bank-width : Width (in bytes) of the flash bank. Equal to the | ||
1299 | device width times the number of interleaved chips. | ||
1300 | - device-width : (optional) Width of a single flash chip. If | ||
1301 | omitted, assumed to be equal to 'bank-width'. | ||
1302 | - #address-cells, #size-cells : Must be present if the flash has | ||
1303 | sub-nodes representing partitions (see below). In this case | ||
1304 | both #address-cells and #size-cells must be equal to 1. | ||
1305 | |||
1306 | For JEDEC compatible devices, the following additional properties | ||
1307 | are defined: | ||
1308 | |||
1309 | - vendor-id : Contains the flash chip's vendor id (1 byte). | ||
1310 | - device-id : Contains the flash chip's device id (1 byte). | ||
1311 | |||
1312 | In addition to the information on the flash bank itself, the | ||
1313 | device tree may optionally contain additional information | ||
1314 | describing partitions of the flash address space. This can be | ||
1315 | used on platforms which have strong conventions about which | ||
1316 | portions of the flash are used for what purposes, but which don't | ||
1317 | use an on-flash partition table such as RedBoot. | ||
1318 | |||
1319 | Each partition is represented as a sub-node of the flash device. | ||
1320 | Each node's name represents the name of the corresponding | ||
1321 | partition of the flash device. | ||
1322 | |||
1323 | Flash partitions | ||
1324 | - reg : The partition's offset and size within the flash bank. | ||
1325 | - label : (optional) The label / name for this flash partition. | ||
1326 | If omitted, the label is taken from the node name (excluding | ||
1327 | the unit address). | ||
1328 | - read-only : (optional) This parameter, if present, is a hint to | ||
1329 | Linux that this flash partition should only be mounted | ||
1330 | read-only. This is usually used for flash partitions | ||
1331 | containing early-boot firmware images or data which should not | ||
1332 | be clobbered. | ||
1333 | |||
1334 | Example: | ||
1335 | |||
1336 | flash@ff000000 { | ||
1337 | compatible = "amd,am29lv128ml", "cfi-flash"; | ||
1338 | reg = <ff000000 01000000>; | ||
1339 | bank-width = <4>; | ||
1340 | device-width = <1>; | ||
1341 | #address-cells = <1>; | ||
1342 | #size-cells = <1>; | ||
1343 | fs@0 { | ||
1344 | label = "fs"; | ||
1345 | reg = <0 f80000>; | ||
1346 | }; | ||
1347 | firmware@f80000 { | ||
1348 | label ="firmware"; | ||
1349 | reg = <f80000 80000>; | ||
1350 | read-only; | ||
1351 | }; | ||
1352 | }; | ||
1353 | |||
1354 | d) 4xx/Axon EMAC ethernet nodes | ||
1355 | 1290 | ||
1356 | The EMAC ethernet controller in IBM and AMCC 4xx chips, and also | 1291 | The EMAC ethernet controller in IBM and AMCC 4xx chips, and also |
1357 | the Axon bridge. To operate this needs to interact with a ths | 1292 | the Axon bridge. To operate this needs to interact with a ths |
@@ -1499,7 +1434,7 @@ platforms are moved over to use the flattened-device-tree model. | |||
1499 | available. | 1434 | available. |
1500 | For Axon: 0x0000012a | 1435 | For Axon: 0x0000012a |
1501 | 1436 | ||
1502 | e) Xilinx IP cores | 1437 | d) Xilinx IP cores |
1503 | 1438 | ||
1504 | The Xilinx EDK toolchain ships with a set of IP cores (devices) for use | 1439 | The Xilinx EDK toolchain ships with a set of IP cores (devices) for use |
1505 | in Xilinx Spartan and Virtex FPGAs. The devices cover the whole range | 1440 | in Xilinx Spartan and Virtex FPGAs. The devices cover the whole range |
@@ -1761,7 +1696,7 @@ platforms are moved over to use the flattened-device-tree model. | |||
1761 | listed above, nodes for these devices should include a phy-handle | 1696 | listed above, nodes for these devices should include a phy-handle |
1762 | property, and may include other common network device properties | 1697 | property, and may include other common network device properties |
1763 | like local-mac-address. | 1698 | like local-mac-address. |
1764 | 1699 | ||
1765 | iv) Xilinx Uartlite | 1700 | iv) Xilinx Uartlite |
1766 | 1701 | ||
1767 | Xilinx uartlite devices are simple fixed speed serial ports. | 1702 | Xilinx uartlite devices are simple fixed speed serial ports. |
@@ -1793,7 +1728,7 @@ platforms are moved over to use the flattened-device-tree model. | |||
1793 | - reg-offset : A value of 3 is required | 1728 | - reg-offset : A value of 3 is required |
1794 | - reg-shift : A value of 2 is required | 1729 | - reg-shift : A value of 2 is required |
1795 | 1730 | ||
1796 | f) USB EHCI controllers | 1731 | e) USB EHCI controllers |
1797 | 1732 | ||
1798 | Required properties: | 1733 | Required properties: |
1799 | - compatible : should be "usb-ehci". | 1734 | - compatible : should be "usb-ehci". |
@@ -1819,7 +1754,7 @@ platforms are moved over to use the flattened-device-tree model. | |||
1819 | big-endian; | 1754 | big-endian; |
1820 | }; | 1755 | }; |
1821 | 1756 | ||
1822 | g) MDIO on GPIOs | 1757 | f) MDIO on GPIOs |
1823 | 1758 | ||
1824 | Currently defined compatibles: | 1759 | Currently defined compatibles: |
1825 | - virtual,gpio-mdio | 1760 | - virtual,gpio-mdio |
@@ -1839,7 +1774,7 @@ platforms are moved over to use the flattened-device-tree model. | |||
1839 | &qe_pio_c 6>; | 1774 | &qe_pio_c 6>; |
1840 | }; | 1775 | }; |
1841 | 1776 | ||
1842 | h) SPI (Serial Peripheral Interface) busses | 1777 | g) SPI (Serial Peripheral Interface) busses |
1843 | 1778 | ||
1844 | SPI busses can be described with a node for the SPI master device | 1779 | SPI busses can be described with a node for the SPI master device |
1845 | and a set of child nodes for each SPI slave on the bus. For this | 1780 | and a set of child nodes for each SPI slave on the bus. For this |
diff --git a/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt b/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt index 84a04d5eb8e6..a48b2cadc7f0 100644 --- a/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt +++ b/Documentation/powerpc/dts-bindings/fsl/upm-nand.txt | |||
@@ -5,9 +5,21 @@ Required properties: | |||
5 | - reg : should specify localbus chip select and size used for the chip. | 5 | - reg : should specify localbus chip select and size used for the chip. |
6 | - fsl,upm-addr-offset : UPM pattern offset for the address latch. | 6 | - fsl,upm-addr-offset : UPM pattern offset for the address latch. |
7 | - fsl,upm-cmd-offset : UPM pattern offset for the command latch. | 7 | - fsl,upm-cmd-offset : UPM pattern offset for the command latch. |
8 | - gpios : may specify optional GPIO connected to the Ready-Not-Busy pin. | ||
9 | 8 | ||
10 | Example: | 9 | Optional properties: |
10 | - fsl,upm-wait-flags : add chip-dependent short delays after running the | ||
11 | UPM pattern (0x1), after writing a data byte (0x2) or after | ||
12 | writing out a buffer (0x4). | ||
13 | - fsl,upm-addr-line-cs-offsets : address offsets for multi-chip support. | ||
14 | The corresponding address lines are used to select the chip. | ||
15 | - gpios : may specify optional GPIOs connected to the Ready-Not-Busy pins | ||
16 | (R/B#). For multi-chip devices, "n" GPIO definitions are required | ||
17 | according to the number of chips. | ||
18 | - chip-delay : chip dependent delay for transfering data from array to | ||
19 | read registers (tR). Required if property "gpios" is not used | ||
20 | (R/B# pins not connected). | ||
21 | |||
22 | Examples: | ||
11 | 23 | ||
12 | upm@1,0 { | 24 | upm@1,0 { |
13 | compatible = "fsl,upm-nand"; | 25 | compatible = "fsl,upm-nand"; |
@@ -26,3 +38,26 @@ upm@1,0 { | |||
26 | }; | 38 | }; |
27 | }; | 39 | }; |
28 | }; | 40 | }; |
41 | |||
42 | upm@3,0 { | ||
43 | #address-cells = <0>; | ||
44 | #size-cells = <0>; | ||
45 | compatible = "tqc,tqm8548-upm-nand", "fsl,upm-nand"; | ||
46 | reg = <3 0x0 0x800>; | ||
47 | fsl,upm-addr-offset = <0x10>; | ||
48 | fsl,upm-cmd-offset = <0x08>; | ||
49 | /* Multi-chip NAND device */ | ||
50 | fsl,upm-addr-line-cs-offsets = <0x0 0x200>; | ||
51 | fsl,upm-wait-flags = <0x5>; | ||
52 | chip-delay = <25>; // in micro-seconds | ||
53 | |||
54 | nand@0 { | ||
55 | #address-cells = <1>; | ||
56 | #size-cells = <1>; | ||
57 | |||
58 | partition@0 { | ||
59 | label = "fs"; | ||
60 | reg = <0x00000000 0x10000000>; | ||
61 | }; | ||
62 | }; | ||
63 | }; | ||
diff --git a/Documentation/powerpc/dts-bindings/gpio/led.txt b/Documentation/powerpc/dts-bindings/gpio/led.txt index ff51f4c0fa9d..4fe14deedc0a 100644 --- a/Documentation/powerpc/dts-bindings/gpio/led.txt +++ b/Documentation/powerpc/dts-bindings/gpio/led.txt | |||
@@ -1,15 +1,43 @@ | |||
1 | LED connected to GPIO | 1 | LEDs connected to GPIO lines |
2 | 2 | ||
3 | Required properties: | 3 | Required properties: |
4 | - compatible : should be "gpio-led". | 4 | - compatible : should be "gpio-leds". |
5 | - label : (optional) the label for this LED. If omitted, the label is | 5 | |
6 | Each LED is represented as a sub-node of the gpio-leds device. Each | ||
7 | node's name represents the name of the corresponding LED. | ||
8 | |||
9 | LED sub-node properties: | ||
10 | - gpios : Should specify the LED's GPIO, see "Specifying GPIO information | ||
11 | for devices" in Documentation/powerpc/booting-without-of.txt. Active | ||
12 | low LEDs should be indicated using flags in the GPIO specifier. | ||
13 | - label : (optional) The label for this LED. If omitted, the label is | ||
6 | taken from the node name (excluding the unit address). | 14 | taken from the node name (excluding the unit address). |
7 | - gpios : should specify LED GPIO. | 15 | - linux,default-trigger : (optional) This parameter, if present, is a |
16 | string defining the trigger assigned to the LED. Current triggers are: | ||
17 | "backlight" - LED will act as a back-light, controlled by the framebuffer | ||
18 | system | ||
19 | "default-on" - LED will turn on | ||
20 | "heartbeat" - LED "double" flashes at a load average based rate | ||
21 | "ide-disk" - LED indicates disk activity | ||
22 | "timer" - LED flashes at a fixed, configurable rate | ||
8 | 23 | ||
9 | Example: | 24 | Examples: |
10 | 25 | ||
11 | led@0 { | 26 | leds { |
12 | compatible = "gpio-led"; | 27 | compatible = "gpio-leds"; |
13 | label = "hdd"; | 28 | hdd { |
14 | gpios = <&mcu_pio 0 1>; | 29 | label = "IDE Activity"; |
30 | gpios = <&mcu_pio 0 1>; /* Active low */ | ||
31 | linux,default-trigger = "ide-disk"; | ||
32 | }; | ||
15 | }; | 33 | }; |
34 | |||
35 | run-control { | ||
36 | compatible = "gpio-leds"; | ||
37 | red { | ||
38 | gpios = <&mpc8572 6 0>; | ||
39 | }; | ||
40 | green { | ||
41 | gpios = <&mpc8572 7 0>; | ||
42 | }; | ||
43 | } | ||
diff --git a/Documentation/powerpc/dts-bindings/mtd-physmap.txt b/Documentation/powerpc/dts-bindings/mtd-physmap.txt new file mode 100644 index 000000000000..667c9bde8699 --- /dev/null +++ b/Documentation/powerpc/dts-bindings/mtd-physmap.txt | |||
@@ -0,0 +1,80 @@ | |||
1 | CFI or JEDEC memory-mapped NOR flash | ||
2 | |||
3 | Flash chips (Memory Technology Devices) are often used for solid state | ||
4 | file systems on embedded devices. | ||
5 | |||
6 | - compatible : should contain the specific model of flash chip(s) | ||
7 | used, if known, followed by either "cfi-flash" or "jedec-flash" | ||
8 | - reg : Address range(s) of the flash chip(s) | ||
9 | It's possible to (optionally) define multiple "reg" tuples so that | ||
10 | non-identical NOR chips can be described in one flash node. | ||
11 | - bank-width : Width (in bytes) of the flash bank. Equal to the | ||
12 | device width times the number of interleaved chips. | ||
13 | - device-width : (optional) Width of a single flash chip. If | ||
14 | omitted, assumed to be equal to 'bank-width'. | ||
15 | - #address-cells, #size-cells : Must be present if the flash has | ||
16 | sub-nodes representing partitions (see below). In this case | ||
17 | both #address-cells and #size-cells must be equal to 1. | ||
18 | |||
19 | For JEDEC compatible devices, the following additional properties | ||
20 | are defined: | ||
21 | |||
22 | - vendor-id : Contains the flash chip's vendor id (1 byte). | ||
23 | - device-id : Contains the flash chip's device id (1 byte). | ||
24 | |||
25 | In addition to the information on the flash bank itself, the | ||
26 | device tree may optionally contain additional information | ||
27 | describing partitions of the flash address space. This can be | ||
28 | used on platforms which have strong conventions about which | ||
29 | portions of the flash are used for what purposes, but which don't | ||
30 | use an on-flash partition table such as RedBoot. | ||
31 | |||
32 | Each partition is represented as a sub-node of the flash device. | ||
33 | Each node's name represents the name of the corresponding | ||
34 | partition of the flash device. | ||
35 | |||
36 | Flash partitions | ||
37 | - reg : The partition's offset and size within the flash bank. | ||
38 | - label : (optional) The label / name for this flash partition. | ||
39 | If omitted, the label is taken from the node name (excluding | ||
40 | the unit address). | ||
41 | - read-only : (optional) This parameter, if present, is a hint to | ||
42 | Linux that this flash partition should only be mounted | ||
43 | read-only. This is usually used for flash partitions | ||
44 | containing early-boot firmware images or data which should not | ||
45 | be clobbered. | ||
46 | |||
47 | Example: | ||
48 | |||
49 | flash@ff000000 { | ||
50 | compatible = "amd,am29lv128ml", "cfi-flash"; | ||
51 | reg = <ff000000 01000000>; | ||
52 | bank-width = <4>; | ||
53 | device-width = <1>; | ||
54 | #address-cells = <1>; | ||
55 | #size-cells = <1>; | ||
56 | fs@0 { | ||
57 | label = "fs"; | ||
58 | reg = <0 f80000>; | ||
59 | }; | ||
60 | firmware@f80000 { | ||
61 | label ="firmware"; | ||
62 | reg = <f80000 80000>; | ||
63 | read-only; | ||
64 | }; | ||
65 | }; | ||
66 | |||
67 | Here an example with multiple "reg" tuples: | ||
68 | |||
69 | flash@f0000000,0 { | ||
70 | #address-cells = <1>; | ||
71 | #size-cells = <1>; | ||
72 | compatible = "intel,PC48F4400P0VB", "cfi-flash"; | ||
73 | reg = <0 0x00000000 0x02000000 | ||
74 | 0 0x02000000 0x02000000>; | ||
75 | bank-width = <2>; | ||
76 | partition@0 { | ||
77 | label = "test-part1"; | ||
78 | reg = <0 0x04000000>; | ||
79 | }; | ||
80 | }; | ||
diff --git a/Documentation/scsi/aacraid.txt b/Documentation/scsi/aacraid.txt index ddace3afc83b..30f643f611b2 100644 --- a/Documentation/scsi/aacraid.txt +++ b/Documentation/scsi/aacraid.txt | |||
@@ -60,17 +60,9 @@ Supported Cards/Chipsets | |||
60 | 9005:0285:9005:02d5 Adaptec ASR-2405 (Voodoo40 Lite) | 60 | 9005:0285:9005:02d5 Adaptec ASR-2405 (Voodoo40 Lite) |
61 | 9005:0285:9005:02d6 Adaptec ASR-2445 (Voodoo44 Lite) | 61 | 9005:0285:9005:02d6 Adaptec ASR-2445 (Voodoo44 Lite) |
62 | 9005:0285:9005:02d7 Adaptec ASR-2805 (Voodoo80 Lite) | 62 | 9005:0285:9005:02d7 Adaptec ASR-2805 (Voodoo80 Lite) |
63 | 9005:0285:9005:02d8 Adaptec 5405G (Voodoo40 PM) | 63 | 9005:0285:9005:02d8 Adaptec 5405Z (Voodoo40 BLBU) |
64 | 9005:0285:9005:02d9 Adaptec 5445G (Voodoo44 PM) | 64 | 9005:0285:9005:02d9 Adaptec 5445Z (Voodoo44 BLBU) |
65 | 9005:0285:9005:02da Adaptec 5805G (Voodoo80 PM) | 65 | 9005:0285:9005:02da Adaptec 5805Z (Voodoo80 BLBU) |
66 | 9005:0285:9005:02db Adaptec 5085G (Voodoo08 PM) | ||
67 | 9005:0285:9005:02dc Adaptec 51245G (Voodoo124 PM) | ||
68 | 9005:0285:9005:02dd Adaptec 51645G (Voodoo164 PM) | ||
69 | 9005:0285:9005:02de Adaptec 52445G (Voodoo244 PM) | ||
70 | 9005:0285:9005:02df Adaptec ASR-2045G (Voodoo04 Lite PM) | ||
71 | 9005:0285:9005:02e0 Adaptec ASR-2405G (Voodoo40 Lite PM) | ||
72 | 9005:0285:9005:02e1 Adaptec ASR-2445G (Voodoo44 Lite PM) | ||
73 | 9005:0285:9005:02e2 Adaptec ASR-2805G (Voodoo80 Lite PM) | ||
74 | 1011:0046:9005:0364 Adaptec 5400S (Mustang) | 66 | 1011:0046:9005:0364 Adaptec 5400S (Mustang) |
75 | 1011:0046:9005:0365 Adaptec 5400S (Mustang) | 67 | 1011:0046:9005:0365 Adaptec 5400S (Mustang) |
76 | 9005:0287:9005:0800 Adaptec Themisto (Jupiter) | 68 | 9005:0287:9005:0800 Adaptec Themisto (Jupiter) |
@@ -140,6 +132,7 @@ Deanna Bonds (non-DASD support, PAE fibs and 64 bit, | |||
140 | where fibs that go to the hardware are consistently called hw_fibs and | 132 | where fibs that go to the hardware are consistently called hw_fibs and |
141 | not just fibs like the name of the driver tracking structure) | 133 | not just fibs like the name of the driver tracking structure) |
142 | Mark Salyzyn <Mark_Salyzyn@adaptec.com> Fixed panic issues and added some new product ids for upcoming hbas. Performance tuning, card failover and bug mitigations. | 134 | Mark Salyzyn <Mark_Salyzyn@adaptec.com> Fixed panic issues and added some new product ids for upcoming hbas. Performance tuning, card failover and bug mitigations. |
135 | Achim Leubner <Achim_Leubner@adaptec.com> | ||
143 | 136 | ||
144 | Original Driver | 137 | Original Driver |
145 | ------------------------- | 138 | ------------------------- |
diff --git a/Documentation/sound/alsa/soc/jack.txt b/Documentation/sound/alsa/soc/jack.txt new file mode 100644 index 000000000000..fcf82a417293 --- /dev/null +++ b/Documentation/sound/alsa/soc/jack.txt | |||
@@ -0,0 +1,71 @@ | |||
1 | ASoC jack detection | ||
2 | =================== | ||
3 | |||
4 | ALSA has a standard API for representing physical jacks to user space, | ||
5 | the kernel side of which can be seen in include/sound/jack.h. ASoC | ||
6 | provides a version of this API adding two additional features: | ||
7 | |||
8 | - It allows more than one jack detection method to work together on one | ||
9 | user visible jack. In embedded systems it is common for multiple | ||
10 | to be present on a single jack but handled by separate bits of | ||
11 | hardware. | ||
12 | |||
13 | - Integration with DAPM, allowing DAPM endpoints to be updated | ||
14 | automatically based on the detected jack status (eg, turning off the | ||
15 | headphone outputs if no headphones are present). | ||
16 | |||
17 | This is done by splitting the jacks up into three things working | ||
18 | together: the jack itself represented by a struct snd_soc_jack, sets of | ||
19 | snd_soc_jack_pins representing DAPM endpoints to update and blocks of | ||
20 | code providing jack reporting mechanisms. | ||
21 | |||
22 | For example, a system may have a stereo headset jack with two reporting | ||
23 | mechanisms, one for the headphone and one for the microphone. Some | ||
24 | systems won't be able to use their speaker output while a headphone is | ||
25 | connected and so will want to make sure to update both speaker and | ||
26 | headphone when the headphone jack status changes. | ||
27 | |||
28 | The jack - struct snd_soc_jack | ||
29 | ============================== | ||
30 | |||
31 | This represents a physical jack on the system and is what is visible to | ||
32 | user space. The jack itself is completely passive, it is set up by the | ||
33 | machine driver and updated by jack detection methods. | ||
34 | |||
35 | Jacks are created by the machine driver calling snd_soc_jack_new(). | ||
36 | |||
37 | snd_soc_jack_pin | ||
38 | ================ | ||
39 | |||
40 | These represent a DAPM pin to update depending on some of the status | ||
41 | bits supported by the jack. Each snd_soc_jack has zero or more of these | ||
42 | which are updated automatically. They are created by the machine driver | ||
43 | and associated with the jack using snd_soc_jack_add_pins(). The status | ||
44 | of the endpoint may configured to be the opposite of the jack status if | ||
45 | required (eg, enabling a built in microphone if a microphone is not | ||
46 | connected via a jack). | ||
47 | |||
48 | Jack detection methods | ||
49 | ====================== | ||
50 | |||
51 | Actual jack detection is done by code which is able to monitor some | ||
52 | input to the system and update a jack by calling snd_soc_jack_report(), | ||
53 | specifying a subset of bits to update. The jack detection code should | ||
54 | be set up by the machine driver, taking configuration for the jack to | ||
55 | update and the set of things to report when the jack is connected. | ||
56 | |||
57 | Often this is done based on the status of a GPIO - a handler for this is | ||
58 | provided by the snd_soc_jack_add_gpio() function. Other methods are | ||
59 | also available, for example integrated into CODECs. One example of | ||
60 | CODEC integrated jack detection can be see in the WM8350 driver. | ||
61 | |||
62 | Each jack may have multiple reporting mechanisms, though it will need at | ||
63 | least one to be useful. | ||
64 | |||
65 | Machine drivers | ||
66 | =============== | ||
67 | |||
68 | These are all hooked together by the machine driver depending on the | ||
69 | system hardware. The machine driver will set up the snd_soc_jack and | ||
70 | the list of pins to update then set up one or more jack detection | ||
71 | mechanisms to update that jack based on their current status. | ||
diff --git a/Documentation/sparse.txt b/Documentation/sparse.txt index 42f43fa59f24..34c76a55bc04 100644 --- a/Documentation/sparse.txt +++ b/Documentation/sparse.txt | |||
@@ -42,6 +42,14 @@ sure that bitwise types don't get mixed up (little-endian vs big-endian | |||
42 | vs cpu-endian vs whatever), and there the constant "0" really _is_ | 42 | vs cpu-endian vs whatever), and there the constant "0" really _is_ |
43 | special. | 43 | special. |
44 | 44 | ||
45 | __bitwise__ - to be used for relatively compact stuff (gfp_t, etc.) that | ||
46 | is mostly warning-free and is supposed to stay that way. Warnings will | ||
47 | be generated without __CHECK_ENDIAN__. | ||
48 | |||
49 | __bitwise - noisy stuff; in particular, __le*/__be* are that. We really | ||
50 | don't want to drown in noise unless we'd explicitly asked for it. | ||
51 | |||
52 | |||
45 | Getting sparse | 53 | Getting sparse |
46 | ~~~~~~~~~~~~~~ | 54 | ~~~~~~~~~~~~~~ |
47 | 55 | ||
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index a34d55b65441..df38ef046f8d 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt | |||
@@ -95,7 +95,7 @@ of struct cmsghdr structures with appended data. | |||
95 | 95 | ||
96 | There is only one file in this directory. | 96 | There is only one file in this directory. |
97 | unix_dgram_qlen limits the max number of datagrams queued in Unix domain | 97 | unix_dgram_qlen limits the max number of datagrams queued in Unix domain |
98 | socket's buffer. It will not take effect unless PF_UNIX flag is spicified. | 98 | socket's buffer. It will not take effect unless PF_UNIX flag is specified. |
99 | 99 | ||
100 | 100 | ||
101 | 3. /proc/sys/net/ipv4 - IPV4 settings | 101 | 3. /proc/sys/net/ipv4 - IPV4 settings |
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 3197fc83bc51..97c4b3284329 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt | |||
@@ -39,6 +39,8 @@ Currently, these files are in /proc/sys/vm: | |||
39 | - nr_hugepages | 39 | - nr_hugepages |
40 | - nr_overcommit_hugepages | 40 | - nr_overcommit_hugepages |
41 | - nr_pdflush_threads | 41 | - nr_pdflush_threads |
42 | - nr_pdflush_threads_min | ||
43 | - nr_pdflush_threads_max | ||
42 | - nr_trim_pages (only if CONFIG_MMU=n) | 44 | - nr_trim_pages (only if CONFIG_MMU=n) |
43 | - numa_zonelist_order | 45 | - numa_zonelist_order |
44 | - oom_dump_tasks | 46 | - oom_dump_tasks |
@@ -463,6 +465,32 @@ The default value is 0. | |||
463 | 465 | ||
464 | ============================================================== | 466 | ============================================================== |
465 | 467 | ||
468 | nr_pdflush_threads_min | ||
469 | |||
470 | This value controls the minimum number of pdflush threads. | ||
471 | |||
472 | At boot time, the kernel will create and maintain 'nr_pdflush_threads_min' | ||
473 | threads for the kernel's lifetime. | ||
474 | |||
475 | The default value is 2. The minimum value you can specify is 1, and | ||
476 | the maximum value is the current setting of 'nr_pdflush_threads_max'. | ||
477 | |||
478 | See 'nr_pdflush_threads_max' below for more information. | ||
479 | |||
480 | ============================================================== | ||
481 | |||
482 | nr_pdflush_threads_max | ||
483 | |||
484 | This value controls the maximum number of pdflush threads that can be | ||
485 | created. The pdflush algorithm will create a new pdflush thread (up to | ||
486 | this maximum) if no pdflush threads have been available for >= 1 second. | ||
487 | |||
488 | The default value is 8. The minimum value you can specify is the | ||
489 | current value of 'nr_pdflush_threads_min' and the | ||
490 | maximum is 1000. | ||
491 | |||
492 | ============================================================== | ||
493 | |||
466 | overcommit_memory: | 494 | overcommit_memory: |
467 | 495 | ||
468 | This value contains a flag that enables memory overcommitment. | 496 | This value contains a flag that enables memory overcommitment. |
diff --git a/Documentation/tomoyo.txt b/Documentation/tomoyo.txt new file mode 100644 index 000000000000..b3a232cae7f8 --- /dev/null +++ b/Documentation/tomoyo.txt | |||
@@ -0,0 +1,55 @@ | |||
1 | --- What is TOMOYO? --- | ||
2 | |||
3 | TOMOYO is a name-based MAC extension (LSM module) for the Linux kernel. | ||
4 | |||
5 | LiveCD-based tutorials are available at | ||
6 | http://tomoyo.sourceforge.jp/en/1.6.x/1st-step/ubuntu8.04-live/ | ||
7 | http://tomoyo.sourceforge.jp/en/1.6.x/1st-step/centos5-live/ . | ||
8 | Though these tutorials use non-LSM version of TOMOYO, they are useful for you | ||
9 | to know what TOMOYO is. | ||
10 | |||
11 | --- How to enable TOMOYO? --- | ||
12 | |||
13 | Build the kernel with CONFIG_SECURITY_TOMOYO=y and pass "security=tomoyo" on | ||
14 | kernel's command line. | ||
15 | |||
16 | Please see http://tomoyo.sourceforge.jp/en/2.2.x/ for details. | ||
17 | |||
18 | --- Where is documentation? --- | ||
19 | |||
20 | User <-> Kernel interface documentation is available at | ||
21 | http://tomoyo.sourceforge.jp/en/2.2.x/policy-reference.html . | ||
22 | |||
23 | Materials we prepared for seminars and symposiums are available at | ||
24 | http://sourceforge.jp/projects/tomoyo/docs/?category_id=532&language_id=1 . | ||
25 | Below lists are chosen from three aspects. | ||
26 | |||
27 | What is TOMOYO? | ||
28 | TOMOYO Linux Overview | ||
29 | http://sourceforge.jp/projects/tomoyo/docs/lca2009-takeda.pdf | ||
30 | TOMOYO Linux: pragmatic and manageable security for Linux | ||
31 | http://sourceforge.jp/projects/tomoyo/docs/freedomhectaipei-tomoyo.pdf | ||
32 | TOMOYO Linux: A Practical Method to Understand and Protect Your Own Linux Box | ||
33 | http://sourceforge.jp/projects/tomoyo/docs/PacSec2007-en-no-demo.pdf | ||
34 | |||
35 | What can TOMOYO do? | ||
36 | Deep inside TOMOYO Linux | ||
37 | http://sourceforge.jp/projects/tomoyo/docs/lca2009-kumaneko.pdf | ||
38 | The role of "pathname based access control" in security. | ||
39 | http://sourceforge.jp/projects/tomoyo/docs/lfj2008-bof.pdf | ||
40 | |||
41 | History of TOMOYO? | ||
42 | Realities of Mainlining | ||
43 | http://sourceforge.jp/projects/tomoyo/docs/lfj2008.pdf | ||
44 | |||
45 | --- What is future plan? --- | ||
46 | |||
47 | We believe that inode based security and name based security are complementary | ||
48 | and both should be used together. But unfortunately, so far, we cannot enable | ||
49 | multiple LSM modules at the same time. We feel sorry that you have to give up | ||
50 | SELinux/SMACK/AppArmor etc. when you want to use TOMOYO. | ||
51 | |||
52 | We hope that LSM becomes stackable in future. Meanwhile, you can use non-LSM | ||
53 | version of TOMOYO, available at http://tomoyo.sourceforge.jp/en/1.6.x/ . | ||
54 | LSM version of TOMOYO is a subset of non-LSM version of TOMOYO. We are planning | ||
55 | to port non-LSM version's functionalities to LSM versions. | ||
diff --git a/Documentation/ftrace.txt b/Documentation/trace/ftrace.txt index fd9a3e693813..fd9a3e693813 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/trace/ftrace.txt | |||
diff --git a/Documentation/vm/kmemtrace.txt b/Documentation/trace/kmemtrace.txt index a956d9b7f943..a956d9b7f943 100644 --- a/Documentation/vm/kmemtrace.txt +++ b/Documentation/trace/kmemtrace.txt | |||
diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/trace/mmiotrace.txt index 5731c67abc55..5731c67abc55 100644 --- a/Documentation/tracers/mmiotrace.txt +++ b/Documentation/trace/mmiotrace.txt | |||
diff --git a/Documentation/tracepoints.txt b/Documentation/trace/tracepoints.txt index c0e1ceed75a4..c0e1ceed75a4 100644 --- a/Documentation/tracepoints.txt +++ b/Documentation/trace/tracepoints.txt | |||
diff --git a/Documentation/video4linux/pxa_camera.txt b/Documentation/video4linux/pxa_camera.txt new file mode 100644 index 000000000000..b1137f9a53eb --- /dev/null +++ b/Documentation/video4linux/pxa_camera.txt | |||
@@ -0,0 +1,125 @@ | |||
1 | PXA-Camera Host Driver | ||
2 | ====================== | ||
3 | |||
4 | Constraints | ||
5 | ----------- | ||
6 | a) Image size for YUV422P format | ||
7 | All YUV422P images are enforced to have width x height % 16 = 0. | ||
8 | This is due to DMA constraints, which transfers only planes of 8 byte | ||
9 | multiples. | ||
10 | |||
11 | |||
12 | Global video workflow | ||
13 | --------------------- | ||
14 | a) QCI stopped | ||
15 | Initialy, the QCI interface is stopped. | ||
16 | When a buffer is queued (pxa_videobuf_ops->buf_queue), the QCI starts. | ||
17 | |||
18 | b) QCI started | ||
19 | More buffers can be queued while the QCI is started without halting the | ||
20 | capture. The new buffers are "appended" at the tail of the DMA chain, and | ||
21 | smoothly captured one frame after the other. | ||
22 | |||
23 | Once a buffer is filled in the QCI interface, it is marked as "DONE" and | ||
24 | removed from the active buffers list. It can be then requeud or dequeued by | ||
25 | userland application. | ||
26 | |||
27 | Once the last buffer is filled in, the QCI interface stops. | ||
28 | |||
29 | |||
30 | DMA usage | ||
31 | --------- | ||
32 | a) DMA flow | ||
33 | - first buffer queued for capture | ||
34 | Once a first buffer is queued for capture, the QCI is started, but data | ||
35 | transfer is not started. On "End Of Frame" interrupt, the irq handler | ||
36 | starts the DMA chain. | ||
37 | - capture of one videobuffer | ||
38 | The DMA chain starts transfering data into videobuffer RAM pages. | ||
39 | When all pages are transfered, the DMA irq is raised on "ENDINTR" status | ||
40 | - finishing one videobuffer | ||
41 | The DMA irq handler marks the videobuffer as "done", and removes it from | ||
42 | the active running queue | ||
43 | Meanwhile, the next videobuffer (if there is one), is transfered by DMA | ||
44 | - finishing the last videobuffer | ||
45 | On the DMA irq of the last videobuffer, the QCI is stopped. | ||
46 | |||
47 | b) DMA prepared buffer will have this structure | ||
48 | |||
49 | +------------+-----+---------------+-----------------+ | ||
50 | | desc-sg[0] | ... | desc-sg[last] | finisher/linker | | ||
51 | +------------+-----+---------------+-----------------+ | ||
52 | |||
53 | This structure is pointed by dma->sg_cpu. | ||
54 | The descriptors are used as follows : | ||
55 | - desc-sg[i]: i-th descriptor, transfering the i-th sg | ||
56 | element to the video buffer scatter gather | ||
57 | - finisher: has ddadr=DADDR_STOP, dcmd=ENDIRQEN | ||
58 | - linker: has ddadr= desc-sg[0] of next video buffer, dcmd=0 | ||
59 | |||
60 | For the next schema, let's assume d0=desc-sg[0] .. dN=desc-sg[N], | ||
61 | "f" stands for finisher and "l" for linker. | ||
62 | A typical running chain is : | ||
63 | |||
64 | Videobuffer 1 Videobuffer 2 | ||
65 | +---------+----+---+ +----+----+----+---+ | ||
66 | | d0 | .. | dN | l | | d0 | .. | dN | f | | ||
67 | +---------+----+-|-+ ^----+----+----+---+ | ||
68 | | | | ||
69 | +----+ | ||
70 | |||
71 | After the chaining is finished, the chain looks like : | ||
72 | |||
73 | Videobuffer 1 Videobuffer 2 Videobuffer 3 | ||
74 | +---------+----+---+ +----+----+----+---+ +----+----+----+---+ | ||
75 | | d0 | .. | dN | l | | d0 | .. | dN | l | | d0 | .. | dN | f | | ||
76 | +---------+----+-|-+ ^----+----+----+-|-+ ^----+----+----+---+ | ||
77 | | | | | | ||
78 | +----+ +----+ | ||
79 | new_link | ||
80 | |||
81 | c) DMA hot chaining timeslice issue | ||
82 | |||
83 | As DMA chaining is done while DMA _is_ running, the linking may be done | ||
84 | while the DMA jumps from one Videobuffer to another. On the schema, that | ||
85 | would be a problem if the following sequence is encountered : | ||
86 | |||
87 | - DMA chain is Videobuffer1 + Videobuffer2 | ||
88 | - pxa_videobuf_queue() is called to queue Videobuffer3 | ||
89 | - DMA controller finishes Videobuffer2, and DMA stops | ||
90 | => | ||
91 | Videobuffer 1 Videobuffer 2 | ||
92 | +---------+----+---+ +----+----+----+---+ | ||
93 | | d0 | .. | dN | l | | d0 | .. | dN | f | | ||
94 | +---------+----+-|-+ ^----+----+----+-^-+ | ||
95 | | | | | ||
96 | +----+ +-- DMA DDADR loads DDADR_STOP | ||
97 | |||
98 | - pxa_dma_add_tail_buf() is called, the Videobuffer2 "finisher" is | ||
99 | replaced by a "linker" to Videobuffer3 (creation of new_link) | ||
100 | - pxa_videobuf_queue() finishes | ||
101 | - the DMA irq handler is called, which terminates Videobuffer2 | ||
102 | - Videobuffer3 capture is not scheduled on DMA chain (as it stopped !!!) | ||
103 | |||
104 | Videobuffer 1 Videobuffer 2 Videobuffer 3 | ||
105 | +---------+----+---+ +----+----+----+---+ +----+----+----+---+ | ||
106 | | d0 | .. | dN | l | | d0 | .. | dN | l | | d0 | .. | dN | f | | ||
107 | +---------+----+-|-+ ^----+----+----+-|-+ ^----+----+----+---+ | ||
108 | | | | | | ||
109 | +----+ +----+ | ||
110 | new_link | ||
111 | DMA DDADR still is DDADR_STOP | ||
112 | |||
113 | - pxa_camera_check_link_miss() is called | ||
114 | This checks if the DMA is finished and a buffer is still on the | ||
115 | pcdev->capture list. If that's the case, the capture will be restarted, | ||
116 | and Videobuffer3 is scheduled on DMA chain. | ||
117 | - the DMA irq handler finishes | ||
118 | |||
119 | Note: if DMA stops just after pxa_camera_check_link_miss() reads DDADR() | ||
120 | value, we have the guarantee that the DMA irq handler will be called back | ||
121 | when the DMA will finish the buffer, and pxa_camera_check_link_miss() will | ||
122 | be called again, to reschedule Videobuffer3. | ||
123 | |||
124 | -- | ||
125 | Author: Robert Jarzmik <robert.jarzmik@free.fr> | ||
diff --git a/Documentation/video4linux/v4l2-framework.txt b/Documentation/video4linux/v4l2-framework.txt index a31177390e55..854808b67fae 100644 --- a/Documentation/video4linux/v4l2-framework.txt +++ b/Documentation/video4linux/v4l2-framework.txt | |||
@@ -90,7 +90,7 @@ up before calling v4l2_device_register then it will be untouched. If dev is | |||
90 | NULL, then you *must* setup v4l2_dev->name before calling v4l2_device_register. | 90 | NULL, then you *must* setup v4l2_dev->name before calling v4l2_device_register. |
91 | 91 | ||
92 | The first 'dev' argument is normally the struct device pointer of a pci_dev, | 92 | The first 'dev' argument is normally the struct device pointer of a pci_dev, |
93 | usb_device or platform_device. It is rare for dev to be NULL, but it happens | 93 | usb_interface or platform_device. It is rare for dev to be NULL, but it happens |
94 | with ISA devices or when one device creates multiple PCI devices, thus making | 94 | with ISA devices or when one device creates multiple PCI devices, thus making |
95 | it impossible to associate v4l2_dev with a particular parent. | 95 | it impossible to associate v4l2_dev with a particular parent. |
96 | 96 | ||
@@ -351,17 +351,6 @@ And this to go from an i2c_client to a v4l2_subdev struct: | |||
351 | 351 | ||
352 | struct v4l2_subdev *sd = i2c_get_clientdata(client); | 352 | struct v4l2_subdev *sd = i2c_get_clientdata(client); |
353 | 353 | ||
354 | Finally you need to make a command function to make driver->command() | ||
355 | call the right subdev_ops functions: | ||
356 | |||
357 | static int subdev_command(struct i2c_client *client, unsigned cmd, void *arg) | ||
358 | { | ||
359 | return v4l2_subdev_command(i2c_get_clientdata(client), cmd, arg); | ||
360 | } | ||
361 | |||
362 | If driver->command is never used then you can leave this out. Eventually the | ||
363 | driver->command usage should be removed from v4l. | ||
364 | |||
365 | Make sure to call v4l2_device_unregister_subdev(sd) when the remove() callback | 354 | Make sure to call v4l2_device_unregister_subdev(sd) when the remove() callback |
366 | is called. This will unregister the sub-device from the bridge driver. It is | 355 | is called. This will unregister the sub-device from the bridge driver. It is |
367 | safe to call this even if the sub-device was never registered. | 356 | safe to call this even if the sub-device was never registered. |
@@ -375,14 +364,12 @@ from the remove() callback ensures that this is always done correctly. | |||
375 | 364 | ||
376 | The bridge driver also has some helper functions it can use: | 365 | The bridge driver also has some helper functions it can use: |
377 | 366 | ||
378 | struct v4l2_subdev *sd = v4l2_i2c_new_subdev(adapter, "module_foo", "chipid", 0x36); | 367 | struct v4l2_subdev *sd = v4l2_i2c_new_subdev(v4l2_dev, adapter, |
368 | "module_foo", "chipid", 0x36); | ||
379 | 369 | ||
380 | This loads the given module (can be NULL if no module needs to be loaded) and | 370 | This loads the given module (can be NULL if no module needs to be loaded) and |
381 | calls i2c_new_device() with the given i2c_adapter and chip/address arguments. | 371 | calls i2c_new_device() with the given i2c_adapter and chip/address arguments. |
382 | If all goes well, then it registers the subdev with the v4l2_device. It gets | 372 | If all goes well, then it registers the subdev with the v4l2_device. |
383 | the v4l2_device by calling i2c_get_adapdata(adapter), so you should make sure | ||
384 | to call i2c_set_adapdata(adapter, v4l2_device) when you setup the i2c_adapter | ||
385 | in your driver. | ||
386 | 373 | ||
387 | You can also use v4l2_i2c_new_probed_subdev() which is very similar to | 374 | You can also use v4l2_i2c_new_probed_subdev() which is very similar to |
388 | v4l2_i2c_new_subdev(), except that it has an array of possible I2C addresses | 375 | v4l2_i2c_new_subdev(), except that it has an array of possible I2C addresses |
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX index 2131b00b63f6..2f77ced35df7 100644 --- a/Documentation/vm/00-INDEX +++ b/Documentation/vm/00-INDEX | |||
@@ -1,5 +1,7 @@ | |||
1 | 00-INDEX | 1 | 00-INDEX |
2 | - this file. | 2 | - this file. |
3 | active_mm.txt | ||
4 | - An explanation from Linus about tsk->active_mm vs tsk->mm. | ||
3 | balance | 5 | balance |
4 | - various information on memory balancing. | 6 | - various information on memory balancing. |
5 | hugetlbpage.txt | 7 | hugetlbpage.txt |
diff --git a/Documentation/vm/active_mm.txt b/Documentation/vm/active_mm.txt new file mode 100644 index 000000000000..4ee1f643d897 --- /dev/null +++ b/Documentation/vm/active_mm.txt | |||
@@ -0,0 +1,83 @@ | |||
1 | List: linux-kernel | ||
2 | Subject: Re: active_mm | ||
3 | From: Linus Torvalds <torvalds () transmeta ! com> | ||
4 | Date: 1999-07-30 21:36:24 | ||
5 | |||
6 | Cc'd to linux-kernel, because I don't write explanations all that often, | ||
7 | and when I do I feel better about more people reading them. | ||
8 | |||
9 | On Fri, 30 Jul 1999, David Mosberger wrote: | ||
10 | > | ||
11 | > Is there a brief description someplace on how "mm" vs. "active_mm" in | ||
12 | > the task_struct are supposed to be used? (My apologies if this was | ||
13 | > discussed on the mailing lists---I just returned from vacation and | ||
14 | > wasn't able to follow linux-kernel for a while). | ||
15 | |||
16 | Basically, the new setup is: | ||
17 | |||
18 | - we have "real address spaces" and "anonymous address spaces". The | ||
19 | difference is that an anonymous address space doesn't care about the | ||
20 | user-level page tables at all, so when we do a context switch into an | ||
21 | anonymous address space we just leave the previous address space | ||
22 | active. | ||
23 | |||
24 | The obvious use for a "anonymous address space" is any thread that | ||
25 | doesn't need any user mappings - all kernel threads basically fall into | ||
26 | this category, but even "real" threads can temporarily say that for | ||
27 | some amount of time they are not going to be interested in user space, | ||
28 | and that the scheduler might as well try to avoid wasting time on | ||
29 | switching the VM state around. Currently only the old-style bdflush | ||
30 | sync does that. | ||
31 | |||
32 | - "tsk->mm" points to the "real address space". For an anonymous process, | ||
33 | tsk->mm will be NULL, for the logical reason that an anonymous process | ||
34 | really doesn't _have_ a real address space at all. | ||
35 | |||
36 | - however, we obviously need to keep track of which address space we | ||
37 | "stole" for such an anonymous user. For that, we have "tsk->active_mm", | ||
38 | which shows what the currently active address space is. | ||
39 | |||
40 | The rule is that for a process with a real address space (ie tsk->mm is | ||
41 | non-NULL) the active_mm obviously always has to be the same as the real | ||
42 | one. | ||
43 | |||
44 | For a anonymous process, tsk->mm == NULL, and tsk->active_mm is the | ||
45 | "borrowed" mm while the anonymous process is running. When the | ||
46 | anonymous process gets scheduled away, the borrowed address space is | ||
47 | returned and cleared. | ||
48 | |||
49 | To support all that, the "struct mm_struct" now has two counters: a | ||
50 | "mm_users" counter that is how many "real address space users" there are, | ||
51 | and a "mm_count" counter that is the number of "lazy" users (ie anonymous | ||
52 | users) plus one if there are any real users. | ||
53 | |||
54 | Usually there is at least one real user, but it could be that the real | ||
55 | user exited on another CPU while a lazy user was still active, so you do | ||
56 | actually get cases where you have a address space that is _only_ used by | ||
57 | lazy users. That is often a short-lived state, because once that thread | ||
58 | gets scheduled away in favour of a real thread, the "zombie" mm gets | ||
59 | released because "mm_users" becomes zero. | ||
60 | |||
61 | Also, a new rule is that _nobody_ ever has "init_mm" as a real MM any | ||
62 | more. "init_mm" should be considered just a "lazy context when no other | ||
63 | context is available", and in fact it is mainly used just at bootup when | ||
64 | no real VM has yet been created. So code that used to check | ||
65 | |||
66 | if (current->mm == &init_mm) | ||
67 | |||
68 | should generally just do | ||
69 | |||
70 | if (!current->mm) | ||
71 | |||
72 | instead (which makes more sense anyway - the test is basically one of "do | ||
73 | we have a user context", and is generally done by the page fault handler | ||
74 | and things like that). | ||
75 | |||
76 | Anyway, I put a pre-patch-2.3.13-1 on ftp.kernel.org just a moment ago, | ||
77 | because it slightly changes the interfaces to accomodate the alpha (who | ||
78 | would have thought it, but the alpha actually ends up having one of the | ||
79 | ugliest context switch codes - unlike the other architectures where the MM | ||
80 | and register state is separate, the alpha PALcode joins the two, and you | ||
81 | need to switch both together). | ||
82 | |||
83 | (From http://marc.info/?l=linux-kernel&m=93337278602211&w=2) | ||
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt index 0706a7282a8c..2d70d0d95108 100644 --- a/Documentation/vm/unevictable-lru.txt +++ b/Documentation/vm/unevictable-lru.txt | |||
@@ -1,588 +1,691 @@ | |||
1 | 1 | ============================== | |
2 | This document describes the Linux memory management "Unevictable LRU" | 2 | UNEVICTABLE LRU INFRASTRUCTURE |
3 | infrastructure and the use of this infrastructure to manage several types | 3 | ============================== |
4 | of "unevictable" pages. The document attempts to provide the overall | 4 | |
5 | rationale behind this mechanism and the rationale for some of the design | 5 | ======== |
6 | decisions that drove the implementation. The latter design rationale is | 6 | CONTENTS |
7 | discussed in the context of an implementation description. Admittedly, one | 7 | ======== |
8 | can obtain the implementation details--the "what does it do?"--by reading the | 8 | |
9 | code. One hopes that the descriptions below add value by provide the answer | 9 | (*) The Unevictable LRU |
10 | to "why does it do that?". | 10 | |
11 | 11 | - The unevictable page list. | |
12 | Unevictable LRU Infrastructure: | 12 | - Memory control group interaction. |
13 | 13 | - Marking address spaces unevictable. | |
14 | The Unevictable LRU adds an additional LRU list to track unevictable pages | 14 | - Detecting Unevictable Pages. |
15 | and to hide these pages from vmscan. This mechanism is based on a patch by | 15 | - vmscan's handling of unevictable pages. |
16 | Larry Woodman of Red Hat to address several scalability problems with page | 16 | |
17 | (*) mlock()'d pages. | ||
18 | |||
19 | - History. | ||
20 | - Basic management. | ||
21 | - mlock()/mlockall() system call handling. | ||
22 | - Filtering special vmas. | ||
23 | - munlock()/munlockall() system call handling. | ||
24 | - Migrating mlocked pages. | ||
25 | - mmap(MAP_LOCKED) system call handling. | ||
26 | - munmap()/exit()/exec() system call handling. | ||
27 | - try_to_unmap(). | ||
28 | - try_to_munlock() reverse map scan. | ||
29 | - Page reclaim in shrink_*_list(). | ||
30 | |||
31 | |||
32 | ============ | ||
33 | INTRODUCTION | ||
34 | ============ | ||
35 | |||
36 | This document describes the Linux memory manager's "Unevictable LRU" | ||
37 | infrastructure and the use of this to manage several types of "unevictable" | ||
38 | pages. | ||
39 | |||
40 | The document attempts to provide the overall rationale behind this mechanism | ||
41 | and the rationale for some of the design decisions that drove the | ||
42 | implementation. The latter design rationale is discussed in the context of an | ||
43 | implementation description. Admittedly, one can obtain the implementation | ||
44 | details - the "what does it do?" - by reading the code. One hopes that the | ||
45 | descriptions below add value by provide the answer to "why does it do that?". | ||
46 | |||
47 | |||
48 | =================== | ||
49 | THE UNEVICTABLE LRU | ||
50 | =================== | ||
51 | |||
52 | The Unevictable LRU facility adds an additional LRU list to track unevictable | ||
53 | pages and to hide these pages from vmscan. This mechanism is based on a patch | ||
54 | by Larry Woodman of Red Hat to address several scalability problems with page | ||
17 | reclaim in Linux. The problems have been observed at customer sites on large | 55 | reclaim in Linux. The problems have been observed at customer sites on large |
18 | memory x86_64 systems. For example, a non-numal x86_64 platform with 128GB | 56 | memory x86_64 systems. |
19 | of main memory will have over 32 million 4k pages in a single zone. When a | 57 | |
20 | large fraction of these pages are not evictable for any reason [see below], | 58 | To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of |
21 | vmscan will spend a lot of time scanning the LRU lists looking for the small | 59 | main memory will have over 32 million 4k pages in a single zone. When a large |
22 | fraction of pages that are evictable. This can result in a situation where | 60 | fraction of these pages are not evictable for any reason [see below], vmscan |
23 | all cpus are spending 100% of their time in vmscan for hours or days on end, | 61 | will spend a lot of time scanning the LRU lists looking for the small fraction |
24 | with the system completely unresponsive. | 62 | of pages that are evictable. This can result in a situation where all CPUs are |
25 | 63 | spending 100% of their time in vmscan for hours or days on end, with the system | |
26 | The Unevictable LRU infrastructure addresses the following classes of | 64 | completely unresponsive. |
27 | unevictable pages: | 65 | |
28 | 66 | The unevictable list addresses the following classes of unevictable pages: | |
29 | + page owned by ramfs | 67 | |
30 | + page mapped into SHM_LOCKed shared memory regions | 68 | (*) Those owned by ramfs. |
31 | + page mapped into VM_LOCKED [mlock()ed] vmas | 69 | |
32 | 70 | (*) Those mapped into SHM_LOCK'd shared memory regions. | |
33 | The infrastructure might be able to handle other conditions that make pages | 71 | |
72 | (*) Those mapped into VM_LOCKED [mlock()ed] VMAs. | ||
73 | |||
74 | The infrastructure may also be able to handle other conditions that make pages | ||
34 | unevictable, either by definition or by circumstance, in the future. | 75 | unevictable, either by definition or by circumstance, in the future. |
35 | 76 | ||
36 | 77 | ||
37 | The Unevictable LRU List | 78 | THE UNEVICTABLE PAGE LIST |
79 | ------------------------- | ||
38 | 80 | ||
39 | The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list | 81 | The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list |
40 | called the "unevictable" list and an associated page flag, PG_unevictable, to | 82 | called the "unevictable" list and an associated page flag, PG_unevictable, to |
41 | indicate that the page is being managed on the unevictable list. The | 83 | indicate that the page is being managed on the unevictable list. |
42 | PG_unevictable flag is analogous to, and mutually exclusive with, the PG_active | 84 | |
43 | flag in that it indicates on which LRU list a page resides when PG_lru is set. | 85 | The PG_unevictable flag is analogous to, and mutually exclusive with, the |
44 | The unevictable LRU list is source configurable based on the UNEVICTABLE_LRU | 86 | PG_active flag in that it indicates on which LRU list a page resides when |
45 | Kconfig option. | 87 | PG_lru is set. The unevictable list is compile-time configurable based on the |
88 | UNEVICTABLE_LRU Kconfig option. | ||
46 | 89 | ||
47 | The Unevictable LRU infrastructure maintains unevictable pages on an additional | 90 | The Unevictable LRU infrastructure maintains unevictable pages on an additional |
48 | LRU list for a few reasons: | 91 | LRU list for a few reasons: |
49 | 92 | ||
50 | 1) We get to "treat unevictable pages just like we treat other pages in the | 93 | (1) We get to "treat unevictable pages just like we treat other pages in the |
51 | system, which means we get to use the same code to manipulate them, the | 94 | system - which means we get to use the same code to manipulate them, the |
52 | same code to isolate them (for migrate, etc.), the same code to keep track | 95 | same code to isolate them (for migrate, etc.), the same code to keep track |
53 | of the statistics, etc..." [Rik van Riel] | 96 | of the statistics, etc..." [Rik van Riel] |
97 | |||
98 | (2) We want to be able to migrate unevictable pages between nodes for memory | ||
99 | defragmentation, workload management and memory hotplug. The linux kernel | ||
100 | can only migrate pages that it can successfully isolate from the LRU | ||
101 | lists. If we were to maintain pages elsewhere than on an LRU-like list, | ||
102 | where they can be found by isolate_lru_page(), we would prevent their | ||
103 | migration, unless we reworked migration code to find the unevictable pages | ||
104 | itself. | ||
54 | 105 | ||
55 | 2) We want to be able to migrate unevictable pages between nodes--for memory | ||
56 | defragmentation, workload management and memory hotplug. The linux kernel | ||
57 | can only migrate pages that it can successfully isolate from the lru lists. | ||
58 | If we were to maintain pages elsewise than on an lru-like list, where they | ||
59 | can be found by isolate_lru_page(), we would prevent their migration, unless | ||
60 | we reworked migration code to find the unevictable pages. | ||
61 | 106 | ||
107 | The unevictable list does not differentiate between file-backed and anonymous, | ||
108 | swap-backed pages. This differentiation is only important while the pages are, | ||
109 | in fact, evictable. | ||
62 | 110 | ||
63 | The unevictable LRU list does not differentiate between file backed and swap | 111 | The unevictable list benefits from the "arrayification" of the per-zone LRU |
64 | backed [anon] pages. This differentiation is only important while the pages | 112 | lists and statistics originally proposed and posted by Christoph Lameter. |
65 | are, in fact, evictable. | ||
66 | 113 | ||
67 | The unevictable LRU list benefits from the "arrayification" of the per-zone | 114 | The unevictable list does not use the LRU pagevec mechanism. Rather, |
68 | LRU lists and statistics originally proposed and posted by Christoph Lameter. | 115 | unevictable pages are placed directly on the page's zone's unevictable list |
116 | under the zone lru_lock. This allows us to prevent the stranding of pages on | ||
117 | the unevictable list when one task has the page isolated from the LRU and other | ||
118 | tasks are changing the "evictability" state of the page. | ||
69 | 119 | ||
70 | The unevictable list does not use the lru pagevec mechanism. Rather, | ||
71 | unevictable pages are placed directly on the page's zone's unevictable | ||
72 | list under the zone lru_lock. The reason for this is to prevent stranding | ||
73 | of pages on the unevictable list when one task has the page isolated from the | ||
74 | lru and other tasks are changing the "evictability" state of the page. | ||
75 | 120 | ||
121 | MEMORY CONTROL GROUP INTERACTION | ||
122 | -------------------------------- | ||
76 | 123 | ||
77 | Unevictable LRU and Memory Controller Interaction | 124 | The unevictable LRU facility interacts with the memory control group [aka |
125 | memory controller; see Documentation/cgroups/memory.txt] by extending the | ||
126 | lru_list enum. | ||
127 | |||
128 | The memory controller data structure automatically gets a per-zone unevictable | ||
129 | list as a result of the "arrayification" of the per-zone LRU lists (one per | ||
130 | lru_list enum element). The memory controller tracks the movement of pages to | ||
131 | and from the unevictable list. | ||
78 | 132 | ||
79 | The memory controller data structure automatically gets a per zone unevictable | ||
80 | lru list as a result of the "arrayification" of the per-zone LRU lists. The | ||
81 | memory controller tracks the movement of pages to and from the unevictable list. | ||
82 | When a memory control group comes under memory pressure, the controller will | 133 | When a memory control group comes under memory pressure, the controller will |
83 | not attempt to reclaim pages on the unevictable list. This has a couple of | 134 | not attempt to reclaim pages on the unevictable list. This has a couple of |
84 | effects. Because the pages are "hidden" from reclaim on the unevictable list, | 135 | effects: |
85 | the reclaim process can be more efficient, dealing only with pages that have | 136 | |
86 | a chance of being reclaimed. On the other hand, if too many of the pages | 137 | (1) Because the pages are "hidden" from reclaim on the unevictable list, the |
87 | charged to the control group are unevictable, the evictable portion of the | 138 | reclaim process can be more efficient, dealing only with pages that have a |
88 | working set of the tasks in the control group may not fit into the available | 139 | chance of being reclaimed. |
89 | memory. This can cause the control group to thrash or to oom-kill tasks. | 140 | |
90 | 141 | (2) On the other hand, if too many of the pages charged to the control group | |
91 | 142 | are unevictable, the evictable portion of the working set of the tasks in | |
92 | Unevictable LRU: Detecting Unevictable Pages | 143 | the control group may not fit into the available memory. This can cause |
93 | 144 | the control group to thrash or to OOM-kill tasks. | |
94 | The function page_evictable(page, vma) in vmscan.c determines whether a | 145 | |
95 | page is evictable or not. For ramfs pages and pages in SHM_LOCKed regions, | 146 | |
96 | page_evictable() tests a new address space flag, AS_UNEVICTABLE, in the page's | 147 | MARKING ADDRESS SPACES UNEVICTABLE |
97 | address space using a wrapper function. Wrapper functions are used to set, | 148 | ---------------------------------- |
98 | clear and test the flag to reduce the requirement for #ifdef's throughout the | 149 | |
99 | source code. AS_UNEVICTABLE is set on ramfs inode/mapping when it is created. | 150 | For facilities such as ramfs none of the pages attached to the address space |
100 | This flag remains for the life of the inode. | 151 | may be evicted. To prevent eviction of any such pages, the AS_UNEVICTABLE |
101 | 152 | address space flag is provided, and this can be manipulated by a filesystem | |
102 | For shared memory regions, AS_UNEVICTABLE is set when an application | 153 | using a number of wrapper functions: |
103 | successfully SHM_LOCKs the region and is removed when the region is | 154 | |
104 | SHM_UNLOCKed. Note that shmctl(SHM_LOCK, ...) does not populate the page | 155 | (*) void mapping_set_unevictable(struct address_space *mapping); |
105 | tables for the region as does, for example, mlock(). So, we make no special | 156 | |
106 | effort to push any pages in the SHM_LOCKed region to the unevictable list. | 157 | Mark the address space as being completely unevictable. |
107 | Vmscan will do this when/if it encounters the pages during reclaim. On | 158 | |
108 | SHM_UNLOCK, shmctl() scans the pages in the region and "rescues" them from the | 159 | (*) void mapping_clear_unevictable(struct address_space *mapping); |
109 | unevictable list if no other condition keeps them unevictable. If a SHM_LOCKed | 160 | |
110 | region is destroyed, the pages are also "rescued" from the unevictable list in | 161 | Mark the address space as being evictable. |
111 | the process of freeing them. | 162 | |
112 | 163 | (*) int mapping_unevictable(struct address_space *mapping); | |
113 | page_evictable() detects mlock()ed pages by testing an additional page flag, | 164 | |
114 | PG_mlocked via the PageMlocked() wrapper. If the page is NOT mlocked, and a | 165 | Query the address space, and return true if it is completely |
115 | non-NULL vma is supplied, page_evictable() will check whether the vma is | 166 | unevictable. |
167 | |||
168 | These are currently used in two places in the kernel: | ||
169 | |||
170 | (1) By ramfs to mark the address spaces of its inodes when they are created, | ||
171 | and this mark remains for the life of the inode. | ||
172 | |||
173 | (2) By SYSV SHM to mark SHM_LOCK'd address spaces until SHM_UNLOCK is called. | ||
174 | |||
175 | Note that SHM_LOCK is not required to page in the locked pages if they're | ||
176 | swapped out; the application must touch the pages manually if it wants to | ||
177 | ensure they're in memory. | ||
178 | |||
179 | |||
180 | DETECTING UNEVICTABLE PAGES | ||
181 | --------------------------- | ||
182 | |||
183 | The function page_evictable() in vmscan.c determines whether a page is | ||
184 | evictable or not using the query function outlined above [see section "Marking | ||
185 | address spaces unevictable"] to check the AS_UNEVICTABLE flag. | ||
186 | |||
187 | For address spaces that are so marked after being populated (as SHM regions | ||
188 | might be), the lock action (eg: SHM_LOCK) can be lazy, and need not populate | ||
189 | the page tables for the region as does, for example, mlock(), nor need it make | ||
190 | any special effort to push any pages in the SHM_LOCK'd area to the unevictable | ||
191 | list. Instead, vmscan will do this if and when it encounters the pages during | ||
192 | a reclamation scan. | ||
193 | |||
194 | On an unlock action (such as SHM_UNLOCK), the unlocker (eg: shmctl()) must scan | ||
195 | the pages in the region and "rescue" them from the unevictable list if no other | ||
196 | condition is keeping them unevictable. If an unevictable region is destroyed, | ||
197 | the pages are also "rescued" from the unevictable list in the process of | ||
198 | freeing them. | ||
199 | |||
200 | page_evictable() also checks for mlocked pages by testing an additional page | ||
201 | flag, PG_mlocked (as wrapped by PageMlocked()). If the page is NOT mlocked, | ||
202 | and a non-NULL VMA is supplied, page_evictable() will check whether the VMA is | ||
116 | VM_LOCKED via is_mlocked_vma(). is_mlocked_vma() will SetPageMlocked() and | 203 | VM_LOCKED via is_mlocked_vma(). is_mlocked_vma() will SetPageMlocked() and |
117 | update the appropriate statistics if the vma is VM_LOCKED. This method allows | 204 | update the appropriate statistics if the vma is VM_LOCKED. This method allows |
118 | efficient "culling" of pages in the fault path that are being faulted in to | 205 | efficient "culling" of pages in the fault path that are being faulted in to |
119 | VM_LOCKED vmas. | 206 | VM_LOCKED VMAs. |
120 | 207 | ||
121 | 208 | ||
122 | Unevictable Pages and Vmscan [shrink_*_list()] | 209 | VMSCAN'S HANDLING OF UNEVICTABLE PAGES |
210 | -------------------------------------- | ||
123 | 211 | ||
124 | If unevictable pages are culled in the fault path, or moved to the unevictable | 212 | If unevictable pages are culled in the fault path, or moved to the unevictable |
125 | list at mlock() or mmap() time, vmscan will never encounter the pages until | 213 | list at mlock() or mmap() time, vmscan will not encounter the pages until they |
126 | they have become evictable again, for example, via munlock() and have been | 214 | have become evictable again (via munlock() for example) and have been "rescued" |
127 | "rescued" from the unevictable list. However, there may be situations where we | 215 | from the unevictable list. However, there may be situations where we decide, |
128 | decide, for the sake of expediency, to leave a unevictable page on one of the | 216 | for the sake of expediency, to leave a unevictable page on one of the regular |
129 | regular active/inactive LRU lists for vmscan to deal with. Vmscan checks for | 217 | active/inactive LRU lists for vmscan to deal with. vmscan checks for such |
130 | such pages in all of the shrink_{active|inactive|page}_list() functions and | 218 | pages in all of the shrink_{active|inactive|page}_list() functions and will |
131 | will "cull" such pages that it encounters--that is, it diverts those pages to | 219 | "cull" such pages that it encounters: that is, it diverts those pages to the |
132 | the unevictable list for the zone being scanned. | 220 | unevictable list for the zone being scanned. |
133 | 221 | ||
134 | There may be situations where a page is mapped into a VM_LOCKED vma, but the | 222 | There may be situations where a page is mapped into a VM_LOCKED VMA, but the |
135 | page is not marked as PageMlocked. Such pages will make it all the way to | 223 | page is not marked as PG_mlocked. Such pages will make it all the way to |
136 | shrink_page_list() where they will be detected when vmscan walks the reverse | 224 | shrink_page_list() where they will be detected when vmscan walks the reverse |
137 | map in try_to_unmap(). If try_to_unmap() returns SWAP_MLOCK, shrink_page_list() | 225 | map in try_to_unmap(). If try_to_unmap() returns SWAP_MLOCK, |
138 | will cull the page at that point. | 226 | shrink_page_list() will cull the page at that point. |
139 | 227 | ||
140 | To "cull" an unevictable page, vmscan simply puts the page back on the lru | 228 | To "cull" an unevictable page, vmscan simply puts the page back on the LRU list |
141 | list using putback_lru_page()--the inverse operation to isolate_lru_page()-- | 229 | using putback_lru_page() - the inverse operation to isolate_lru_page() - after |
142 | after dropping the page lock. Because the condition which makes the page | 230 | dropping the page lock. Because the condition which makes the page unevictable |
143 | unevictable may change once the page is unlocked, putback_lru_page() will | 231 | may change once the page is unlocked, putback_lru_page() will recheck the |
144 | recheck the unevictable state of a page that it places on the unevictable lru | 232 | unevictable state of a page that it places on the unevictable list. If the |
145 | list. If the page has become unevictable, putback_lru_page() removes it from | 233 | page has become unevictable, putback_lru_page() removes it from the list and |
146 | the list and retries, including the page_unevictable() test. Because such a | 234 | retries, including the page_unevictable() test. Because such a race is a rare |
147 | race is a rare event and movement of pages onto the unevictable list should be | 235 | event and movement of pages onto the unevictable list should be rare, these |
148 | rare, these extra evictabilty checks should not occur in the majority of calls | 236 | extra evictabilty checks should not occur in the majority of calls to |
149 | to putback_lru_page(). | 237 | putback_lru_page(). |
150 | 238 | ||
151 | 239 | ||
152 | Mlocked Page: Prior Work | 240 | ============= |
241 | MLOCKED PAGES | ||
242 | ============= | ||
153 | 243 | ||
154 | The "Unevictable Mlocked Pages" infrastructure is based on work originally | 244 | The unevictable page list is also useful for mlock(), in addition to ramfs and |
245 | SYSV SHM. Note that mlock() is only available in CONFIG_MMU=y situations; in | ||
246 | NOMMU situations, all mappings are effectively mlocked. | ||
247 | |||
248 | |||
249 | HISTORY | ||
250 | ------- | ||
251 | |||
252 | The "Unevictable mlocked Pages" infrastructure is based on work originally | ||
155 | posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU". | 253 | posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU". |
156 | Nick posted his patch as an alternative to a patch posted by Christoph | 254 | Nick posted his patch as an alternative to a patch posted by Christoph Lameter |
157 | Lameter to achieve the same objective--hiding mlocked pages from vmscan. | 255 | to achieve the same objective: hiding mlocked pages from vmscan. |
158 | In Nick's patch, he used one of the struct page lru list link fields as a count | 256 | |
159 | of VM_LOCKED vmas that map the page. This use of the link field for a count | 257 | In Nick's patch, he used one of the struct page LRU list link fields as a count |
160 | prevented the management of the pages on an LRU list. Thus, mlocked pages were | 258 | of VM_LOCKED VMAs that map the page. This use of the link field for a count |
161 | not migratable as isolate_lru_page() could not find them and the lru list link | 259 | prevented the management of the pages on an LRU list, and thus mlocked pages |
162 | field was not available to the migration subsystem. Nick resolved this by | 260 | were not migratable as isolate_lru_page() could not find them, and the LRU list |
163 | putting mlocked pages back on the lru list before attempting to isolate them, | 261 | link field was not available to the migration subsystem. |
164 | thus abandoning the count of VM_LOCKED vmas. When Nick's patch was integrated | 262 | |
165 | with the Unevictable LRU work, the count was replaced by walking the reverse | 263 | Nick resolved this by putting mlocked pages back on the lru list before |
166 | map to determine whether any VM_LOCKED vmas mapped the page. More on this | 264 | attempting to isolate them, thus abandoning the count of VM_LOCKED VMAs. When |
167 | below. | 265 | Nick's patch was integrated with the Unevictable LRU work, the count was |
168 | 266 | replaced by walking the reverse map to determine whether any VM_LOCKED VMAs | |
169 | 267 | mapped the page. More on this below. | |
170 | Mlocked Pages: Basic Management | 268 | |
171 | 269 | ||
172 | Mlocked pages--pages mapped into a VM_LOCKED vma--represent one class of | 270 | BASIC MANAGEMENT |
173 | unevictable pages. When such a page has been "noticed" by the memory | 271 | ---------------- |
174 | management subsystem, the page is marked with the PG_mlocked [PageMlocked()] | 272 | |
175 | flag. A PageMlocked() page will be placed on the unevictable LRU list when | 273 | mlocked pages - pages mapped into a VM_LOCKED VMA - are a class of unevictable |
176 | it is added to the LRU. Pages can be "noticed" by memory management in | 274 | pages. When such a page has been "noticed" by the memory management subsystem, |
177 | several places: | 275 | the page is marked with the PG_mlocked flag. This can be manipulated using the |
178 | 276 | PageMlocked() functions. | |
179 | 1) in the mlock()/mlockall() system call handlers. | 277 | |
180 | 2) in the mmap() system call handler when mmap()ing a region with the | 278 | A PG_mlocked page will be placed on the unevictable list when it is added to |
181 | MAP_LOCKED flag, or mmap()ing a region in a task that has called | 279 | the LRU. Such pages can be "noticed" by memory management in several places: |
182 | mlockall() with the MCL_FUTURE flag. Both of these conditions result | 280 | |
183 | in the VM_LOCKED flag being set for the vma. | 281 | (1) in the mlock()/mlockall() system call handlers; |
184 | 3) in the fault path, if mlocked pages are "culled" in the fault path, | 282 | |
185 | and when a VM_LOCKED stack segment is expanded. | 283 | (2) in the mmap() system call handler when mmapping a region with the |
186 | 4) as mentioned above, in vmscan:shrink_page_list() when attempting to | 284 | MAP_LOCKED flag; |
187 | reclaim a page in a VM_LOCKED vma via try_to_unmap(). | 285 | |
188 | 286 | (3) mmapping a region in a task that has called mlockall() with the MCL_FUTURE | |
189 | Mlocked pages become unlocked and rescued from the unevictable list when: | 287 | flag |
190 | 288 | ||
191 | 1) mapped in a range unlocked via the munlock()/munlockall() system calls. | 289 | (4) in the fault path, if mlocked pages are "culled" in the fault path, |
192 | 2) munmapped() out of the last VM_LOCKED vma that maps the page, including | 290 | and when a VM_LOCKED stack segment is expanded; or |
193 | unmapping at task exit. | 291 | |
194 | 3) when the page is truncated from the last VM_LOCKED vma of an mmap()ed file. | 292 | (5) as mentioned above, in vmscan:shrink_page_list() when attempting to |
195 | 4) before a page is COWed in a VM_LOCKED vma. | 293 | reclaim a page in a VM_LOCKED VMA via try_to_unmap() |
196 | 294 | ||
197 | 295 | all of which result in the VM_LOCKED flag being set for the VMA if it doesn't | |
198 | Mlocked Pages: mlock()/mlockall() System Call Handling | 296 | already have it set. |
297 | |||
298 | mlocked pages become unlocked and rescued from the unevictable list when: | ||
299 | |||
300 | (1) mapped in a range unlocked via the munlock()/munlockall() system calls; | ||
301 | |||
302 | (2) munmap()'d out of the last VM_LOCKED VMA that maps the page, including | ||
303 | unmapping at task exit; | ||
304 | |||
305 | (3) when the page is truncated from the last VM_LOCKED VMA of an mmapped file; | ||
306 | or | ||
307 | |||
308 | (4) before a page is COW'd in a VM_LOCKED VMA. | ||
309 | |||
310 | |||
311 | mlock()/mlockall() SYSTEM CALL HANDLING | ||
312 | --------------------------------------- | ||
199 | 313 | ||
200 | Both [do_]mlock() and [do_]mlockall() system call handlers call mlock_fixup() | 314 | Both [do_]mlock() and [do_]mlockall() system call handlers call mlock_fixup() |
201 | for each vma in the range specified by the call. In the case of mlockall(), | 315 | for each VMA in the range specified by the call. In the case of mlockall(), |
202 | this is the entire active address space of the task. Note that mlock_fixup() | 316 | this is the entire active address space of the task. Note that mlock_fixup() |
203 | is used for both mlock()ing and munlock()ing a range of memory. A call to | 317 | is used for both mlocking and munlocking a range of memory. A call to mlock() |
204 | mlock() an already VM_LOCKED vma, or to munlock() a vma that is not VM_LOCKED | 318 | an already VM_LOCKED VMA, or to munlock() a VMA that is not VM_LOCKED is |
205 | is treated as a no-op--mlock_fixup() simply returns. | 319 | treated as a no-op, and mlock_fixup() simply returns. |
206 | 320 | ||
207 | If the vma passes some filtering described in "Mlocked Pages: Filtering Vmas" | 321 | If the VMA passes some filtering as described in "Filtering Special Vmas" |
208 | below, mlock_fixup() will attempt to merge the vma with its neighbors or split | 322 | below, mlock_fixup() will attempt to merge the VMA with its neighbors or split |
209 | off a subset of the vma if the range does not cover the entire vma. Once the | 323 | off a subset of the VMA if the range does not cover the entire VMA. Once the |
210 | vma has been merged or split or neither, mlock_fixup() will call | 324 | VMA has been merged or split or neither, mlock_fixup() will call |
211 | __mlock_vma_pages_range() to fault in the pages via get_user_pages() and | 325 | __mlock_vma_pages_range() to fault in the pages via get_user_pages() and to |
212 | to mark the pages as mlocked via mlock_vma_page(). | 326 | mark the pages as mlocked via mlock_vma_page(). |
213 | 327 | ||
214 | Note that the vma being mlocked might be mapped with PROT_NONE. In this case, | 328 | Note that the VMA being mlocked might be mapped with PROT_NONE. In this case, |
215 | get_user_pages() will be unable to fault in the pages. That's OK. If pages | 329 | get_user_pages() will be unable to fault in the pages. That's okay. If pages |
216 | do end up getting faulted into this VM_LOCKED vma, we'll handle them in the | 330 | do end up getting faulted into this VM_LOCKED VMA, we'll handle them in the |
217 | fault path or in vmscan. | 331 | fault path or in vmscan. |
218 | 332 | ||
219 | Also note that a page returned by get_user_pages() could be truncated or | 333 | Also note that a page returned by get_user_pages() could be truncated or |
220 | migrated out from under us, while we're trying to mlock it. To detect | 334 | migrated out from under us, while we're trying to mlock it. To detect this, |
221 | this, __mlock_vma_pages_range() tests the page_mapping after acquiring | 335 | __mlock_vma_pages_range() checks page_mapping() after acquiring the page lock. |
222 | the page lock. If the page is still associated with its mapping, we'll | 336 | If the page is still associated with its mapping, we'll go ahead and call |
223 | go ahead and call mlock_vma_page(). If the mapping is gone, we just | 337 | mlock_vma_page(). If the mapping is gone, we just unlock the page and move on. |
224 | unlock the page and move on. Worse case, this results in page mapped | 338 | In the worst case, this will result in a page mapped in a VM_LOCKED VMA |
225 | in a VM_LOCKED vma remaining on a normal LRU list without being | 339 | remaining on a normal LRU list without being PageMlocked(). Again, vmscan will |
226 | PageMlocked(). Again, vmscan will detect and cull such pages. | 340 | detect and cull such pages. |
227 | 341 | ||
228 | mlock_vma_page(), called with the page locked [N.B., not "mlocked"], will | 342 | mlock_vma_page() will call TestSetPageMlocked() for each page returned by |
229 | TestSetPageMlocked() for each page returned by get_user_pages(). We use | 343 | get_user_pages(). We use TestSetPageMlocked() because the page might already |
230 | TestSetPageMlocked() because the page might already be mlocked by another | 344 | be mlocked by another task/VMA and we don't want to do extra work. We |
231 | task/vma and we don't want to do extra work. We especially do not want to | 345 | especially do not want to count an mlocked page more than once in the |
232 | count an mlocked page more than once in the statistics. If the page was | 346 | statistics. If the page was already mlocked, mlock_vma_page() need do nothing |
233 | already mlocked, mlock_vma_page() is done. | 347 | more. |
234 | 348 | ||
235 | If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the | 349 | If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the |
236 | page from the LRU, as it is likely on the appropriate active or inactive list | 350 | page from the LRU, as it is likely on the appropriate active or inactive list |
237 | at that time. If the isolate_lru_page() succeeds, mlock_vma_page() will | 351 | at that time. If the isolate_lru_page() succeeds, mlock_vma_page() will put |
238 | putback the page--putback_lru_page()--which will notice that the page is now | 352 | back the page - by calling putback_lru_page() - which will notice that the page |
239 | mlocked and divert the page to the zone's unevictable LRU list. If | 353 | is now mlocked and divert the page to the zone's unevictable list. If |
240 | mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle | 354 | mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle |
241 | it later if/when it attempts to reclaim the page. | 355 | it later if and when it attempts to reclaim the page. |
242 | 356 | ||
243 | 357 | ||
244 | Mlocked Pages: Filtering Special Vmas | 358 | FILTERING SPECIAL VMAS |
359 | ---------------------- | ||
245 | 360 | ||
246 | mlock_fixup() filters several classes of "special" vmas: | 361 | mlock_fixup() filters several classes of "special" VMAs: |
247 | 362 | ||
248 | 1) vmas with VM_IO|VM_PFNMAP set are skipped entirely. The pages behind | 363 | 1) VMAs with VM_IO or VM_PFNMAP set are skipped entirely. The pages behind |
249 | these mappings are inherently pinned, so we don't need to mark them as | 364 | these mappings are inherently pinned, so we don't need to mark them as |
250 | mlocked. In any case, most of the pages have no struct page in which to | 365 | mlocked. In any case, most of the pages have no struct page in which to so |
251 | so mark the page. Because of this, get_user_pages() will fail for these | 366 | mark the page. Because of this, get_user_pages() will fail for these VMAs, |
252 | vmas, so there is no sense in attempting to visit them. | 367 | so there is no sense in attempting to visit them. |
253 | 368 | ||
254 | 2) vmas mapping hugetlbfs page are already effectively pinned into memory. | 369 | 2) VMAs mapping hugetlbfs page are already effectively pinned into memory. We |
255 | We don't need nor want to mlock() these pages. However, to preserve the | 370 | neither need nor want to mlock() these pages. However, to preserve the |
256 | prior behavior of mlock()--before the unevictable/mlock changes-- | 371 | prior behavior of mlock() - before the unevictable/mlock changes - |
257 | mlock_fixup() will call make_pages_present() in the hugetlbfs vma range | 372 | mlock_fixup() will call make_pages_present() in the hugetlbfs VMA range to |
258 | to allocate the huge pages and populate the ptes. | 373 | allocate the huge pages and populate the ptes. |
259 | 374 | ||
260 | 3) vmas with VM_DONTEXPAND|VM_RESERVED are generally user space mappings of | 375 | 3) VMAs with VM_DONTEXPAND or VM_RESERVED are generally userspace mappings of |
261 | kernel pages, such as the vdso page, relay channel pages, etc. These pages | 376 | kernel pages, such as the VDSO page, relay channel pages, etc. These pages |
262 | are inherently unevictable and are not managed on the LRU lists. | 377 | are inherently unevictable and are not managed on the LRU lists. |
263 | mlock_fixup() treats these vmas the same as hugetlbfs vmas. It calls | 378 | mlock_fixup() treats these VMAs the same as hugetlbfs VMAs. It calls |
264 | make_pages_present() to populate the ptes. | 379 | make_pages_present() to populate the ptes. |
265 | 380 | ||
266 | Note that for all of these special vmas, mlock_fixup() does not set the | 381 | Note that for all of these special VMAs, mlock_fixup() does not set the |
267 | VM_LOCKED flag. Therefore, we won't have to deal with them later during | 382 | VM_LOCKED flag. Therefore, we won't have to deal with them later during |
268 | munlock() or munmap()--for example, at task exit. Neither does mlock_fixup() | 383 | munlock(), munmap() or task exit. Neither does mlock_fixup() account these |
269 | account these vmas against the task's "locked_vm". | 384 | VMAs against the task's "locked_vm". |
270 | 385 | ||
271 | Mlocked Pages: Downgrading the Mmap Semaphore. | 386 | |
272 | 387 | munlock()/munlockall() SYSTEM CALL HANDLING | |
273 | mlock_fixup() must be called with the mmap semaphore held for write, because | 388 | ------------------------------------------- |
274 | it may have to merge or split vmas. However, mlocking a large region of | 389 | |
275 | memory can take a long time--especially if vmscan must reclaim pages to | 390 | The munlock() and munlockall() system calls are handled by the same functions - |
276 | satisfy the regions requirements. Faulting in a large region with the mmap | 391 | do_mlock[all]() - as the mlock() and mlockall() system calls with the unlock vs |
277 | semaphore held for write can hold off other faults on the address space, in | 392 | lock operation indicated by an argument. So, these system calls are also |
278 | the case of a multi-threaded task. It can also hold off scans of the task's | 393 | handled by mlock_fixup(). Again, if called for an already munlocked VMA, |
279 | address space via /proc. While testing under heavy load, it was observed that | 394 | mlock_fixup() simply returns. Because of the VMA filtering discussed above, |
280 | the ps(1) command could be held off for many minutes while a large segment was | 395 | VM_LOCKED will not be set in any "special" VMAs. So, these VMAs will be |
281 | mlock()ed down. | ||
282 | |||
283 | To address this issue, and to make the system more responsive during mlock()ing | ||
284 | of large segments, mlock_fixup() downgrades the mmap semaphore to read mode | ||
285 | during the call to __mlock_vma_pages_range(). This works fine. However, the | ||
286 | callers of mlock_fixup() expect the semaphore to be returned in write mode. | ||
287 | So, mlock_fixup() "upgrades" the semphore to write mode. Linux does not | ||
288 | support an atomic upgrade_sem() call, so mlock_fixup() must drop the semaphore | ||
289 | and reacquire it in write mode. In a multi-threaded task, it is possible for | ||
290 | the task memory map to change while the semaphore is dropped. Therefore, | ||
291 | mlock_fixup() looks up the vma at the range start address after reacquiring | ||
292 | the semaphore in write mode and verifies that it still covers the original | ||
293 | range. If not, mlock_fixup() returns an error [-EAGAIN]. All callers of | ||
294 | mlock_fixup() have been changed to deal with this new error condition. | ||
295 | |||
296 | Note: when munlocking a region, all of the pages should already be resident-- | ||
297 | unless we have racing threads mlocking() and munlocking() regions. So, | ||
298 | unlocking should not have to wait for page allocations nor faults of any kind. | ||
299 | Therefore mlock_fixup() does not downgrade the semaphore for munlock(). | ||
300 | |||
301 | |||
302 | Mlocked Pages: munlock()/munlockall() System Call Handling | ||
303 | |||
304 | The munlock() and munlockall() system calls are handled by the same functions-- | ||
305 | do_mlock[all]()--as the mlock() and mlockall() system calls with the unlock | ||
306 | vs lock operation indicated by an argument. So, these system calls are also | ||
307 | handled by mlock_fixup(). Again, if called for an already munlock()ed vma, | ||
308 | mlock_fixup() simply returns. Because of the vma filtering discussed above, | ||
309 | VM_LOCKED will not be set in any "special" vmas. So, these vmas will be | ||
310 | ignored for munlock. | 396 | ignored for munlock. |
311 | 397 | ||
312 | If the vma is VM_LOCKED, mlock_fixup() again attempts to merge or split off | 398 | If the VMA is VM_LOCKED, mlock_fixup() again attempts to merge or split off the |
313 | the specified range. The range is then munlocked via the function | 399 | specified range. The range is then munlocked via the function |
314 | __mlock_vma_pages_range()--the same function used to mlock a vma range-- | 400 | __mlock_vma_pages_range() - the same function used to mlock a VMA range - |
315 | passing a flag to indicate that munlock() is being performed. | 401 | passing a flag to indicate that munlock() is being performed. |
316 | 402 | ||
317 | Because the vma access protections could have been changed to PROT_NONE after | 403 | Because the VMA access protections could have been changed to PROT_NONE after |
318 | faulting in and mlocking pages, get_user_pages() was unreliable for visiting | 404 | faulting in and mlocking pages, get_user_pages() was unreliable for visiting |
319 | these pages for munlocking. Because we don't want to leave pages mlocked(), | 405 | these pages for munlocking. Because we don't want to leave pages mlocked, |
320 | get_user_pages() was enhanced to accept a flag to ignore the permissions when | 406 | get_user_pages() was enhanced to accept a flag to ignore the permissions when |
321 | fetching the pages--all of which should be resident as a result of previous | 407 | fetching the pages - all of which should be resident as a result of previous |
322 | mlock()ing. | 408 | mlocking. |
323 | 409 | ||
324 | For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling | 410 | For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling |
325 | munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked | 411 | munlock_vma_page(). munlock_vma_page() unconditionally clears the PG_mlocked |
326 | flag using TestClearPageMlocked(). As with mlock_vma_page(), munlock_vma_page() | 412 | flag using TestClearPageMlocked(). As with mlock_vma_page(), |
327 | use the Test*PageMlocked() function to handle the case where the page might | 413 | munlock_vma_page() use the Test*PageMlocked() function to handle the case where |
328 | have already been unlocked by another task. If the page was mlocked, | 414 | the page might have already been unlocked by another task. If the page was |
329 | munlock_vma_page() updates that zone statistics for the number of mlocked | 415 | mlocked, munlock_vma_page() updates that zone statistics for the number of |
330 | pages. Note, however, that at this point we haven't checked whether the page | 416 | mlocked pages. Note, however, that at this point we haven't checked whether |
331 | is mapped by other VM_LOCKED vmas. | 417 | the page is mapped by other VM_LOCKED VMAs. |
332 | 418 | ||
333 | We can't call try_to_munlock(), the function that walks the reverse map to check | 419 | We can't call try_to_munlock(), the function that walks the reverse map to |
334 | for other VM_LOCKED vmas, without first isolating the page from the LRU. | 420 | check for other VM_LOCKED VMAs, without first isolating the page from the LRU. |
335 | try_to_munlock() is a variant of try_to_unmap() and thus requires that the page | 421 | try_to_munlock() is a variant of try_to_unmap() and thus requires that the page |
336 | not be on an lru list. [More on these below.] However, the call to | 422 | not be on an LRU list [more on these below]. However, the call to |
337 | isolate_lru_page() could fail, in which case we couldn't try_to_munlock(). | 423 | isolate_lru_page() could fail, in which case we couldn't try_to_munlock(). So, |
338 | So, we go ahead and clear PG_mlocked up front, as this might be the only chance | 424 | we go ahead and clear PG_mlocked up front, as this might be the only chance we |
339 | we have. If we can successfully isolate the page, we go ahead and | 425 | have. If we can successfully isolate the page, we go ahead and |
340 | try_to_munlock(), which will restore the PG_mlocked flag and update the zone | 426 | try_to_munlock(), which will restore the PG_mlocked flag and update the zone |
341 | page statistics if it finds another vma holding the page mlocked. If we fail | 427 | page statistics if it finds another VMA holding the page mlocked. If we fail |
342 | to isolate the page, we'll have left a potentially mlocked page on the LRU. | 428 | to isolate the page, we'll have left a potentially mlocked page on the LRU. |
343 | This is fine, because we'll catch it later when/if vmscan tries to reclaim the | 429 | This is fine, because we'll catch it later if and if vmscan tries to reclaim |
344 | page. This should be relatively rare. | 430 | the page. This should be relatively rare. |
345 | 431 | ||
346 | Mlocked Pages: Migrating Them... | 432 | |
347 | 433 | MIGRATING MLOCKED PAGES | |
348 | A page that is being migrated has been isolated from the lru lists and is | 434 | ----------------------- |
349 | held locked across unmapping of the page, updating the page's mapping | 435 | |
350 | [address_space] entry and copying the contents and state, until the | 436 | A page that is being migrated has been isolated from the LRU lists and is held |
351 | page table entry has been replaced with an entry that refers to the new | 437 | locked across unmapping of the page, updating the page's address space entry |
352 | page. Linux supports migration of mlocked pages and other unevictable | 438 | and copying the contents and state, until the page table entry has been |
353 | pages. This involves simply moving the PageMlocked and PageUnevictable states | 439 | replaced with an entry that refers to the new page. Linux supports migration |
354 | from the old page to the new page. | 440 | of mlocked pages and other unevictable pages. This involves simply moving the |
355 | 441 | PG_mlocked and PG_unevictable states from the old page to the new page. | |
356 | Note that page migration can race with mlocking or munlocking of the same | 442 | |
357 | page. This has been discussed from the mlock/munlock perspective in the | 443 | Note that page migration can race with mlocking or munlocking of the same page. |
358 | respective sections above. Both processes [migration, m[un]locking], hold | 444 | This has been discussed from the mlock/munlock perspective in the respective |
359 | the page locked. This provides the first level of synchronization. Page | 445 | sections above. Both processes (migration and m[un]locking) hold the page |
360 | migration zeros out the page_mapping of the old page before unlocking it, | 446 | locked. This provides the first level of synchronization. Page migration |
361 | so m[un]lock can skip these pages by testing the page mapping under page | 447 | zeros out the page_mapping of the old page before unlocking it, so m[un]lock |
362 | lock. | 448 | can skip these pages by testing the page mapping under page lock. |
363 | 449 | ||
364 | When completing page migration, we place the new and old pages back onto the | 450 | To complete page migration, we place the new and old pages back onto the LRU |
365 | lru after dropping the page lock. The "unneeded" page--old page on success, | 451 | after dropping the page lock. The "unneeded" page - old page on success, new |
366 | new page on failure--will be freed when the reference count held by the | 452 | page on failure - will be freed when the reference count held by the migration |
367 | migration process is released. To ensure that we don't strand pages on the | 453 | process is released. To ensure that we don't strand pages on the unevictable |
368 | unevictable list because of a race between munlock and migration, page | 454 | list because of a race between munlock and migration, page migration uses the |
369 | migration uses the putback_lru_page() function to add migrated pages back to | 455 | putback_lru_page() function to add migrated pages back to the LRU. |
370 | the lru. | 456 | |
371 | 457 | ||
372 | 458 | mmap(MAP_LOCKED) SYSTEM CALL HANDLING | |
373 | Mlocked Pages: mmap(MAP_LOCKED) System Call Handling | 459 | ------------------------------------- |
374 | 460 | ||
375 | In addition the the mlock()/mlockall() system calls, an application can request | 461 | In addition the the mlock()/mlockall() system calls, an application can request |
376 | that a region of memory be mlocked using the MAP_LOCKED flag with the mmap() | 462 | that a region of memory be mlocked supplying the MAP_LOCKED flag to the mmap() |
377 | call. Furthermore, any mmap() call or brk() call that expands the heap by a | 463 | call. Furthermore, any mmap() call or brk() call that expands the heap by a |
378 | task that has previously called mlockall() with the MCL_FUTURE flag will result | 464 | task that has previously called mlockall() with the MCL_FUTURE flag will result |
379 | in the newly mapped memory being mlocked. Before the unevictable/mlock changes, | 465 | in the newly mapped memory being mlocked. Before the unevictable/mlock |
380 | the kernel simply called make_pages_present() to allocate pages and populate | 466 | changes, the kernel simply called make_pages_present() to allocate pages and |
381 | the page table. | 467 | populate the page table. |
382 | 468 | ||
383 | To mlock a range of memory under the unevictable/mlock infrastructure, the | 469 | To mlock a range of memory under the unevictable/mlock infrastructure, the |
384 | mmap() handler and task address space expansion functions call | 470 | mmap() handler and task address space expansion functions call |
385 | mlock_vma_pages_range() specifying the vma and the address range to mlock. | 471 | mlock_vma_pages_range() specifying the vma and the address range to mlock. |
386 | mlock_vma_pages_range() filters vmas like mlock_fixup(), as described above in | 472 | mlock_vma_pages_range() filters VMAs like mlock_fixup(), as described above in |
387 | "Mlocked Pages: Filtering Vmas". It will clear the VM_LOCKED flag, which will | 473 | "Filtering Special VMAs". It will clear the VM_LOCKED flag, which will have |
388 | have already been set by the caller, in filtered vmas. Thus these vma's need | 474 | already been set by the caller, in filtered VMAs. Thus these VMA's need not be |
389 | not be visited for munlock when the region is unmapped. | 475 | visited for munlock when the region is unmapped. |
390 | 476 | ||
391 | For "normal" vmas, mlock_vma_pages_range() calls __mlock_vma_pages_range() to | 477 | For "normal" VMAs, mlock_vma_pages_range() calls __mlock_vma_pages_range() to |
392 | fault/allocate the pages and mlock them. Again, like mlock_fixup(), | 478 | fault/allocate the pages and mlock them. Again, like mlock_fixup(), |
393 | mlock_vma_pages_range() downgrades the mmap semaphore to read mode before | 479 | mlock_vma_pages_range() downgrades the mmap semaphore to read mode before |
394 | attempting to fault/allocate and mlock the pages; and "upgrades" the semaphore | 480 | attempting to fault/allocate and mlock the pages and "upgrades" the semaphore |
395 | back to write mode before returning. | 481 | back to write mode before returning. |
396 | 482 | ||
397 | The callers of mlock_vma_pages_range() will have already added the memory | 483 | The callers of mlock_vma_pages_range() will have already added the memory range |
398 | range to be mlocked to the task's "locked_vm". To account for filtered vmas, | 484 | to be mlocked to the task's "locked_vm". To account for filtered VMAs, |
399 | mlock_vma_pages_range() returns the number of pages NOT mlocked. All of the | 485 | mlock_vma_pages_range() returns the number of pages NOT mlocked. All of the |
400 | callers then subtract a non-negative return value from the task's locked_vm. | 486 | callers then subtract a non-negative return value from the task's locked_vm. A |
401 | A negative return value represent an error--for example, from get_user_pages() | 487 | negative return value represent an error - for example, from get_user_pages() |
402 | attempting to fault in a vma with PROT_NONE access. In this case, we leave | 488 | attempting to fault in a VMA with PROT_NONE access. In this case, we leave the |
403 | the memory range accounted as locked_vm, as the protections could be changed | 489 | memory range accounted as locked_vm, as the protections could be changed later |
404 | later and pages allocated into that region. | 490 | and pages allocated into that region. |
405 | 491 | ||
406 | 492 | ||
407 | Mlocked Pages: munmap()/exit()/exec() System Call Handling | 493 | munmap()/exit()/exec() SYSTEM CALL HANDLING |
494 | ------------------------------------------- | ||
408 | 495 | ||
409 | When unmapping an mlocked region of memory, whether by an explicit call to | 496 | When unmapping an mlocked region of memory, whether by an explicit call to |
410 | munmap() or via an internal unmap from exit() or exec() processing, we must | 497 | munmap() or via an internal unmap from exit() or exec() processing, we must |
411 | munlock the pages if we're removing the last VM_LOCKED vma that maps the pages. | 498 | munlock the pages if we're removing the last VM_LOCKED VMA that maps the pages. |
412 | Before the unevictable/mlock changes, mlocking did not mark the pages in any | 499 | Before the unevictable/mlock changes, mlocking did not mark the pages in any |
413 | way, so unmapping them required no processing. | 500 | way, so unmapping them required no processing. |
414 | 501 | ||
415 | To munlock a range of memory under the unevictable/mlock infrastructure, the | 502 | To munlock a range of memory under the unevictable/mlock infrastructure, the |
416 | munmap() hander and task address space tear down function call | 503 | munmap() handler and task address space call tear down function |
417 | munlock_vma_pages_all(). The name reflects the observation that one always | 504 | munlock_vma_pages_all(). The name reflects the observation that one always |
418 | specifies the entire vma range when munlock()ing during unmap of a region. | 505 | specifies the entire VMA range when munlock()ing during unmap of a region. |
419 | Because of the vma filtering when mlocking() regions, only "normal" vmas that | 506 | Because of the VMA filtering when mlocking() regions, only "normal" VMAs that |
420 | actually contain mlocked pages will be passed to munlock_vma_pages_all(). | 507 | actually contain mlocked pages will be passed to munlock_vma_pages_all(). |
421 | 508 | ||
422 | munlock_vma_pages_all() clears the VM_LOCKED vma flag and, like mlock_fixup() | 509 | munlock_vma_pages_all() clears the VM_LOCKED VMA flag and, like mlock_fixup() |
423 | for the munlock case, calls __munlock_vma_pages_range() to walk the page table | 510 | for the munlock case, calls __munlock_vma_pages_range() to walk the page table |
424 | for the vma's memory range and munlock_vma_page() each resident page mapped by | 511 | for the VMA's memory range and munlock_vma_page() each resident page mapped by |
425 | the vma. This effectively munlocks the page, only if this is the last | 512 | the VMA. This effectively munlocks the page, only if this is the last |
426 | VM_LOCKED vma that maps the page. | 513 | VM_LOCKED VMA that maps the page. |
427 | |||
428 | 514 | ||
429 | Mlocked Page: try_to_unmap() | ||
430 | 515 | ||
431 | [Note: the code changes represented by this section are really quite small | 516 | try_to_unmap() |
432 | compared to the text to describe what happening and why, and to discuss the | 517 | -------------- |
433 | implications.] | ||
434 | 518 | ||
435 | Pages can, of course, be mapped into multiple vmas. Some of these vmas may | 519 | Pages can, of course, be mapped into multiple VMAs. Some of these VMAs may |
436 | have VM_LOCKED flag set. It is possible for a page mapped into one or more | 520 | have VM_LOCKED flag set. It is possible for a page mapped into one or more |
437 | VM_LOCKED vmas not to have the PG_mlocked flag set and therefore reside on one | 521 | VM_LOCKED VMAs not to have the PG_mlocked flag set and therefore reside on one |
438 | of the active or inactive LRU lists. This could happen if, for example, a | 522 | of the active or inactive LRU lists. This could happen if, for example, a task |
439 | task in the process of munlock()ing the page could not isolate the page from | 523 | in the process of munlocking the page could not isolate the page from the LRU. |
440 | the LRU. As a result, vmscan/shrink_page_list() might encounter such a page | 524 | As a result, vmscan/shrink_page_list() might encounter such a page as described |
441 | as described in "Unevictable Pages and Vmscan [shrink_*_list()]". To | 525 | in section "vmscan's handling of unevictable pages". To handle this situation, |
442 | handle this situation, try_to_unmap() has been enhanced to check for VM_LOCKED | 526 | try_to_unmap() checks for VM_LOCKED VMAs while it is walking a page's reverse |
443 | vmas while it is walking a page's reverse map. | 527 | map. |
444 | 528 | ||
445 | try_to_unmap() is always called, by either vmscan for reclaim or for page | 529 | try_to_unmap() is always called, by either vmscan for reclaim or for page |
446 | migration, with the argument page locked and isolated from the LRU. BUG_ON() | 530 | migration, with the argument page locked and isolated from the LRU. Separate |
447 | assertions enforce this requirement. Separate functions handle anonymous and | 531 | functions handle anonymous and mapped file pages, as these types of pages have |
448 | mapped file pages, as these types of pages have different reverse map | 532 | different reverse map mechanisms. |
449 | mechanisms. | 533 | |
450 | 534 | (*) try_to_unmap_anon() | |
451 | try_to_unmap_anon() | 535 | |
452 | 536 | To unmap anonymous pages, each VMA in the list anchored in the anon_vma | |
453 | To unmap anonymous pages, each vma in the list anchored in the anon_vma must be | 537 | must be visited - at least until a VM_LOCKED VMA is encountered. If the |
454 | visited--at least until a VM_LOCKED vma is encountered. If the page is being | 538 | page is being unmapped for migration, VM_LOCKED VMAs do not stop the |
455 | unmapped for migration, VM_LOCKED vmas do not stop the process because mlocked | 539 | process because mlocked pages are migratable. However, for reclaim, if |
456 | pages are migratable. However, for reclaim, if the page is mapped into a | 540 | the page is mapped into a VM_LOCKED VMA, the scan stops. |
457 | VM_LOCKED vma, the scan stops. try_to_unmap() attempts to acquire the mmap | 541 | |
458 | semphore of the mm_struct to which the vma belongs in read mode. If this is | 542 | try_to_unmap_anon() attempts to acquire in read mode the mmap semphore of |
459 | successful, try_to_unmap() will mlock the page via mlock_vma_page()--we | 543 | the mm_struct to which the VMA belongs. If this is successful, it will |
460 | wouldn't have gotten to try_to_unmap() if the page were already mlocked--and | 544 | mlock the page via mlock_vma_page() - we wouldn't have gotten to |
461 | will return SWAP_MLOCK, indicating that the page is unevictable. If the | 545 | try_to_unmap_anon() if the page were already mlocked - and will return |
462 | mmap semaphore cannot be acquired, we are not sure whether the page is really | 546 | SWAP_MLOCK, indicating that the page is unevictable. |
463 | unevictable or not. In this case, try_to_unmap() will return SWAP_AGAIN. | 547 | |
464 | 548 | If the mmap semaphore cannot be acquired, we are not sure whether the page | |
465 | try_to_unmap_file() -- linear mappings | 549 | is really unevictable or not. In this case, try_to_unmap_anon() will |
466 | 550 | return SWAP_AGAIN. | |
467 | Unmapping of a mapped file page works the same, except that the scan visits | 551 | |
468 | all vmas that maps the page's index/page offset in the page's mapping's | 552 | (*) try_to_unmap_file() - linear mappings |
469 | reverse map priority search tree. It must also visit each vma in the page's | 553 | |
470 | mapping's non-linear list, if the list is non-empty. As for anonymous pages, | 554 | Unmapping of a mapped file page works the same as for anonymous mappings, |
471 | on encountering a VM_LOCKED vma for a mapped file page, try_to_unmap() will | 555 | except that the scan visits all VMAs that map the page's index/page offset |
472 | attempt to acquire the associated mm_struct's mmap semaphore to mlock the page, | 556 | in the page's mapping's reverse map priority search tree. It also visits |
473 | returning SWAP_MLOCK if this is successful, and SWAP_AGAIN, if not. | 557 | each VMA in the page's mapping's non-linear list, if the list is |
474 | 558 | non-empty. | |
475 | try_to_unmap_file() -- non-linear mappings | 559 | |
476 | 560 | As for anonymous pages, on encountering a VM_LOCKED VMA for a mapped file | |
477 | If a page's mapping contains a non-empty non-linear mapping vma list, then | 561 | page, try_to_unmap_file() will attempt to acquire the associated |
478 | try_to_un{map|lock}() must also visit each vma in that list to determine | 562 | mm_struct's mmap semaphore to mlock the page, returning SWAP_MLOCK if this |
479 | whether the page is mapped in a VM_LOCKED vma. Again, the scan must visit | 563 | is successful, and SWAP_AGAIN, if not. |
480 | all vmas in the non-linear list to ensure that the pages is not/should not be | 564 | |
481 | mlocked. If a VM_LOCKED vma is found in the list, the scan could terminate. | 565 | (*) try_to_unmap_file() - non-linear mappings |
482 | However, there is no easy way to determine whether the page is actually mapped | 566 | |
483 | in a given vma--either for unmapping or testing whether the VM_LOCKED vma | 567 | If a page's mapping contains a non-empty non-linear mapping VMA list, then |
484 | actually pins the page. | 568 | try_to_un{map|lock}() must also visit each VMA in that list to determine |
485 | 569 | whether the page is mapped in a VM_LOCKED VMA. Again, the scan must visit | |
486 | So, try_to_unmap_file() handles non-linear mappings by scanning a certain | 570 | all VMAs in the non-linear list to ensure that the pages is not/should not |
487 | number of pages--a "cluster"--in each non-linear vma associated with the page's | 571 | be mlocked. |
488 | mapping, for each file mapped page that vmscan tries to unmap. If this happens | 572 | |
489 | to unmap the page we're trying to unmap, try_to_unmap() will notice this on | 573 | If a VM_LOCKED VMA is found in the list, the scan could terminate. |
490 | return--(page_mapcount(page) == 0)--and return SWAP_SUCCESS. Otherwise, it | 574 | However, there is no easy way to determine whether the page is actually |
491 | will return SWAP_AGAIN, causing vmscan to recirculate this page. We take | 575 | mapped in a given VMA - either for unmapping or testing whether the |
492 | advantage of the cluster scan in try_to_unmap_cluster() as follows: | 576 | VM_LOCKED VMA actually pins the page. |
493 | 577 | ||
494 | For each non-linear vma, try_to_unmap_cluster() attempts to acquire the mmap | 578 | try_to_unmap_file() handles non-linear mappings by scanning a certain |
495 | semaphore of the associated mm_struct for read without blocking. If this | 579 | number of pages - a "cluster" - in each non-linear VMA associated with the |
496 | attempt is successful and the vma is VM_LOCKED, try_to_unmap_cluster() will | 580 | page's mapping, for each file mapped page that vmscan tries to unmap. If |
497 | retain the mmap semaphore for the scan; otherwise it drops it here. Then, | 581 | this happens to unmap the page we're trying to unmap, try_to_unmap() will |
498 | for each page in the cluster, if we're holding the mmap semaphore for a locked | 582 | notice this on return (page_mapcount(page) will be 0) and return |
499 | vma, try_to_unmap_cluster() calls mlock_vma_page() to mlock the page. This | 583 | SWAP_SUCCESS. Otherwise, it will return SWAP_AGAIN, causing vmscan to |
500 | call is a no-op if the page is already locked, but will mlock any pages in | 584 | recirculate this page. We take advantage of the cluster scan in |
501 | the non-linear mapping that happen to be unlocked. If one of the pages so | 585 | try_to_unmap_cluster() as follows: |
502 | mlocked is the page passed in to try_to_unmap(), try_to_unmap_cluster() will | 586 | |
503 | return SWAP_MLOCK, rather than the default SWAP_AGAIN. This will allow vmscan | 587 | For each non-linear VMA, try_to_unmap_cluster() attempts to acquire the |
504 | to cull the page, rather than recirculating it on the inactive list. Again, | 588 | mmap semaphore of the associated mm_struct for read without blocking. |
505 | if try_to_unmap_cluster() cannot acquire the vma's mmap sem, it returns | 589 | |
506 | SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED vma, but | 590 | If this attempt is successful and the VMA is VM_LOCKED, |
507 | couldn't be mlocked. | 591 | try_to_unmap_cluster() will retain the mmap semaphore for the scan; |
508 | 592 | otherwise it drops it here. | |
509 | 593 | ||
510 | Mlocked pages: try_to_munlock() Reverse Map Scan | 594 | Then, for each page in the cluster, if we're holding the mmap semaphore |
511 | 595 | for a locked VMA, try_to_unmap_cluster() calls mlock_vma_page() to | |
512 | TODO/FIXME: a better name might be page_mlocked()--analogous to the | 596 | mlock the page. This call is a no-op if the page is already locked, |
513 | page_referenced() reverse map walker. | 597 | but will mlock any pages in the non-linear mapping that happen to be |
514 | 598 | unlocked. | |
515 | When munlock_vma_page()--see "Mlocked Pages: munlock()/munlockall() | 599 | |
516 | System Call Handling" above--tries to munlock a page, it needs to | 600 | If one of the pages so mlocked is the page passed in to try_to_unmap(), |
517 | determine whether or not the page is mapped by any VM_LOCKED vma, without | 601 | try_to_unmap_cluster() will return SWAP_MLOCK, rather than the default |
518 | actually attempting to unmap all ptes from the page. For this purpose, the | 602 | SWAP_AGAIN. This will allow vmscan to cull the page, rather than |
519 | unevictable/mlock infrastructure introduced a variant of try_to_unmap() called | 603 | recirculating it on the inactive list. |
520 | try_to_munlock(). | 604 | |
605 | Again, if try_to_unmap_cluster() cannot acquire the VMA's mmap sem, it | ||
606 | returns SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED | ||
607 | VMA, but couldn't be mlocked. | ||
608 | |||
609 | |||
610 | try_to_munlock() REVERSE MAP SCAN | ||
611 | --------------------------------- | ||
612 | |||
613 | [!] TODO/FIXME: a better name might be page_mlocked() - analogous to the | ||
614 | page_referenced() reverse map walker. | ||
615 | |||
616 | When munlock_vma_page() [see section "munlock()/munlockall() System Call | ||
617 | Handling" above] tries to munlock a page, it needs to determine whether or not | ||
618 | the page is mapped by any VM_LOCKED VMA without actually attempting to unmap | ||
619 | all PTEs from the page. For this purpose, the unevictable/mlock infrastructure | ||
620 | introduced a variant of try_to_unmap() called try_to_munlock(). | ||
521 | 621 | ||
522 | try_to_munlock() calls the same functions as try_to_unmap() for anonymous and | 622 | try_to_munlock() calls the same functions as try_to_unmap() for anonymous and |
523 | mapped file pages with an additional argument specifing unlock versus unmap | 623 | mapped file pages with an additional argument specifing unlock versus unmap |
524 | processing. Again, these functions walk the respective reverse maps looking | 624 | processing. Again, these functions walk the respective reverse maps looking |
525 | for VM_LOCKED vmas. When such a vma is found for anonymous pages and file | 625 | for VM_LOCKED VMAs. When such a VMA is found for anonymous pages and file |
526 | pages mapped in linear VMAs, as in the try_to_unmap() case, the functions | 626 | pages mapped in linear VMAs, as in the try_to_unmap() case, the functions |
527 | attempt to acquire the associated mmap semphore, mlock the page via | 627 | attempt to acquire the associated mmap semphore, mlock the page via |
528 | mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the | 628 | mlock_vma_page() and return SWAP_MLOCK. This effectively undoes the |
529 | pre-clearing of the page's PG_mlocked done by munlock_vma_page. | 629 | pre-clearing of the page's PG_mlocked done by munlock_vma_page. |
530 | 630 | ||
531 | If try_to_unmap() is unable to acquire a VM_LOCKED vma's associated mmap | 631 | If try_to_unmap() is unable to acquire a VM_LOCKED VMA's associated mmap |
532 | semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() | 632 | semaphore, it will return SWAP_AGAIN. This will allow shrink_page_list() to |
533 | to recycle the page on the inactive list and hope that it has better luck | 633 | recycle the page on the inactive list and hope that it has better luck with the |
534 | with the page next time. | 634 | page next time. |
535 | 635 | ||
536 | For file pages mapped into non-linear vmas, the try_to_munlock() logic works | 636 | For file pages mapped into non-linear VMAs, the try_to_munlock() logic works |
537 | slightly differently. On encountering a VM_LOCKED non-linear vma that might | 637 | slightly differently. On encountering a VM_LOCKED non-linear VMA that might |
538 | map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking | 638 | map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking the |
539 | the page. munlock_vma_page() will just leave the page unlocked and let | 639 | page. munlock_vma_page() will just leave the page unlocked and let vmscan deal |
540 | vmscan deal with it--the usual fallback position. | 640 | with it - the usual fallback position. |
541 | 641 | ||
542 | Note that try_to_munlock()'s reverse map walk must visit every vma in a pages' | 642 | Note that try_to_munlock()'s reverse map walk must visit every VMA in a page's |
543 | reverse map to determine that a page is NOT mapped into any VM_LOCKED vma. | 643 | reverse map to determine that a page is NOT mapped into any VM_LOCKED VMA. |
544 | However, the scan can terminate when it encounters a VM_LOCKED vma and can | 644 | However, the scan can terminate when it encounters a VM_LOCKED VMA and can |
545 | successfully acquire the vma's mmap semphore for read and mlock the page. | 645 | successfully acquire the VMA's mmap semphore for read and mlock the page. |
546 | Although try_to_munlock() can be called many [very many!] times when | 646 | Although try_to_munlock() might be called a great many times when munlocking a |
547 | munlock()ing a large region or tearing down a large address space that has been | 647 | large region or tearing down a large address space that has been mlocked via |
548 | mlocked via mlockall(), overall this is a fairly rare event. | 648 | mlockall(), overall this is a fairly rare event. |
549 | 649 | ||
550 | Mlocked Page: Page Reclaim in shrink_*_list() | 650 | |
551 | 651 | PAGE RECLAIM IN shrink_*_list() | |
552 | shrink_active_list() culls any obviously unevictable pages--i.e., | 652 | ------------------------------- |
553 | !page_evictable(page, NULL)--diverting these to the unevictable lru | 653 | |
554 | list. However, shrink_active_list() only sees unevictable pages that | 654 | shrink_active_list() culls any obviously unevictable pages - i.e. |
555 | made it onto the active/inactive lru lists. Note that these pages do not | 655 | !page_evictable(page, NULL) - diverting these to the unevictable list. |
556 | have PageUnevictable set--otherwise, they would be on the unevictable list and | 656 | However, shrink_active_list() only sees unevictable pages that made it onto the |
557 | shrink_active_list would never see them. | 657 | active/inactive lru lists. Note that these pages do not have PageUnevictable |
658 | set - otherwise they would be on the unevictable list and shrink_active_list | ||
659 | would never see them. | ||
558 | 660 | ||
559 | Some examples of these unevictable pages on the LRU lists are: | 661 | Some examples of these unevictable pages on the LRU lists are: |
560 | 662 | ||
561 | 1) ramfs pages that have been placed on the lru lists when first allocated. | 663 | (1) ramfs pages that have been placed on the LRU lists when first allocated. |
664 | |||
665 | (2) SHM_LOCK'd shared memory pages. shmctl(SHM_LOCK) does not attempt to | ||
666 | allocate or fault in the pages in the shared memory region. This happens | ||
667 | when an application accesses the page the first time after SHM_LOCK'ing | ||
668 | the segment. | ||
562 | 669 | ||
563 | 2) SHM_LOCKed shared memory pages. shmctl(SHM_LOCK) does not attempt to | 670 | (3) mlocked pages that could not be isolated from the LRU and moved to the |
564 | allocate or fault in the pages in the shared memory region. This happens | 671 | unevictable list in mlock_vma_page(). |
565 | when an application accesses the page the first time after SHM_LOCKing | ||
566 | the segment. | ||
567 | 672 | ||
568 | 3) Mlocked pages that could not be isolated from the lru and moved to the | 673 | (4) Pages mapped into multiple VM_LOCKED VMAs, but try_to_munlock() couldn't |
569 | unevictable list in mlock_vma_page(). | 674 | acquire the VMA's mmap semaphore to test the flags and set PageMlocked. |
675 | munlock_vma_page() was forced to let the page back on to the normal LRU | ||
676 | list for vmscan to handle. | ||
570 | 677 | ||
571 | 3) Pages mapped into multiple VM_LOCKED vmas, but try_to_munlock() couldn't | 678 | shrink_inactive_list() also diverts any unevictable pages that it finds on the |
572 | acquire the vma's mmap semaphore to test the flags and set PageMlocked. | 679 | inactive lists to the appropriate zone's unevictable list. |
573 | munlock_vma_page() was forced to let the page back on to the normal | ||
574 | LRU list for vmscan to handle. | ||
575 | 680 | ||
576 | shrink_inactive_list() also culls any unevictable pages that it finds on | 681 | shrink_inactive_list() should only see SHM_LOCK'd pages that became SHM_LOCK'd |
577 | the inactive lists, again diverting them to the appropriate zone's unevictable | 682 | after shrink_active_list() had moved them to the inactive list, or pages mapped |
578 | lru list. shrink_inactive_list() should only see SHM_LOCKed pages that became | 683 | into VM_LOCKED VMAs that munlock_vma_page() couldn't isolate from the LRU to |
579 | SHM_LOCKed after shrink_active_list() had moved them to the inactive list, or | 684 | recheck via try_to_munlock(). shrink_inactive_list() won't notice the latter, |
580 | pages mapped into VM_LOCKED vmas that munlock_vma_page() couldn't isolate from | 685 | but will pass on to shrink_page_list(). |
581 | the lru to recheck via try_to_munlock(). shrink_inactive_list() won't notice | ||
582 | the latter, but will pass on to shrink_page_list(). | ||
583 | 686 | ||
584 | shrink_page_list() again culls obviously unevictable pages that it could | 687 | shrink_page_list() again culls obviously unevictable pages that it could |
585 | encounter for similar reason to shrink_inactive_list(). Pages mapped into | 688 | encounter for similar reason to shrink_inactive_list(). Pages mapped into |
586 | VM_LOCKED vmas but without PG_mlocked set will make it all the way to | 689 | VM_LOCKED VMAs but without PG_mlocked set will make it all the way to |
587 | try_to_unmap(). shrink_page_list() will divert them to the unevictable list | 690 | try_to_unmap(). shrink_page_list() will divert them to the unevictable list |
588 | when try_to_unmap() returns SWAP_MLOCK, as discussed above. | 691 | when try_to_unmap() returns SWAP_MLOCK, as discussed above. |