diff options
author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2010-03-01 02:55:20 -0500 |
---|---|---|
committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2010-03-01 02:55:20 -0500 |
commit | 35858adbfca13678af99fb31618ef4428d6dedb0 (patch) | |
tree | 3336feaa61324486945816cb52c347733e7c0821 /Documentation/vm | |
parent | 197d4db752e67160d79fed09968c2140376a80a3 (diff) | |
parent | 4b70858ba8d4537daf782defebe5f2ff80ccef2b (diff) |
Merge branch 'next' into for-linus
Diffstat (limited to 'Documentation/vm')
-rw-r--r-- | Documentation/vm/hugetlbpage.txt | 262 | ||||
-rw-r--r-- | Documentation/vm/hwpoison.txt | 52 | ||||
-rw-r--r-- | Documentation/vm/ksm.txt | 22 | ||||
-rw-r--r-- | Documentation/vm/page-types.c | 83 |
4 files changed, 289 insertions, 130 deletions
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index 82a7bd1800b2..bc31636973e3 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt | |||
@@ -11,23 +11,21 @@ This optimization is more critical now as bigger and bigger physical memories | |||
11 | (several GBs) are more readily available. | 11 | (several GBs) are more readily available. |
12 | 12 | ||
13 | Users can use the huge page support in Linux kernel by either using the mmap | 13 | Users can use the huge page support in Linux kernel by either using the mmap |
14 | system call or standard SYSv shared memory system calls (shmget, shmat). | 14 | system call or standard SYSV shared memory system calls (shmget, shmat). |
15 | 15 | ||
16 | First the Linux kernel needs to be built with the CONFIG_HUGETLBFS | 16 | First the Linux kernel needs to be built with the CONFIG_HUGETLBFS |
17 | (present under "File systems") and CONFIG_HUGETLB_PAGE (selected | 17 | (present under "File systems") and CONFIG_HUGETLB_PAGE (selected |
18 | automatically when CONFIG_HUGETLBFS is selected) configuration | 18 | automatically when CONFIG_HUGETLBFS is selected) configuration |
19 | options. | 19 | options. |
20 | 20 | ||
21 | The kernel built with huge page support should show the number of configured | 21 | The /proc/meminfo file provides information about the total number of |
22 | huge pages in the system by running the "cat /proc/meminfo" command. | 22 | persistent hugetlb pages in the kernel's huge page pool. It also displays |
23 | information about the number of free, reserved and surplus huge pages and the | ||
24 | default huge page size. The huge page size is needed for generating the | ||
25 | proper alignment and size of the arguments to system calls that map huge page | ||
26 | regions. | ||
23 | 27 | ||
24 | /proc/meminfo also provides information about the total number of hugetlb | 28 | The output of "cat /proc/meminfo" will include lines like: |
25 | pages configured in the kernel. It also displays information about the | ||
26 | number of free hugetlb pages at any time. It also displays information about | ||
27 | the configured huge page size - this is needed for generating the proper | ||
28 | alignment and size of the arguments to the above system calls. | ||
29 | |||
30 | The output of "cat /proc/meminfo" will have lines like: | ||
31 | 29 | ||
32 | ..... | 30 | ..... |
33 | HugePages_Total: vvv | 31 | HugePages_Total: vvv |
@@ -53,59 +51,63 @@ HugePages_Surp is short for "surplus," and is the number of huge pages in | |||
53 | /proc/filesystems should also show a filesystem of type "hugetlbfs" configured | 51 | /proc/filesystems should also show a filesystem of type "hugetlbfs" configured |
54 | in the kernel. | 52 | in the kernel. |
55 | 53 | ||
56 | /proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb | 54 | /proc/sys/vm/nr_hugepages indicates the current number of "persistent" huge |
57 | pages in the kernel. Super user can dynamically request more (or free some | 55 | pages in the kernel's huge page pool. "Persistent" huge pages will be |
58 | pre-configured) huge pages. | 56 | returned to the huge page pool when freed by a task. A user with root |
59 | The allocation (or deallocation) of hugetlb pages is possible only if there are | 57 | privileges can dynamically allocate more or free some persistent huge pages |
60 | enough physically contiguous free pages in system (freeing of huge pages is | 58 | by increasing or decreasing the value of 'nr_hugepages'. |
61 | possible only if there are enough hugetlb pages free that can be transferred | ||
62 | back to regular memory pool). | ||
63 | 59 | ||
64 | Pages that are used as hugetlb pages are reserved inside the kernel and cannot | 60 | Pages that are used as huge pages are reserved inside the kernel and cannot |
65 | be used for other purposes. | 61 | be used for other purposes. Huge pages cannot be swapped out under |
62 | memory pressure. | ||
66 | 63 | ||
67 | Once the kernel with Hugetlb page support is built and running, a user can | 64 | Once a number of huge pages have been pre-allocated to the kernel huge page |
68 | use either the mmap system call or shared memory system calls to start using | 65 | pool, a user with appropriate privilege can use either the mmap system call |
69 | the huge pages. It is required that the system administrator preallocate | 66 | or shared memory system calls to use the huge pages. See the discussion of |
70 | enough memory for huge page purposes. | 67 | Using Huge Pages, below. |
71 | 68 | ||
72 | The administrator can preallocate huge pages on the kernel boot command line by | 69 | The administrator can allocate persistent huge pages on the kernel boot |
73 | specifying the "hugepages=N" parameter, where 'N' = the number of huge pages | 70 | command line by specifying the "hugepages=N" parameter, where 'N' = the |
74 | requested. This is the most reliable method for preallocating huge pages as | 71 | number of huge pages requested. This is the most reliable method of |
75 | memory has not yet become fragmented. | 72 | allocating huge pages as memory has not yet become fragmented. |
76 | 73 | ||
77 | Some platforms support multiple huge page sizes. To preallocate huge pages | 74 | Some platforms support multiple huge page sizes. To allocate huge pages |
78 | of a specific size, one must preceed the huge pages boot command parameters | 75 | of a specific size, one must preceed the huge pages boot command parameters |
79 | with a huge page size selection parameter "hugepagesz=<size>". <size> must | 76 | with a huge page size selection parameter "hugepagesz=<size>". <size> must |
80 | be specified in bytes with optional scale suffix [kKmMgG]. The default huge | 77 | be specified in bytes with optional scale suffix [kKmMgG]. The default huge |
81 | page size may be selected with the "default_hugepagesz=<size>" boot parameter. | 78 | page size may be selected with the "default_hugepagesz=<size>" boot parameter. |
82 | 79 | ||
83 | /proc/sys/vm/nr_hugepages indicates the current number of configured [default | 80 | When multiple huge page sizes are supported, /proc/sys/vm/nr_hugepages |
84 | size] hugetlb pages in the kernel. Super user can dynamically request more | 81 | indicates the current number of pre-allocated huge pages of the default size. |
85 | (or free some pre-configured) huge pages. | 82 | Thus, one can use the following command to dynamically allocate/deallocate |
86 | 83 | default sized persistent huge pages: | |
87 | Use the following command to dynamically allocate/deallocate default sized | ||
88 | huge pages: | ||
89 | 84 | ||
90 | echo 20 > /proc/sys/vm/nr_hugepages | 85 | echo 20 > /proc/sys/vm/nr_hugepages |
91 | 86 | ||
92 | This command will try to configure 20 default sized huge pages in the system. | 87 | This command will try to adjust the number of default sized huge pages in the |
88 | huge page pool to 20, allocating or freeing huge pages, as required. | ||
89 | |||
93 | On a NUMA platform, the kernel will attempt to distribute the huge page pool | 90 | On a NUMA platform, the kernel will attempt to distribute the huge page pool |
94 | over the all on-line nodes. These huge pages, allocated when nr_hugepages | 91 | over all the set of allowed nodes specified by the NUMA memory policy of the |
95 | is increased, are called "persistent huge pages". | 92 | task that modifies nr_hugepages. The default for the allowed nodes--when the |
93 | task has default memory policy--is all on-line nodes with memory. Allowed | ||
94 | nodes with insufficient available, contiguous memory for a huge page will be | ||
95 | silently skipped when allocating persistent huge pages. See the discussion | ||
96 | below of the interaction of task memory policy, cpusets and per node attributes | ||
97 | with the allocation and freeing of persistent huge pages. | ||
96 | 98 | ||
97 | The success or failure of huge page allocation depends on the amount of | 99 | The success or failure of huge page allocation depends on the amount of |
98 | physically contiguous memory that is preset in system at the time of the | 100 | physically contiguous memory that is present in system at the time of the |
99 | allocation attempt. If the kernel is unable to allocate huge pages from | 101 | allocation attempt. If the kernel is unable to allocate huge pages from |
100 | some nodes in a NUMA system, it will attempt to make up the difference by | 102 | some nodes in a NUMA system, it will attempt to make up the difference by |
101 | allocating extra pages on other nodes with sufficient available contiguous | 103 | allocating extra pages on other nodes with sufficient available contiguous |
102 | memory, if any. | 104 | memory, if any. |
103 | 105 | ||
104 | System administrators may want to put this command in one of the local rc init | 106 | System administrators may want to put this command in one of the local rc |
105 | files. This will enable the kernel to request huge pages early in the boot | 107 | init files. This will enable the kernel to allocate huge pages early in |
106 | process when the possibility of getting physical contiguous pages is still | 108 | the boot process when the possibility of getting physical contiguous pages |
107 | very high. Administrators can verify the number of huge pages actually | 109 | is still very high. Administrators can verify the number of huge pages |
108 | allocated by checking the sysctl or meminfo. To check the per node | 110 | actually allocated by checking the sysctl or meminfo. To check the per node |
109 | distribution of huge pages in a NUMA system, use: | 111 | distribution of huge pages in a NUMA system, use: |
110 | 112 | ||
111 | cat /sys/devices/system/node/node*/meminfo | fgrep Huge | 113 | cat /sys/devices/system/node/node*/meminfo | fgrep Huge |
@@ -113,45 +115,47 @@ distribution of huge pages in a NUMA system, use: | |||
113 | /proc/sys/vm/nr_overcommit_hugepages specifies how large the pool of | 115 | /proc/sys/vm/nr_overcommit_hugepages specifies how large the pool of |
114 | huge pages can grow, if more huge pages than /proc/sys/vm/nr_hugepages are | 116 | huge pages can grow, if more huge pages than /proc/sys/vm/nr_hugepages are |
115 | requested by applications. Writing any non-zero value into this file | 117 | requested by applications. Writing any non-zero value into this file |
116 | indicates that the hugetlb subsystem is allowed to try to obtain "surplus" | 118 | indicates that the hugetlb subsystem is allowed to try to obtain that |
117 | huge pages from the buddy allocator, when the normal pool is exhausted. As | 119 | number of "surplus" huge pages from the kernel's normal page pool, when the |
118 | these surplus huge pages go out of use, they are freed back to the buddy | 120 | persistent huge page pool is exhausted. As these surplus huge pages become |
119 | allocator. | 121 | unused, they are freed back to the kernel's normal page pool. |
120 | 122 | ||
121 | When increasing the huge page pool size via nr_hugepages, any surplus | 123 | When increasing the huge page pool size via nr_hugepages, any existing surplus |
122 | pages will first be promoted to persistent huge pages. Then, additional | 124 | pages will first be promoted to persistent huge pages. Then, additional |
123 | huge pages will be allocated, if necessary and if possible, to fulfill | 125 | huge pages will be allocated, if necessary and if possible, to fulfill |
124 | the new huge page pool size. | 126 | the new persistent huge page pool size. |
125 | 127 | ||
126 | The administrator may shrink the pool of preallocated huge pages for | 128 | The administrator may shrink the pool of persistent huge pages for |
127 | the default huge page size by setting the nr_hugepages sysctl to a | 129 | the default huge page size by setting the nr_hugepages sysctl to a |
128 | smaller value. The kernel will attempt to balance the freeing of huge pages | 130 | smaller value. The kernel will attempt to balance the freeing of huge pages |
129 | across all on-line nodes. Any free huge pages on the selected nodes will | 131 | across all nodes in the memory policy of the task modifying nr_hugepages. |
130 | be freed back to the buddy allocator. | 132 | Any free huge pages on the selected nodes will be freed back to the kernel's |
131 | 133 | normal page pool. | |
132 | Caveat: Shrinking the pool via nr_hugepages such that it becomes less | 134 | |
133 | than the number of huge pages in use will convert the balance to surplus | 135 | Caveat: Shrinking the persistent huge page pool via nr_hugepages such that |
134 | huge pages even if it would exceed the overcommit value. As long as | 136 | it becomes less than the number of huge pages in use will convert the balance |
135 | this condition holds, however, no more surplus huge pages will be | 137 | of the in-use huge pages to surplus huge pages. This will occur even if |
136 | allowed on the system until one of the two sysctls are increased | 138 | the number of surplus pages it would exceed the overcommit value. As long as |
137 | sufficiently, or the surplus huge pages go out of use and are freed. | 139 | this condition holds--that is, until nr_hugepages+nr_overcommit_hugepages is |
140 | increased sufficiently, or the surplus huge pages go out of use and are freed-- | ||
141 | no more surplus huge pages will be allowed to be allocated. | ||
138 | 142 | ||
139 | With support for multiple huge page pools at run-time available, much of | 143 | With support for multiple huge page pools at run-time available, much of |
140 | the huge page userspace interface has been duplicated in sysfs. The above | 144 | the huge page userspace interface in /proc/sys/vm has been duplicated in sysfs. |
141 | information applies to the default huge page size which will be | 145 | The /proc interfaces discussed above have been retained for backwards |
142 | controlled by the /proc interfaces for backwards compatibility. The root | 146 | compatibility. The root huge page control directory in sysfs is: |
143 | huge page control directory in sysfs is: | ||
144 | 147 | ||
145 | /sys/kernel/mm/hugepages | 148 | /sys/kernel/mm/hugepages |
146 | 149 | ||
147 | For each huge page size supported by the running kernel, a subdirectory | 150 | For each huge page size supported by the running kernel, a subdirectory |
148 | will exist, of the form | 151 | will exist, of the form: |
149 | 152 | ||
150 | hugepages-${size}kB | 153 | hugepages-${size}kB |
151 | 154 | ||
152 | Inside each of these directories, the same set of files will exist: | 155 | Inside each of these directories, the same set of files will exist: |
153 | 156 | ||
154 | nr_hugepages | 157 | nr_hugepages |
158 | nr_hugepages_mempolicy | ||
155 | nr_overcommit_hugepages | 159 | nr_overcommit_hugepages |
156 | free_hugepages | 160 | free_hugepages |
157 | resv_hugepages | 161 | resv_hugepages |
@@ -159,6 +163,102 @@ Inside each of these directories, the same set of files will exist: | |||
159 | 163 | ||
160 | which function as described above for the default huge page-sized case. | 164 | which function as described above for the default huge page-sized case. |
161 | 165 | ||
166 | |||
167 | Interaction of Task Memory Policy with Huge Page Allocation/Freeing | ||
168 | |||
169 | Whether huge pages are allocated and freed via the /proc interface or | ||
170 | the /sysfs interface using the nr_hugepages_mempolicy attribute, the NUMA | ||
171 | nodes from which huge pages are allocated or freed are controlled by the | ||
172 | NUMA memory policy of the task that modifies the nr_hugepages_mempolicy | ||
173 | sysctl or attribute. When the nr_hugepages attribute is used, mempolicy | ||
174 | is ignored. | ||
175 | |||
176 | The recommended method to allocate or free huge pages to/from the kernel | ||
177 | huge page pool, using the nr_hugepages example above, is: | ||
178 | |||
179 | numactl --interleave <node-list> echo 20 \ | ||
180 | >/proc/sys/vm/nr_hugepages_mempolicy | ||
181 | |||
182 | or, more succinctly: | ||
183 | |||
184 | numactl -m <node-list> echo 20 >/proc/sys/vm/nr_hugepages_mempolicy | ||
185 | |||
186 | This will allocate or free abs(20 - nr_hugepages) to or from the nodes | ||
187 | specified in <node-list>, depending on whether number of persistent huge pages | ||
188 | is initially less than or greater than 20, respectively. No huge pages will be | ||
189 | allocated nor freed on any node not included in the specified <node-list>. | ||
190 | |||
191 | When adjusting the persistent hugepage count via nr_hugepages_mempolicy, any | ||
192 | memory policy mode--bind, preferred, local or interleave--may be used. The | ||
193 | resulting effect on persistent huge page allocation is as follows: | ||
194 | |||
195 | 1) Regardless of mempolicy mode [see Documentation/vm/numa_memory_policy.txt], | ||
196 | persistent huge pages will be distributed across the node or nodes | ||
197 | specified in the mempolicy as if "interleave" had been specified. | ||
198 | However, if a node in the policy does not contain sufficient contiguous | ||
199 | memory for a huge page, the allocation will not "fallback" to the nearest | ||
200 | neighbor node with sufficient contiguous memory. To do this would cause | ||
201 | undesirable imbalance in the distribution of the huge page pool, or | ||
202 | possibly, allocation of persistent huge pages on nodes not allowed by | ||
203 | the task's memory policy. | ||
204 | |||
205 | 2) One or more nodes may be specified with the bind or interleave policy. | ||
206 | If more than one node is specified with the preferred policy, only the | ||
207 | lowest numeric id will be used. Local policy will select the node where | ||
208 | the task is running at the time the nodes_allowed mask is constructed. | ||
209 | For local policy to be deterministic, the task must be bound to a cpu or | ||
210 | cpus in a single node. Otherwise, the task could be migrated to some | ||
211 | other node at any time after launch and the resulting node will be | ||
212 | indeterminate. Thus, local policy is not very useful for this purpose. | ||
213 | Any of the other mempolicy modes may be used to specify a single node. | ||
214 | |||
215 | 3) The nodes allowed mask will be derived from any non-default task mempolicy, | ||
216 | whether this policy was set explicitly by the task itself or one of its | ||
217 | ancestors, such as numactl. This means that if the task is invoked from a | ||
218 | shell with non-default policy, that policy will be used. One can specify a | ||
219 | node list of "all" with numactl --interleave or --membind [-m] to achieve | ||
220 | interleaving over all nodes in the system or cpuset. | ||
221 | |||
222 | 4) Any task mempolicy specifed--e.g., using numactl--will be constrained by | ||
223 | the resource limits of any cpuset in which the task runs. Thus, there will | ||
224 | be no way for a task with non-default policy running in a cpuset with a | ||
225 | subset of the system nodes to allocate huge pages outside the cpuset | ||
226 | without first moving to a cpuset that contains all of the desired nodes. | ||
227 | |||
228 | 5) Boot-time huge page allocation attempts to distribute the requested number | ||
229 | of huge pages over all on-lines nodes with memory. | ||
230 | |||
231 | Per Node Hugepages Attributes | ||
232 | |||
233 | A subset of the contents of the root huge page control directory in sysfs, | ||
234 | described above, will be replicated under each the system device of each | ||
235 | NUMA node with memory in: | ||
236 | |||
237 | /sys/devices/system/node/node[0-9]*/hugepages/ | ||
238 | |||
239 | Under this directory, the subdirectory for each supported huge page size | ||
240 | contains the following attribute files: | ||
241 | |||
242 | nr_hugepages | ||
243 | free_hugepages | ||
244 | surplus_hugepages | ||
245 | |||
246 | The free_' and surplus_' attribute files are read-only. They return the number | ||
247 | of free and surplus [overcommitted] huge pages, respectively, on the parent | ||
248 | node. | ||
249 | |||
250 | The nr_hugepages attribute returns the total number of huge pages on the | ||
251 | specified node. When this attribute is written, the number of persistent huge | ||
252 | pages on the parent node will be adjusted to the specified value, if sufficient | ||
253 | resources exist, regardless of the task's mempolicy or cpuset constraints. | ||
254 | |||
255 | Note that the number of overcommit and reserve pages remain global quantities, | ||
256 | as we don't know until fault time, when the faulting task's mempolicy is | ||
257 | applied, from which node the huge page allocation will be attempted. | ||
258 | |||
259 | |||
260 | Using Huge Pages | ||
261 | |||
162 | If the user applications are going to request huge pages using mmap system | 262 | If the user applications are going to request huge pages using mmap system |
163 | call, then it is required that system administrator mount a file system of | 263 | call, then it is required that system administrator mount a file system of |
164 | type hugetlbfs: | 264 | type hugetlbfs: |
@@ -206,9 +306,11 @@ map_hugetlb.c. | |||
206 | * requesting huge pages. | 306 | * requesting huge pages. |
207 | * | 307 | * |
208 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for | 308 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for |
209 | * huge pages. That means the addresses starting with 0x800000... will need | 309 | * huge pages. That means that if one requires a fixed address, a huge page |
210 | * to be specified. Specifying a fixed address is not required on ppc64, | 310 | * aligned address starting with 0x800000... will be required. If a fixed |
211 | * i386 or x86_64. | 311 | * address is not required, the kernel will select an address in the proper |
312 | * range. | ||
313 | * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. | ||
212 | * | 314 | * |
213 | * Note: The default shared memory limit is quite low on many kernels, | 315 | * Note: The default shared memory limit is quite low on many kernels, |
214 | * you may need to increase it via: | 316 | * you may need to increase it via: |
@@ -237,14 +339,8 @@ map_hugetlb.c. | |||
237 | 339 | ||
238 | #define dprintf(x) printf(x) | 340 | #define dprintf(x) printf(x) |
239 | 341 | ||
240 | /* Only ia64 requires this */ | 342 | #define ADDR (void *)(0x0UL) /* let kernel choose address */ |
241 | #ifdef __ia64__ | ||
242 | #define ADDR (void *)(0x8000000000000000UL) | ||
243 | #define SHMAT_FLAGS (SHM_RND) | ||
244 | #else | ||
245 | #define ADDR (void *)(0x0UL) | ||
246 | #define SHMAT_FLAGS (0) | 343 | #define SHMAT_FLAGS (0) |
247 | #endif | ||
248 | 344 | ||
249 | int main(void) | 345 | int main(void) |
250 | { | 346 | { |
@@ -302,10 +398,12 @@ int main(void) | |||
302 | * example, the app is requesting memory of size 256MB that is backed by | 398 | * example, the app is requesting memory of size 256MB that is backed by |
303 | * huge pages. | 399 | * huge pages. |
304 | * | 400 | * |
305 | * For ia64 architecture, Linux kernel reserves Region number 4 for huge pages. | 401 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for |
306 | * That means the addresses starting with 0x800000... will need to be | 402 | * huge pages. That means that if one requires a fixed address, a huge page |
307 | * specified. Specifying a fixed address is not required on ppc64, i386 | 403 | * aligned address starting with 0x800000... will be required. If a fixed |
308 | * or x86_64. | 404 | * address is not required, the kernel will select an address in the proper |
405 | * range. | ||
406 | * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. | ||
309 | */ | 407 | */ |
310 | #include <stdlib.h> | 408 | #include <stdlib.h> |
311 | #include <stdio.h> | 409 | #include <stdio.h> |
@@ -317,14 +415,8 @@ int main(void) | |||
317 | #define LENGTH (256UL*1024*1024) | 415 | #define LENGTH (256UL*1024*1024) |
318 | #define PROTECTION (PROT_READ | PROT_WRITE) | 416 | #define PROTECTION (PROT_READ | PROT_WRITE) |
319 | 417 | ||
320 | /* Only ia64 requires this */ | 418 | #define ADDR (void *)(0x0UL) /* let kernel choose address */ |
321 | #ifdef __ia64__ | ||
322 | #define ADDR (void *)(0x8000000000000000UL) | ||
323 | #define FLAGS (MAP_SHARED | MAP_FIXED) | ||
324 | #else | ||
325 | #define ADDR (void *)(0x0UL) | ||
326 | #define FLAGS (MAP_SHARED) | 419 | #define FLAGS (MAP_SHARED) |
327 | #endif | ||
328 | 420 | ||
329 | void check_bytes(char *addr) | 421 | void check_bytes(char *addr) |
330 | { | 422 | { |
diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index 3ffadf8da61f..12f9ba20ccb7 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt | |||
@@ -92,16 +92,62 @@ PR_MCE_KILL_GET | |||
92 | 92 | ||
93 | Testing: | 93 | Testing: |
94 | 94 | ||
95 | madvise(MADV_POISON, ....) | 95 | madvise(MADV_HWPOISON, ....) |
96 | (as root) | 96 | (as root) |
97 | Poison a page in the process for testing | 97 | Poison a page in the process for testing |
98 | 98 | ||
99 | 99 | ||
100 | hwpoison-inject module through debugfs | 100 | hwpoison-inject module through debugfs |
101 | /sys/debug/hwpoison/corrupt-pfn | ||
102 | 101 | ||
103 | Inject hwpoison fault at PFN echoed into this file | 102 | /sys/debug/hwpoison/ |
104 | 103 | ||
104 | corrupt-pfn | ||
105 | |||
106 | Inject hwpoison fault at PFN echoed into this file. This does | ||
107 | some early filtering to avoid corrupted unintended pages in test suites. | ||
108 | |||
109 | unpoison-pfn | ||
110 | |||
111 | Software-unpoison page at PFN echoed into this file. This | ||
112 | way a page can be reused again. | ||
113 | This only works for Linux injected failures, not for real | ||
114 | memory failures. | ||
115 | |||
116 | Note these injection interfaces are not stable and might change between | ||
117 | kernel versions | ||
118 | |||
119 | corrupt-filter-dev-major | ||
120 | corrupt-filter-dev-minor | ||
121 | |||
122 | Only handle memory failures to pages associated with the file system defined | ||
123 | by block device major/minor. -1U is the wildcard value. | ||
124 | This should be only used for testing with artificial injection. | ||
125 | |||
126 | corrupt-filter-memcg | ||
127 | |||
128 | Limit injection to pages owned by memgroup. Specified by inode number | ||
129 | of the memcg. | ||
130 | |||
131 | Example: | ||
132 | mkdir /cgroup/hwpoison | ||
133 | |||
134 | usemem -m 100 -s 1000 & | ||
135 | echo `jobs -p` > /cgroup/hwpoison/tasks | ||
136 | |||
137 | memcg_ino=$(ls -id /cgroup/hwpoison | cut -f1 -d' ') | ||
138 | echo $memcg_ino > /debug/hwpoison/corrupt-filter-memcg | ||
139 | |||
140 | page-types -p `pidof init` --hwpoison # shall do nothing | ||
141 | page-types -p `pidof usemem` --hwpoison # poison its pages | ||
142 | |||
143 | corrupt-filter-flags-mask | ||
144 | corrupt-filter-flags-value | ||
145 | |||
146 | When specified, only poison pages if ((page_flags & mask) == value). | ||
147 | This allows stress testing of many kinds of pages. The page_flags | ||
148 | are the same as in /proc/kpageflags. The flag bits are defined in | ||
149 | include/linux/kernel-page-flags.h and documented in | ||
150 | Documentation/vm/pagemap.txt | ||
105 | 151 | ||
106 | Architecture specific MCE injector | 152 | Architecture specific MCE injector |
107 | 153 | ||
diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt index 262d8e6793a3..b392e496f816 100644 --- a/Documentation/vm/ksm.txt +++ b/Documentation/vm/ksm.txt | |||
@@ -16,9 +16,9 @@ by sharing the data common between them. But it can be useful to any | |||
16 | application which generates many instances of the same data. | 16 | application which generates many instances of the same data. |
17 | 17 | ||
18 | KSM only merges anonymous (private) pages, never pagecache (file) pages. | 18 | KSM only merges anonymous (private) pages, never pagecache (file) pages. |
19 | KSM's merged pages are at present locked into kernel memory for as long | 19 | KSM's merged pages were originally locked into kernel memory, but can now |
20 | as they are shared: so cannot be swapped out like the user pages they | 20 | be swapped out just like other user pages (but sharing is broken when they |
21 | replace (but swapping KSM pages should follow soon in a later release). | 21 | are swapped back in: ksmd must rediscover their identity and merge again). |
22 | 22 | ||
23 | KSM only operates on those areas of address space which an application | 23 | KSM only operates on those areas of address space which an application |
24 | has advised to be likely candidates for merging, by using the madvise(2) | 24 | has advised to be likely candidates for merging, by using the madvise(2) |
@@ -44,20 +44,12 @@ includes unmapped gaps (though working on the intervening mapped areas), | |||
44 | and might fail with EAGAIN if not enough memory for internal structures. | 44 | and might fail with EAGAIN if not enough memory for internal structures. |
45 | 45 | ||
46 | Applications should be considerate in their use of MADV_MERGEABLE, | 46 | Applications should be considerate in their use of MADV_MERGEABLE, |
47 | restricting its use to areas likely to benefit. KSM's scans may use | 47 | restricting its use to areas likely to benefit. KSM's scans may use a lot |
48 | a lot of processing power, and its kernel-resident pages are a limited | 48 | of processing power: some installations will disable KSM for that reason. |
49 | resource. Some installations will disable KSM for these reasons. | ||
50 | 49 | ||
51 | The KSM daemon is controlled by sysfs files in /sys/kernel/mm/ksm/, | 50 | The KSM daemon is controlled by sysfs files in /sys/kernel/mm/ksm/, |
52 | readable by all but writable only by root: | 51 | readable by all but writable only by root: |
53 | 52 | ||
54 | max_kernel_pages - set to maximum number of kernel pages that KSM may use | ||
55 | e.g. "echo 100000 > /sys/kernel/mm/ksm/max_kernel_pages" | ||
56 | Value 0 imposes no limit on the kernel pages KSM may use; | ||
57 | but note that any process using MADV_MERGEABLE can cause | ||
58 | KSM to allocate these pages, unswappable until it exits. | ||
59 | Default: quarter of memory (chosen to not pin too much) | ||
60 | |||
61 | pages_to_scan - how many present pages to scan before ksmd goes to sleep | 53 | pages_to_scan - how many present pages to scan before ksmd goes to sleep |
62 | e.g. "echo 100 > /sys/kernel/mm/ksm/pages_to_scan" | 54 | e.g. "echo 100 > /sys/kernel/mm/ksm/pages_to_scan" |
63 | Default: 100 (chosen for demonstration purposes) | 55 | Default: 100 (chosen for demonstration purposes) |
@@ -75,7 +67,7 @@ run - set 0 to stop ksmd from running but keep merged pages, | |||
75 | 67 | ||
76 | The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/: | 68 | The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/: |
77 | 69 | ||
78 | pages_shared - how many shared unswappable kernel pages KSM is using | 70 | pages_shared - how many shared pages are being used |
79 | pages_sharing - how many more sites are sharing them i.e. how much saved | 71 | pages_sharing - how many more sites are sharing them i.e. how much saved |
80 | pages_unshared - how many pages unique but repeatedly checked for merging | 72 | pages_unshared - how many pages unique but repeatedly checked for merging |
81 | pages_volatile - how many pages changing too fast to be placed in a tree | 73 | pages_volatile - how many pages changing too fast to be placed in a tree |
@@ -87,4 +79,4 @@ pages_volatile embraces several different kinds of activity, but a high | |||
87 | proportion there would also indicate poor use of madvise MADV_MERGEABLE. | 79 | proportion there would also indicate poor use of madvise MADV_MERGEABLE. |
88 | 80 | ||
89 | Izik Eidus, | 81 | Izik Eidus, |
90 | Hugh Dickins, 24 Sept 2009 | 82 | Hugh Dickins, 17 Nov 2009 |
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index ea44ea502da1..66e9358e2144 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c | |||
@@ -1,11 +1,22 @@ | |||
1 | /* | 1 | /* |
2 | * page-types: Tool for querying page flags | 2 | * page-types: Tool for querying page flags |
3 | * | 3 | * |
4 | * This program is free software; you can redistribute it and/or modify it | ||
5 | * under the terms of the GNU General Public License as published by the Free | ||
6 | * Software Foundation; version 2. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, but WITHOUT | ||
9 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
10 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
11 | * more details. | ||
12 | * | ||
13 | * You should find a copy of v2 of the GNU General Public License somewhere on | ||
14 | * your Linux system; if not, write to the Free Software Foundation, Inc., 59 | ||
15 | * Temple Place, Suite 330, Boston, MA 02111-1307 USA. | ||
16 | * | ||
4 | * Copyright (C) 2009 Intel corporation | 17 | * Copyright (C) 2009 Intel corporation |
5 | * | 18 | * |
6 | * Authors: Wu Fengguang <fengguang.wu@intel.com> | 19 | * Authors: Wu Fengguang <fengguang.wu@intel.com> |
7 | * | ||
8 | * Released under the General Public License (GPL). | ||
9 | */ | 20 | */ |
10 | 21 | ||
11 | #define _LARGEFILE64_SOURCE | 22 | #define _LARGEFILE64_SOURCE |
@@ -100,7 +111,7 @@ | |||
100 | #define BIT(name) (1ULL << KPF_##name) | 111 | #define BIT(name) (1ULL << KPF_##name) |
101 | #define BITS_COMPOUND (BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL)) | 112 | #define BITS_COMPOUND (BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL)) |
102 | 113 | ||
103 | static char *page_flag_names[] = { | 114 | static const char *page_flag_names[] = { |
104 | [KPF_LOCKED] = "L:locked", | 115 | [KPF_LOCKED] = "L:locked", |
105 | [KPF_ERROR] = "E:error", | 116 | [KPF_ERROR] = "E:error", |
106 | [KPF_REFERENCED] = "R:referenced", | 117 | [KPF_REFERENCED] = "R:referenced", |
@@ -173,7 +184,7 @@ static int kpageflags_fd; | |||
173 | static int opt_hwpoison; | 184 | static int opt_hwpoison; |
174 | static int opt_unpoison; | 185 | static int opt_unpoison; |
175 | 186 | ||
176 | static char *hwpoison_debug_fs = "/debug/hwpoison"; | 187 | static const char hwpoison_debug_fs[] = "/debug/hwpoison"; |
177 | static int hwpoison_inject_fd; | 188 | static int hwpoison_inject_fd; |
178 | static int hwpoison_forget_fd; | 189 | static int hwpoison_forget_fd; |
179 | 190 | ||
@@ -560,7 +571,7 @@ static void walk_pfn(unsigned long voffset, | |||
560 | { | 571 | { |
561 | uint64_t buf[KPAGEFLAGS_BATCH]; | 572 | uint64_t buf[KPAGEFLAGS_BATCH]; |
562 | unsigned long batch; | 573 | unsigned long batch; |
563 | unsigned long pages; | 574 | long pages; |
564 | unsigned long i; | 575 | unsigned long i; |
565 | 576 | ||
566 | while (count) { | 577 | while (count) { |
@@ -673,30 +684,35 @@ static void usage(void) | |||
673 | 684 | ||
674 | printf( | 685 | printf( |
675 | "page-types [options]\n" | 686 | "page-types [options]\n" |
676 | " -r|--raw Raw mode, for kernel developers\n" | 687 | " -r|--raw Raw mode, for kernel developers\n" |
677 | " -a|--addr addr-spec Walk a range of pages\n" | 688 | " -d|--describe flags Describe flags\n" |
678 | " -b|--bits bits-spec Walk pages with specified bits\n" | 689 | " -a|--addr addr-spec Walk a range of pages\n" |
679 | " -p|--pid pid Walk process address space\n" | 690 | " -b|--bits bits-spec Walk pages with specified bits\n" |
691 | " -p|--pid pid Walk process address space\n" | ||
680 | #if 0 /* planned features */ | 692 | #if 0 /* planned features */ |
681 | " -f|--file filename Walk file address space\n" | 693 | " -f|--file filename Walk file address space\n" |
682 | #endif | 694 | #endif |
683 | " -l|--list Show page details in ranges\n" | 695 | " -l|--list Show page details in ranges\n" |
684 | " -L|--list-each Show page details one by one\n" | 696 | " -L|--list-each Show page details one by one\n" |
685 | " -N|--no-summary Don't show summay info\n" | 697 | " -N|--no-summary Don't show summay info\n" |
686 | " -X|--hwpoison hwpoison pages\n" | 698 | " -X|--hwpoison hwpoison pages\n" |
687 | " -x|--unpoison unpoison pages\n" | 699 | " -x|--unpoison unpoison pages\n" |
688 | " -h|--help Show this usage message\n" | 700 | " -h|--help Show this usage message\n" |
701 | "flags:\n" | ||
702 | " 0x10 bitfield format, e.g.\n" | ||
703 | " anon bit-name, e.g.\n" | ||
704 | " 0x10,anon comma-separated list, e.g.\n" | ||
689 | "addr-spec:\n" | 705 | "addr-spec:\n" |
690 | " N one page at offset N (unit: pages)\n" | 706 | " N one page at offset N (unit: pages)\n" |
691 | " N+M pages range from N to N+M-1\n" | 707 | " N+M pages range from N to N+M-1\n" |
692 | " N,M pages range from N to M-1\n" | 708 | " N,M pages range from N to M-1\n" |
693 | " N, pages range from N to end\n" | 709 | " N, pages range from N to end\n" |
694 | " ,M pages range from 0 to M-1\n" | 710 | " ,M pages range from 0 to M-1\n" |
695 | "bits-spec:\n" | 711 | "bits-spec:\n" |
696 | " bit1,bit2 (flags & (bit1|bit2)) != 0\n" | 712 | " bit1,bit2 (flags & (bit1|bit2)) != 0\n" |
697 | " bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n" | 713 | " bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n" |
698 | " bit1,~bit2 (flags & (bit1|bit2)) == bit1\n" | 714 | " bit1,~bit2 (flags & (bit1|bit2)) == bit1\n" |
699 | " =bit1,bit2 flags == (bit1|bit2)\n" | 715 | " =bit1,bit2 flags == (bit1|bit2)\n" |
700 | "bit-names:\n" | 716 | "bit-names:\n" |
701 | ); | 717 | ); |
702 | 718 | ||
@@ -884,13 +900,23 @@ static void parse_bits_mask(const char *optarg) | |||
884 | add_bits_filter(mask, bits); | 900 | add_bits_filter(mask, bits); |
885 | } | 901 | } |
886 | 902 | ||
903 | static void describe_flags(const char *optarg) | ||
904 | { | ||
905 | uint64_t flags = parse_flag_names(optarg, 0); | ||
906 | |||
907 | printf("0x%016llx\t%s\t%s\n", | ||
908 | (unsigned long long)flags, | ||
909 | page_flag_name(flags), | ||
910 | page_flag_longname(flags)); | ||
911 | } | ||
887 | 912 | ||
888 | static struct option opts[] = { | 913 | static const struct option opts[] = { |
889 | { "raw" , 0, NULL, 'r' }, | 914 | { "raw" , 0, NULL, 'r' }, |
890 | { "pid" , 1, NULL, 'p' }, | 915 | { "pid" , 1, NULL, 'p' }, |
891 | { "file" , 1, NULL, 'f' }, | 916 | { "file" , 1, NULL, 'f' }, |
892 | { "addr" , 1, NULL, 'a' }, | 917 | { "addr" , 1, NULL, 'a' }, |
893 | { "bits" , 1, NULL, 'b' }, | 918 | { "bits" , 1, NULL, 'b' }, |
919 | { "describe" , 1, NULL, 'd' }, | ||
894 | { "list" , 0, NULL, 'l' }, | 920 | { "list" , 0, NULL, 'l' }, |
895 | { "list-each" , 0, NULL, 'L' }, | 921 | { "list-each" , 0, NULL, 'L' }, |
896 | { "no-summary", 0, NULL, 'N' }, | 922 | { "no-summary", 0, NULL, 'N' }, |
@@ -907,7 +933,7 @@ int main(int argc, char *argv[]) | |||
907 | page_size = getpagesize(); | 933 | page_size = getpagesize(); |
908 | 934 | ||
909 | while ((c = getopt_long(argc, argv, | 935 | while ((c = getopt_long(argc, argv, |
910 | "rp:f:a:b:lLNXxh", opts, NULL)) != -1) { | 936 | "rp:f:a:b:d:lLNXxh", opts, NULL)) != -1) { |
911 | switch (c) { | 937 | switch (c) { |
912 | case 'r': | 938 | case 'r': |
913 | opt_raw = 1; | 939 | opt_raw = 1; |
@@ -924,6 +950,9 @@ int main(int argc, char *argv[]) | |||
924 | case 'b': | 950 | case 'b': |
925 | parse_bits_mask(optarg); | 951 | parse_bits_mask(optarg); |
926 | break; | 952 | break; |
953 | case 'd': | ||
954 | describe_flags(optarg); | ||
955 | exit(0); | ||
927 | case 'l': | 956 | case 'l': |
928 | opt_list = 1; | 957 | opt_list = 1; |
929 | break; | 958 | break; |