diff options
Diffstat (limited to 'Documentation/vm/hugetlbpage.txt')
-rw-r--r-- | Documentation/vm/hugetlbpage.txt | 262 |
1 files changed, 177 insertions, 85 deletions
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index 82a7bd1800b2..bc31636973e3 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt | |||
@@ -11,23 +11,21 @@ This optimization is more critical now as bigger and bigger physical memories | |||
11 | (several GBs) are more readily available. | 11 | (several GBs) are more readily available. |
12 | 12 | ||
13 | Users can use the huge page support in Linux kernel by either using the mmap | 13 | Users can use the huge page support in Linux kernel by either using the mmap |
14 | system call or standard SYSv shared memory system calls (shmget, shmat). | 14 | system call or standard SYSV shared memory system calls (shmget, shmat). |
15 | 15 | ||
16 | First the Linux kernel needs to be built with the CONFIG_HUGETLBFS | 16 | First the Linux kernel needs to be built with the CONFIG_HUGETLBFS |
17 | (present under "File systems") and CONFIG_HUGETLB_PAGE (selected | 17 | (present under "File systems") and CONFIG_HUGETLB_PAGE (selected |
18 | automatically when CONFIG_HUGETLBFS is selected) configuration | 18 | automatically when CONFIG_HUGETLBFS is selected) configuration |
19 | options. | 19 | options. |
20 | 20 | ||
21 | The kernel built with huge page support should show the number of configured | 21 | The /proc/meminfo file provides information about the total number of |
22 | huge pages in the system by running the "cat /proc/meminfo" command. | 22 | persistent hugetlb pages in the kernel's huge page pool. It also displays |
23 | information about the number of free, reserved and surplus huge pages and the | ||
24 | default huge page size. The huge page size is needed for generating the | ||
25 | proper alignment and size of the arguments to system calls that map huge page | ||
26 | regions. | ||
23 | 27 | ||
24 | /proc/meminfo also provides information about the total number of hugetlb | 28 | The output of "cat /proc/meminfo" will include lines like: |
25 | pages configured in the kernel. It also displays information about the | ||
26 | number of free hugetlb pages at any time. It also displays information about | ||
27 | the configured huge page size - this is needed for generating the proper | ||
28 | alignment and size of the arguments to the above system calls. | ||
29 | |||
30 | The output of "cat /proc/meminfo" will have lines like: | ||
31 | 29 | ||
32 | ..... | 30 | ..... |
33 | HugePages_Total: vvv | 31 | HugePages_Total: vvv |
@@ -53,59 +51,63 @@ HugePages_Surp is short for "surplus," and is the number of huge pages in | |||
53 | /proc/filesystems should also show a filesystem of type "hugetlbfs" configured | 51 | /proc/filesystems should also show a filesystem of type "hugetlbfs" configured |
54 | in the kernel. | 52 | in the kernel. |
55 | 53 | ||
56 | /proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb | 54 | /proc/sys/vm/nr_hugepages indicates the current number of "persistent" huge |
57 | pages in the kernel. Super user can dynamically request more (or free some | 55 | pages in the kernel's huge page pool. "Persistent" huge pages will be |
58 | pre-configured) huge pages. | 56 | returned to the huge page pool when freed by a task. A user with root |
59 | The allocation (or deallocation) of hugetlb pages is possible only if there are | 57 | privileges can dynamically allocate more or free some persistent huge pages |
60 | enough physically contiguous free pages in system (freeing of huge pages is | 58 | by increasing or decreasing the value of 'nr_hugepages'. |
61 | possible only if there are enough hugetlb pages free that can be transferred | ||
62 | back to regular memory pool). | ||
63 | 59 | ||
64 | Pages that are used as hugetlb pages are reserved inside the kernel and cannot | 60 | Pages that are used as huge pages are reserved inside the kernel and cannot |
65 | be used for other purposes. | 61 | be used for other purposes. Huge pages cannot be swapped out under |
62 | memory pressure. | ||
66 | 63 | ||
67 | Once the kernel with Hugetlb page support is built and running, a user can | 64 | Once a number of huge pages have been pre-allocated to the kernel huge page |
68 | use either the mmap system call or shared memory system calls to start using | 65 | pool, a user with appropriate privilege can use either the mmap system call |
69 | the huge pages. It is required that the system administrator preallocate | 66 | or shared memory system calls to use the huge pages. See the discussion of |
70 | enough memory for huge page purposes. | 67 | Using Huge Pages, below. |
71 | 68 | ||
72 | The administrator can preallocate huge pages on the kernel boot command line by | 69 | The administrator can allocate persistent huge pages on the kernel boot |
73 | specifying the "hugepages=N" parameter, where 'N' = the number of huge pages | 70 | command line by specifying the "hugepages=N" parameter, where 'N' = the |
74 | requested. This is the most reliable method for preallocating huge pages as | 71 | number of huge pages requested. This is the most reliable method of |
75 | memory has not yet become fragmented. | 72 | allocating huge pages as memory has not yet become fragmented. |
76 | 73 | ||
77 | Some platforms support multiple huge page sizes. To preallocate huge pages | 74 | Some platforms support multiple huge page sizes. To allocate huge pages |
78 | of a specific size, one must preceed the huge pages boot command parameters | 75 | of a specific size, one must preceed the huge pages boot command parameters |
79 | with a huge page size selection parameter "hugepagesz=<size>". <size> must | 76 | with a huge page size selection parameter "hugepagesz=<size>". <size> must |
80 | be specified in bytes with optional scale suffix [kKmMgG]. The default huge | 77 | be specified in bytes with optional scale suffix [kKmMgG]. The default huge |
81 | page size may be selected with the "default_hugepagesz=<size>" boot parameter. | 78 | page size may be selected with the "default_hugepagesz=<size>" boot parameter. |
82 | 79 | ||
83 | /proc/sys/vm/nr_hugepages indicates the current number of configured [default | 80 | When multiple huge page sizes are supported, /proc/sys/vm/nr_hugepages |
84 | size] hugetlb pages in the kernel. Super user can dynamically request more | 81 | indicates the current number of pre-allocated huge pages of the default size. |
85 | (or free some pre-configured) huge pages. | 82 | Thus, one can use the following command to dynamically allocate/deallocate |
86 | 83 | default sized persistent huge pages: | |
87 | Use the following command to dynamically allocate/deallocate default sized | ||
88 | huge pages: | ||
89 | 84 | ||
90 | echo 20 > /proc/sys/vm/nr_hugepages | 85 | echo 20 > /proc/sys/vm/nr_hugepages |
91 | 86 | ||
92 | This command will try to configure 20 default sized huge pages in the system. | 87 | This command will try to adjust the number of default sized huge pages in the |
88 | huge page pool to 20, allocating or freeing huge pages, as required. | ||
89 | |||
93 | On a NUMA platform, the kernel will attempt to distribute the huge page pool | 90 | On a NUMA platform, the kernel will attempt to distribute the huge page pool |
94 | over the all on-line nodes. These huge pages, allocated when nr_hugepages | 91 | over all the set of allowed nodes specified by the NUMA memory policy of the |
95 | is increased, are called "persistent huge pages". | 92 | task that modifies nr_hugepages. The default for the allowed nodes--when the |
93 | task has default memory policy--is all on-line nodes with memory. Allowed | ||
94 | nodes with insufficient available, contiguous memory for a huge page will be | ||
95 | silently skipped when allocating persistent huge pages. See the discussion | ||
96 | below of the interaction of task memory policy, cpusets and per node attributes | ||
97 | with the allocation and freeing of persistent huge pages. | ||
96 | 98 | ||
97 | The success or failure of huge page allocation depends on the amount of | 99 | The success or failure of huge page allocation depends on the amount of |
98 | physically contiguous memory that is preset in system at the time of the | 100 | physically contiguous memory that is present in system at the time of the |
99 | allocation attempt. If the kernel is unable to allocate huge pages from | 101 | allocation attempt. If the kernel is unable to allocate huge pages from |
100 | some nodes in a NUMA system, it will attempt to make up the difference by | 102 | some nodes in a NUMA system, it will attempt to make up the difference by |
101 | allocating extra pages on other nodes with sufficient available contiguous | 103 | allocating extra pages on other nodes with sufficient available contiguous |
102 | memory, if any. | 104 | memory, if any. |
103 | 105 | ||
104 | System administrators may want to put this command in one of the local rc init | 106 | System administrators may want to put this command in one of the local rc |
105 | files. This will enable the kernel to request huge pages early in the boot | 107 | init files. This will enable the kernel to allocate huge pages early in |
106 | process when the possibility of getting physical contiguous pages is still | 108 | the boot process when the possibility of getting physical contiguous pages |
107 | very high. Administrators can verify the number of huge pages actually | 109 | is still very high. Administrators can verify the number of huge pages |
108 | allocated by checking the sysctl or meminfo. To check the per node | 110 | actually allocated by checking the sysctl or meminfo. To check the per node |
109 | distribution of huge pages in a NUMA system, use: | 111 | distribution of huge pages in a NUMA system, use: |
110 | 112 | ||
111 | cat /sys/devices/system/node/node*/meminfo | fgrep Huge | 113 | cat /sys/devices/system/node/node*/meminfo | fgrep Huge |
@@ -113,45 +115,47 @@ distribution of huge pages in a NUMA system, use: | |||
113 | /proc/sys/vm/nr_overcommit_hugepages specifies how large the pool of | 115 | /proc/sys/vm/nr_overcommit_hugepages specifies how large the pool of |
114 | huge pages can grow, if more huge pages than /proc/sys/vm/nr_hugepages are | 116 | huge pages can grow, if more huge pages than /proc/sys/vm/nr_hugepages are |
115 | requested by applications. Writing any non-zero value into this file | 117 | requested by applications. Writing any non-zero value into this file |
116 | indicates that the hugetlb subsystem is allowed to try to obtain "surplus" | 118 | indicates that the hugetlb subsystem is allowed to try to obtain that |
117 | huge pages from the buddy allocator, when the normal pool is exhausted. As | 119 | number of "surplus" huge pages from the kernel's normal page pool, when the |
118 | these surplus huge pages go out of use, they are freed back to the buddy | 120 | persistent huge page pool is exhausted. As these surplus huge pages become |
119 | allocator. | 121 | unused, they are freed back to the kernel's normal page pool. |
120 | 122 | ||
121 | When increasing the huge page pool size via nr_hugepages, any surplus | 123 | When increasing the huge page pool size via nr_hugepages, any existing surplus |
122 | pages will first be promoted to persistent huge pages. Then, additional | 124 | pages will first be promoted to persistent huge pages. Then, additional |
123 | huge pages will be allocated, if necessary and if possible, to fulfill | 125 | huge pages will be allocated, if necessary and if possible, to fulfill |
124 | the new huge page pool size. | 126 | the new persistent huge page pool size. |
125 | 127 | ||
126 | The administrator may shrink the pool of preallocated huge pages for | 128 | The administrator may shrink the pool of persistent huge pages for |
127 | the default huge page size by setting the nr_hugepages sysctl to a | 129 | the default huge page size by setting the nr_hugepages sysctl to a |
128 | smaller value. The kernel will attempt to balance the freeing of huge pages | 130 | smaller value. The kernel will attempt to balance the freeing of huge pages |
129 | across all on-line nodes. Any free huge pages on the selected nodes will | 131 | across all nodes in the memory policy of the task modifying nr_hugepages. |
130 | be freed back to the buddy allocator. | 132 | Any free huge pages on the selected nodes will be freed back to the kernel's |
131 | 133 | normal page pool. | |
132 | Caveat: Shrinking the pool via nr_hugepages such that it becomes less | 134 | |
133 | than the number of huge pages in use will convert the balance to surplus | 135 | Caveat: Shrinking the persistent huge page pool via nr_hugepages such that |
134 | huge pages even if it would exceed the overcommit value. As long as | 136 | it becomes less than the number of huge pages in use will convert the balance |
135 | this condition holds, however, no more surplus huge pages will be | 137 | of the in-use huge pages to surplus huge pages. This will occur even if |
136 | allowed on the system until one of the two sysctls are increased | 138 | the number of surplus pages it would exceed the overcommit value. As long as |
137 | sufficiently, or the surplus huge pages go out of use and are freed. | 139 | this condition holds--that is, until nr_hugepages+nr_overcommit_hugepages is |
140 | increased sufficiently, or the surplus huge pages go out of use and are freed-- | ||
141 | no more surplus huge pages will be allowed to be allocated. | ||
138 | 142 | ||
139 | With support for multiple huge page pools at run-time available, much of | 143 | With support for multiple huge page pools at run-time available, much of |
140 | the huge page userspace interface has been duplicated in sysfs. The above | 144 | the huge page userspace interface in /proc/sys/vm has been duplicated in sysfs. |
141 | information applies to the default huge page size which will be | 145 | The /proc interfaces discussed above have been retained for backwards |
142 | controlled by the /proc interfaces for backwards compatibility. The root | 146 | compatibility. The root huge page control directory in sysfs is: |
143 | huge page control directory in sysfs is: | ||
144 | 147 | ||
145 | /sys/kernel/mm/hugepages | 148 | /sys/kernel/mm/hugepages |
146 | 149 | ||
147 | For each huge page size supported by the running kernel, a subdirectory | 150 | For each huge page size supported by the running kernel, a subdirectory |
148 | will exist, of the form | 151 | will exist, of the form: |
149 | 152 | ||
150 | hugepages-${size}kB | 153 | hugepages-${size}kB |
151 | 154 | ||
152 | Inside each of these directories, the same set of files will exist: | 155 | Inside each of these directories, the same set of files will exist: |
153 | 156 | ||
154 | nr_hugepages | 157 | nr_hugepages |
158 | nr_hugepages_mempolicy | ||
155 | nr_overcommit_hugepages | 159 | nr_overcommit_hugepages |
156 | free_hugepages | 160 | free_hugepages |
157 | resv_hugepages | 161 | resv_hugepages |
@@ -159,6 +163,102 @@ Inside each of these directories, the same set of files will exist: | |||
159 | 163 | ||
160 | which function as described above for the default huge page-sized case. | 164 | which function as described above for the default huge page-sized case. |
161 | 165 | ||
166 | |||
167 | Interaction of Task Memory Policy with Huge Page Allocation/Freeing | ||
168 | |||
169 | Whether huge pages are allocated and freed via the /proc interface or | ||
170 | the /sysfs interface using the nr_hugepages_mempolicy attribute, the NUMA | ||
171 | nodes from which huge pages are allocated or freed are controlled by the | ||
172 | NUMA memory policy of the task that modifies the nr_hugepages_mempolicy | ||
173 | sysctl or attribute. When the nr_hugepages attribute is used, mempolicy | ||
174 | is ignored. | ||
175 | |||
176 | The recommended method to allocate or free huge pages to/from the kernel | ||
177 | huge page pool, using the nr_hugepages example above, is: | ||
178 | |||
179 | numactl --interleave <node-list> echo 20 \ | ||
180 | >/proc/sys/vm/nr_hugepages_mempolicy | ||
181 | |||
182 | or, more succinctly: | ||
183 | |||
184 | numactl -m <node-list> echo 20 >/proc/sys/vm/nr_hugepages_mempolicy | ||
185 | |||
186 | This will allocate or free abs(20 - nr_hugepages) to or from the nodes | ||
187 | specified in <node-list>, depending on whether number of persistent huge pages | ||
188 | is initially less than or greater than 20, respectively. No huge pages will be | ||
189 | allocated nor freed on any node not included in the specified <node-list>. | ||
190 | |||
191 | When adjusting the persistent hugepage count via nr_hugepages_mempolicy, any | ||
192 | memory policy mode--bind, preferred, local or interleave--may be used. The | ||
193 | resulting effect on persistent huge page allocation is as follows: | ||
194 | |||
195 | 1) Regardless of mempolicy mode [see Documentation/vm/numa_memory_policy.txt], | ||
196 | persistent huge pages will be distributed across the node or nodes | ||
197 | specified in the mempolicy as if "interleave" had been specified. | ||
198 | However, if a node in the policy does not contain sufficient contiguous | ||
199 | memory for a huge page, the allocation will not "fallback" to the nearest | ||
200 | neighbor node with sufficient contiguous memory. To do this would cause | ||
201 | undesirable imbalance in the distribution of the huge page pool, or | ||
202 | possibly, allocation of persistent huge pages on nodes not allowed by | ||
203 | the task's memory policy. | ||
204 | |||
205 | 2) One or more nodes may be specified with the bind or interleave policy. | ||
206 | If more than one node is specified with the preferred policy, only the | ||
207 | lowest numeric id will be used. Local policy will select the node where | ||
208 | the task is running at the time the nodes_allowed mask is constructed. | ||
209 | For local policy to be deterministic, the task must be bound to a cpu or | ||
210 | cpus in a single node. Otherwise, the task could be migrated to some | ||
211 | other node at any time after launch and the resulting node will be | ||
212 | indeterminate. Thus, local policy is not very useful for this purpose. | ||
213 | Any of the other mempolicy modes may be used to specify a single node. | ||
214 | |||
215 | 3) The nodes allowed mask will be derived from any non-default task mempolicy, | ||
216 | whether this policy was set explicitly by the task itself or one of its | ||
217 | ancestors, such as numactl. This means that if the task is invoked from a | ||
218 | shell with non-default policy, that policy will be used. One can specify a | ||
219 | node list of "all" with numactl --interleave or --membind [-m] to achieve | ||
220 | interleaving over all nodes in the system or cpuset. | ||
221 | |||
222 | 4) Any task mempolicy specifed--e.g., using numactl--will be constrained by | ||
223 | the resource limits of any cpuset in which the task runs. Thus, there will | ||
224 | be no way for a task with non-default policy running in a cpuset with a | ||
225 | subset of the system nodes to allocate huge pages outside the cpuset | ||
226 | without first moving to a cpuset that contains all of the desired nodes. | ||
227 | |||
228 | 5) Boot-time huge page allocation attempts to distribute the requested number | ||
229 | of huge pages over all on-lines nodes with memory. | ||
230 | |||
231 | Per Node Hugepages Attributes | ||
232 | |||
233 | A subset of the contents of the root huge page control directory in sysfs, | ||
234 | described above, will be replicated under each the system device of each | ||
235 | NUMA node with memory in: | ||
236 | |||
237 | /sys/devices/system/node/node[0-9]*/hugepages/ | ||
238 | |||
239 | Under this directory, the subdirectory for each supported huge page size | ||
240 | contains the following attribute files: | ||
241 | |||
242 | nr_hugepages | ||
243 | free_hugepages | ||
244 | surplus_hugepages | ||
245 | |||
246 | The free_' and surplus_' attribute files are read-only. They return the number | ||
247 | of free and surplus [overcommitted] huge pages, respectively, on the parent | ||
248 | node. | ||
249 | |||
250 | The nr_hugepages attribute returns the total number of huge pages on the | ||
251 | specified node. When this attribute is written, the number of persistent huge | ||
252 | pages on the parent node will be adjusted to the specified value, if sufficient | ||
253 | resources exist, regardless of the task's mempolicy or cpuset constraints. | ||
254 | |||
255 | Note that the number of overcommit and reserve pages remain global quantities, | ||
256 | as we don't know until fault time, when the faulting task's mempolicy is | ||
257 | applied, from which node the huge page allocation will be attempted. | ||
258 | |||
259 | |||
260 | Using Huge Pages | ||
261 | |||
162 | If the user applications are going to request huge pages using mmap system | 262 | If the user applications are going to request huge pages using mmap system |
163 | call, then it is required that system administrator mount a file system of | 263 | call, then it is required that system administrator mount a file system of |
164 | type hugetlbfs: | 264 | type hugetlbfs: |
@@ -206,9 +306,11 @@ map_hugetlb.c. | |||
206 | * requesting huge pages. | 306 | * requesting huge pages. |
207 | * | 307 | * |
208 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for | 308 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for |
209 | * huge pages. That means the addresses starting with 0x800000... will need | 309 | * huge pages. That means that if one requires a fixed address, a huge page |
210 | * to be specified. Specifying a fixed address is not required on ppc64, | 310 | * aligned address starting with 0x800000... will be required. If a fixed |
211 | * i386 or x86_64. | 311 | * address is not required, the kernel will select an address in the proper |
312 | * range. | ||
313 | * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. | ||
212 | * | 314 | * |
213 | * Note: The default shared memory limit is quite low on many kernels, | 315 | * Note: The default shared memory limit is quite low on many kernels, |
214 | * you may need to increase it via: | 316 | * you may need to increase it via: |
@@ -237,14 +339,8 @@ map_hugetlb.c. | |||
237 | 339 | ||
238 | #define dprintf(x) printf(x) | 340 | #define dprintf(x) printf(x) |
239 | 341 | ||
240 | /* Only ia64 requires this */ | 342 | #define ADDR (void *)(0x0UL) /* let kernel choose address */ |
241 | #ifdef __ia64__ | ||
242 | #define ADDR (void *)(0x8000000000000000UL) | ||
243 | #define SHMAT_FLAGS (SHM_RND) | ||
244 | #else | ||
245 | #define ADDR (void *)(0x0UL) | ||
246 | #define SHMAT_FLAGS (0) | 343 | #define SHMAT_FLAGS (0) |
247 | #endif | ||
248 | 344 | ||
249 | int main(void) | 345 | int main(void) |
250 | { | 346 | { |
@@ -302,10 +398,12 @@ int main(void) | |||
302 | * example, the app is requesting memory of size 256MB that is backed by | 398 | * example, the app is requesting memory of size 256MB that is backed by |
303 | * huge pages. | 399 | * huge pages. |
304 | * | 400 | * |
305 | * For ia64 architecture, Linux kernel reserves Region number 4 for huge pages. | 401 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for |
306 | * That means the addresses starting with 0x800000... will need to be | 402 | * huge pages. That means that if one requires a fixed address, a huge page |
307 | * specified. Specifying a fixed address is not required on ppc64, i386 | 403 | * aligned address starting with 0x800000... will be required. If a fixed |
308 | * or x86_64. | 404 | * address is not required, the kernel will select an address in the proper |
405 | * range. | ||
406 | * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. | ||
309 | */ | 407 | */ |
310 | #include <stdlib.h> | 408 | #include <stdlib.h> |
311 | #include <stdio.h> | 409 | #include <stdio.h> |
@@ -317,14 +415,8 @@ int main(void) | |||
317 | #define LENGTH (256UL*1024*1024) | 415 | #define LENGTH (256UL*1024*1024) |
318 | #define PROTECTION (PROT_READ | PROT_WRITE) | 416 | #define PROTECTION (PROT_READ | PROT_WRITE) |
319 | 417 | ||
320 | /* Only ia64 requires this */ | 418 | #define ADDR (void *)(0x0UL) /* let kernel choose address */ |
321 | #ifdef __ia64__ | ||
322 | #define ADDR (void *)(0x8000000000000000UL) | ||
323 | #define FLAGS (MAP_SHARED | MAP_FIXED) | ||
324 | #else | ||
325 | #define ADDR (void *)(0x0UL) | ||
326 | #define FLAGS (MAP_SHARED) | 419 | #define FLAGS (MAP_SHARED) |
327 | #endif | ||
328 | 420 | ||
329 | void check_bytes(char *addr) | 421 | void check_bytes(char *addr) |
330 | { | 422 | { |