diff options
Diffstat (limited to 'Documentation/vm/hugetlbpage.txt')
-rw-r--r-- | Documentation/vm/hugetlbpage.txt | 405 |
1 files changed, 166 insertions, 239 deletions
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index 82a7bd1800b2..457634c1e03e 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt | |||
@@ -11,23 +11,21 @@ This optimization is more critical now as bigger and bigger physical memories | |||
11 | (several GBs) are more readily available. | 11 | (several GBs) are more readily available. |
12 | 12 | ||
13 | Users can use the huge page support in Linux kernel by either using the mmap | 13 | Users can use the huge page support in Linux kernel by either using the mmap |
14 | system call or standard SYSv shared memory system calls (shmget, shmat). | 14 | system call or standard SYSV shared memory system calls (shmget, shmat). |
15 | 15 | ||
16 | First the Linux kernel needs to be built with the CONFIG_HUGETLBFS | 16 | First the Linux kernel needs to be built with the CONFIG_HUGETLBFS |
17 | (present under "File systems") and CONFIG_HUGETLB_PAGE (selected | 17 | (present under "File systems") and CONFIG_HUGETLB_PAGE (selected |
18 | automatically when CONFIG_HUGETLBFS is selected) configuration | 18 | automatically when CONFIG_HUGETLBFS is selected) configuration |
19 | options. | 19 | options. |
20 | 20 | ||
21 | The kernel built with huge page support should show the number of configured | 21 | The /proc/meminfo file provides information about the total number of |
22 | huge pages in the system by running the "cat /proc/meminfo" command. | 22 | persistent hugetlb pages in the kernel's huge page pool. It also displays |
23 | information about the number of free, reserved and surplus huge pages and the | ||
24 | default huge page size. The huge page size is needed for generating the | ||
25 | proper alignment and size of the arguments to system calls that map huge page | ||
26 | regions. | ||
23 | 27 | ||
24 | /proc/meminfo also provides information about the total number of hugetlb | 28 | The output of "cat /proc/meminfo" will include lines like: |
25 | pages configured in the kernel. It also displays information about the | ||
26 | number of free hugetlb pages at any time. It also displays information about | ||
27 | the configured huge page size - this is needed for generating the proper | ||
28 | alignment and size of the arguments to the above system calls. | ||
29 | |||
30 | The output of "cat /proc/meminfo" will have lines like: | ||
31 | 29 | ||
32 | ..... | 30 | ..... |
33 | HugePages_Total: vvv | 31 | HugePages_Total: vvv |
@@ -53,59 +51,63 @@ HugePages_Surp is short for "surplus," and is the number of huge pages in | |||
53 | /proc/filesystems should also show a filesystem of type "hugetlbfs" configured | 51 | /proc/filesystems should also show a filesystem of type "hugetlbfs" configured |
54 | in the kernel. | 52 | in the kernel. |
55 | 53 | ||
56 | /proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb | 54 | /proc/sys/vm/nr_hugepages indicates the current number of "persistent" huge |
57 | pages in the kernel. Super user can dynamically request more (or free some | 55 | pages in the kernel's huge page pool. "Persistent" huge pages will be |
58 | pre-configured) huge pages. | 56 | returned to the huge page pool when freed by a task. A user with root |
59 | The allocation (or deallocation) of hugetlb pages is possible only if there are | 57 | privileges can dynamically allocate more or free some persistent huge pages |
60 | enough physically contiguous free pages in system (freeing of huge pages is | 58 | by increasing or decreasing the value of 'nr_hugepages'. |
61 | possible only if there are enough hugetlb pages free that can be transferred | ||
62 | back to regular memory pool). | ||
63 | 59 | ||
64 | Pages that are used as hugetlb pages are reserved inside the kernel and cannot | 60 | Pages that are used as huge pages are reserved inside the kernel and cannot |
65 | be used for other purposes. | 61 | be used for other purposes. Huge pages cannot be swapped out under |
62 | memory pressure. | ||
66 | 63 | ||
67 | Once the kernel with Hugetlb page support is built and running, a user can | 64 | Once a number of huge pages have been pre-allocated to the kernel huge page |
68 | use either the mmap system call or shared memory system calls to start using | 65 | pool, a user with appropriate privilege can use either the mmap system call |
69 | the huge pages. It is required that the system administrator preallocate | 66 | or shared memory system calls to use the huge pages. See the discussion of |
70 | enough memory for huge page purposes. | 67 | Using Huge Pages, below. |
71 | 68 | ||
72 | The administrator can preallocate huge pages on the kernel boot command line by | 69 | The administrator can allocate persistent huge pages on the kernel boot |
73 | specifying the "hugepages=N" parameter, where 'N' = the number of huge pages | 70 | command line by specifying the "hugepages=N" parameter, where 'N' = the |
74 | requested. This is the most reliable method for preallocating huge pages as | 71 | number of huge pages requested. This is the most reliable method of |
75 | memory has not yet become fragmented. | 72 | allocating huge pages as memory has not yet become fragmented. |
76 | 73 | ||
77 | Some platforms support multiple huge page sizes. To preallocate huge pages | 74 | Some platforms support multiple huge page sizes. To allocate huge pages |
78 | of a specific size, one must preceed the huge pages boot command parameters | 75 | of a specific size, one must preceed the huge pages boot command parameters |
79 | with a huge page size selection parameter "hugepagesz=<size>". <size> must | 76 | with a huge page size selection parameter "hugepagesz=<size>". <size> must |
80 | be specified in bytes with optional scale suffix [kKmMgG]. The default huge | 77 | be specified in bytes with optional scale suffix [kKmMgG]. The default huge |
81 | page size may be selected with the "default_hugepagesz=<size>" boot parameter. | 78 | page size may be selected with the "default_hugepagesz=<size>" boot parameter. |
82 | 79 | ||
83 | /proc/sys/vm/nr_hugepages indicates the current number of configured [default | 80 | When multiple huge page sizes are supported, /proc/sys/vm/nr_hugepages |
84 | size] hugetlb pages in the kernel. Super user can dynamically request more | 81 | indicates the current number of pre-allocated huge pages of the default size. |
85 | (or free some pre-configured) huge pages. | 82 | Thus, one can use the following command to dynamically allocate/deallocate |
86 | 83 | default sized persistent huge pages: | |
87 | Use the following command to dynamically allocate/deallocate default sized | ||
88 | huge pages: | ||
89 | 84 | ||
90 | echo 20 > /proc/sys/vm/nr_hugepages | 85 | echo 20 > /proc/sys/vm/nr_hugepages |
91 | 86 | ||
92 | This command will try to configure 20 default sized huge pages in the system. | 87 | This command will try to adjust the number of default sized huge pages in the |
88 | huge page pool to 20, allocating or freeing huge pages, as required. | ||
89 | |||
93 | On a NUMA platform, the kernel will attempt to distribute the huge page pool | 90 | On a NUMA platform, the kernel will attempt to distribute the huge page pool |
94 | over the all on-line nodes. These huge pages, allocated when nr_hugepages | 91 | over all the set of allowed nodes specified by the NUMA memory policy of the |
95 | is increased, are called "persistent huge pages". | 92 | task that modifies nr_hugepages. The default for the allowed nodes--when the |
93 | task has default memory policy--is all on-line nodes with memory. Allowed | ||
94 | nodes with insufficient available, contiguous memory for a huge page will be | ||
95 | silently skipped when allocating persistent huge pages. See the discussion | ||
96 | below of the interaction of task memory policy, cpusets and per node attributes | ||
97 | with the allocation and freeing of persistent huge pages. | ||
96 | 98 | ||
97 | The success or failure of huge page allocation depends on the amount of | 99 | The success or failure of huge page allocation depends on the amount of |
98 | physically contiguous memory that is preset in system at the time of the | 100 | physically contiguous memory that is present in system at the time of the |
99 | allocation attempt. If the kernel is unable to allocate huge pages from | 101 | allocation attempt. If the kernel is unable to allocate huge pages from |
100 | some nodes in a NUMA system, it will attempt to make up the difference by | 102 | some nodes in a NUMA system, it will attempt to make up the difference by |
101 | allocating extra pages on other nodes with sufficient available contiguous | 103 | allocating extra pages on other nodes with sufficient available contiguous |
102 | memory, if any. | 104 | memory, if any. |
103 | 105 | ||
104 | System administrators may want to put this command in one of the local rc init | 106 | System administrators may want to put this command in one of the local rc |
105 | files. This will enable the kernel to request huge pages early in the boot | 107 | init files. This will enable the kernel to allocate huge pages early in |
106 | process when the possibility of getting physical contiguous pages is still | 108 | the boot process when the possibility of getting physical contiguous pages |
107 | very high. Administrators can verify the number of huge pages actually | 109 | is still very high. Administrators can verify the number of huge pages |
108 | allocated by checking the sysctl or meminfo. To check the per node | 110 | actually allocated by checking the sysctl or meminfo. To check the per node |
109 | distribution of huge pages in a NUMA system, use: | 111 | distribution of huge pages in a NUMA system, use: |
110 | 112 | ||
111 | cat /sys/devices/system/node/node*/meminfo | fgrep Huge | 113 | cat /sys/devices/system/node/node*/meminfo | fgrep Huge |
@@ -113,45 +115,47 @@ distribution of huge pages in a NUMA system, use: | |||
113 | /proc/sys/vm/nr_overcommit_hugepages specifies how large the pool of | 115 | /proc/sys/vm/nr_overcommit_hugepages specifies how large the pool of |
114 | huge pages can grow, if more huge pages than /proc/sys/vm/nr_hugepages are | 116 | huge pages can grow, if more huge pages than /proc/sys/vm/nr_hugepages are |
115 | requested by applications. Writing any non-zero value into this file | 117 | requested by applications. Writing any non-zero value into this file |
116 | indicates that the hugetlb subsystem is allowed to try to obtain "surplus" | 118 | indicates that the hugetlb subsystem is allowed to try to obtain that |
117 | huge pages from the buddy allocator, when the normal pool is exhausted. As | 119 | number of "surplus" huge pages from the kernel's normal page pool, when the |
118 | these surplus huge pages go out of use, they are freed back to the buddy | 120 | persistent huge page pool is exhausted. As these surplus huge pages become |
119 | allocator. | 121 | unused, they are freed back to the kernel's normal page pool. |
120 | 122 | ||
121 | When increasing the huge page pool size via nr_hugepages, any surplus | 123 | When increasing the huge page pool size via nr_hugepages, any existing surplus |
122 | pages will first be promoted to persistent huge pages. Then, additional | 124 | pages will first be promoted to persistent huge pages. Then, additional |
123 | huge pages will be allocated, if necessary and if possible, to fulfill | 125 | huge pages will be allocated, if necessary and if possible, to fulfill |
124 | the new huge page pool size. | 126 | the new persistent huge page pool size. |
125 | 127 | ||
126 | The administrator may shrink the pool of preallocated huge pages for | 128 | The administrator may shrink the pool of persistent huge pages for |
127 | the default huge page size by setting the nr_hugepages sysctl to a | 129 | the default huge page size by setting the nr_hugepages sysctl to a |
128 | smaller value. The kernel will attempt to balance the freeing of huge pages | 130 | smaller value. The kernel will attempt to balance the freeing of huge pages |
129 | across all on-line nodes. Any free huge pages on the selected nodes will | 131 | across all nodes in the memory policy of the task modifying nr_hugepages. |
130 | be freed back to the buddy allocator. | 132 | Any free huge pages on the selected nodes will be freed back to the kernel's |
131 | 133 | normal page pool. | |
132 | Caveat: Shrinking the pool via nr_hugepages such that it becomes less | 134 | |
133 | than the number of huge pages in use will convert the balance to surplus | 135 | Caveat: Shrinking the persistent huge page pool via nr_hugepages such that |
134 | huge pages even if it would exceed the overcommit value. As long as | 136 | it becomes less than the number of huge pages in use will convert the balance |
135 | this condition holds, however, no more surplus huge pages will be | 137 | of the in-use huge pages to surplus huge pages. This will occur even if |
136 | allowed on the system until one of the two sysctls are increased | 138 | the number of surplus pages it would exceed the overcommit value. As long as |
137 | sufficiently, or the surplus huge pages go out of use and are freed. | 139 | this condition holds--that is, until nr_hugepages+nr_overcommit_hugepages is |
140 | increased sufficiently, or the surplus huge pages go out of use and are freed-- | ||
141 | no more surplus huge pages will be allowed to be allocated. | ||
138 | 142 | ||
139 | With support for multiple huge page pools at run-time available, much of | 143 | With support for multiple huge page pools at run-time available, much of |
140 | the huge page userspace interface has been duplicated in sysfs. The above | 144 | the huge page userspace interface in /proc/sys/vm has been duplicated in sysfs. |
141 | information applies to the default huge page size which will be | 145 | The /proc interfaces discussed above have been retained for backwards |
142 | controlled by the /proc interfaces for backwards compatibility. The root | 146 | compatibility. The root huge page control directory in sysfs is: |
143 | huge page control directory in sysfs is: | ||
144 | 147 | ||
145 | /sys/kernel/mm/hugepages | 148 | /sys/kernel/mm/hugepages |
146 | 149 | ||
147 | For each huge page size supported by the running kernel, a subdirectory | 150 | For each huge page size supported by the running kernel, a subdirectory |
148 | will exist, of the form | 151 | will exist, of the form: |
149 | 152 | ||
150 | hugepages-${size}kB | 153 | hugepages-${size}kB |
151 | 154 | ||
152 | Inside each of these directories, the same set of files will exist: | 155 | Inside each of these directories, the same set of files will exist: |
153 | 156 | ||
154 | nr_hugepages | 157 | nr_hugepages |
158 | nr_hugepages_mempolicy | ||
155 | nr_overcommit_hugepages | 159 | nr_overcommit_hugepages |
156 | free_hugepages | 160 | free_hugepages |
157 | resv_hugepages | 161 | resv_hugepages |
@@ -159,6 +163,102 @@ Inside each of these directories, the same set of files will exist: | |||
159 | 163 | ||
160 | which function as described above for the default huge page-sized case. | 164 | which function as described above for the default huge page-sized case. |
161 | 165 | ||
166 | |||
167 | Interaction of Task Memory Policy with Huge Page Allocation/Freeing | ||
168 | |||
169 | Whether huge pages are allocated and freed via the /proc interface or | ||
170 | the /sysfs interface using the nr_hugepages_mempolicy attribute, the NUMA | ||
171 | nodes from which huge pages are allocated or freed are controlled by the | ||
172 | NUMA memory policy of the task that modifies the nr_hugepages_mempolicy | ||
173 | sysctl or attribute. When the nr_hugepages attribute is used, mempolicy | ||
174 | is ignored. | ||
175 | |||
176 | The recommended method to allocate or free huge pages to/from the kernel | ||
177 | huge page pool, using the nr_hugepages example above, is: | ||
178 | |||
179 | numactl --interleave <node-list> echo 20 \ | ||
180 | >/proc/sys/vm/nr_hugepages_mempolicy | ||
181 | |||
182 | or, more succinctly: | ||
183 | |||
184 | numactl -m <node-list> echo 20 >/proc/sys/vm/nr_hugepages_mempolicy | ||
185 | |||
186 | This will allocate or free abs(20 - nr_hugepages) to or from the nodes | ||
187 | specified in <node-list>, depending on whether number of persistent huge pages | ||
188 | is initially less than or greater than 20, respectively. No huge pages will be | ||
189 | allocated nor freed on any node not included in the specified <node-list>. | ||
190 | |||
191 | When adjusting the persistent hugepage count via nr_hugepages_mempolicy, any | ||
192 | memory policy mode--bind, preferred, local or interleave--may be used. The | ||
193 | resulting effect on persistent huge page allocation is as follows: | ||
194 | |||
195 | 1) Regardless of mempolicy mode [see Documentation/vm/numa_memory_policy.txt], | ||
196 | persistent huge pages will be distributed across the node or nodes | ||
197 | specified in the mempolicy as if "interleave" had been specified. | ||
198 | However, if a node in the policy does not contain sufficient contiguous | ||
199 | memory for a huge page, the allocation will not "fallback" to the nearest | ||
200 | neighbor node with sufficient contiguous memory. To do this would cause | ||
201 | undesirable imbalance in the distribution of the huge page pool, or | ||
202 | possibly, allocation of persistent huge pages on nodes not allowed by | ||
203 | the task's memory policy. | ||
204 | |||
205 | 2) One or more nodes may be specified with the bind or interleave policy. | ||
206 | If more than one node is specified with the preferred policy, only the | ||
207 | lowest numeric id will be used. Local policy will select the node where | ||
208 | the task is running at the time the nodes_allowed mask is constructed. | ||
209 | For local policy to be deterministic, the task must be bound to a cpu or | ||
210 | cpus in a single node. Otherwise, the task could be migrated to some | ||
211 | other node at any time after launch and the resulting node will be | ||
212 | indeterminate. Thus, local policy is not very useful for this purpose. | ||
213 | Any of the other mempolicy modes may be used to specify a single node. | ||
214 | |||
215 | 3) The nodes allowed mask will be derived from any non-default task mempolicy, | ||
216 | whether this policy was set explicitly by the task itself or one of its | ||
217 | ancestors, such as numactl. This means that if the task is invoked from a | ||
218 | shell with non-default policy, that policy will be used. One can specify a | ||
219 | node list of "all" with numactl --interleave or --membind [-m] to achieve | ||
220 | interleaving over all nodes in the system or cpuset. | ||
221 | |||
222 | 4) Any task mempolicy specifed--e.g., using numactl--will be constrained by | ||
223 | the resource limits of any cpuset in which the task runs. Thus, there will | ||
224 | be no way for a task with non-default policy running in a cpuset with a | ||
225 | subset of the system nodes to allocate huge pages outside the cpuset | ||
226 | without first moving to a cpuset that contains all of the desired nodes. | ||
227 | |||
228 | 5) Boot-time huge page allocation attempts to distribute the requested number | ||
229 | of huge pages over all on-lines nodes with memory. | ||
230 | |||
231 | Per Node Hugepages Attributes | ||
232 | |||
233 | A subset of the contents of the root huge page control directory in sysfs, | ||
234 | described above, will be replicated under each the system device of each | ||
235 | NUMA node with memory in: | ||
236 | |||
237 | /sys/devices/system/node/node[0-9]*/hugepages/ | ||
238 | |||
239 | Under this directory, the subdirectory for each supported huge page size | ||
240 | contains the following attribute files: | ||
241 | |||
242 | nr_hugepages | ||
243 | free_hugepages | ||
244 | surplus_hugepages | ||
245 | |||
246 | The free_' and surplus_' attribute files are read-only. They return the number | ||
247 | of free and surplus [overcommitted] huge pages, respectively, on the parent | ||
248 | node. | ||
249 | |||
250 | The nr_hugepages attribute returns the total number of huge pages on the | ||
251 | specified node. When this attribute is written, the number of persistent huge | ||
252 | pages on the parent node will be adjusted to the specified value, if sufficient | ||
253 | resources exist, regardless of the task's mempolicy or cpuset constraints. | ||
254 | |||
255 | Note that the number of overcommit and reserve pages remain global quantities, | ||
256 | as we don't know until fault time, when the faulting task's mempolicy is | ||
257 | applied, from which node the huge page allocation will be attempted. | ||
258 | |||
259 | |||
260 | Using Huge Pages | ||
261 | |||
162 | If the user applications are going to request huge pages using mmap system | 262 | If the user applications are going to request huge pages using mmap system |
163 | call, then it is required that system administrator mount a file system of | 263 | call, then it is required that system administrator mount a file system of |
164 | type hugetlbfs: | 264 | type hugetlbfs: |
@@ -199,184 +299,11 @@ map_hugetlb.c. | |||
199 | ******************************************************************* | 299 | ******************************************************************* |
200 | 300 | ||
201 | /* | 301 | /* |
202 | * Example of using huge page memory in a user application using Sys V shared | 302 | * hugepage-shm: see Documentation/vm/hugepage-shm.c |
203 | * memory system calls. In this example the app is requesting 256MB of | ||
204 | * memory that is backed by huge pages. The application uses the flag | ||
205 | * SHM_HUGETLB in the shmget system call to inform the kernel that it is | ||
206 | * requesting huge pages. | ||
207 | * | ||
208 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for | ||
209 | * huge pages. That means the addresses starting with 0x800000... will need | ||
210 | * to be specified. Specifying a fixed address is not required on ppc64, | ||
211 | * i386 or x86_64. | ||
212 | * | ||
213 | * Note: The default shared memory limit is quite low on many kernels, | ||
214 | * you may need to increase it via: | ||
215 | * | ||
216 | * echo 268435456 > /proc/sys/kernel/shmmax | ||
217 | * | ||
218 | * This will increase the maximum size per shared memory segment to 256MB. | ||
219 | * The other limit that you will hit eventually is shmall which is the | ||
220 | * total amount of shared memory in pages. To set it to 16GB on a system | ||
221 | * with a 4kB pagesize do: | ||
222 | * | ||
223 | * echo 4194304 > /proc/sys/kernel/shmall | ||
224 | */ | 303 | */ |
225 | #include <stdlib.h> | ||
226 | #include <stdio.h> | ||
227 | #include <sys/types.h> | ||
228 | #include <sys/ipc.h> | ||
229 | #include <sys/shm.h> | ||
230 | #include <sys/mman.h> | ||
231 | |||
232 | #ifndef SHM_HUGETLB | ||
233 | #define SHM_HUGETLB 04000 | ||
234 | #endif | ||
235 | |||
236 | #define LENGTH (256UL*1024*1024) | ||
237 | |||
238 | #define dprintf(x) printf(x) | ||
239 | |||
240 | /* Only ia64 requires this */ | ||
241 | #ifdef __ia64__ | ||
242 | #define ADDR (void *)(0x8000000000000000UL) | ||
243 | #define SHMAT_FLAGS (SHM_RND) | ||
244 | #else | ||
245 | #define ADDR (void *)(0x0UL) | ||
246 | #define SHMAT_FLAGS (0) | ||
247 | #endif | ||
248 | |||
249 | int main(void) | ||
250 | { | ||
251 | int shmid; | ||
252 | unsigned long i; | ||
253 | char *shmaddr; | ||
254 | |||
255 | if ((shmid = shmget(2, LENGTH, | ||
256 | SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) { | ||
257 | perror("shmget"); | ||
258 | exit(1); | ||
259 | } | ||
260 | printf("shmid: 0x%x\n", shmid); | ||
261 | |||
262 | shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS); | ||
263 | if (shmaddr == (char *)-1) { | ||
264 | perror("Shared memory attach failure"); | ||
265 | shmctl(shmid, IPC_RMID, NULL); | ||
266 | exit(2); | ||
267 | } | ||
268 | printf("shmaddr: %p\n", shmaddr); | ||
269 | |||
270 | dprintf("Starting the writes:\n"); | ||
271 | for (i = 0; i < LENGTH; i++) { | ||
272 | shmaddr[i] = (char)(i); | ||
273 | if (!(i % (1024 * 1024))) | ||
274 | dprintf("."); | ||
275 | } | ||
276 | dprintf("\n"); | ||
277 | |||
278 | dprintf("Starting the Check..."); | ||
279 | for (i = 0; i < LENGTH; i++) | ||
280 | if (shmaddr[i] != (char)i) | ||
281 | printf("\nIndex %lu mismatched\n", i); | ||
282 | dprintf("Done.\n"); | ||
283 | |||
284 | if (shmdt((const void *)shmaddr) != 0) { | ||
285 | perror("Detach failure"); | ||
286 | shmctl(shmid, IPC_RMID, NULL); | ||
287 | exit(3); | ||
288 | } | ||
289 | |||
290 | shmctl(shmid, IPC_RMID, NULL); | ||
291 | |||
292 | return 0; | ||
293 | } | ||
294 | 304 | ||
295 | ******************************************************************* | 305 | ******************************************************************* |
296 | 306 | ||
297 | /* | 307 | /* |
298 | * Example of using huge page memory in a user application using the mmap | 308 | * hugepage-mmap: see Documentation/vm/hugepage-mmap.c |
299 | * system call. Before running this application, make sure that the | ||
300 | * administrator has mounted the hugetlbfs filesystem (on some directory | ||
301 | * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this | ||
302 | * example, the app is requesting memory of size 256MB that is backed by | ||
303 | * huge pages. | ||
304 | * | ||
305 | * For ia64 architecture, Linux kernel reserves Region number 4 for huge pages. | ||
306 | * That means the addresses starting with 0x800000... will need to be | ||
307 | * specified. Specifying a fixed address is not required on ppc64, i386 | ||
308 | * or x86_64. | ||
309 | */ | 309 | */ |
310 | #include <stdlib.h> | ||
311 | #include <stdio.h> | ||
312 | #include <unistd.h> | ||
313 | #include <sys/mman.h> | ||
314 | #include <fcntl.h> | ||
315 | |||
316 | #define FILE_NAME "/mnt/hugepagefile" | ||
317 | #define LENGTH (256UL*1024*1024) | ||
318 | #define PROTECTION (PROT_READ | PROT_WRITE) | ||
319 | |||
320 | /* Only ia64 requires this */ | ||
321 | #ifdef __ia64__ | ||
322 | #define ADDR (void *)(0x8000000000000000UL) | ||
323 | #define FLAGS (MAP_SHARED | MAP_FIXED) | ||
324 | #else | ||
325 | #define ADDR (void *)(0x0UL) | ||
326 | #define FLAGS (MAP_SHARED) | ||
327 | #endif | ||
328 | |||
329 | void check_bytes(char *addr) | ||
330 | { | ||
331 | printf("First hex is %x\n", *((unsigned int *)addr)); | ||
332 | } | ||
333 | |||
334 | void write_bytes(char *addr) | ||
335 | { | ||
336 | unsigned long i; | ||
337 | |||
338 | for (i = 0; i < LENGTH; i++) | ||
339 | *(addr + i) = (char)i; | ||
340 | } | ||
341 | |||
342 | void read_bytes(char *addr) | ||
343 | { | ||
344 | unsigned long i; | ||
345 | |||
346 | check_bytes(addr); | ||
347 | for (i = 0; i < LENGTH; i++) | ||
348 | if (*(addr + i) != (char)i) { | ||
349 | printf("Mismatch at %lu\n", i); | ||
350 | break; | ||
351 | } | ||
352 | } | ||
353 | |||
354 | int main(void) | ||
355 | { | ||
356 | void *addr; | ||
357 | int fd; | ||
358 | |||
359 | fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755); | ||
360 | if (fd < 0) { | ||
361 | perror("Open failed"); | ||
362 | exit(1); | ||
363 | } | ||
364 | |||
365 | addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0); | ||
366 | if (addr == MAP_FAILED) { | ||
367 | perror("mmap"); | ||
368 | unlink(FILE_NAME); | ||
369 | exit(1); | ||
370 | } | ||
371 | |||
372 | printf("Returned address is %p\n", addr); | ||
373 | check_bytes(addr); | ||
374 | write_bytes(addr); | ||
375 | read_bytes(addr); | ||
376 | |||
377 | munmap(addr, LENGTH); | ||
378 | close(fd); | ||
379 | unlink(FILE_NAME); | ||
380 | |||
381 | return 0; | ||
382 | } | ||