diff options
Diffstat (limited to 'Documentation/vm')
-rw-r--r-- | Documentation/vm/00-INDEX | 16 | ||||
-rw-r--r-- | Documentation/vm/Makefile | 2 | ||||
-rw-r--r-- | Documentation/vm/hugepage-mmap.c | 91 | ||||
-rw-r--r-- | Documentation/vm/hugepage-shm.c | 98 | ||||
-rw-r--r-- | Documentation/vm/hugetlbpage.txt | 169 | ||||
-rw-r--r-- | Documentation/vm/map_hugetlb.c | 8 | ||||
-rw-r--r-- | Documentation/vm/numa | 186 | ||||
-rw-r--r-- | Documentation/vm/numa_memory_policy.txt | 4 | ||||
-rw-r--r-- | Documentation/vm/slub.txt | 1 |
9 files changed, 360 insertions, 215 deletions
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX index e57d6a9dd32b..dca82d7c83d8 100644 --- a/Documentation/vm/00-INDEX +++ b/Documentation/vm/00-INDEX | |||
@@ -4,23 +4,35 @@ active_mm.txt | |||
4 | - An explanation from Linus about tsk->active_mm vs tsk->mm. | 4 | - An explanation from Linus about tsk->active_mm vs tsk->mm. |
5 | balance | 5 | balance |
6 | - various information on memory balancing. | 6 | - various information on memory balancing. |
7 | hugepage-mmap.c | ||
8 | - Example app using huge page memory with the mmap system call. | ||
9 | hugepage-shm.c | ||
10 | - Example app using huge page memory with Sys V shared memory system calls. | ||
7 | hugetlbpage.txt | 11 | hugetlbpage.txt |
8 | - a brief summary of hugetlbpage support in the Linux kernel. | 12 | - a brief summary of hugetlbpage support in the Linux kernel. |
13 | hwpoison.txt | ||
14 | - explains what hwpoison is | ||
9 | ksm.txt | 15 | ksm.txt |
10 | - how to use the Kernel Samepage Merging feature. | 16 | - how to use the Kernel Samepage Merging feature. |
11 | locking | 17 | locking |
12 | - info on how locking and synchronization is done in the Linux vm code. | 18 | - info on how locking and synchronization is done in the Linux vm code. |
19 | map_hugetlb.c | ||
20 | - an example program that uses the MAP_HUGETLB mmap flag. | ||
13 | numa | 21 | numa |
14 | - information about NUMA specific code in the Linux vm. | 22 | - information about NUMA specific code in the Linux vm. |
15 | numa_memory_policy.txt | 23 | numa_memory_policy.txt |
16 | - documentation of concepts and APIs of the 2.6 memory policy support. | 24 | - documentation of concepts and APIs of the 2.6 memory policy support. |
17 | overcommit-accounting | 25 | overcommit-accounting |
18 | - description of the Linux kernels overcommit handling modes. | 26 | - description of the Linux kernels overcommit handling modes. |
27 | page-types.c | ||
28 | - Tool for querying page flags | ||
19 | page_migration | 29 | page_migration |
20 | - description of page migration in NUMA systems. | 30 | - description of page migration in NUMA systems. |
31 | pagemap.txt | ||
32 | - pagemap, from the userspace perspective | ||
21 | slabinfo.c | 33 | slabinfo.c |
22 | - source code for a tool to get reports about slabs. | 34 | - source code for a tool to get reports about slabs. |
23 | slub.txt | 35 | slub.txt |
24 | - a short users guide for SLUB. | 36 | - a short users guide for SLUB. |
25 | map_hugetlb.c | 37 | unevictable-lru.txt |
26 | - an example program that uses the MAP_HUGETLB mmap flag. | 38 | - Unevictable LRU infrastructure |
diff --git a/Documentation/vm/Makefile b/Documentation/vm/Makefile index 5bd269b3731a..9dcff328b964 100644 --- a/Documentation/vm/Makefile +++ b/Documentation/vm/Makefile | |||
@@ -2,7 +2,7 @@ | |||
2 | obj- := dummy.o | 2 | obj- := dummy.o |
3 | 3 | ||
4 | # List of programs to build | 4 | # List of programs to build |
5 | hostprogs-y := slabinfo page-types | 5 | hostprogs-y := slabinfo page-types hugepage-mmap hugepage-shm map_hugetlb |
6 | 6 | ||
7 | # Tell kbuild to always build the programs | 7 | # Tell kbuild to always build the programs |
8 | always := $(hostprogs-y) | 8 | always := $(hostprogs-y) |
diff --git a/Documentation/vm/hugepage-mmap.c b/Documentation/vm/hugepage-mmap.c new file mode 100644 index 000000000000..db0dd9a33d54 --- /dev/null +++ b/Documentation/vm/hugepage-mmap.c | |||
@@ -0,0 +1,91 @@ | |||
1 | /* | ||
2 | * hugepage-mmap: | ||
3 | * | ||
4 | * Example of using huge page memory in a user application using the mmap | ||
5 | * system call. Before running this application, make sure that the | ||
6 | * administrator has mounted the hugetlbfs filesystem (on some directory | ||
7 | * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this | ||
8 | * example, the app is requesting memory of size 256MB that is backed by | ||
9 | * huge pages. | ||
10 | * | ||
11 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for | ||
12 | * huge pages. That means that if one requires a fixed address, a huge page | ||
13 | * aligned address starting with 0x800000... will be required. If a fixed | ||
14 | * address is not required, the kernel will select an address in the proper | ||
15 | * range. | ||
16 | * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. | ||
17 | */ | ||
18 | |||
19 | #include <stdlib.h> | ||
20 | #include <stdio.h> | ||
21 | #include <unistd.h> | ||
22 | #include <sys/mman.h> | ||
23 | #include <fcntl.h> | ||
24 | |||
25 | #define FILE_NAME "/mnt/hugepagefile" | ||
26 | #define LENGTH (256UL*1024*1024) | ||
27 | #define PROTECTION (PROT_READ | PROT_WRITE) | ||
28 | |||
29 | /* Only ia64 requires this */ | ||
30 | #ifdef __ia64__ | ||
31 | #define ADDR (void *)(0x8000000000000000UL) | ||
32 | #define FLAGS (MAP_SHARED | MAP_FIXED) | ||
33 | #else | ||
34 | #define ADDR (void *)(0x0UL) | ||
35 | #define FLAGS (MAP_SHARED) | ||
36 | #endif | ||
37 | |||
38 | static void check_bytes(char *addr) | ||
39 | { | ||
40 | printf("First hex is %x\n", *((unsigned int *)addr)); | ||
41 | } | ||
42 | |||
43 | static void write_bytes(char *addr) | ||
44 | { | ||
45 | unsigned long i; | ||
46 | |||
47 | for (i = 0; i < LENGTH; i++) | ||
48 | *(addr + i) = (char)i; | ||
49 | } | ||
50 | |||
51 | static void read_bytes(char *addr) | ||
52 | { | ||
53 | unsigned long i; | ||
54 | |||
55 | check_bytes(addr); | ||
56 | for (i = 0; i < LENGTH; i++) | ||
57 | if (*(addr + i) != (char)i) { | ||
58 | printf("Mismatch at %lu\n", i); | ||
59 | break; | ||
60 | } | ||
61 | } | ||
62 | |||
63 | int main(void) | ||
64 | { | ||
65 | void *addr; | ||
66 | int fd; | ||
67 | |||
68 | fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755); | ||
69 | if (fd < 0) { | ||
70 | perror("Open failed"); | ||
71 | exit(1); | ||
72 | } | ||
73 | |||
74 | addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0); | ||
75 | if (addr == MAP_FAILED) { | ||
76 | perror("mmap"); | ||
77 | unlink(FILE_NAME); | ||
78 | exit(1); | ||
79 | } | ||
80 | |||
81 | printf("Returned address is %p\n", addr); | ||
82 | check_bytes(addr); | ||
83 | write_bytes(addr); | ||
84 | read_bytes(addr); | ||
85 | |||
86 | munmap(addr, LENGTH); | ||
87 | close(fd); | ||
88 | unlink(FILE_NAME); | ||
89 | |||
90 | return 0; | ||
91 | } | ||
diff --git a/Documentation/vm/hugepage-shm.c b/Documentation/vm/hugepage-shm.c new file mode 100644 index 000000000000..07956d8592c9 --- /dev/null +++ b/Documentation/vm/hugepage-shm.c | |||
@@ -0,0 +1,98 @@ | |||
1 | /* | ||
2 | * hugepage-shm: | ||
3 | * | ||
4 | * Example of using huge page memory in a user application using Sys V shared | ||
5 | * memory system calls. In this example the app is requesting 256MB of | ||
6 | * memory that is backed by huge pages. The application uses the flag | ||
7 | * SHM_HUGETLB in the shmget system call to inform the kernel that it is | ||
8 | * requesting huge pages. | ||
9 | * | ||
10 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for | ||
11 | * huge pages. That means that if one requires a fixed address, a huge page | ||
12 | * aligned address starting with 0x800000... will be required. If a fixed | ||
13 | * address is not required, the kernel will select an address in the proper | ||
14 | * range. | ||
15 | * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. | ||
16 | * | ||
17 | * Note: The default shared memory limit is quite low on many kernels, | ||
18 | * you may need to increase it via: | ||
19 | * | ||
20 | * echo 268435456 > /proc/sys/kernel/shmmax | ||
21 | * | ||
22 | * This will increase the maximum size per shared memory segment to 256MB. | ||
23 | * The other limit that you will hit eventually is shmall which is the | ||
24 | * total amount of shared memory in pages. To set it to 16GB on a system | ||
25 | * with a 4kB pagesize do: | ||
26 | * | ||
27 | * echo 4194304 > /proc/sys/kernel/shmall | ||
28 | */ | ||
29 | |||
30 | #include <stdlib.h> | ||
31 | #include <stdio.h> | ||
32 | #include <sys/types.h> | ||
33 | #include <sys/ipc.h> | ||
34 | #include <sys/shm.h> | ||
35 | #include <sys/mman.h> | ||
36 | |||
37 | #ifndef SHM_HUGETLB | ||
38 | #define SHM_HUGETLB 04000 | ||
39 | #endif | ||
40 | |||
41 | #define LENGTH (256UL*1024*1024) | ||
42 | |||
43 | #define dprintf(x) printf(x) | ||
44 | |||
45 | /* Only ia64 requires this */ | ||
46 | #ifdef __ia64__ | ||
47 | #define ADDR (void *)(0x8000000000000000UL) | ||
48 | #define SHMAT_FLAGS (SHM_RND) | ||
49 | #else | ||
50 | #define ADDR (void *)(0x0UL) | ||
51 | #define SHMAT_FLAGS (0) | ||
52 | #endif | ||
53 | |||
54 | int main(void) | ||
55 | { | ||
56 | int shmid; | ||
57 | unsigned long i; | ||
58 | char *shmaddr; | ||
59 | |||
60 | if ((shmid = shmget(2, LENGTH, | ||
61 | SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) { | ||
62 | perror("shmget"); | ||
63 | exit(1); | ||
64 | } | ||
65 | printf("shmid: 0x%x\n", shmid); | ||
66 | |||
67 | shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS); | ||
68 | if (shmaddr == (char *)-1) { | ||
69 | perror("Shared memory attach failure"); | ||
70 | shmctl(shmid, IPC_RMID, NULL); | ||
71 | exit(2); | ||
72 | } | ||
73 | printf("shmaddr: %p\n", shmaddr); | ||
74 | |||
75 | dprintf("Starting the writes:\n"); | ||
76 | for (i = 0; i < LENGTH; i++) { | ||
77 | shmaddr[i] = (char)(i); | ||
78 | if (!(i % (1024 * 1024))) | ||
79 | dprintf("."); | ||
80 | } | ||
81 | dprintf("\n"); | ||
82 | |||
83 | dprintf("Starting the Check..."); | ||
84 | for (i = 0; i < LENGTH; i++) | ||
85 | if (shmaddr[i] != (char)i) | ||
86 | printf("\nIndex %lu mismatched\n", i); | ||
87 | dprintf("Done.\n"); | ||
88 | |||
89 | if (shmdt((const void *)shmaddr) != 0) { | ||
90 | perror("Detach failure"); | ||
91 | shmctl(shmid, IPC_RMID, NULL); | ||
92 | exit(3); | ||
93 | } | ||
94 | |||
95 | shmctl(shmid, IPC_RMID, NULL); | ||
96 | |||
97 | return 0; | ||
98 | } | ||
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index bc31636973e3..457634c1e03e 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt | |||
@@ -299,176 +299,11 @@ map_hugetlb.c. | |||
299 | ******************************************************************* | 299 | ******************************************************************* |
300 | 300 | ||
301 | /* | 301 | /* |
302 | * Example of using huge page memory in a user application using Sys V shared | 302 | * hugepage-shm: see Documentation/vm/hugepage-shm.c |
303 | * memory system calls. In this example the app is requesting 256MB of | ||
304 | * memory that is backed by huge pages. The application uses the flag | ||
305 | * SHM_HUGETLB in the shmget system call to inform the kernel that it is | ||
306 | * requesting huge pages. | ||
307 | * | ||
308 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for | ||
309 | * huge pages. That means that if one requires a fixed address, a huge page | ||
310 | * aligned address starting with 0x800000... will be required. If a fixed | ||
311 | * address is not required, the kernel will select an address in the proper | ||
312 | * range. | ||
313 | * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. | ||
314 | * | ||
315 | * Note: The default shared memory limit is quite low on many kernels, | ||
316 | * you may need to increase it via: | ||
317 | * | ||
318 | * echo 268435456 > /proc/sys/kernel/shmmax | ||
319 | * | ||
320 | * This will increase the maximum size per shared memory segment to 256MB. | ||
321 | * The other limit that you will hit eventually is shmall which is the | ||
322 | * total amount of shared memory in pages. To set it to 16GB on a system | ||
323 | * with a 4kB pagesize do: | ||
324 | * | ||
325 | * echo 4194304 > /proc/sys/kernel/shmall | ||
326 | */ | 303 | */ |
327 | #include <stdlib.h> | ||
328 | #include <stdio.h> | ||
329 | #include <sys/types.h> | ||
330 | #include <sys/ipc.h> | ||
331 | #include <sys/shm.h> | ||
332 | #include <sys/mman.h> | ||
333 | |||
334 | #ifndef SHM_HUGETLB | ||
335 | #define SHM_HUGETLB 04000 | ||
336 | #endif | ||
337 | |||
338 | #define LENGTH (256UL*1024*1024) | ||
339 | |||
340 | #define dprintf(x) printf(x) | ||
341 | |||
342 | #define ADDR (void *)(0x0UL) /* let kernel choose address */ | ||
343 | #define SHMAT_FLAGS (0) | ||
344 | |||
345 | int main(void) | ||
346 | { | ||
347 | int shmid; | ||
348 | unsigned long i; | ||
349 | char *shmaddr; | ||
350 | |||
351 | if ((shmid = shmget(2, LENGTH, | ||
352 | SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) { | ||
353 | perror("shmget"); | ||
354 | exit(1); | ||
355 | } | ||
356 | printf("shmid: 0x%x\n", shmid); | ||
357 | |||
358 | shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS); | ||
359 | if (shmaddr == (char *)-1) { | ||
360 | perror("Shared memory attach failure"); | ||
361 | shmctl(shmid, IPC_RMID, NULL); | ||
362 | exit(2); | ||
363 | } | ||
364 | printf("shmaddr: %p\n", shmaddr); | ||
365 | |||
366 | dprintf("Starting the writes:\n"); | ||
367 | for (i = 0; i < LENGTH; i++) { | ||
368 | shmaddr[i] = (char)(i); | ||
369 | if (!(i % (1024 * 1024))) | ||
370 | dprintf("."); | ||
371 | } | ||
372 | dprintf("\n"); | ||
373 | |||
374 | dprintf("Starting the Check..."); | ||
375 | for (i = 0; i < LENGTH; i++) | ||
376 | if (shmaddr[i] != (char)i) | ||
377 | printf("\nIndex %lu mismatched\n", i); | ||
378 | dprintf("Done.\n"); | ||
379 | |||
380 | if (shmdt((const void *)shmaddr) != 0) { | ||
381 | perror("Detach failure"); | ||
382 | shmctl(shmid, IPC_RMID, NULL); | ||
383 | exit(3); | ||
384 | } | ||
385 | |||
386 | shmctl(shmid, IPC_RMID, NULL); | ||
387 | |||
388 | return 0; | ||
389 | } | ||
390 | 304 | ||
391 | ******************************************************************* | 305 | ******************************************************************* |
392 | 306 | ||
393 | /* | 307 | /* |
394 | * Example of using huge page memory in a user application using the mmap | 308 | * hugepage-mmap: see Documentation/vm/hugepage-mmap.c |
395 | * system call. Before running this application, make sure that the | ||
396 | * administrator has mounted the hugetlbfs filesystem (on some directory | ||
397 | * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this | ||
398 | * example, the app is requesting memory of size 256MB that is backed by | ||
399 | * huge pages. | ||
400 | * | ||
401 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for | ||
402 | * huge pages. That means that if one requires a fixed address, a huge page | ||
403 | * aligned address starting with 0x800000... will be required. If a fixed | ||
404 | * address is not required, the kernel will select an address in the proper | ||
405 | * range. | ||
406 | * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. | ||
407 | */ | 309 | */ |
408 | #include <stdlib.h> | ||
409 | #include <stdio.h> | ||
410 | #include <unistd.h> | ||
411 | #include <sys/mman.h> | ||
412 | #include <fcntl.h> | ||
413 | |||
414 | #define FILE_NAME "/mnt/hugepagefile" | ||
415 | #define LENGTH (256UL*1024*1024) | ||
416 | #define PROTECTION (PROT_READ | PROT_WRITE) | ||
417 | |||
418 | #define ADDR (void *)(0x0UL) /* let kernel choose address */ | ||
419 | #define FLAGS (MAP_SHARED) | ||
420 | |||
421 | void check_bytes(char *addr) | ||
422 | { | ||
423 | printf("First hex is %x\n", *((unsigned int *)addr)); | ||
424 | } | ||
425 | |||
426 | void write_bytes(char *addr) | ||
427 | { | ||
428 | unsigned long i; | ||
429 | |||
430 | for (i = 0; i < LENGTH; i++) | ||
431 | *(addr + i) = (char)i; | ||
432 | } | ||
433 | |||
434 | void read_bytes(char *addr) | ||
435 | { | ||
436 | unsigned long i; | ||
437 | |||
438 | check_bytes(addr); | ||
439 | for (i = 0; i < LENGTH; i++) | ||
440 | if (*(addr + i) != (char)i) { | ||
441 | printf("Mismatch at %lu\n", i); | ||
442 | break; | ||
443 | } | ||
444 | } | ||
445 | |||
446 | int main(void) | ||
447 | { | ||
448 | void *addr; | ||
449 | int fd; | ||
450 | |||
451 | fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755); | ||
452 | if (fd < 0) { | ||
453 | perror("Open failed"); | ||
454 | exit(1); | ||
455 | } | ||
456 | |||
457 | addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0); | ||
458 | if (addr == MAP_FAILED) { | ||
459 | perror("mmap"); | ||
460 | unlink(FILE_NAME); | ||
461 | exit(1); | ||
462 | } | ||
463 | |||
464 | printf("Returned address is %p\n", addr); | ||
465 | check_bytes(addr); | ||
466 | write_bytes(addr); | ||
467 | read_bytes(addr); | ||
468 | |||
469 | munmap(addr, LENGTH); | ||
470 | close(fd); | ||
471 | unlink(FILE_NAME); | ||
472 | |||
473 | return 0; | ||
474 | } | ||
diff --git a/Documentation/vm/map_hugetlb.c b/Documentation/vm/map_hugetlb.c index e2bdae37f499..eda1a6d3578a 100644 --- a/Documentation/vm/map_hugetlb.c +++ b/Documentation/vm/map_hugetlb.c | |||
@@ -19,7 +19,7 @@ | |||
19 | #define PROTECTION (PROT_READ | PROT_WRITE) | 19 | #define PROTECTION (PROT_READ | PROT_WRITE) |
20 | 20 | ||
21 | #ifndef MAP_HUGETLB | 21 | #ifndef MAP_HUGETLB |
22 | #define MAP_HUGETLB 0x40 | 22 | #define MAP_HUGETLB 0x40000 /* arch specific */ |
23 | #endif | 23 | #endif |
24 | 24 | ||
25 | /* Only ia64 requires this */ | 25 | /* Only ia64 requires this */ |
@@ -31,12 +31,12 @@ | |||
31 | #define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) | 31 | #define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) |
32 | #endif | 32 | #endif |
33 | 33 | ||
34 | void check_bytes(char *addr) | 34 | static void check_bytes(char *addr) |
35 | { | 35 | { |
36 | printf("First hex is %x\n", *((unsigned int *)addr)); | 36 | printf("First hex is %x\n", *((unsigned int *)addr)); |
37 | } | 37 | } |
38 | 38 | ||
39 | void write_bytes(char *addr) | 39 | static void write_bytes(char *addr) |
40 | { | 40 | { |
41 | unsigned long i; | 41 | unsigned long i; |
42 | 42 | ||
@@ -44,7 +44,7 @@ void write_bytes(char *addr) | |||
44 | *(addr + i) = (char)i; | 44 | *(addr + i) = (char)i; |
45 | } | 45 | } |
46 | 46 | ||
47 | void read_bytes(char *addr) | 47 | static void read_bytes(char *addr) |
48 | { | 48 | { |
49 | unsigned long i; | 49 | unsigned long i; |
50 | 50 | ||
diff --git a/Documentation/vm/numa b/Documentation/vm/numa index e93ad9425e2a..a200a386429d 100644 --- a/Documentation/vm/numa +++ b/Documentation/vm/numa | |||
@@ -1,41 +1,149 @@ | |||
1 | Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com> | 1 | Started Nov 1999 by Kanoj Sarcar <kanoj@sgi.com> |
2 | 2 | ||
3 | The intent of this file is to have an uptodate, running commentary | 3 | What is NUMA? |
4 | from different people about NUMA specific code in the Linux vm. | 4 | |
5 | 5 | This question can be answered from a couple of perspectives: the | |
6 | What is NUMA? It is an architecture where the memory access times | 6 | hardware view and the Linux software view. |
7 | for different regions of memory from a given processor varies | 7 | |
8 | according to the "distance" of the memory region from the processor. | 8 | From the hardware perspective, a NUMA system is a computer platform that |
9 | Each region of memory to which access times are the same from any | 9 | comprises multiple components or assemblies each of which may contain 0 |
10 | cpu, is called a node. On such architectures, it is beneficial if | 10 | or more CPUs, local memory, and/or IO buses. For brevity and to |
11 | the kernel tries to minimize inter node communications. Schemes | 11 | disambiguate the hardware view of these physical components/assemblies |
12 | for this range from kernel text and read-only data replication | 12 | from the software abstraction thereof, we'll call the components/assemblies |
13 | across nodes, and trying to house all the data structures that | 13 | 'cells' in this document. |
14 | key components of the kernel need on memory on that node. | 14 | |
15 | 15 | Each of the 'cells' may be viewed as an SMP [symmetric multi-processor] subset | |
16 | Currently, all the numa support is to provide efficient handling | 16 | of the system--although some components necessary for a stand-alone SMP system |
17 | of widely discontiguous physical memory, so architectures which | 17 | may not be populated on any given cell. The cells of the NUMA system are |
18 | are not NUMA but can have huge holes in the physical address space | 18 | connected together with some sort of system interconnect--e.g., a crossbar or |
19 | can use the same code. All this code is bracketed by CONFIG_DISCONTIGMEM. | 19 | point-to-point link are common types of NUMA system interconnects. Both of |
20 | 20 | these types of interconnects can be aggregated to create NUMA platforms with | |
21 | The initial port includes NUMAizing the bootmem allocator code by | 21 | cells at multiple distances from other cells. |
22 | encapsulating all the pieces of information into a bootmem_data_t | 22 | |
23 | structure. Node specific calls have been added to the allocator. | 23 | For Linux, the NUMA platforms of interest are primarily what is known as Cache |
24 | In theory, any platform which uses the bootmem allocator should | 24 | Coherent NUMA or ccNUMA systems. With ccNUMA systems, all memory is visible |
25 | be able to put the bootmem and mem_map data structures anywhere | 25 | to and accessible from any CPU attached to any cell and cache coherency |
26 | it deems best. | 26 | is handled in hardware by the processor caches and/or the system interconnect. |
27 | 27 | ||
28 | Each node's page allocation data structures have also been encapsulated | 28 | Memory access time and effective memory bandwidth varies depending on how far |
29 | into a pg_data_t. The bootmem_data_t is just one part of this. To | 29 | away the cell containing the CPU or IO bus making the memory access is from the |
30 | make the code look uniform between NUMA and regular UMA platforms, | 30 | cell containing the target memory. For example, access to memory by CPUs |
31 | UMA platforms have a statically allocated pg_data_t too (contig_page_data). | 31 | attached to the same cell will experience faster access times and higher |
32 | For the sake of uniformity, the function num_online_nodes() is also defined | 32 | bandwidths than accesses to memory on other, remote cells. NUMA platforms |
33 | for all platforms. As we run benchmarks, we might decide to NUMAize | 33 | can have cells at multiple remote distances from any given cell. |
34 | more variables like low_on_memory, nr_free_pages etc into the pg_data_t. | 34 | |
35 | 35 | Platform vendors don't build NUMA systems just to make software developers' | |
36 | The NUMA aware page allocation code currently tries to allocate pages | 36 | lives interesting. Rather, this architecture is a means to provide scalable |
37 | from different nodes in a round robin manner. This will be changed to | 37 | memory bandwidth. However, to achieve scalable memory bandwidth, system and |
38 | do concentratic circle search, starting from current node, once the | 38 | application software must arrange for a large majority of the memory references |
39 | NUMA port achieves more maturity. The call alloc_pages_node has been | 39 | [cache misses] to be to "local" memory--memory on the same cell, if any--or |
40 | added, so that drivers can make the call and not worry about whether | 40 | to the closest cell with memory. |
41 | it is running on a NUMA or UMA platform. | 41 | |
42 | This leads to the Linux software view of a NUMA system: | ||
43 | |||
44 | Linux divides the system's hardware resources into multiple software | ||
45 | abstractions called "nodes". Linux maps the nodes onto the physical cells | ||
46 | of the hardware platform, abstracting away some of the details for some | ||
47 | architectures. As with physical cells, software nodes may contain 0 or more | ||
48 | CPUs, memory and/or IO buses. And, again, memory accesses to memory on | ||
49 | "closer" nodes--nodes that map to closer cells--will generally experience | ||
50 | faster access times and higher effective bandwidth than accesses to more | ||
51 | remote cells. | ||
52 | |||
53 | For some architectures, such as x86, Linux will "hide" any node representing a | ||
54 | physical cell that has no memory attached, and reassign any CPUs attached to | ||
55 | that cell to a node representing a cell that does have memory. Thus, on | ||
56 | these architectures, one cannot assume that all CPUs that Linux associates with | ||
57 | a given node will see the same local memory access times and bandwidth. | ||
58 | |||
59 | In addition, for some architectures, again x86 is an example, Linux supports | ||
60 | the emulation of additional nodes. For NUMA emulation, linux will carve up | ||
61 | the existing nodes--or the system memory for non-NUMA platforms--into multiple | ||
62 | nodes. Each emulated node will manage a fraction of the underlying cells' | ||
63 | physical memory. NUMA emluation is useful for testing NUMA kernel and | ||
64 | application features on non-NUMA platforms, and as a sort of memory resource | ||
65 | management mechanism when used together with cpusets. | ||
66 | [see Documentation/cgroups/cpusets.txt] | ||
67 | |||
68 | For each node with memory, Linux constructs an independent memory management | ||
69 | subsystem, complete with its own free page lists, in-use page lists, usage | ||
70 | statistics and locks to mediate access. In addition, Linux constructs for | ||
71 | each memory zone [one or more of DMA, DMA32, NORMAL, HIGH_MEMORY, MOVABLE], | ||
72 | an ordered "zonelist". A zonelist specifies the zones/nodes to visit when a | ||
73 | selected zone/node cannot satisfy the allocation request. This situation, | ||
74 | when a zone has no available memory to satisfy a request, is called | ||
75 | "overflow" or "fallback". | ||
76 | |||
77 | Because some nodes contain multiple zones containing different types of | ||
78 | memory, Linux must decide whether to order the zonelists such that allocations | ||
79 | fall back to the same zone type on a different node, or to a different zone | ||
80 | type on the same node. This is an important consideration because some zones, | ||
81 | such as DMA or DMA32, represent relatively scarce resources. Linux chooses | ||
82 | a default zonelist order based on the sizes of the various zone types relative | ||
83 | to the total memory of the node and the total memory of the system. The | ||
84 | default zonelist order may be overridden using the numa_zonelist_order kernel | ||
85 | boot parameter or sysctl. [see Documentation/kernel-parameters.txt and | ||
86 | Documentation/sysctl/vm.txt] | ||
87 | |||
88 | By default, Linux will attempt to satisfy memory allocation requests from the | ||
89 | node to which the CPU that executes the request is assigned. Specifically, | ||
90 | Linux will attempt to allocate from the first node in the appropriate zonelist | ||
91 | for the node where the request originates. This is called "local allocation." | ||
92 | If the "local" node cannot satisfy the request, the kernel will examine other | ||
93 | nodes' zones in the selected zonelist looking for the first zone in the list | ||
94 | that can satisfy the request. | ||
95 | |||
96 | Local allocation will tend to keep subsequent access to the allocated memory | ||
97 | "local" to the underlying physical resources and off the system interconnect-- | ||
98 | as long as the task on whose behalf the kernel allocated some memory does not | ||
99 | later migrate away from that memory. The Linux scheduler is aware of the | ||
100 | NUMA topology of the platform--embodied in the "scheduling domains" data | ||
101 | structures [see Documentation/scheduler/sched-domains.txt]--and the scheduler | ||
102 | attempts to minimize task migration to distant scheduling domains. However, | ||
103 | the scheduler does not take a task's NUMA footprint into account directly. | ||
104 | Thus, under sufficient imbalance, tasks can migrate between nodes, remote | ||
105 | from their initial node and kernel data structures. | ||
106 | |||
107 | System administrators and application designers can restrict a task's migration | ||
108 | to improve NUMA locality using various CPU affinity command line interfaces, | ||
109 | such as taskset(1) and numactl(1), and program interfaces such as | ||
110 | sched_setaffinity(2). Further, one can modify the kernel's default local | ||
111 | allocation behavior using Linux NUMA memory policy. | ||
112 | [see Documentation/vm/numa_memory_policy.] | ||
113 | |||
114 | System administrators can restrict the CPUs and nodes' memories that a non- | ||
115 | privileged user can specify in the scheduling or NUMA commands and functions | ||
116 | using control groups and CPUsets. [see Documentation/cgroups/CPUsets.txt] | ||
117 | |||
118 | On architectures that do not hide memoryless nodes, Linux will include only | ||
119 | zones [nodes] with memory in the zonelists. This means that for a memoryless | ||
120 | node the "local memory node"--the node of the first zone in CPU's node's | ||
121 | zonelist--will not be the node itself. Rather, it will be the node that the | ||
122 | kernel selected as the nearest node with memory when it built the zonelists. | ||
123 | So, default, local allocations will succeed with the kernel supplying the | ||
124 | closest available memory. This is a consequence of the same mechanism that | ||
125 | allows such allocations to fallback to other nearby nodes when a node that | ||
126 | does contain memory overflows. | ||
127 | |||
128 | Some kernel allocations do not want or cannot tolerate this allocation fallback | ||
129 | behavior. Rather they want to be sure they get memory from the specified node | ||
130 | or get notified that the node has no free memory. This is usually the case when | ||
131 | a subsystem allocates per CPU memory resources, for example. | ||
132 | |||
133 | A typical model for making such an allocation is to obtain the node id of the | ||
134 | node to which the "current CPU" is attached using one of the kernel's | ||
135 | numa_node_id() or CPU_to_node() functions and then request memory from only | ||
136 | the node id returned. When such an allocation fails, the requesting subsystem | ||
137 | may revert to its own fallback path. The slab kernel memory allocator is an | ||
138 | example of this. Or, the subsystem may choose to disable or not to enable | ||
139 | itself on allocation failure. The kernel profiling subsystem is an example of | ||
140 | this. | ||
141 | |||
142 | If the architecture supports--does not hide--memoryless nodes, then CPUs | ||
143 | attached to memoryless nodes would always incur the fallback path overhead | ||
144 | or some subsystems would fail to initialize if they attempted to allocated | ||
145 | memory exclusively from a node without memory. To support such | ||
146 | architectures transparently, kernel subsystems can use the numa_mem_id() | ||
147 | or cpu_to_mem() function to locate the "local memory node" for the calling or | ||
148 | specified CPU. Again, this is the same node from which default, local page | ||
149 | allocations will be attempted. | ||
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt index be45dbb9d7f2..6690fc34ef6d 100644 --- a/Documentation/vm/numa_memory_policy.txt +++ b/Documentation/vm/numa_memory_policy.txt | |||
@@ -45,7 +45,7 @@ most general to most specific: | |||
45 | to establish the task policy for a child task exec()'d from an | 45 | to establish the task policy for a child task exec()'d from an |
46 | executable image that has no awareness of memory policy. See the | 46 | executable image that has no awareness of memory policy. See the |
47 | MEMORY POLICY APIS section, below, for an overview of the system call | 47 | MEMORY POLICY APIS section, below, for an overview of the system call |
48 | that a task may use to set/change it's task/process policy. | 48 | that a task may use to set/change its task/process policy. |
49 | 49 | ||
50 | In a multi-threaded task, task policies apply only to the thread | 50 | In a multi-threaded task, task policies apply only to the thread |
51 | [Linux kernel task] that installs the policy and any threads | 51 | [Linux kernel task] that installs the policy and any threads |
@@ -301,7 +301,7 @@ decrement this reference count, respectively. mpol_put() will only free | |||
301 | the structure back to the mempolicy kmem cache when the reference count | 301 | the structure back to the mempolicy kmem cache when the reference count |
302 | goes to zero. | 302 | goes to zero. |
303 | 303 | ||
304 | When a new memory policy is allocated, it's reference count is initialized | 304 | When a new memory policy is allocated, its reference count is initialized |
305 | to '1', representing the reference held by the task that is installing the | 305 | to '1', representing the reference held by the task that is installing the |
306 | new policy. When a pointer to a memory policy structure is stored in another | 306 | new policy. When a pointer to a memory policy structure is stored in another |
307 | structure, another reference is added, as the task's reference will be dropped | 307 | structure, another reference is added, as the task's reference will be dropped |
diff --git a/Documentation/vm/slub.txt b/Documentation/vm/slub.txt index b37300edf27c..07375e73981a 100644 --- a/Documentation/vm/slub.txt +++ b/Documentation/vm/slub.txt | |||
@@ -41,6 +41,7 @@ Possible debug options are | |||
41 | P Poisoning (object and padding) | 41 | P Poisoning (object and padding) |
42 | U User tracking (free and alloc) | 42 | U User tracking (free and alloc) |
43 | T Trace (please only use on single slabs) | 43 | T Trace (please only use on single slabs) |
44 | A Toggle failslab filter mark for the cache | ||
44 | O Switch debugging off for caches that would have | 45 | O Switch debugging off for caches that would have |
45 | caused higher minimum slab orders | 46 | caused higher minimum slab orders |
46 | - Switch all debugging off (useful if the kernel is | 47 | - Switch all debugging off (useful if the kernel is |