diff options
Diffstat (limited to 'Documentation')
29 files changed, 1310 insertions, 172 deletions
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl index 8e145857fc9d..df0d089d0fb9 100644 --- a/Documentation/DocBook/mtdnand.tmpl +++ b/Documentation/DocBook/mtdnand.tmpl | |||
@@ -568,7 +568,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip) | |||
568 | <para> | 568 | <para> |
569 | The blocks in which the tables are stored are procteted against | 569 | The blocks in which the tables are stored are procteted against |
570 | accidental access by marking them bad in the memory bad block | 570 | accidental access by marking them bad in the memory bad block |
571 | table. The bad block table managment functions are allowed | 571 | table. The bad block table management functions are allowed |
572 | to circumvernt this protection. | 572 | to circumvernt this protection. |
573 | </para> | 573 | </para> |
574 | <para> | 574 | <para> |
diff --git a/Documentation/DocBook/scsi.tmpl b/Documentation/DocBook/scsi.tmpl index 10a150ae2a7e..d87f4569e768 100644 --- a/Documentation/DocBook/scsi.tmpl +++ b/Documentation/DocBook/scsi.tmpl | |||
@@ -317,7 +317,7 @@ | |||
317 | <para> | 317 | <para> |
318 | The SAS transport class contains common code to deal with SAS HBAs, | 318 | The SAS transport class contains common code to deal with SAS HBAs, |
319 | an aproximated representation of SAS topologies in the driver model, | 319 | an aproximated representation of SAS topologies in the driver model, |
320 | and various sysfs attributes to expose these topologies and managment | 320 | and various sysfs attributes to expose these topologies and management |
321 | interfaces to userspace. | 321 | interfaces to userspace. |
322 | </para> | 322 | </para> |
323 | <para> | 323 | <para> |
diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches index 5c555a8b39e5..b7f9d3b4bbf6 100644 --- a/Documentation/SubmittingPatches +++ b/Documentation/SubmittingPatches | |||
@@ -183,7 +183,7 @@ the MAN-PAGES maintainer (as listed in the MAINTAINERS file) | |||
183 | a man-pages patch, or at least a notification of the change, | 183 | a man-pages patch, or at least a notification of the change, |
184 | so that some information makes its way into the manual pages. | 184 | so that some information makes its way into the manual pages. |
185 | 185 | ||
186 | Even if the maintainer did not respond in step #4, make sure to ALWAYS | 186 | Even if the maintainer did not respond in step #5, make sure to ALWAYS |
187 | copy the maintainer when you change their code. | 187 | copy the maintainer when you change their code. |
188 | 188 | ||
189 | For small patches you may want to CC the Trivial Patch Monkey | 189 | For small patches you may want to CC the Trivial Patch Monkey |
diff --git a/Documentation/filesystems/nfs41-server.txt b/Documentation/filesystems/nfs41-server.txt index 05d81cbcb2e1..5920fe26e6ff 100644 --- a/Documentation/filesystems/nfs41-server.txt +++ b/Documentation/filesystems/nfs41-server.txt | |||
@@ -11,6 +11,11 @@ the /proc/fs/nfsd/versions control file. Note that to write this | |||
11 | control file, the nfsd service must be taken down. Use your user-mode | 11 | control file, the nfsd service must be taken down. Use your user-mode |
12 | nfs-utils to set this up; see rpc.nfsd(8) | 12 | nfs-utils to set this up; see rpc.nfsd(8) |
13 | 13 | ||
14 | (Warning: older servers will interpret "+4.1" and "-4.1" as "+4" and | ||
15 | "-4", respectively. Therefore, code meant to work on both new and old | ||
16 | kernels must turn 4.1 on or off *before* turning support for version 4 | ||
17 | on or off; rpc.nfsd does this correctly.) | ||
18 | |||
14 | The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based | 19 | The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based |
15 | on the latest NFSv4.1 Internet Draft: | 20 | on the latest NFSv4.1 Internet Draft: |
16 | http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29 | 21 | http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29 |
@@ -25,6 +30,49 @@ are still under development out of tree. | |||
25 | See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design | 30 | See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design |
26 | for more information. | 31 | for more information. |
27 | 32 | ||
33 | The current implementation is intended for developers only: while it | ||
34 | does support ordinary file operations on clients we have tested against | ||
35 | (including the linux client), it is incomplete in ways which may limit | ||
36 | features unexpectedly, cause known bugs in rare cases, or cause | ||
37 | interoperability problems with future clients. Known issues: | ||
38 | |||
39 | - gss support is questionable: currently mounts with kerberos | ||
40 | from a linux client are possible, but we aren't really | ||
41 | conformant with the spec (for example, we don't use kerberos | ||
42 | on the backchannel correctly). | ||
43 | - no trunking support: no clients currently take advantage of | ||
44 | trunking, but this is a mandatory failure, and its use is | ||
45 | recommended to clients in a number of places. (E.g. to ensure | ||
46 | timely renewal in case an existing connection's retry timeouts | ||
47 | have gotten too long; see section 8.3 of the draft.) | ||
48 | Therefore, lack of this feature may cause future clients to | ||
49 | fail. | ||
50 | - Incomplete backchannel support: incomplete backchannel gss | ||
51 | support and no support for BACKCHANNEL_CTL mean that | ||
52 | callbacks (hence delegations and layouts) may not be | ||
53 | available and clients confused by the incomplete | ||
54 | implementation may fail. | ||
55 | - Server reboot recovery is unsupported; if the server reboots, | ||
56 | clients may fail. | ||
57 | - We do not support SSV, which provides security for shared | ||
58 | client-server state (thus preventing unauthorized tampering | ||
59 | with locks and opens, for example). It is mandatory for | ||
60 | servers to support this, though no clients use it yet. | ||
61 | - Mandatory operations which we do not support, such as | ||
62 | DESTROY_CLIENTID, FREE_STATEID, SECINFO_NO_NAME, and | ||
63 | TEST_STATEID, are not currently used by clients, but will be | ||
64 | (and the spec recommends their uses in common cases), and | ||
65 | clients should not be expected to know how to recover from the | ||
66 | case where they are not supported. This will eventually cause | ||
67 | interoperability failures. | ||
68 | |||
69 | In addition, some limitations are inherited from the current NFSv4 | ||
70 | implementation: | ||
71 | |||
72 | - Incomplete delegation enforcement: if a file is renamed or | ||
73 | unlinked, a client holding a delegation may continue to | ||
74 | indefinitely allow opens of the file under the old name. | ||
75 | |||
28 | The table below, taken from the NFSv4.1 document, lists | 76 | The table below, taken from the NFSv4.1 document, lists |
29 | the operations that are mandatory to implement (REQ), optional | 77 | the operations that are mandatory to implement (REQ), optional |
30 | (OPT), and NFSv4.0 operations that are required not to implement (MNI) | 78 | (OPT), and NFSv4.0 operations that are required not to implement (MNI) |
@@ -142,6 +190,12 @@ NS*| CB_WANTS_CANCELLED | OPT | FDELG, | Section 20.10 | | |||
142 | 190 | ||
143 | Implementation notes: | 191 | Implementation notes: |
144 | 192 | ||
193 | DELEGPURGE: | ||
194 | * mandatory only for servers that support CLAIM_DELEGATE_PREV and/or | ||
195 | CLAIM_DELEG_PREV_FH (which allows clients to keep delegations that | ||
196 | persist across client reboots). Thus we need not implement this for | ||
197 | now. | ||
198 | |||
145 | EXCHANGE_ID: | 199 | EXCHANGE_ID: |
146 | * only SP4_NONE state protection supported | 200 | * only SP4_NONE state protection supported |
147 | * implementation ids are ignored | 201 | * implementation ids are ignored |
diff --git a/Documentation/filesystems/nfsroot.txt b/Documentation/filesystems/nfsroot.txt index 68baddf3c3e0..3ba0b945aaf8 100644 --- a/Documentation/filesystems/nfsroot.txt +++ b/Documentation/filesystems/nfsroot.txt | |||
@@ -105,7 +105,7 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf> | |||
105 | the client address and this parameter is NOT empty only | 105 | the client address and this parameter is NOT empty only |
106 | replies from the specified server are accepted. | 106 | replies from the specified server are accepted. |
107 | 107 | ||
108 | Only required for for NFS root. That is autoconfiguration | 108 | Only required for NFS root. That is autoconfiguration |
109 | will not be triggered if it is missing and NFS root is not | 109 | will not be triggered if it is missing and NFS root is not |
110 | in operation. | 110 | in operation. |
111 | 111 | ||
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index ffead13f9443..75988ba26a51 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -375,6 +375,19 @@ of memory currently marked as referenced or accessed. | |||
375 | This file is only present if the CONFIG_MMU kernel configuration option is | 375 | This file is only present if the CONFIG_MMU kernel configuration option is |
376 | enabled. | 376 | enabled. |
377 | 377 | ||
378 | The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG | ||
379 | bits on both physical and virtual pages associated with a process. | ||
380 | To clear the bits for all the pages associated with the process | ||
381 | > echo 1 > /proc/PID/clear_refs | ||
382 | |||
383 | To clear the bits for the anonymous pages associated with the process | ||
384 | > echo 2 > /proc/PID/clear_refs | ||
385 | |||
386 | To clear the bits for the file mapped pages associated with the process | ||
387 | > echo 3 > /proc/PID/clear_refs | ||
388 | Any other value written to /proc/PID/clear_refs will have no effect. | ||
389 | |||
390 | |||
378 | 1.2 Kernel data | 391 | 1.2 Kernel data |
379 | --------------- | 392 | --------------- |
380 | 393 | ||
@@ -1032,9 +1045,9 @@ Various pieces of information about kernel activity are available in the | |||
1032 | since the system first booted. For a quick look, simply cat the file: | 1045 | since the system first booted. For a quick look, simply cat the file: |
1033 | 1046 | ||
1034 | > cat /proc/stat | 1047 | > cat /proc/stat |
1035 | cpu 2255 34 2290 22625563 6290 127 456 0 | 1048 | cpu 2255 34 2290 22625563 6290 127 456 0 0 |
1036 | cpu0 1132 34 1441 11311718 3675 127 438 0 | 1049 | cpu0 1132 34 1441 11311718 3675 127 438 0 0 |
1037 | cpu1 1123 0 849 11313845 2614 0 18 0 | 1050 | cpu1 1123 0 849 11313845 2614 0 18 0 0 |
1038 | intr 114930548 113199788 3 0 5 263 0 4 [... lots more numbers ...] | 1051 | intr 114930548 113199788 3 0 5 263 0 4 [... lots more numbers ...] |
1039 | ctxt 1990473 | 1052 | ctxt 1990473 |
1040 | btime 1062191376 | 1053 | btime 1062191376 |
@@ -1056,6 +1069,7 @@ second). The meanings of the columns are as follows, from left to right: | |||
1056 | - irq: servicing interrupts | 1069 | - irq: servicing interrupts |
1057 | - softirq: servicing softirqs | 1070 | - softirq: servicing softirqs |
1058 | - steal: involuntary wait | 1071 | - steal: involuntary wait |
1072 | - guest: running a guest | ||
1059 | 1073 | ||
1060 | The "intr" line gives counts of interrupts serviced since boot time, for each | 1074 | The "intr" line gives counts of interrupts serviced since boot time, for each |
1061 | of the possible system interrupts. The first column is the total of all | 1075 | of the possible system interrupts. The first column is the total of all |
@@ -1191,7 +1205,7 @@ The following heuristics are then applied: | |||
1191 | * if the task was reniced, its score doubles | 1205 | * if the task was reniced, its score doubles |
1192 | * superuser or direct hardware access tasks (CAP_SYS_ADMIN, CAP_SYS_RESOURCE | 1206 | * superuser or direct hardware access tasks (CAP_SYS_ADMIN, CAP_SYS_RESOURCE |
1193 | or CAP_SYS_RAWIO) have their score divided by 4 | 1207 | or CAP_SYS_RAWIO) have their score divided by 4 |
1194 | * if oom condition happened in one cpuset and checked task does not belong | 1208 | * if oom condition happened in one cpuset and checked process does not belong |
1195 | to it, its score is divided by 8 | 1209 | to it, its score is divided by 8 |
1196 | * the resulting score is multiplied by two to the power of oom_adj, i.e. | 1210 | * the resulting score is multiplied by two to the power of oom_adj, i.e. |
1197 | points <<= oom_adj when it is positive and | 1211 | points <<= oom_adj when it is positive and |
diff --git a/Documentation/gcov.txt b/Documentation/gcov.txt index 40ec63352760..e7ca6478cd93 100644 --- a/Documentation/gcov.txt +++ b/Documentation/gcov.txt | |||
@@ -47,7 +47,7 @@ Possible uses: | |||
47 | 47 | ||
48 | Configure the kernel with: | 48 | Configure the kernel with: |
49 | 49 | ||
50 | CONFIG_DEBUGFS=y | 50 | CONFIG_DEBUG_FS=y |
51 | CONFIG_GCOV_KERNEL=y | 51 | CONFIG_GCOV_KERNEL=y |
52 | 52 | ||
53 | and to get coverage data for the entire kernel: | 53 | and to get coverage data for the entire kernel: |
diff --git a/Documentation/hwmon/hpfall.c b/Documentation/hwmon/hpfall.c index bbea1ccfd46a..681ec22b9d0e 100644 --- a/Documentation/hwmon/hpfall.c +++ b/Documentation/hwmon/hpfall.c | |||
@@ -16,6 +16,34 @@ | |||
16 | #include <stdint.h> | 16 | #include <stdint.h> |
17 | #include <errno.h> | 17 | #include <errno.h> |
18 | #include <signal.h> | 18 | #include <signal.h> |
19 | #include <sys/mman.h> | ||
20 | #include <sched.h> | ||
21 | |||
22 | char unload_heads_path[64]; | ||
23 | |||
24 | int set_unload_heads_path(char *device) | ||
25 | { | ||
26 | char devname[64]; | ||
27 | |||
28 | if (strlen(device) <= 5 || strncmp(device, "/dev/", 5) != 0) | ||
29 | return -EINVAL; | ||
30 | strncpy(devname, device + 5, sizeof(devname)); | ||
31 | |||
32 | snprintf(unload_heads_path, sizeof(unload_heads_path), | ||
33 | "/sys/block/%s/device/unload_heads", devname); | ||
34 | return 0; | ||
35 | } | ||
36 | int valid_disk(void) | ||
37 | { | ||
38 | int fd = open(unload_heads_path, O_RDONLY); | ||
39 | if (fd < 0) { | ||
40 | perror(unload_heads_path); | ||
41 | return 0; | ||
42 | } | ||
43 | |||
44 | close(fd); | ||
45 | return 1; | ||
46 | } | ||
19 | 47 | ||
20 | void write_int(char *path, int i) | 48 | void write_int(char *path, int i) |
21 | { | 49 | { |
@@ -40,7 +68,7 @@ void set_led(int on) | |||
40 | 68 | ||
41 | void protect(int seconds) | 69 | void protect(int seconds) |
42 | { | 70 | { |
43 | write_int("/sys/block/sda/device/unload_heads", seconds*1000); | 71 | write_int(unload_heads_path, seconds*1000); |
44 | } | 72 | } |
45 | 73 | ||
46 | int on_ac(void) | 74 | int on_ac(void) |
@@ -57,45 +85,62 @@ void ignore_me(void) | |||
57 | { | 85 | { |
58 | protect(0); | 86 | protect(0); |
59 | set_led(0); | 87 | set_led(0); |
60 | |||
61 | } | 88 | } |
62 | 89 | ||
63 | int main(int argc, char* argv[]) | 90 | int main(int argc, char **argv) |
64 | { | 91 | { |
65 | int fd, ret; | 92 | int fd, ret; |
93 | struct sched_param param; | ||
94 | |||
95 | if (argc == 1) | ||
96 | ret = set_unload_heads_path("/dev/sda"); | ||
97 | else if (argc == 2) | ||
98 | ret = set_unload_heads_path(argv[1]); | ||
99 | else | ||
100 | ret = -EINVAL; | ||
101 | |||
102 | if (ret || !valid_disk()) { | ||
103 | fprintf(stderr, "usage: %s <device> (default: /dev/sda)\n", | ||
104 | argv[0]); | ||
105 | exit(1); | ||
106 | } | ||
107 | |||
108 | fd = open("/dev/freefall", O_RDONLY); | ||
109 | if (fd < 0) { | ||
110 | perror("/dev/freefall"); | ||
111 | return EXIT_FAILURE; | ||
112 | } | ||
66 | 113 | ||
67 | fd = open("/dev/freefall", O_RDONLY); | 114 | daemon(0, 0); |
68 | if (fd < 0) { | 115 | param.sched_priority = sched_get_priority_max(SCHED_FIFO); |
69 | perror("open"); | 116 | sched_setscheduler(0, SCHED_FIFO, ¶m); |
70 | return EXIT_FAILURE; | 117 | mlockall(MCL_CURRENT|MCL_FUTURE); |
71 | } | ||
72 | 118 | ||
73 | signal(SIGALRM, ignore_me); | 119 | signal(SIGALRM, ignore_me); |
74 | 120 | ||
75 | for (;;) { | 121 | for (;;) { |
76 | unsigned char count; | 122 | unsigned char count; |
77 | 123 | ||
78 | ret = read(fd, &count, sizeof(count)); | 124 | ret = read(fd, &count, sizeof(count)); |
79 | alarm(0); | 125 | alarm(0); |
80 | if ((ret == -1) && (errno == EINTR)) { | 126 | if ((ret == -1) && (errno == EINTR)) { |
81 | /* Alarm expired, time to unpark the heads */ | 127 | /* Alarm expired, time to unpark the heads */ |
82 | continue; | 128 | continue; |
83 | } | 129 | } |
84 | 130 | ||
85 | if (ret != sizeof(count)) { | 131 | if (ret != sizeof(count)) { |
86 | perror("read"); | 132 | perror("read"); |
87 | break; | 133 | break; |
88 | } | 134 | } |
89 | 135 | ||
90 | protect(21); | 136 | protect(21); |
91 | set_led(1); | 137 | set_led(1); |
92 | if (1 || on_ac() || lid_open()) { | 138 | if (1 || on_ac() || lid_open()) |
93 | alarm(2); | 139 | alarm(2); |
94 | } else { | 140 | else |
95 | alarm(20); | 141 | alarm(20); |
96 | } | 142 | } |
97 | } | 143 | |
98 | 144 | close(fd); | |
99 | close(fd); | 145 | return EXIT_SUCCESS; |
100 | return EXIT_SUCCESS; | ||
101 | } | 146 | } |
diff --git a/Documentation/hwmon/pc87427 b/Documentation/hwmon/pc87427 index d1ebbe510f35..db5cc1227a83 100644 --- a/Documentation/hwmon/pc87427 +++ b/Documentation/hwmon/pc87427 | |||
@@ -34,5 +34,5 @@ Fan rotation speeds are reported as 14-bit values from a gated clock | |||
34 | signal. Speeds down to 83 RPM can be measured. | 34 | signal. Speeds down to 83 RPM can be measured. |
35 | 35 | ||
36 | An alarm is triggered if the rotation speed drops below a programmable | 36 | An alarm is triggered if the rotation speed drops below a programmable |
37 | limit. Another alarm is triggered if the speed is too low to to be measured | 37 | limit. Another alarm is triggered if the speed is too low to be measured |
38 | (including stalled or missing fan). | 38 | (including stalled or missing fan). |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 0f17d16dc101..c363840cdcea 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -933,7 +933,7 @@ and is between 256 and 4096 characters. It is defined in the file | |||
933 | 1 -- enable informational integrity auditing messages. | 933 | 1 -- enable informational integrity auditing messages. |
934 | 934 | ||
935 | ima_hash= [IMA] | 935 | ima_hash= [IMA] |
936 | Formt: { "sha1" | "md5" } | 936 | Format: { "sha1" | "md5" } |
937 | default: "sha1" | 937 | default: "sha1" |
938 | 938 | ||
939 | ima_tcb [IMA] | 939 | ima_tcb [IMA] |
diff --git a/Documentation/kmemcheck.txt b/Documentation/kmemcheck.txt index 363044609dad..c28f82895d6b 100644 --- a/Documentation/kmemcheck.txt +++ b/Documentation/kmemcheck.txt | |||
@@ -43,26 +43,7 @@ feature. | |||
43 | 1. Downloading | 43 | 1. Downloading |
44 | ============== | 44 | ============== |
45 | 45 | ||
46 | kmemcheck can only be downloaded using git. If you want to write patches | 46 | As of version 2.6.31-rc1, kmemcheck is included in the mainline kernel. |
47 | against the current code, you should use the kmemcheck development branch of | ||
48 | the tip tree. It is also possible to use the linux-next tree, which also | ||
49 | includes the latest version of kmemcheck. | ||
50 | |||
51 | Assuming that you've already cloned the linux-2.6.git repository, all you | ||
52 | have to do is add the -tip tree as a remote, like this: | ||
53 | |||
54 | $ git remote add tip git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git | ||
55 | |||
56 | To actually download the tree, fetch the remote: | ||
57 | |||
58 | $ git fetch tip | ||
59 | |||
60 | And to check out a new local branch with the kmemcheck code: | ||
61 | |||
62 | $ git checkout -b kmemcheck tip/kmemcheck | ||
63 | |||
64 | General instructions for the -tip tree can be found here: | ||
65 | http://people.redhat.com/mingo/tip.git/readme.txt | ||
66 | 47 | ||
67 | 48 | ||
68 | 2. Configuring and compiling | 49 | 2. Configuring and compiling |
diff --git a/Documentation/memory.txt b/Documentation/memory.txt index 2b3dedd39538..802efe58647c 100644 --- a/Documentation/memory.txt +++ b/Documentation/memory.txt | |||
@@ -1,18 +1,7 @@ | |||
1 | There are several classic problems related to memory on Linux | 1 | There are several classic problems related to memory on Linux |
2 | systems. | 2 | systems. |
3 | 3 | ||
4 | 1) There are some buggy motherboards which cannot properly | 4 | 1) There are some motherboards that will not cache above |
5 | deal with the memory above 16MB. Consider exchanging | ||
6 | your motherboard. | ||
7 | |||
8 | 2) You cannot do DMA on the ISA bus to addresses above | ||
9 | 16M. Most device drivers under Linux allow the use | ||
10 | of bounce buffers which work around this problem. Drivers | ||
11 | that don't use bounce buffers will be unstable with | ||
12 | more than 16M installed. Drivers that use bounce buffers | ||
13 | will be OK, but may have slightly higher overhead. | ||
14 | |||
15 | 3) There are some motherboards that will not cache above | ||
16 | a certain quantity of memory. If you have one of these | 5 | a certain quantity of memory. If you have one of these |
17 | motherboards, your system will be SLOWER, not faster | 6 | motherboards, your system will be SLOWER, not faster |
18 | as you add more memory. Consider exchanging your | 7 | as you add more memory. Consider exchanging your |
@@ -24,7 +13,7 @@ It can also tell Linux to use less memory than is actually installed. | |||
24 | If you use "mem=" on a machine with PCI, consider using "memmap=" to avoid | 13 | If you use "mem=" on a machine with PCI, consider using "memmap=" to avoid |
25 | physical address space collisions. | 14 | physical address space collisions. |
26 | 15 | ||
27 | See the documentation of your boot loader (LILO, loadlin, etc.) about | 16 | See the documentation of your boot loader (LILO, grub, loadlin, etc.) about |
28 | how to pass options to the kernel. | 17 | how to pass options to the kernel. |
29 | 18 | ||
30 | There are other memory problems which Linux cannot deal with. Random | 19 | There are other memory problems which Linux cannot deal with. Random |
@@ -42,19 +31,3 @@ Try: | |||
42 | with the vendor. Consider testing it with memtest86 yourself. | 31 | with the vendor. Consider testing it with memtest86 yourself. |
43 | 32 | ||
44 | * Exchanging your CPU, cache, or motherboard for one that works. | 33 | * Exchanging your CPU, cache, or motherboard for one that works. |
45 | |||
46 | * Disabling the cache from the BIOS. | ||
47 | |||
48 | * Try passing the "mem=4M" option to the kernel to limit | ||
49 | Linux to using a very small amount of memory. Use "memmap="-option | ||
50 | together with "mem=" on systems with PCI to avoid physical address | ||
51 | space collisions. | ||
52 | |||
53 | |||
54 | Other tricks: | ||
55 | |||
56 | * Try passing the "no-387" option to the kernel to ignore | ||
57 | a buggy FPU. | ||
58 | |||
59 | * Try passing the "no-hlt" option to disable the potentially | ||
60 | buggy HLT instruction in your CPU. | ||
diff --git a/Documentation/networking/regulatory.txt b/Documentation/networking/regulatory.txt index eaa1a25946c1..ee31369e9e5b 100644 --- a/Documentation/networking/regulatory.txt +++ b/Documentation/networking/regulatory.txt | |||
@@ -96,7 +96,7 @@ Example code - drivers hinting an alpha2: | |||
96 | 96 | ||
97 | This example comes from the zd1211rw device driver. You can start | 97 | This example comes from the zd1211rw device driver. You can start |
98 | by having a mapping of your device's EEPROM country/regulatory | 98 | by having a mapping of your device's EEPROM country/regulatory |
99 | domain value to to a specific alpha2 as follows: | 99 | domain value to a specific alpha2 as follows: |
100 | 100 | ||
101 | static struct zd_reg_alpha2_map reg_alpha2_map[] = { | 101 | static struct zd_reg_alpha2_map reg_alpha2_map[] = { |
102 | { ZD_REGDOMAIN_FCC, "US" }, | 102 | { ZD_REGDOMAIN_FCC, "US" }, |
diff --git a/Documentation/numastat.txt b/Documentation/numastat.txt index 80133ace1eb2..9fcc9a608dc0 100644 --- a/Documentation/numastat.txt +++ b/Documentation/numastat.txt | |||
@@ -7,10 +7,10 @@ All units are pages. Hugepages have separate counters. | |||
7 | 7 | ||
8 | numa_hit A process wanted to allocate memory from this node, | 8 | numa_hit A process wanted to allocate memory from this node, |
9 | and succeeded. | 9 | and succeeded. |
10 | numa_miss A process wanted to allocate memory from this node, | 10 | numa_miss A process wanted to allocate memory from another node, |
11 | but ended up with memory from another. | 11 | but ended up with memory from this node. |
12 | numa_foreign A process wanted to allocate on another node, | 12 | numa_foreign A process wanted to allocate on this node, |
13 | but ended up with memory from this one. | 13 | but ended up with memory from another one. |
14 | local_node A process ran on this node and got memory from it. | 14 | local_node A process ran on this node and got memory from it. |
15 | other_node A process ran on this node and got memory from another node. | 15 | other_node A process ran on this node and got memory from another node. |
16 | interleave_hit Interleaving wanted to allocate from this node | 16 | interleave_hit Interleaving wanted to allocate from this node |
diff --git a/Documentation/powerpc/dts-bindings/marvell.txt b/Documentation/powerpc/dts-bindings/marvell.txt index 3708a2fd4747..f1533d91953a 100644 --- a/Documentation/powerpc/dts-bindings/marvell.txt +++ b/Documentation/powerpc/dts-bindings/marvell.txt | |||
@@ -32,7 +32,7 @@ prefixed with the string "marvell,", for Marvell Technology Group Ltd. | |||
32 | devices. This field represents the number of cells needed to | 32 | devices. This field represents the number of cells needed to |
33 | represent the address of the memory-mapped registers of devices | 33 | represent the address of the memory-mapped registers of devices |
34 | within the system controller chip. | 34 | within the system controller chip. |
35 | - #size-cells : Size representation for for the memory-mapped | 35 | - #size-cells : Size representation for the memory-mapped |
36 | registers within the system controller chip. | 36 | registers within the system controller chip. |
37 | - #interrupt-cells : Defines the width of cells used to represent | 37 | - #interrupt-cells : Defines the width of cells used to represent |
38 | interrupts. | 38 | interrupts. |
diff --git a/Documentation/scsi/ChangeLog.megaraid b/Documentation/scsi/ChangeLog.megaraid index eaa4801f2ce6..38e9e7cadc90 100644 --- a/Documentation/scsi/ChangeLog.megaraid +++ b/Documentation/scsi/ChangeLog.megaraid | |||
@@ -514,7 +514,7 @@ iv. Remove yield() while mailbox handshake in synchronous commands | |||
514 | 514 | ||
515 | v. Remove redundant __megaraid_busywait_mbox routine | 515 | v. Remove redundant __megaraid_busywait_mbox routine |
516 | 516 | ||
517 | vi. Fix bug in the managment module, which causes a system lockup when the | 517 | vi. Fix bug in the management module, which causes a system lockup when the |
518 | IO module is loaded and then unloaded, followed by executing any | 518 | IO module is loaded and then unloaded, followed by executing any |
519 | management utility. The current version of management module does not | 519 | management utility. The current version of management module does not |
520 | handle the adapter unregister properly. | 520 | handle the adapter unregister properly. |
diff --git a/Documentation/scsi/scsi_fc_transport.txt b/Documentation/scsi/scsi_fc_transport.txt index d7f181701dc2..aec6549ab097 100644 --- a/Documentation/scsi/scsi_fc_transport.txt +++ b/Documentation/scsi/scsi_fc_transport.txt | |||
@@ -378,7 +378,7 @@ Vport Disable/Enable: | |||
378 | int vport_disable(struct fc_vport *vport, bool disable) | 378 | int vport_disable(struct fc_vport *vport, bool disable) |
379 | 379 | ||
380 | where: | 380 | where: |
381 | vport: Is vport to to be enabled or disabled | 381 | vport: Is vport to be enabled or disabled |
382 | disable: If "true", the vport is to be disabled. | 382 | disable: If "true", the vport is to be disabled. |
383 | If "false", the vport is to be enabled. | 383 | If "false", the vport is to be enabled. |
384 | 384 | ||
diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt index 97eebd63bedc..f1708b79f963 100644 --- a/Documentation/sound/alsa/HD-Audio-Models.txt +++ b/Documentation/sound/alsa/HD-Audio-Models.txt | |||
@@ -387,7 +387,7 @@ STAC92HD73* | |||
387 | STAC92HD83* | 387 | STAC92HD83* |
388 | =========== | 388 | =========== |
389 | ref Reference board | 389 | ref Reference board |
390 | mic-ref Reference board with power managment for ports | 390 | mic-ref Reference board with power management for ports |
391 | dell-s14 Dell laptop | 391 | dell-s14 Dell laptop |
392 | auto BIOS setup (default) | 392 | auto BIOS setup (default) |
393 | 393 | ||
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 2dbff53369d0..3e5b63ebb821 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt | |||
@@ -319,25 +319,29 @@ This option can be used to select the type of process address | |||
319 | space randomization that is used in the system, for architectures | 319 | space randomization that is used in the system, for architectures |
320 | that support this feature. | 320 | that support this feature. |
321 | 321 | ||
322 | 0 - Turn the process address space randomization off by default. | 322 | 0 - Turn the process address space randomization off. This is the |
323 | default for architectures that do not support this feature anyways, | ||
324 | and kernels that are booted with the "norandmaps" parameter. | ||
323 | 325 | ||
324 | 1 - Make the addresses of mmap base, stack and VDSO page randomized. | 326 | 1 - Make the addresses of mmap base, stack and VDSO page randomized. |
325 | This, among other things, implies that shared libraries will be | 327 | This, among other things, implies that shared libraries will be |
326 | loaded to random addresses. Also for PIE-linked binaries, the location | 328 | loaded to random addresses. Also for PIE-linked binaries, the |
327 | of code start is randomized. | 329 | location of code start is randomized. This is the default if the |
330 | CONFIG_COMPAT_BRK option is enabled. | ||
328 | 331 | ||
329 | With heap randomization, the situation is a little bit more | 332 | 2 - Additionally enable heap randomization. This is the default if |
330 | complicated. | 333 | CONFIG_COMPAT_BRK is disabled. |
331 | There a few legacy applications out there (such as some ancient | 334 | |
335 | There are a few legacy applications out there (such as some ancient | ||
332 | versions of libc.so.5 from 1996) that assume that brk area starts | 336 | versions of libc.so.5 from 1996) that assume that brk area starts |
333 | just after the end of the code+bss. These applications break when | 337 | just after the end of the code+bss. These applications break when |
334 | start of the brk area is randomized. There are however no known | 338 | start of the brk area is randomized. There are however no known |
335 | non-legacy applications that would be broken this way, so for most | 339 | non-legacy applications that would be broken this way, so for most |
336 | systems it is safe to choose full randomization. However there is | 340 | systems it is safe to choose full randomization. |
337 | a CONFIG_COMPAT_BRK option for systems with ancient and/or broken | 341 | |
338 | binaries, that makes heap non-randomized, but keeps all other | 342 | Systems with ancient and/or broken binaries should be configured |
339 | parts of process address space randomized if randomize_va_space | 343 | with CONFIG_COMPAT_BRK enabled, which excludes the heap from process |
340 | sysctl is turned on. | 344 | address space randomization. |
341 | 345 | ||
342 | ============================================================== | 346 | ============================================================== |
343 | 347 | ||
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index c4de6359d440..e6fb1ec2744b 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt | |||
@@ -585,7 +585,9 @@ caching of directory and inode objects. | |||
585 | At the default value of vfs_cache_pressure=100 the kernel will attempt to | 585 | At the default value of vfs_cache_pressure=100 the kernel will attempt to |
586 | reclaim dentries and inodes at a "fair" rate with respect to pagecache and | 586 | reclaim dentries and inodes at a "fair" rate with respect to pagecache and |
587 | swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer | 587 | swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer |
588 | to retain dentry and inode caches. Increasing vfs_cache_pressure beyond 100 | 588 | to retain dentry and inode caches. When vfs_cache_pressure=0, the kernel will |
589 | never reclaim dentries and inodes due to memory pressure and this can easily | ||
590 | lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100 | ||
589 | causes the kernel to prefer to reclaim dentries and inodes. | 591 | causes the kernel to prefer to reclaim dentries and inodes. |
590 | 592 | ||
591 | ============================================================== | 593 | ============================================================== |
diff --git a/Documentation/trace/events-kmem.txt b/Documentation/trace/events-kmem.txt new file mode 100644 index 000000000000..6ef2a8652e17 --- /dev/null +++ b/Documentation/trace/events-kmem.txt | |||
@@ -0,0 +1,107 @@ | |||
1 | Subsystem Trace Points: kmem | ||
2 | |||
3 | The tracing system kmem captures events related to object and page allocation | ||
4 | within the kernel. Broadly speaking there are four major subheadings. | ||
5 | |||
6 | o Slab allocation of small objects of unknown type (kmalloc) | ||
7 | o Slab allocation of small objects of known type | ||
8 | o Page allocation | ||
9 | o Per-CPU Allocator Activity | ||
10 | o External Fragmentation | ||
11 | |||
12 | This document will describe what each of the tracepoints are and why they | ||
13 | might be useful. | ||
14 | |||
15 | 1. Slab allocation of small objects of unknown type | ||
16 | =================================================== | ||
17 | kmalloc call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s | ||
18 | kmalloc_node call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d | ||
19 | kfree call_site=%lx ptr=%p | ||
20 | |||
21 | Heavy activity for these events may indicate that a specific cache is | ||
22 | justified, particularly if kmalloc slab pages are getting significantly | ||
23 | internal fragmented as a result of the allocation pattern. By correlating | ||
24 | kmalloc with kfree, it may be possible to identify memory leaks and where | ||
25 | the allocation sites were. | ||
26 | |||
27 | |||
28 | 2. Slab allocation of small objects of known type | ||
29 | ================================================= | ||
30 | kmem_cache_alloc call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s | ||
31 | kmem_cache_alloc_node call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d | ||
32 | kmem_cache_free call_site=%lx ptr=%p | ||
33 | |||
34 | These events are similar in usage to the kmalloc-related events except that | ||
35 | it is likely easier to pin the event down to a specific cache. At the time | ||
36 | of writing, no information is available on what slab is being allocated from, | ||
37 | but the call_site can usually be used to extrapolate that information | ||
38 | |||
39 | 3. Page allocation | ||
40 | ================== | ||
41 | mm_page_alloc page=%p pfn=%lu order=%d migratetype=%d gfp_flags=%s | ||
42 | mm_page_alloc_zone_locked page=%p pfn=%lu order=%u migratetype=%d cpu=%d percpu_refill=%d | ||
43 | mm_page_free_direct page=%p pfn=%lu order=%d | ||
44 | mm_pagevec_free page=%p pfn=%lu order=%d cold=%d | ||
45 | |||
46 | These four events deal with page allocation and freeing. mm_page_alloc is | ||
47 | a simple indicator of page allocator activity. Pages may be allocated from | ||
48 | the per-CPU allocator (high performance) or the buddy allocator. | ||
49 | |||
50 | If pages are allocated directly from the buddy allocator, the | ||
51 | mm_page_alloc_zone_locked event is triggered. This event is important as high | ||
52 | amounts of activity imply high activity on the zone->lock. Taking this lock | ||
53 | impairs performance by disabling interrupts, dirtying cache lines between | ||
54 | CPUs and serialising many CPUs. | ||
55 | |||
56 | When a page is freed directly by the caller, the mm_page_free_direct event | ||
57 | is triggered. Significant amounts of activity here could indicate that the | ||
58 | callers should be batching their activities. | ||
59 | |||
60 | When pages are freed using a pagevec, the mm_pagevec_free is | ||
61 | triggered. Broadly speaking, pages are taken off the LRU lock in bulk and | ||
62 | freed in batch with a pagevec. Significant amounts of activity here could | ||
63 | indicate that the system is under memory pressure and can also indicate | ||
64 | contention on the zone->lru_lock. | ||
65 | |||
66 | 4. Per-CPU Allocator Activity | ||
67 | ============================= | ||
68 | mm_page_alloc_zone_locked page=%p pfn=%lu order=%u migratetype=%d cpu=%d percpu_refill=%d | ||
69 | mm_page_pcpu_drain page=%p pfn=%lu order=%d cpu=%d migratetype=%d | ||
70 | |||
71 | In front of the page allocator is a per-cpu page allocator. It exists only | ||
72 | for order-0 pages, reduces contention on the zone->lock and reduces the | ||
73 | amount of writing on struct page. | ||
74 | |||
75 | When a per-CPU list is empty or pages of the wrong type are allocated, | ||
76 | the zone->lock will be taken once and the per-CPU list refilled. The event | ||
77 | triggered is mm_page_alloc_zone_locked for each page allocated with the | ||
78 | event indicating whether it is for a percpu_refill or not. | ||
79 | |||
80 | When the per-CPU list is too full, a number of pages are freed, each one | ||
81 | which triggers a mm_page_pcpu_drain event. | ||
82 | |||
83 | The individual nature of the events are so that pages can be tracked | ||
84 | between allocation and freeing. A number of drain or refill pages that occur | ||
85 | consecutively imply the zone->lock being taken once. Large amounts of PCP | ||
86 | refills and drains could imply an imbalance between CPUs where too much work | ||
87 | is being concentrated in one place. It could also indicate that the per-CPU | ||
88 | lists should be a larger size. Finally, large amounts of refills on one CPU | ||
89 | and drains on another could be a factor in causing large amounts of cache | ||
90 | line bounces due to writes between CPUs and worth investigating if pages | ||
91 | can be allocated and freed on the same CPU through some algorithm change. | ||
92 | |||
93 | 5. External Fragmentation | ||
94 | ========================= | ||
95 | mm_page_alloc_extfrag page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d | ||
96 | |||
97 | External fragmentation affects whether a high-order allocation will be | ||
98 | successful or not. For some types of hardware, this is important although | ||
99 | it is avoided where possible. If the system is using huge pages and needs | ||
100 | to be able to resize the pool over the lifetime of the system, this value | ||
101 | is important. | ||
102 | |||
103 | Large numbers of this event implies that memory is fragmenting and | ||
104 | high-order allocations will start failing at some time in the future. One | ||
105 | means of reducing the occurange of this event is to increase the size of | ||
106 | min_free_kbytes in increments of 3*pageblock_size*nr_online_nodes where | ||
107 | pageblock_size is usually the size of the default hugepage size. | ||
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt index 78c45a87be57..02ac6ed38b2d 100644 --- a/Documentation/trace/events.txt +++ b/Documentation/trace/events.txt | |||
@@ -72,7 +72,7 @@ To enable all events in sched subsystem: | |||
72 | 72 | ||
73 | # echo 1 > /sys/kernel/debug/tracing/events/sched/enable | 73 | # echo 1 > /sys/kernel/debug/tracing/events/sched/enable |
74 | 74 | ||
75 | To eanble all events: | 75 | To enable all events: |
76 | 76 | ||
77 | # echo 1 > /sys/kernel/debug/tracing/events/enable | 77 | # echo 1 > /sys/kernel/debug/tracing/events/enable |
78 | 78 | ||
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 1b6292bbdd6d..957b22fde2df 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt | |||
@@ -133,7 +133,7 @@ of ftrace. Here is a list of some of the key files: | |||
133 | than requested, the rest of the page will be used, | 133 | than requested, the rest of the page will be used, |
134 | making the actual allocation bigger than requested. | 134 | making the actual allocation bigger than requested. |
135 | ( Note, the size may not be a multiple of the page size | 135 | ( Note, the size may not be a multiple of the page size |
136 | due to buffer managment overhead. ) | 136 | due to buffer management overhead. ) |
137 | 137 | ||
138 | This can only be updated when the current_tracer | 138 | This can only be updated when the current_tracer |
139 | is set to "nop". | 139 | is set to "nop". |
diff --git a/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl b/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl new file mode 100644 index 000000000000..7df50e8cf4d9 --- /dev/null +++ b/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl | |||
@@ -0,0 +1,418 @@ | |||
1 | #!/usr/bin/perl | ||
2 | # This is a POC (proof of concept or piece of crap, take your pick) for reading the | ||
3 | # text representation of trace output related to page allocation. It makes an attempt | ||
4 | # to extract some high-level information on what is going on. The accuracy of the parser | ||
5 | # may vary considerably | ||
6 | # | ||
7 | # Example usage: trace-pagealloc-postprocess.pl < /sys/kernel/debug/tracing/trace_pipe | ||
8 | # other options | ||
9 | # --prepend-parent Report on the parent proc and PID | ||
10 | # --read-procstat If the trace lacks process info, get it from /proc | ||
11 | # --ignore-pid Aggregate processes of the same name together | ||
12 | # | ||
13 | # Copyright (c) IBM Corporation 2009 | ||
14 | # Author: Mel Gorman <mel@csn.ul.ie> | ||
15 | use strict; | ||
16 | use Getopt::Long; | ||
17 | |||
18 | # Tracepoint events | ||
19 | use constant MM_PAGE_ALLOC => 1; | ||
20 | use constant MM_PAGE_FREE_DIRECT => 2; | ||
21 | use constant MM_PAGEVEC_FREE => 3; | ||
22 | use constant MM_PAGE_PCPU_DRAIN => 4; | ||
23 | use constant MM_PAGE_ALLOC_ZONE_LOCKED => 5; | ||
24 | use constant MM_PAGE_ALLOC_EXTFRAG => 6; | ||
25 | use constant EVENT_UNKNOWN => 7; | ||
26 | |||
27 | # Constants used to track state | ||
28 | use constant STATE_PCPU_PAGES_DRAINED => 8; | ||
29 | use constant STATE_PCPU_PAGES_REFILLED => 9; | ||
30 | |||
31 | # High-level events extrapolated from tracepoints | ||
32 | use constant HIGH_PCPU_DRAINS => 10; | ||
33 | use constant HIGH_PCPU_REFILLS => 11; | ||
34 | use constant HIGH_EXT_FRAGMENT => 12; | ||
35 | use constant HIGH_EXT_FRAGMENT_SEVERE => 13; | ||
36 | use constant HIGH_EXT_FRAGMENT_MODERATE => 14; | ||
37 | use constant HIGH_EXT_FRAGMENT_CHANGED => 15; | ||
38 | |||
39 | my %perprocesspid; | ||
40 | my %perprocess; | ||
41 | my $opt_ignorepid; | ||
42 | my $opt_read_procstat; | ||
43 | my $opt_prepend_parent; | ||
44 | |||
45 | # Catch sigint and exit on request | ||
46 | my $sigint_report = 0; | ||
47 | my $sigint_exit = 0; | ||
48 | my $sigint_pending = 0; | ||
49 | my $sigint_received = 0; | ||
50 | sub sigint_handler { | ||
51 | my $current_time = time; | ||
52 | if ($current_time - 2 > $sigint_received) { | ||
53 | print "SIGINT received, report pending. Hit ctrl-c again to exit\n"; | ||
54 | $sigint_report = 1; | ||
55 | } else { | ||
56 | if (!$sigint_exit) { | ||
57 | print "Second SIGINT received quickly, exiting\n"; | ||
58 | } | ||
59 | $sigint_exit++; | ||
60 | } | ||
61 | |||
62 | if ($sigint_exit > 3) { | ||
63 | print "Many SIGINTs received, exiting now without report\n"; | ||
64 | exit; | ||
65 | } | ||
66 | |||
67 | $sigint_received = $current_time; | ||
68 | $sigint_pending = 1; | ||
69 | } | ||
70 | $SIG{INT} = "sigint_handler"; | ||
71 | |||
72 | # Parse command line options | ||
73 | GetOptions( | ||
74 | 'ignore-pid' => \$opt_ignorepid, | ||
75 | 'read-procstat' => \$opt_read_procstat, | ||
76 | 'prepend-parent' => \$opt_prepend_parent, | ||
77 | ); | ||
78 | |||
79 | # Defaults for dynamically discovered regex's | ||
80 | my $regex_fragdetails_default = 'page=([0-9a-f]*) pfn=([0-9]*) alloc_order=([-0-9]*) fallback_order=([-0-9]*) pageblock_order=([-0-9]*) alloc_migratetype=([-0-9]*) fallback_migratetype=([-0-9]*) fragmenting=([-0-9]) change_ownership=([-0-9])'; | ||
81 | |||
82 | # Dyanically discovered regex | ||
83 | my $regex_fragdetails; | ||
84 | |||
85 | # Static regex used. Specified like this for readability and for use with /o | ||
86 | # (process_pid) (cpus ) ( time ) (tpoint ) (details) | ||
87 | my $regex_traceevent = '\s*([a-zA-Z0-9-]*)\s*(\[[0-9]*\])\s*([0-9.]*):\s*([a-zA-Z_]*):\s*(.*)'; | ||
88 | my $regex_statname = '[-0-9]*\s\((.*)\).*'; | ||
89 | my $regex_statppid = '[-0-9]*\s\(.*\)\s[A-Za-z]\s([0-9]*).*'; | ||
90 | |||
91 | sub generate_traceevent_regex { | ||
92 | my $event = shift; | ||
93 | my $default = shift; | ||
94 | my $regex; | ||
95 | |||
96 | # Read the event format or use the default | ||
97 | if (!open (FORMAT, "/sys/kernel/debug/tracing/events/$event/format")) { | ||
98 | $regex = $default; | ||
99 | } else { | ||
100 | my $line; | ||
101 | while (!eof(FORMAT)) { | ||
102 | $line = <FORMAT>; | ||
103 | if ($line =~ /^print fmt:\s"(.*)",.*/) { | ||
104 | $regex = $1; | ||
105 | $regex =~ s/%p/\([0-9a-f]*\)/g; | ||
106 | $regex =~ s/%d/\([-0-9]*\)/g; | ||
107 | $regex =~ s/%lu/\([0-9]*\)/g; | ||
108 | } | ||
109 | } | ||
110 | } | ||
111 | |||
112 | # Verify fields are in the right order | ||
113 | my $tuple; | ||
114 | foreach $tuple (split /\s/, $regex) { | ||
115 | my ($key, $value) = split(/=/, $tuple); | ||
116 | my $expected = shift; | ||
117 | if ($key ne $expected) { | ||
118 | print("WARNING: Format not as expected '$key' != '$expected'"); | ||
119 | $regex =~ s/$key=\((.*)\)/$key=$1/; | ||
120 | } | ||
121 | } | ||
122 | |||
123 | if (defined shift) { | ||
124 | die("Fewer fields than expected in format"); | ||
125 | } | ||
126 | |||
127 | return $regex; | ||
128 | } | ||
129 | $regex_fragdetails = generate_traceevent_regex("kmem/mm_page_alloc_extfrag", | ||
130 | $regex_fragdetails_default, | ||
131 | "page", "pfn", | ||
132 | "alloc_order", "fallback_order", "pageblock_order", | ||
133 | "alloc_migratetype", "fallback_migratetype", | ||
134 | "fragmenting", "change_ownership"); | ||
135 | |||
136 | sub read_statline($) { | ||
137 | my $pid = $_[0]; | ||
138 | my $statline; | ||
139 | |||
140 | if (open(STAT, "/proc/$pid/stat")) { | ||
141 | $statline = <STAT>; | ||
142 | close(STAT); | ||
143 | } | ||
144 | |||
145 | if ($statline eq '') { | ||
146 | $statline = "-1 (UNKNOWN_PROCESS_NAME) R 0"; | ||
147 | } | ||
148 | |||
149 | return $statline; | ||
150 | } | ||
151 | |||
152 | sub guess_process_pid($$) { | ||
153 | my $pid = $_[0]; | ||
154 | my $statline = $_[1]; | ||
155 | |||
156 | if ($pid == 0) { | ||
157 | return "swapper-0"; | ||
158 | } | ||
159 | |||
160 | if ($statline !~ /$regex_statname/o) { | ||
161 | die("Failed to math stat line for process name :: $statline"); | ||
162 | } | ||
163 | return "$1-$pid"; | ||
164 | } | ||
165 | |||
166 | sub parent_info($$) { | ||
167 | my $pid = $_[0]; | ||
168 | my $statline = $_[1]; | ||
169 | my $ppid; | ||
170 | |||
171 | if ($pid == 0) { | ||
172 | return "NOPARENT-0"; | ||
173 | } | ||
174 | |||
175 | if ($statline !~ /$regex_statppid/o) { | ||
176 | die("Failed to match stat line process ppid:: $statline"); | ||
177 | } | ||
178 | |||
179 | # Read the ppid stat line | ||
180 | $ppid = $1; | ||
181 | return guess_process_pid($ppid, read_statline($ppid)); | ||
182 | } | ||
183 | |||
184 | sub process_events { | ||
185 | my $traceevent; | ||
186 | my $process_pid; | ||
187 | my $cpus; | ||
188 | my $timestamp; | ||
189 | my $tracepoint; | ||
190 | my $details; | ||
191 | my $statline; | ||
192 | |||
193 | # Read each line of the event log | ||
194 | EVENT_PROCESS: | ||
195 | while ($traceevent = <STDIN>) { | ||
196 | if ($traceevent =~ /$regex_traceevent/o) { | ||
197 | $process_pid = $1; | ||
198 | $tracepoint = $4; | ||
199 | |||
200 | if ($opt_read_procstat || $opt_prepend_parent) { | ||
201 | $process_pid =~ /(.*)-([0-9]*)$/; | ||
202 | my $process = $1; | ||
203 | my $pid = $2; | ||
204 | |||
205 | $statline = read_statline($pid); | ||
206 | |||
207 | if ($opt_read_procstat && $process eq '') { | ||
208 | $process_pid = guess_process_pid($pid, $statline); | ||
209 | } | ||
210 | |||
211 | if ($opt_prepend_parent) { | ||
212 | $process_pid = parent_info($pid, $statline) . " :: $process_pid"; | ||
213 | } | ||
214 | } | ||
215 | |||
216 | # Unnecessary in this script. Uncomment if required | ||
217 | # $cpus = $2; | ||
218 | # $timestamp = $3; | ||
219 | } else { | ||
220 | next; | ||
221 | } | ||
222 | |||
223 | # Perl Switch() sucks majorly | ||
224 | if ($tracepoint eq "mm_page_alloc") { | ||
225 | $perprocesspid{$process_pid}->{MM_PAGE_ALLOC}++; | ||
226 | } elsif ($tracepoint eq "mm_page_free_direct") { | ||
227 | $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT}++; | ||
228 | } elsif ($tracepoint eq "mm_pagevec_free") { | ||
229 | $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE}++; | ||
230 | } elsif ($tracepoint eq "mm_page_pcpu_drain") { | ||
231 | $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN}++; | ||
232 | $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED}++; | ||
233 | } elsif ($tracepoint eq "mm_page_alloc_zone_locked") { | ||
234 | $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}++; | ||
235 | $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED}++; | ||
236 | } elsif ($tracepoint eq "mm_page_alloc_extfrag") { | ||
237 | |||
238 | # Extract the details of the event now | ||
239 | $details = $5; | ||
240 | |||
241 | my ($page, $pfn); | ||
242 | my ($alloc_order, $fallback_order, $pageblock_order); | ||
243 | my ($alloc_migratetype, $fallback_migratetype); | ||
244 | my ($fragmenting, $change_ownership); | ||
245 | |||
246 | if ($details !~ /$regex_fragdetails/o) { | ||
247 | print "WARNING: Failed to parse mm_page_alloc_extfrag as expected\n"; | ||
248 | next; | ||
249 | } | ||
250 | |||
251 | $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG}++; | ||
252 | $page = $1; | ||
253 | $pfn = $2; | ||
254 | $alloc_order = $3; | ||
255 | $fallback_order = $4; | ||
256 | $pageblock_order = $5; | ||
257 | $alloc_migratetype = $6; | ||
258 | $fallback_migratetype = $7; | ||
259 | $fragmenting = $8; | ||
260 | $change_ownership = $9; | ||
261 | |||
262 | if ($fragmenting) { | ||
263 | $perprocesspid{$process_pid}->{HIGH_EXT_FRAG}++; | ||
264 | if ($fallback_order <= 3) { | ||
265 | $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE}++; | ||
266 | } else { | ||
267 | $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE}++; | ||
268 | } | ||
269 | } | ||
270 | if ($change_ownership) { | ||
271 | $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED}++; | ||
272 | } | ||
273 | } else { | ||
274 | $perprocesspid{$process_pid}->{EVENT_UNKNOWN}++; | ||
275 | } | ||
276 | |||
277 | # Catch a full pcpu drain event | ||
278 | if ($perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED} && | ||
279 | $tracepoint ne "mm_page_pcpu_drain") { | ||
280 | |||
281 | $perprocesspid{$process_pid}->{HIGH_PCPU_DRAINS}++; | ||
282 | $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED} = 0; | ||
283 | } | ||
284 | |||
285 | # Catch a full pcpu refill event | ||
286 | if ($perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED} && | ||
287 | $tracepoint ne "mm_page_alloc_zone_locked") { | ||
288 | $perprocesspid{$process_pid}->{HIGH_PCPU_REFILLS}++; | ||
289 | $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED} = 0; | ||
290 | } | ||
291 | |||
292 | if ($sigint_pending) { | ||
293 | last EVENT_PROCESS; | ||
294 | } | ||
295 | } | ||
296 | } | ||
297 | |||
298 | sub dump_stats { | ||
299 | my $hashref = shift; | ||
300 | my %stats = %$hashref; | ||
301 | |||
302 | # Dump per-process stats | ||
303 | my $process_pid; | ||
304 | my $max_strlen = 0; | ||
305 | |||
306 | # Get the maximum process name | ||
307 | foreach $process_pid (keys %perprocesspid) { | ||
308 | my $len = length($process_pid); | ||
309 | if ($len > $max_strlen) { | ||
310 | $max_strlen = $len; | ||
311 | } | ||
312 | } | ||
313 | $max_strlen += 2; | ||
314 | |||
315 | printf("\n"); | ||
316 | printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n", | ||
317 | "Process", "Pages", "Pages", "Pages", "Pages", "PCPU", "PCPU", "PCPU", "Fragment", "Fragment", "MigType", "Fragment", "Fragment", "Unknown"); | ||
318 | printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n", | ||
319 | "details", "allocd", "allocd", "freed", "freed", "pages", "drains", "refills", "Fallback", "Causing", "Changed", "Severe", "Moderate", ""); | ||
320 | |||
321 | printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n", | ||
322 | "", "", "under lock", "direct", "pagevec", "drain", "", "", "", "", "", "", "", ""); | ||
323 | |||
324 | foreach $process_pid (keys %stats) { | ||
325 | # Dump final aggregates | ||
326 | if ($stats{$process_pid}->{STATE_PCPU_PAGES_DRAINED}) { | ||
327 | $stats{$process_pid}->{HIGH_PCPU_DRAINS}++; | ||
328 | $stats{$process_pid}->{STATE_PCPU_PAGES_DRAINED} = 0; | ||
329 | } | ||
330 | if ($stats{$process_pid}->{STATE_PCPU_PAGES_REFILLED}) { | ||
331 | $stats{$process_pid}->{HIGH_PCPU_REFILLS}++; | ||
332 | $stats{$process_pid}->{STATE_PCPU_PAGES_REFILLED} = 0; | ||
333 | } | ||
334 | |||
335 | printf("%-" . $max_strlen . "s %8d %10d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d\n", | ||
336 | $process_pid, | ||
337 | $stats{$process_pid}->{MM_PAGE_ALLOC}, | ||
338 | $stats{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}, | ||
339 | $stats{$process_pid}->{MM_PAGE_FREE_DIRECT}, | ||
340 | $stats{$process_pid}->{MM_PAGEVEC_FREE}, | ||
341 | $stats{$process_pid}->{MM_PAGE_PCPU_DRAIN}, | ||
342 | $stats{$process_pid}->{HIGH_PCPU_DRAINS}, | ||
343 | $stats{$process_pid}->{HIGH_PCPU_REFILLS}, | ||
344 | $stats{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG}, | ||
345 | $stats{$process_pid}->{HIGH_EXT_FRAG}, | ||
346 | $stats{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED}, | ||
347 | $stats{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE}, | ||
348 | $stats{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE}, | ||
349 | $stats{$process_pid}->{EVENT_UNKNOWN}); | ||
350 | } | ||
351 | } | ||
352 | |||
353 | sub aggregate_perprocesspid() { | ||
354 | my $process_pid; | ||
355 | my $process; | ||
356 | undef %perprocess; | ||
357 | |||
358 | foreach $process_pid (keys %perprocesspid) { | ||
359 | $process = $process_pid; | ||
360 | $process =~ s/-([0-9])*$//; | ||
361 | if ($process eq '') { | ||
362 | $process = "NO_PROCESS_NAME"; | ||
363 | } | ||
364 | |||
365 | $perprocess{$process}->{MM_PAGE_ALLOC} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC}; | ||
366 | $perprocess{$process}->{MM_PAGE_ALLOC_ZONE_LOCKED} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}; | ||
367 | $perprocess{$process}->{MM_PAGE_FREE_DIRECT} += $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT}; | ||
368 | $perprocess{$process}->{MM_PAGEVEC_FREE} += $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE}; | ||
369 | $perprocess{$process}->{MM_PAGE_PCPU_DRAIN} += $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN}; | ||
370 | $perprocess{$process}->{HIGH_PCPU_DRAINS} += $perprocesspid{$process_pid}->{HIGH_PCPU_DRAINS}; | ||
371 | $perprocess{$process}->{HIGH_PCPU_REFILLS} += $perprocesspid{$process_pid}->{HIGH_PCPU_REFILLS}; | ||
372 | $perprocess{$process}->{MM_PAGE_ALLOC_EXTFRAG} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG}; | ||
373 | $perprocess{$process}->{HIGH_EXT_FRAG} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAG}; | ||
374 | $perprocess{$process}->{HIGH_EXT_FRAGMENT_CHANGED} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED}; | ||
375 | $perprocess{$process}->{HIGH_EXT_FRAGMENT_SEVERE} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE}; | ||
376 | $perprocess{$process}->{HIGH_EXT_FRAGMENT_MODERATE} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE}; | ||
377 | $perprocess{$process}->{EVENT_UNKNOWN} += $perprocesspid{$process_pid}->{EVENT_UNKNOWN}; | ||
378 | } | ||
379 | } | ||
380 | |||
381 | sub report() { | ||
382 | if (!$opt_ignorepid) { | ||
383 | dump_stats(\%perprocesspid); | ||
384 | } else { | ||
385 | aggregate_perprocesspid(); | ||
386 | dump_stats(\%perprocess); | ||
387 | } | ||
388 | } | ||
389 | |||
390 | # Process events or signals until neither is available | ||
391 | sub signal_loop() { | ||
392 | my $sigint_processed; | ||
393 | do { | ||
394 | $sigint_processed = 0; | ||
395 | process_events(); | ||
396 | |||
397 | # Handle pending signals if any | ||
398 | if ($sigint_pending) { | ||
399 | my $current_time = time; | ||
400 | |||
401 | if ($sigint_exit) { | ||
402 | print "Received exit signal\n"; | ||
403 | $sigint_pending = 0; | ||
404 | } | ||
405 | if ($sigint_report) { | ||
406 | if ($current_time >= $sigint_received + 2) { | ||
407 | report(); | ||
408 | $sigint_report = 0; | ||
409 | $sigint_pending = 0; | ||
410 | $sigint_processed = 1; | ||
411 | } | ||
412 | } | ||
413 | } | ||
414 | } while ($sigint_pending || $sigint_processed); | ||
415 | } | ||
416 | |||
417 | signal_loop(); | ||
418 | report(); | ||
diff --git a/Documentation/trace/tracepoint-analysis.txt b/Documentation/trace/tracepoint-analysis.txt new file mode 100644 index 000000000000..5eb4e487e667 --- /dev/null +++ b/Documentation/trace/tracepoint-analysis.txt | |||
@@ -0,0 +1,327 @@ | |||
1 | Notes on Analysing Behaviour Using Events and Tracepoints | ||
2 | |||
3 | Documentation written by Mel Gorman | ||
4 | PCL information heavily based on email from Ingo Molnar | ||
5 | |||
6 | 1. Introduction | ||
7 | =============== | ||
8 | |||
9 | Tracepoints (see Documentation/trace/tracepoints.txt) can be used without | ||
10 | creating custom kernel modules to register probe functions using the event | ||
11 | tracing infrastructure. | ||
12 | |||
13 | Simplistically, tracepoints will represent an important event that when can | ||
14 | be taken in conjunction with other tracepoints to build a "Big Picture" of | ||
15 | what is going on within the system. There are a large number of methods for | ||
16 | gathering and interpreting these events. Lacking any current Best Practises, | ||
17 | this document describes some of the methods that can be used. | ||
18 | |||
19 | This document assumes that debugfs is mounted on /sys/kernel/debug and that | ||
20 | the appropriate tracing options have been configured into the kernel. It is | ||
21 | assumed that the PCL tool tools/perf has been installed and is in your path. | ||
22 | |||
23 | 2. Listing Available Events | ||
24 | =========================== | ||
25 | |||
26 | 2.1 Standard Utilities | ||
27 | ---------------------- | ||
28 | |||
29 | All possible events are visible from /sys/kernel/debug/tracing/events. Simply | ||
30 | calling | ||
31 | |||
32 | $ find /sys/kernel/debug/tracing/events -type d | ||
33 | |||
34 | will give a fair indication of the number of events available. | ||
35 | |||
36 | 2.2 PCL | ||
37 | ------- | ||
38 | |||
39 | Discovery and enumeration of all counters and events, including tracepoints | ||
40 | are available with the perf tool. Getting a list of available events is a | ||
41 | simple case of | ||
42 | |||
43 | $ perf list 2>&1 | grep Tracepoint | ||
44 | ext4:ext4_free_inode [Tracepoint event] | ||
45 | ext4:ext4_request_inode [Tracepoint event] | ||
46 | ext4:ext4_allocate_inode [Tracepoint event] | ||
47 | ext4:ext4_write_begin [Tracepoint event] | ||
48 | ext4:ext4_ordered_write_end [Tracepoint event] | ||
49 | [ .... remaining output snipped .... ] | ||
50 | |||
51 | |||
52 | 2. Enabling Events | ||
53 | ================== | ||
54 | |||
55 | 2.1 System-Wide Event Enabling | ||
56 | ------------------------------ | ||
57 | |||
58 | See Documentation/trace/events.txt for a proper description on how events | ||
59 | can be enabled system-wide. A short example of enabling all events related | ||
60 | to page allocation would look something like | ||
61 | |||
62 | $ for i in `find /sys/kernel/debug/tracing/events -name "enable" | grep mm_`; do echo 1 > $i; done | ||
63 | |||
64 | 2.2 System-Wide Event Enabling with SystemTap | ||
65 | --------------------------------------------- | ||
66 | |||
67 | In SystemTap, tracepoints are accessible using the kernel.trace() function | ||
68 | call. The following is an example that reports every 5 seconds what processes | ||
69 | were allocating the pages. | ||
70 | |||
71 | global page_allocs | ||
72 | |||
73 | probe kernel.trace("mm_page_alloc") { | ||
74 | page_allocs[execname()]++ | ||
75 | } | ||
76 | |||
77 | function print_count() { | ||
78 | printf ("%-25s %-s\n", "#Pages Allocated", "Process Name") | ||
79 | foreach (proc in page_allocs-) | ||
80 | printf("%-25d %s\n", page_allocs[proc], proc) | ||
81 | printf ("\n") | ||
82 | delete page_allocs | ||
83 | } | ||
84 | |||
85 | probe timer.s(5) { | ||
86 | print_count() | ||
87 | } | ||
88 | |||
89 | 2.3 System-Wide Event Enabling with PCL | ||
90 | --------------------------------------- | ||
91 | |||
92 | By specifying the -a switch and analysing sleep, the system-wide events | ||
93 | for a duration of time can be examined. | ||
94 | |||
95 | $ perf stat -a \ | ||
96 | -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ | ||
97 | -e kmem:mm_pagevec_free \ | ||
98 | sleep 10 | ||
99 | Performance counter stats for 'sleep 10': | ||
100 | |||
101 | 9630 kmem:mm_page_alloc | ||
102 | 2143 kmem:mm_page_free_direct | ||
103 | 7424 kmem:mm_pagevec_free | ||
104 | |||
105 | 10.002577764 seconds time elapsed | ||
106 | |||
107 | Similarly, one could execute a shell and exit it as desired to get a report | ||
108 | at that point. | ||
109 | |||
110 | 2.4 Local Event Enabling | ||
111 | ------------------------ | ||
112 | |||
113 | Documentation/trace/ftrace.txt describes how to enable events on a per-thread | ||
114 | basis using set_ftrace_pid. | ||
115 | |||
116 | 2.5 Local Event Enablement with PCL | ||
117 | ----------------------------------- | ||
118 | |||
119 | Events can be activate and tracked for the duration of a process on a local | ||
120 | basis using PCL such as follows. | ||
121 | |||
122 | $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ | ||
123 | -e kmem:mm_pagevec_free ./hackbench 10 | ||
124 | Time: 0.909 | ||
125 | |||
126 | Performance counter stats for './hackbench 10': | ||
127 | |||
128 | 17803 kmem:mm_page_alloc | ||
129 | 12398 kmem:mm_page_free_direct | ||
130 | 4827 kmem:mm_pagevec_free | ||
131 | |||
132 | 0.973913387 seconds time elapsed | ||
133 | |||
134 | 3. Event Filtering | ||
135 | ================== | ||
136 | |||
137 | Documentation/trace/ftrace.txt covers in-depth how to filter events in | ||
138 | ftrace. Obviously using grep and awk of trace_pipe is an option as well | ||
139 | as any script reading trace_pipe. | ||
140 | |||
141 | 4. Analysing Event Variances with PCL | ||
142 | ===================================== | ||
143 | |||
144 | Any workload can exhibit variances between runs and it can be important | ||
145 | to know what the standard deviation in. By and large, this is left to the | ||
146 | performance analyst to do it by hand. In the event that the discrete event | ||
147 | occurrences are useful to the performance analyst, then perf can be used. | ||
148 | |||
149 | $ perf stat --repeat 5 -e kmem:mm_page_alloc -e kmem:mm_page_free_direct | ||
150 | -e kmem:mm_pagevec_free ./hackbench 10 | ||
151 | Time: 0.890 | ||
152 | Time: 0.895 | ||
153 | Time: 0.915 | ||
154 | Time: 1.001 | ||
155 | Time: 0.899 | ||
156 | |||
157 | Performance counter stats for './hackbench 10' (5 runs): | ||
158 | |||
159 | 16630 kmem:mm_page_alloc ( +- 3.542% ) | ||
160 | 11486 kmem:mm_page_free_direct ( +- 4.771% ) | ||
161 | 4730 kmem:mm_pagevec_free ( +- 2.325% ) | ||
162 | |||
163 | 0.982653002 seconds time elapsed ( +- 1.448% ) | ||
164 | |||
165 | In the event that some higher-level event is required that depends on some | ||
166 | aggregation of discrete events, then a script would need to be developed. | ||
167 | |||
168 | Using --repeat, it is also possible to view how events are fluctuating over | ||
169 | time on a system wide basis using -a and sleep. | ||
170 | |||
171 | $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ | ||
172 | -e kmem:mm_pagevec_free \ | ||
173 | -a --repeat 10 \ | ||
174 | sleep 1 | ||
175 | Performance counter stats for 'sleep 1' (10 runs): | ||
176 | |||
177 | 1066 kmem:mm_page_alloc ( +- 26.148% ) | ||
178 | 182 kmem:mm_page_free_direct ( +- 5.464% ) | ||
179 | 890 kmem:mm_pagevec_free ( +- 30.079% ) | ||
180 | |||
181 | 1.002251757 seconds time elapsed ( +- 0.005% ) | ||
182 | |||
183 | 5. Higher-Level Analysis with Helper Scripts | ||
184 | ============================================ | ||
185 | |||
186 | When events are enabled the events that are triggering can be read from | ||
187 | /sys/kernel/debug/tracing/trace_pipe in human-readable format although binary | ||
188 | options exist as well. By post-processing the output, further information can | ||
189 | be gathered on-line as appropriate. Examples of post-processing might include | ||
190 | |||
191 | o Reading information from /proc for the PID that triggered the event | ||
192 | o Deriving a higher-level event from a series of lower-level events. | ||
193 | o Calculate latencies between two events | ||
194 | |||
195 | Documentation/trace/postprocess/trace-pagealloc-postprocess.pl is an example | ||
196 | script that can read trace_pipe from STDIN or a copy of a trace. When used | ||
197 | on-line, it can be interrupted once to generate a report without existing | ||
198 | and twice to exit. | ||
199 | |||
200 | Simplistically, the script just reads STDIN and counts up events but it | ||
201 | also can do more such as | ||
202 | |||
203 | o Derive high-level events from many low-level events. If a number of pages | ||
204 | are freed to the main allocator from the per-CPU lists, it recognises | ||
205 | that as one per-CPU drain even though there is no specific tracepoint | ||
206 | for that event | ||
207 | o It can aggregate based on PID or individual process number | ||
208 | o In the event memory is getting externally fragmented, it reports | ||
209 | on whether the fragmentation event was severe or moderate. | ||
210 | o When receiving an event about a PID, it can record who the parent was so | ||
211 | that if large numbers of events are coming from very short-lived | ||
212 | processes, the parent process responsible for creating all the helpers | ||
213 | can be identified | ||
214 | |||
215 | 6. Lower-Level Analysis with PCL | ||
216 | ================================ | ||
217 | |||
218 | There may also be a requirement to identify what functions with a program | ||
219 | were generating events within the kernel. To begin this sort of analysis, the | ||
220 | data must be recorded. At the time of writing, this required root | ||
221 | |||
222 | $ perf record -c 1 \ | ||
223 | -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ | ||
224 | -e kmem:mm_pagevec_free \ | ||
225 | ./hackbench 10 | ||
226 | Time: 0.894 | ||
227 | [ perf record: Captured and wrote 0.733 MB perf.data (~32010 samples) ] | ||
228 | |||
229 | Note the use of '-c 1' to set the event period to sample. The default sample | ||
230 | period is quite high to minimise overhead but the information collected can be | ||
231 | very coarse as a result. | ||
232 | |||
233 | This record outputted a file called perf.data which can be analysed using | ||
234 | perf report. | ||
235 | |||
236 | $ perf report | ||
237 | # Samples: 30922 | ||
238 | # | ||
239 | # Overhead Command Shared Object | ||
240 | # ........ ......... ................................ | ||
241 | # | ||
242 | 87.27% hackbench [vdso] | ||
243 | 6.85% hackbench /lib/i686/cmov/libc-2.9.so | ||
244 | 2.62% hackbench /lib/ld-2.9.so | ||
245 | 1.52% perf [vdso] | ||
246 | 1.22% hackbench ./hackbench | ||
247 | 0.48% hackbench [kernel] | ||
248 | 0.02% perf /lib/i686/cmov/libc-2.9.so | ||
249 | 0.01% perf /usr/bin/perf | ||
250 | 0.01% perf /lib/ld-2.9.so | ||
251 | 0.00% hackbench /lib/i686/cmov/libpthread-2.9.so | ||
252 | # | ||
253 | # (For more details, try: perf report --sort comm,dso,symbol) | ||
254 | # | ||
255 | |||
256 | According to this, the vast majority of events occured triggered on events | ||
257 | within the VDSO. With simple binaries, this will often be the case so lets | ||
258 | take a slightly different example. In the course of writing this, it was | ||
259 | noticed that X was generating an insane amount of page allocations so lets look | ||
260 | at it | ||
261 | |||
262 | $ perf record -c 1 -f \ | ||
263 | -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ | ||
264 | -e kmem:mm_pagevec_free \ | ||
265 | -p `pidof X` | ||
266 | |||
267 | This was interrupted after a few seconds and | ||
268 | |||
269 | $ perf report | ||
270 | # Samples: 27666 | ||
271 | # | ||
272 | # Overhead Command Shared Object | ||
273 | # ........ ....... ....................................... | ||
274 | # | ||
275 | 51.95% Xorg [vdso] | ||
276 | 47.95% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 | ||
277 | 0.09% Xorg /lib/i686/cmov/libc-2.9.so | ||
278 | 0.01% Xorg [kernel] | ||
279 | # | ||
280 | # (For more details, try: perf report --sort comm,dso,symbol) | ||
281 | # | ||
282 | |||
283 | So, almost half of the events are occuring in a library. To get an idea which | ||
284 | symbol. | ||
285 | |||
286 | $ perf report --sort comm,dso,symbol | ||
287 | # Samples: 27666 | ||
288 | # | ||
289 | # Overhead Command Shared Object Symbol | ||
290 | # ........ ....... ....................................... ...... | ||
291 | # | ||
292 | 51.95% Xorg [vdso] [.] 0x000000ffffe424 | ||
293 | 47.93% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] pixmanFillsse2 | ||
294 | 0.09% Xorg /lib/i686/cmov/libc-2.9.so [.] _int_malloc | ||
295 | 0.01% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] pixman_region32_copy_f | ||
296 | 0.01% Xorg [kernel] [k] read_hpet | ||
297 | 0.01% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] get_fast_path | ||
298 | 0.00% Xorg [kernel] [k] ftrace_trace_userstack | ||
299 | |||
300 | To see where within the function pixmanFillsse2 things are going wrong | ||
301 | |||
302 | $ perf annotate pixmanFillsse2 | ||
303 | [ ... ] | ||
304 | 0.00 : 34eeb: 0f 18 08 prefetcht0 (%eax) | ||
305 | : } | ||
306 | : | ||
307 | : extern __inline void __attribute__((__gnu_inline__, __always_inline__, _ | ||
308 | : _mm_store_si128 (__m128i *__P, __m128i __B) : { | ||
309 | : *__P = __B; | ||
310 | 12.40 : 34eee: 66 0f 7f 80 40 ff ff movdqa %xmm0,-0xc0(%eax) | ||
311 | 0.00 : 34ef5: ff | ||
312 | 12.40 : 34ef6: 66 0f 7f 80 50 ff ff movdqa %xmm0,-0xb0(%eax) | ||
313 | 0.00 : 34efd: ff | ||
314 | 12.39 : 34efe: 66 0f 7f 80 60 ff ff movdqa %xmm0,-0xa0(%eax) | ||
315 | 0.00 : 34f05: ff | ||
316 | 12.67 : 34f06: 66 0f 7f 80 70 ff ff movdqa %xmm0,-0x90(%eax) | ||
317 | 0.00 : 34f0d: ff | ||
318 | 12.58 : 34f0e: 66 0f 7f 40 80 movdqa %xmm0,-0x80(%eax) | ||
319 | 12.31 : 34f13: 66 0f 7f 40 90 movdqa %xmm0,-0x70(%eax) | ||
320 | 12.40 : 34f18: 66 0f 7f 40 a0 movdqa %xmm0,-0x60(%eax) | ||
321 | 12.31 : 34f1d: 66 0f 7f 40 b0 movdqa %xmm0,-0x50(%eax) | ||
322 | |||
323 | At a glance, it looks like the time is being spent copying pixmaps to | ||
324 | the card. Further investigation would be needed to determine why pixmaps | ||
325 | are being copied around so much but a starting point would be to take an | ||
326 | ancient build of libpixmap out of the library path where it was totally | ||
327 | forgotten about from months ago! | ||
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX index 2f77ced35df7..e57d6a9dd32b 100644 --- a/Documentation/vm/00-INDEX +++ b/Documentation/vm/00-INDEX | |||
@@ -6,6 +6,8 @@ balance | |||
6 | - various information on memory balancing. | 6 | - various information on memory balancing. |
7 | hugetlbpage.txt | 7 | hugetlbpage.txt |
8 | - a brief summary of hugetlbpage support in the Linux kernel. | 8 | - a brief summary of hugetlbpage support in the Linux kernel. |
9 | ksm.txt | ||
10 | - how to use the Kernel Samepage Merging feature. | ||
9 | locking | 11 | locking |
10 | - info on how locking and synchronization is done in the Linux vm code. | 12 | - info on how locking and synchronization is done in the Linux vm code. |
11 | numa | 13 | numa |
@@ -20,3 +22,5 @@ slabinfo.c | |||
20 | - source code for a tool to get reports about slabs. | 22 | - source code for a tool to get reports about slabs. |
21 | slub.txt | 23 | slub.txt |
22 | - a short users guide for SLUB. | 24 | - a short users guide for SLUB. |
25 | map_hugetlb.c | ||
26 | - an example program that uses the MAP_HUGETLB mmap flag. | ||
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index ea8714fcc3ad..82a7bd1800b2 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt | |||
@@ -18,13 +18,13 @@ First the Linux kernel needs to be built with the CONFIG_HUGETLBFS | |||
18 | automatically when CONFIG_HUGETLBFS is selected) configuration | 18 | automatically when CONFIG_HUGETLBFS is selected) configuration |
19 | options. | 19 | options. |
20 | 20 | ||
21 | The kernel built with hugepage support should show the number of configured | 21 | The kernel built with huge page support should show the number of configured |
22 | hugepages in the system by running the "cat /proc/meminfo" command. | 22 | huge pages in the system by running the "cat /proc/meminfo" command. |
23 | 23 | ||
24 | /proc/meminfo also provides information about the total number of hugetlb | 24 | /proc/meminfo also provides information about the total number of hugetlb |
25 | pages configured in the kernel. It also displays information about the | 25 | pages configured in the kernel. It also displays information about the |
26 | number of free hugetlb pages at any time. It also displays information about | 26 | number of free hugetlb pages at any time. It also displays information about |
27 | the configured hugepage size - this is needed for generating the proper | 27 | the configured huge page size - this is needed for generating the proper |
28 | alignment and size of the arguments to the above system calls. | 28 | alignment and size of the arguments to the above system calls. |
29 | 29 | ||
30 | The output of "cat /proc/meminfo" will have lines like: | 30 | The output of "cat /proc/meminfo" will have lines like: |
@@ -37,25 +37,27 @@ HugePages_Surp: yyy | |||
37 | Hugepagesize: zzz kB | 37 | Hugepagesize: zzz kB |
38 | 38 | ||
39 | where: | 39 | where: |
40 | HugePages_Total is the size of the pool of hugepages. | 40 | HugePages_Total is the size of the pool of huge pages. |
41 | HugePages_Free is the number of hugepages in the pool that are not yet | 41 | HugePages_Free is the number of huge pages in the pool that are not yet |
42 | allocated. | 42 | allocated. |
43 | HugePages_Rsvd is short for "reserved," and is the number of hugepages | 43 | HugePages_Rsvd is short for "reserved," and is the number of huge pages for |
44 | for which a commitment to allocate from the pool has been made, but no | 44 | which a commitment to allocate from the pool has been made, |
45 | allocation has yet been made. It's vaguely analogous to overcommit. | 45 | but no allocation has yet been made. Reserved huge pages |
46 | HugePages_Surp is short for "surplus," and is the number of hugepages in | 46 | guarantee that an application will be able to allocate a |
47 | the pool above the value in /proc/sys/vm/nr_hugepages. The maximum | 47 | huge page from the pool of huge pages at fault time. |
48 | number of surplus hugepages is controlled by | 48 | HugePages_Surp is short for "surplus," and is the number of huge pages in |
49 | /proc/sys/vm/nr_overcommit_hugepages. | 49 | the pool above the value in /proc/sys/vm/nr_hugepages. The |
50 | maximum number of surplus huge pages is controlled by | ||
51 | /proc/sys/vm/nr_overcommit_hugepages. | ||
50 | 52 | ||
51 | /proc/filesystems should also show a filesystem of type "hugetlbfs" configured | 53 | /proc/filesystems should also show a filesystem of type "hugetlbfs" configured |
52 | in the kernel. | 54 | in the kernel. |
53 | 55 | ||
54 | /proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb | 56 | /proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb |
55 | pages in the kernel. Super user can dynamically request more (or free some | 57 | pages in the kernel. Super user can dynamically request more (or free some |
56 | pre-configured) hugepages. | 58 | pre-configured) huge pages. |
57 | The allocation (or deallocation) of hugetlb pages is possible only if there are | 59 | The allocation (or deallocation) of hugetlb pages is possible only if there are |
58 | enough physically contiguous free pages in system (freeing of hugepages is | 60 | enough physically contiguous free pages in system (freeing of huge pages is |
59 | possible only if there are enough hugetlb pages free that can be transferred | 61 | possible only if there are enough hugetlb pages free that can be transferred |
60 | back to regular memory pool). | 62 | back to regular memory pool). |
61 | 63 | ||
@@ -67,43 +69,82 @@ use either the mmap system call or shared memory system calls to start using | |||
67 | the huge pages. It is required that the system administrator preallocate | 69 | the huge pages. It is required that the system administrator preallocate |
68 | enough memory for huge page purposes. | 70 | enough memory for huge page purposes. |
69 | 71 | ||
70 | Use the following command to dynamically allocate/deallocate hugepages: | 72 | The administrator can preallocate huge pages on the kernel boot command line by |
73 | specifying the "hugepages=N" parameter, where 'N' = the number of huge pages | ||
74 | requested. This is the most reliable method for preallocating huge pages as | ||
75 | memory has not yet become fragmented. | ||
76 | |||
77 | Some platforms support multiple huge page sizes. To preallocate huge pages | ||
78 | of a specific size, one must preceed the huge pages boot command parameters | ||
79 | with a huge page size selection parameter "hugepagesz=<size>". <size> must | ||
80 | be specified in bytes with optional scale suffix [kKmMgG]. The default huge | ||
81 | page size may be selected with the "default_hugepagesz=<size>" boot parameter. | ||
82 | |||
83 | /proc/sys/vm/nr_hugepages indicates the current number of configured [default | ||
84 | size] hugetlb pages in the kernel. Super user can dynamically request more | ||
85 | (or free some pre-configured) huge pages. | ||
86 | |||
87 | Use the following command to dynamically allocate/deallocate default sized | ||
88 | huge pages: | ||
71 | 89 | ||
72 | echo 20 > /proc/sys/vm/nr_hugepages | 90 | echo 20 > /proc/sys/vm/nr_hugepages |
73 | 91 | ||
74 | This command will try to configure 20 hugepages in the system. The success | 92 | This command will try to configure 20 default sized huge pages in the system. |
75 | or failure of allocation depends on the amount of physically contiguous | 93 | On a NUMA platform, the kernel will attempt to distribute the huge page pool |
76 | memory that is preset in system at this time. System administrators may want | 94 | over the all on-line nodes. These huge pages, allocated when nr_hugepages |
77 | to put this command in one of the local rc init files. This will enable the | 95 | is increased, are called "persistent huge pages". |
78 | kernel to request huge pages early in the boot process (when the possibility | 96 | |
79 | of getting physical contiguous pages is still very high). In either | 97 | The success or failure of huge page allocation depends on the amount of |
80 | case, administrators will want to verify the number of hugepages actually | 98 | physically contiguous memory that is preset in system at the time of the |
81 | allocated by checking the sysctl or meminfo. | 99 | allocation attempt. If the kernel is unable to allocate huge pages from |
82 | 100 | some nodes in a NUMA system, it will attempt to make up the difference by | |
83 | /proc/sys/vm/nr_overcommit_hugepages indicates how large the pool of | 101 | allocating extra pages on other nodes with sufficient available contiguous |
84 | hugepages can grow, if more hugepages than /proc/sys/vm/nr_hugepages are | 102 | memory, if any. |
85 | requested by applications. echo'ing any non-zero value into this file | 103 | |
86 | indicates that the hugetlb subsystem is allowed to try to obtain | 104 | System administrators may want to put this command in one of the local rc init |
87 | hugepages from the buddy allocator, if the normal pool is exhausted. As | 105 | files. This will enable the kernel to request huge pages early in the boot |
88 | these surplus hugepages go out of use, they are freed back to the buddy | 106 | process when the possibility of getting physical contiguous pages is still |
107 | very high. Administrators can verify the number of huge pages actually | ||
108 | allocated by checking the sysctl or meminfo. To check the per node | ||
109 | distribution of huge pages in a NUMA system, use: | ||
110 | |||
111 | cat /sys/devices/system/node/node*/meminfo | fgrep Huge | ||
112 | |||
113 | /proc/sys/vm/nr_overcommit_hugepages specifies how large the pool of | ||
114 | huge pages can grow, if more huge pages than /proc/sys/vm/nr_hugepages are | ||
115 | requested by applications. Writing any non-zero value into this file | ||
116 | indicates that the hugetlb subsystem is allowed to try to obtain "surplus" | ||
117 | huge pages from the buddy allocator, when the normal pool is exhausted. As | ||
118 | these surplus huge pages go out of use, they are freed back to the buddy | ||
89 | allocator. | 119 | allocator. |
90 | 120 | ||
121 | When increasing the huge page pool size via nr_hugepages, any surplus | ||
122 | pages will first be promoted to persistent huge pages. Then, additional | ||
123 | huge pages will be allocated, if necessary and if possible, to fulfill | ||
124 | the new huge page pool size. | ||
125 | |||
126 | The administrator may shrink the pool of preallocated huge pages for | ||
127 | the default huge page size by setting the nr_hugepages sysctl to a | ||
128 | smaller value. The kernel will attempt to balance the freeing of huge pages | ||
129 | across all on-line nodes. Any free huge pages on the selected nodes will | ||
130 | be freed back to the buddy allocator. | ||
131 | |||
91 | Caveat: Shrinking the pool via nr_hugepages such that it becomes less | 132 | Caveat: Shrinking the pool via nr_hugepages such that it becomes less |
92 | than the number of hugepages in use will convert the balance to surplus | 133 | than the number of huge pages in use will convert the balance to surplus |
93 | huge pages even if it would exceed the overcommit value. As long as | 134 | huge pages even if it would exceed the overcommit value. As long as |
94 | this condition holds, however, no more surplus huge pages will be | 135 | this condition holds, however, no more surplus huge pages will be |
95 | allowed on the system until one of the two sysctls are increased | 136 | allowed on the system until one of the two sysctls are increased |
96 | sufficiently, or the surplus huge pages go out of use and are freed. | 137 | sufficiently, or the surplus huge pages go out of use and are freed. |
97 | 138 | ||
98 | With support for multiple hugepage pools at run-time available, much of | 139 | With support for multiple huge page pools at run-time available, much of |
99 | the hugepage userspace interface has been duplicated in sysfs. The above | 140 | the huge page userspace interface has been duplicated in sysfs. The above |
100 | information applies to the default hugepage size (which will be | 141 | information applies to the default huge page size which will be |
101 | controlled by the proc interfaces for backwards compatibility). The root | 142 | controlled by the /proc interfaces for backwards compatibility. The root |
102 | hugepage control directory is | 143 | huge page control directory in sysfs is: |
103 | 144 | ||
104 | /sys/kernel/mm/hugepages | 145 | /sys/kernel/mm/hugepages |
105 | 146 | ||
106 | For each hugepage size supported by the running kernel, a subdirectory | 147 | For each huge page size supported by the running kernel, a subdirectory |
107 | will exist, of the form | 148 | will exist, of the form |
108 | 149 | ||
109 | hugepages-${size}kB | 150 | hugepages-${size}kB |
@@ -116,9 +157,9 @@ Inside each of these directories, the same set of files will exist: | |||
116 | resv_hugepages | 157 | resv_hugepages |
117 | surplus_hugepages | 158 | surplus_hugepages |
118 | 159 | ||
119 | which function as described above for the default hugepage-sized case. | 160 | which function as described above for the default huge page-sized case. |
120 | 161 | ||
121 | If the user applications are going to request hugepages using mmap system | 162 | If the user applications are going to request huge pages using mmap system |
122 | call, then it is required that system administrator mount a file system of | 163 | call, then it is required that system administrator mount a file system of |
123 | type hugetlbfs: | 164 | type hugetlbfs: |
124 | 165 | ||
@@ -127,7 +168,7 @@ type hugetlbfs: | |||
127 | none /mnt/huge | 168 | none /mnt/huge |
128 | 169 | ||
129 | This command mounts a (pseudo) filesystem of type hugetlbfs on the directory | 170 | This command mounts a (pseudo) filesystem of type hugetlbfs on the directory |
130 | /mnt/huge. Any files created on /mnt/huge uses hugepages. The uid and gid | 171 | /mnt/huge. Any files created on /mnt/huge uses huge pages. The uid and gid |
131 | options sets the owner and group of the root of the file system. By default | 172 | options sets the owner and group of the root of the file system. By default |
132 | the uid and gid of the current process are taken. The mode option sets the | 173 | the uid and gid of the current process are taken. The mode option sets the |
133 | mode of root of file system to value & 0777. This value is given in octal. | 174 | mode of root of file system to value & 0777. This value is given in octal. |
@@ -146,24 +187,26 @@ Regular chown, chgrp, and chmod commands (with right permissions) could be | |||
146 | used to change the file attributes on hugetlbfs. | 187 | used to change the file attributes on hugetlbfs. |
147 | 188 | ||
148 | Also, it is important to note that no such mount command is required if the | 189 | Also, it is important to note that no such mount command is required if the |
149 | applications are going to use only shmat/shmget system calls. Users who | 190 | applications are going to use only shmat/shmget system calls or mmap with |
150 | wish to use hugetlb page via shared memory segment should be a member of | 191 | MAP_HUGETLB. Users who wish to use hugetlb page via shared memory segment |
151 | a supplementary group and system admin needs to configure that gid into | 192 | should be a member of a supplementary group and system admin needs to |
152 | /proc/sys/vm/hugetlb_shm_group. It is possible for same or different | 193 | configure that gid into /proc/sys/vm/hugetlb_shm_group. It is possible for |
153 | applications to use any combination of mmaps and shm* calls, though the | 194 | same or different applications to use any combination of mmaps and shm* |
154 | mount of filesystem will be required for using mmap calls. | 195 | calls, though the mount of filesystem will be required for using mmap calls |
196 | without MAP_HUGETLB. For an example of how to use mmap with MAP_HUGETLB see | ||
197 | map_hugetlb.c. | ||
155 | 198 | ||
156 | ******************************************************************* | 199 | ******************************************************************* |
157 | 200 | ||
158 | /* | 201 | /* |
159 | * Example of using hugepage memory in a user application using Sys V shared | 202 | * Example of using huge page memory in a user application using Sys V shared |
160 | * memory system calls. In this example the app is requesting 256MB of | 203 | * memory system calls. In this example the app is requesting 256MB of |
161 | * memory that is backed by huge pages. The application uses the flag | 204 | * memory that is backed by huge pages. The application uses the flag |
162 | * SHM_HUGETLB in the shmget system call to inform the kernel that it is | 205 | * SHM_HUGETLB in the shmget system call to inform the kernel that it is |
163 | * requesting hugepages. | 206 | * requesting huge pages. |
164 | * | 207 | * |
165 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for | 208 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for |
166 | * hugepages. That means the addresses starting with 0x800000... will need | 209 | * huge pages. That means the addresses starting with 0x800000... will need |
167 | * to be specified. Specifying a fixed address is not required on ppc64, | 210 | * to be specified. Specifying a fixed address is not required on ppc64, |
168 | * i386 or x86_64. | 211 | * i386 or x86_64. |
169 | * | 212 | * |
@@ -252,14 +295,14 @@ int main(void) | |||
252 | ******************************************************************* | 295 | ******************************************************************* |
253 | 296 | ||
254 | /* | 297 | /* |
255 | * Example of using hugepage memory in a user application using the mmap | 298 | * Example of using huge page memory in a user application using the mmap |
256 | * system call. Before running this application, make sure that the | 299 | * system call. Before running this application, make sure that the |
257 | * administrator has mounted the hugetlbfs filesystem (on some directory | 300 | * administrator has mounted the hugetlbfs filesystem (on some directory |
258 | * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this | 301 | * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this |
259 | * example, the app is requesting memory of size 256MB that is backed by | 302 | * example, the app is requesting memory of size 256MB that is backed by |
260 | * huge pages. | 303 | * huge pages. |
261 | * | 304 | * |
262 | * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. | 305 | * For ia64 architecture, Linux kernel reserves Region number 4 for huge pages. |
263 | * That means the addresses starting with 0x800000... will need to be | 306 | * That means the addresses starting with 0x800000... will need to be |
264 | * specified. Specifying a fixed address is not required on ppc64, i386 | 307 | * specified. Specifying a fixed address is not required on ppc64, i386 |
265 | * or x86_64. | 308 | * or x86_64. |
diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt new file mode 100644 index 000000000000..72a22f65960e --- /dev/null +++ b/Documentation/vm/ksm.txt | |||
@@ -0,0 +1,89 @@ | |||
1 | How to use the Kernel Samepage Merging feature | ||
2 | ---------------------------------------------- | ||
3 | |||
4 | KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y, | ||
5 | added to the Linux kernel in 2.6.32. See mm/ksm.c for its implementation, | ||
6 | and http://lwn.net/Articles/306704/ and http://lwn.net/Articles/330589/ | ||
7 | |||
8 | The KSM daemon ksmd periodically scans those areas of user memory which | ||
9 | have been registered with it, looking for pages of identical content which | ||
10 | can be replaced by a single write-protected page (which is automatically | ||
11 | copied if a process later wants to update its content). | ||
12 | |||
13 | KSM was originally developed for use with KVM (where it was known as | ||
14 | Kernel Shared Memory), to fit more virtual machines into physical memory, | ||
15 | by sharing the data common between them. But it can be useful to any | ||
16 | application which generates many instances of the same data. | ||
17 | |||
18 | KSM only merges anonymous (private) pages, never pagecache (file) pages. | ||
19 | KSM's merged pages are at present locked into kernel memory for as long | ||
20 | as they are shared: so cannot be swapped out like the user pages they | ||
21 | replace (but swapping KSM pages should follow soon in a later release). | ||
22 | |||
23 | KSM only operates on those areas of address space which an application | ||
24 | has advised to be likely candidates for merging, by using the madvise(2) | ||
25 | system call: int madvise(addr, length, MADV_MERGEABLE). | ||
26 | |||
27 | The app may call int madvise(addr, length, MADV_UNMERGEABLE) to cancel | ||
28 | that advice and restore unshared pages: whereupon KSM unmerges whatever | ||
29 | it merged in that range. Note: this unmerging call may suddenly require | ||
30 | more memory than is available - possibly failing with EAGAIN, but more | ||
31 | probably arousing the Out-Of-Memory killer. | ||
32 | |||
33 | If KSM is not configured into the running kernel, madvise MADV_MERGEABLE | ||
34 | and MADV_UNMERGEABLE simply fail with EINVAL. If the running kernel was | ||
35 | built with CONFIG_KSM=y, those calls will normally succeed: even if the | ||
36 | the KSM daemon is not currently running, MADV_MERGEABLE still registers | ||
37 | the range for whenever the KSM daemon is started; even if the range | ||
38 | cannot contain any pages which KSM could actually merge; even if | ||
39 | MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE. | ||
40 | |||
41 | Like other madvise calls, they are intended for use on mapped areas of | ||
42 | the user address space: they will report ENOMEM if the specified range | ||
43 | includes unmapped gaps (though working on the intervening mapped areas), | ||
44 | and might fail with EAGAIN if not enough memory for internal structures. | ||
45 | |||
46 | Applications should be considerate in their use of MADV_MERGEABLE, | ||
47 | restricting its use to areas likely to benefit. KSM's scans may use | ||
48 | a lot of processing power, and its kernel-resident pages are a limited | ||
49 | resource. Some installations will disable KSM for these reasons. | ||
50 | |||
51 | The KSM daemon is controlled by sysfs files in /sys/kernel/mm/ksm/, | ||
52 | readable by all but writable only by root: | ||
53 | |||
54 | max_kernel_pages - set to maximum number of kernel pages that KSM may use | ||
55 | e.g. "echo 2000 > /sys/kernel/mm/ksm/max_kernel_pages" | ||
56 | Value 0 imposes no limit on the kernel pages KSM may use; | ||
57 | but note that any process using MADV_MERGEABLE can cause | ||
58 | KSM to allocate these pages, unswappable until it exits. | ||
59 | Default: 2000 (chosen for demonstration purposes) | ||
60 | |||
61 | pages_to_scan - how many present pages to scan before ksmd goes to sleep | ||
62 | e.g. "echo 200 > /sys/kernel/mm/ksm/pages_to_scan" | ||
63 | Default: 200 (chosen for demonstration purposes) | ||
64 | |||
65 | sleep_millisecs - how many milliseconds ksmd should sleep before next scan | ||
66 | e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs" | ||
67 | Default: 20 (chosen for demonstration purposes) | ||
68 | |||
69 | run - set 0 to stop ksmd from running but keep merged pages, | ||
70 | set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run", | ||
71 | set 2 to stop ksmd and unmerge all pages currently merged, | ||
72 | but leave mergeable areas registered for next run | ||
73 | Default: 1 (for immediate use by apps which register) | ||
74 | |||
75 | The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/: | ||
76 | |||
77 | pages_shared - how many shared unswappable kernel pages KSM is using | ||
78 | pages_sharing - how many more sites are sharing them i.e. how much saved | ||
79 | pages_unshared - how many pages unique but repeatedly checked for merging | ||
80 | pages_volatile - how many pages changing too fast to be placed in a tree | ||
81 | full_scans - how many times all mergeable areas have been scanned | ||
82 | |||
83 | A high ratio of pages_sharing to pages_shared indicates good sharing, but | ||
84 | a high ratio of pages_unshared to pages_sharing indicates wasted effort. | ||
85 | pages_volatile embraces several different kinds of activity, but a high | ||
86 | proportion there would also indicate poor use of madvise MADV_MERGEABLE. | ||
87 | |||
88 | Izik Eidus, | ||
89 | Hugh Dickins, 30 July 2009 | ||
diff --git a/Documentation/vm/map_hugetlb.c b/Documentation/vm/map_hugetlb.c new file mode 100644 index 000000000000..e2bdae37f499 --- /dev/null +++ b/Documentation/vm/map_hugetlb.c | |||
@@ -0,0 +1,77 @@ | |||
1 | /* | ||
2 | * Example of using hugepage memory in a user application using the mmap | ||
3 | * system call with MAP_HUGETLB flag. Before running this program make | ||
4 | * sure the administrator has allocated enough default sized huge pages | ||
5 | * to cover the 256 MB allocation. | ||
6 | * | ||
7 | * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. | ||
8 | * That means the addresses starting with 0x800000... will need to be | ||
9 | * specified. Specifying a fixed address is not required on ppc64, i386 | ||
10 | * or x86_64. | ||
11 | */ | ||
12 | #include <stdlib.h> | ||
13 | #include <stdio.h> | ||
14 | #include <unistd.h> | ||
15 | #include <sys/mman.h> | ||
16 | #include <fcntl.h> | ||
17 | |||
18 | #define LENGTH (256UL*1024*1024) | ||
19 | #define PROTECTION (PROT_READ | PROT_WRITE) | ||
20 | |||
21 | #ifndef MAP_HUGETLB | ||
22 | #define MAP_HUGETLB 0x40 | ||
23 | #endif | ||
24 | |||
25 | /* Only ia64 requires this */ | ||
26 | #ifdef __ia64__ | ||
27 | #define ADDR (void *)(0x8000000000000000UL) | ||
28 | #define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) | ||
29 | #else | ||
30 | #define ADDR (void *)(0x0UL) | ||
31 | #define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) | ||
32 | #endif | ||
33 | |||
34 | void check_bytes(char *addr) | ||
35 | { | ||
36 | printf("First hex is %x\n", *((unsigned int *)addr)); | ||
37 | } | ||
38 | |||
39 | void write_bytes(char *addr) | ||
40 | { | ||
41 | unsigned long i; | ||
42 | |||
43 | for (i = 0; i < LENGTH; i++) | ||
44 | *(addr + i) = (char)i; | ||
45 | } | ||
46 | |||
47 | void read_bytes(char *addr) | ||
48 | { | ||
49 | unsigned long i; | ||
50 | |||
51 | check_bytes(addr); | ||
52 | for (i = 0; i < LENGTH; i++) | ||
53 | if (*(addr + i) != (char)i) { | ||
54 | printf("Mismatch at %lu\n", i); | ||
55 | break; | ||
56 | } | ||
57 | } | ||
58 | |||
59 | int main(void) | ||
60 | { | ||
61 | void *addr; | ||
62 | |||
63 | addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, 0, 0); | ||
64 | if (addr == MAP_FAILED) { | ||
65 | perror("mmap"); | ||
66 | exit(1); | ||
67 | } | ||
68 | |||
69 | printf("Returned address is %p\n", addr); | ||
70 | check_bytes(addr); | ||
71 | write_bytes(addr); | ||
72 | read_bytes(addr); | ||
73 | |||
74 | munmap(addr, LENGTH); | ||
75 | |||
76 | return 0; | ||
77 | } | ||