aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/DocBook/mtdnand.tmpl2
-rw-r--r--Documentation/DocBook/scsi.tmpl2
-rw-r--r--Documentation/SubmittingPatches2
-rw-r--r--Documentation/filesystems/nfs41-server.txt54
-rw-r--r--Documentation/filesystems/nfsroot.txt2
-rw-r--r--Documentation/filesystems/proc.txt22
-rw-r--r--Documentation/gcov.txt2
-rw-r--r--Documentation/hwmon/hpfall.c115
-rw-r--r--Documentation/hwmon/pc874272
-rw-r--r--Documentation/kernel-parameters.txt2
-rw-r--r--Documentation/kmemcheck.txt21
-rw-r--r--Documentation/memory.txt31
-rw-r--r--Documentation/networking/regulatory.txt2
-rw-r--r--Documentation/numastat.txt8
-rw-r--r--Documentation/powerpc/dts-bindings/marvell.txt2
-rw-r--r--Documentation/scsi/ChangeLog.megaraid2
-rw-r--r--Documentation/scsi/scsi_fc_transport.txt2
-rw-r--r--Documentation/sound/alsa/HD-Audio-Models.txt2
-rw-r--r--Documentation/sysctl/kernel.txt30
-rw-r--r--Documentation/sysctl/vm.txt4
-rw-r--r--Documentation/trace/events-kmem.txt107
-rw-r--r--Documentation/trace/events.txt2
-rw-r--r--Documentation/trace/ftrace.txt2
-rw-r--r--Documentation/trace/postprocess/trace-pagealloc-postprocess.pl418
-rw-r--r--Documentation/trace/tracepoint-analysis.txt327
-rw-r--r--Documentation/vm/00-INDEX4
-rw-r--r--Documentation/vm/hugetlbpage.txt147
-rw-r--r--Documentation/vm/ksm.txt89
-rw-r--r--Documentation/vm/map_hugetlb.c77
29 files changed, 1310 insertions, 172 deletions
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
index 8e145857fc9d..df0d089d0fb9 100644
--- a/Documentation/DocBook/mtdnand.tmpl
+++ b/Documentation/DocBook/mtdnand.tmpl
@@ -568,7 +568,7 @@ static void board_select_chip (struct mtd_info *mtd, int chip)
568 <para> 568 <para>
569 The blocks in which the tables are stored are procteted against 569 The blocks in which the tables are stored are procteted against
570 accidental access by marking them bad in the memory bad block 570 accidental access by marking them bad in the memory bad block
571 table. The bad block table managment functions are allowed 571 table. The bad block table management functions are allowed
572 to circumvernt this protection. 572 to circumvernt this protection.
573 </para> 573 </para>
574 <para> 574 <para>
diff --git a/Documentation/DocBook/scsi.tmpl b/Documentation/DocBook/scsi.tmpl
index 10a150ae2a7e..d87f4569e768 100644
--- a/Documentation/DocBook/scsi.tmpl
+++ b/Documentation/DocBook/scsi.tmpl
@@ -317,7 +317,7 @@
317 <para> 317 <para>
318 The SAS transport class contains common code to deal with SAS HBAs, 318 The SAS transport class contains common code to deal with SAS HBAs,
319 an aproximated representation of SAS topologies in the driver model, 319 an aproximated representation of SAS topologies in the driver model,
320 and various sysfs attributes to expose these topologies and managment 320 and various sysfs attributes to expose these topologies and management
321 interfaces to userspace. 321 interfaces to userspace.
322 </para> 322 </para>
323 <para> 323 <para>
diff --git a/Documentation/SubmittingPatches b/Documentation/SubmittingPatches
index 5c555a8b39e5..b7f9d3b4bbf6 100644
--- a/Documentation/SubmittingPatches
+++ b/Documentation/SubmittingPatches
@@ -183,7 +183,7 @@ the MAN-PAGES maintainer (as listed in the MAINTAINERS file)
183a man-pages patch, or at least a notification of the change, 183a man-pages patch, or at least a notification of the change,
184so that some information makes its way into the manual pages. 184so that some information makes its way into the manual pages.
185 185
186Even if the maintainer did not respond in step #4, make sure to ALWAYS 186Even if the maintainer did not respond in step #5, make sure to ALWAYS
187copy the maintainer when you change their code. 187copy the maintainer when you change their code.
188 188
189For small patches you may want to CC the Trivial Patch Monkey 189For small patches you may want to CC the Trivial Patch Monkey
diff --git a/Documentation/filesystems/nfs41-server.txt b/Documentation/filesystems/nfs41-server.txt
index 05d81cbcb2e1..5920fe26e6ff 100644
--- a/Documentation/filesystems/nfs41-server.txt
+++ b/Documentation/filesystems/nfs41-server.txt
@@ -11,6 +11,11 @@ the /proc/fs/nfsd/versions control file. Note that to write this
11control file, the nfsd service must be taken down. Use your user-mode 11control file, the nfsd service must be taken down. Use your user-mode
12nfs-utils to set this up; see rpc.nfsd(8) 12nfs-utils to set this up; see rpc.nfsd(8)
13 13
14(Warning: older servers will interpret "+4.1" and "-4.1" as "+4" and
15"-4", respectively. Therefore, code meant to work on both new and old
16kernels must turn 4.1 on or off *before* turning support for version 4
17on or off; rpc.nfsd does this correctly.)
18
14The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based 19The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based
15on the latest NFSv4.1 Internet Draft: 20on the latest NFSv4.1 Internet Draft:
16http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29 21http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29
@@ -25,6 +30,49 @@ are still under development out of tree.
25See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design 30See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design
26for more information. 31for more information.
27 32
33The current implementation is intended for developers only: while it
34does support ordinary file operations on clients we have tested against
35(including the linux client), it is incomplete in ways which may limit
36features unexpectedly, cause known bugs in rare cases, or cause
37interoperability problems with future clients. Known issues:
38
39 - gss support is questionable: currently mounts with kerberos
40 from a linux client are possible, but we aren't really
41 conformant with the spec (for example, we don't use kerberos
42 on the backchannel correctly).
43 - no trunking support: no clients currently take advantage of
44 trunking, but this is a mandatory failure, and its use is
45 recommended to clients in a number of places. (E.g. to ensure
46 timely renewal in case an existing connection's retry timeouts
47 have gotten too long; see section 8.3 of the draft.)
48 Therefore, lack of this feature may cause future clients to
49 fail.
50 - Incomplete backchannel support: incomplete backchannel gss
51 support and no support for BACKCHANNEL_CTL mean that
52 callbacks (hence delegations and layouts) may not be
53 available and clients confused by the incomplete
54 implementation may fail.
55 - Server reboot recovery is unsupported; if the server reboots,
56 clients may fail.
57 - We do not support SSV, which provides security for shared
58 client-server state (thus preventing unauthorized tampering
59 with locks and opens, for example). It is mandatory for
60 servers to support this, though no clients use it yet.
61 - Mandatory operations which we do not support, such as
62 DESTROY_CLIENTID, FREE_STATEID, SECINFO_NO_NAME, and
63 TEST_STATEID, are not currently used by clients, but will be
64 (and the spec recommends their uses in common cases), and
65 clients should not be expected to know how to recover from the
66 case where they are not supported. This will eventually cause
67 interoperability failures.
68
69In addition, some limitations are inherited from the current NFSv4
70implementation:
71
72 - Incomplete delegation enforcement: if a file is renamed or
73 unlinked, a client holding a delegation may continue to
74 indefinitely allow opens of the file under the old name.
75
28The table below, taken from the NFSv4.1 document, lists 76The table below, taken from the NFSv4.1 document, lists
29the operations that are mandatory to implement (REQ), optional 77the operations that are mandatory to implement (REQ), optional
30(OPT), and NFSv4.0 operations that are required not to implement (MNI) 78(OPT), and NFSv4.0 operations that are required not to implement (MNI)
@@ -142,6 +190,12 @@ NS*| CB_WANTS_CANCELLED | OPT | FDELG, | Section 20.10 |
142 190
143Implementation notes: 191Implementation notes:
144 192
193DELEGPURGE:
194* mandatory only for servers that support CLAIM_DELEGATE_PREV and/or
195 CLAIM_DELEG_PREV_FH (which allows clients to keep delegations that
196 persist across client reboots). Thus we need not implement this for
197 now.
198
145EXCHANGE_ID: 199EXCHANGE_ID:
146* only SP4_NONE state protection supported 200* only SP4_NONE state protection supported
147* implementation ids are ignored 201* implementation ids are ignored
diff --git a/Documentation/filesystems/nfsroot.txt b/Documentation/filesystems/nfsroot.txt
index 68baddf3c3e0..3ba0b945aaf8 100644
--- a/Documentation/filesystems/nfsroot.txt
+++ b/Documentation/filesystems/nfsroot.txt
@@ -105,7 +105,7 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>
105 the client address and this parameter is NOT empty only 105 the client address and this parameter is NOT empty only
106 replies from the specified server are accepted. 106 replies from the specified server are accepted.
107 107
108 Only required for for NFS root. That is autoconfiguration 108 Only required for NFS root. That is autoconfiguration
109 will not be triggered if it is missing and NFS root is not 109 will not be triggered if it is missing and NFS root is not
110 in operation. 110 in operation.
111 111
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index ffead13f9443..75988ba26a51 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -375,6 +375,19 @@ of memory currently marked as referenced or accessed.
375This file is only present if the CONFIG_MMU kernel configuration option is 375This file is only present if the CONFIG_MMU kernel configuration option is
376enabled. 376enabled.
377 377
378The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG
379bits on both physical and virtual pages associated with a process.
380To clear the bits for all the pages associated with the process
381 > echo 1 > /proc/PID/clear_refs
382
383To clear the bits for the anonymous pages associated with the process
384 > echo 2 > /proc/PID/clear_refs
385
386To clear the bits for the file mapped pages associated with the process
387 > echo 3 > /proc/PID/clear_refs
388Any other value written to /proc/PID/clear_refs will have no effect.
389
390
3781.2 Kernel data 3911.2 Kernel data
379--------------- 392---------------
380 393
@@ -1032,9 +1045,9 @@ Various pieces of information about kernel activity are available in the
1032since the system first booted. For a quick look, simply cat the file: 1045since the system first booted. For a quick look, simply cat the file:
1033 1046
1034 > cat /proc/stat 1047 > cat /proc/stat
1035 cpu 2255 34 2290 22625563 6290 127 456 0 1048 cpu 2255 34 2290 22625563 6290 127 456 0 0
1036 cpu0 1132 34 1441 11311718 3675 127 438 0 1049 cpu0 1132 34 1441 11311718 3675 127 438 0 0
1037 cpu1 1123 0 849 11313845 2614 0 18 0 1050 cpu1 1123 0 849 11313845 2614 0 18 0 0
1038 intr 114930548 113199788 3 0 5 263 0 4 [... lots more numbers ...] 1051 intr 114930548 113199788 3 0 5 263 0 4 [... lots more numbers ...]
1039 ctxt 1990473 1052 ctxt 1990473
1040 btime 1062191376 1053 btime 1062191376
@@ -1056,6 +1069,7 @@ second). The meanings of the columns are as follows, from left to right:
1056- irq: servicing interrupts 1069- irq: servicing interrupts
1057- softirq: servicing softirqs 1070- softirq: servicing softirqs
1058- steal: involuntary wait 1071- steal: involuntary wait
1072- guest: running a guest
1059 1073
1060The "intr" line gives counts of interrupts serviced since boot time, for each 1074The "intr" line gives counts of interrupts serviced since boot time, for each
1061of the possible system interrupts. The first column is the total of all 1075of the possible system interrupts. The first column is the total of all
@@ -1191,7 +1205,7 @@ The following heuristics are then applied:
1191 * if the task was reniced, its score doubles 1205 * if the task was reniced, its score doubles
1192 * superuser or direct hardware access tasks (CAP_SYS_ADMIN, CAP_SYS_RESOURCE 1206 * superuser or direct hardware access tasks (CAP_SYS_ADMIN, CAP_SYS_RESOURCE
1193 or CAP_SYS_RAWIO) have their score divided by 4 1207 or CAP_SYS_RAWIO) have their score divided by 4
1194 * if oom condition happened in one cpuset and checked task does not belong 1208 * if oom condition happened in one cpuset and checked process does not belong
1195 to it, its score is divided by 8 1209 to it, its score is divided by 8
1196 * the resulting score is multiplied by two to the power of oom_adj, i.e. 1210 * the resulting score is multiplied by two to the power of oom_adj, i.e.
1197 points <<= oom_adj when it is positive and 1211 points <<= oom_adj when it is positive and
diff --git a/Documentation/gcov.txt b/Documentation/gcov.txt
index 40ec63352760..e7ca6478cd93 100644
--- a/Documentation/gcov.txt
+++ b/Documentation/gcov.txt
@@ -47,7 +47,7 @@ Possible uses:
47 47
48Configure the kernel with: 48Configure the kernel with:
49 49
50 CONFIG_DEBUGFS=y 50 CONFIG_DEBUG_FS=y
51 CONFIG_GCOV_KERNEL=y 51 CONFIG_GCOV_KERNEL=y
52 52
53and to get coverage data for the entire kernel: 53and to get coverage data for the entire kernel:
diff --git a/Documentation/hwmon/hpfall.c b/Documentation/hwmon/hpfall.c
index bbea1ccfd46a..681ec22b9d0e 100644
--- a/Documentation/hwmon/hpfall.c
+++ b/Documentation/hwmon/hpfall.c
@@ -16,6 +16,34 @@
16#include <stdint.h> 16#include <stdint.h>
17#include <errno.h> 17#include <errno.h>
18#include <signal.h> 18#include <signal.h>
19#include <sys/mman.h>
20#include <sched.h>
21
22char unload_heads_path[64];
23
24int set_unload_heads_path(char *device)
25{
26 char devname[64];
27
28 if (strlen(device) <= 5 || strncmp(device, "/dev/", 5) != 0)
29 return -EINVAL;
30 strncpy(devname, device + 5, sizeof(devname));
31
32 snprintf(unload_heads_path, sizeof(unload_heads_path),
33 "/sys/block/%s/device/unload_heads", devname);
34 return 0;
35}
36int valid_disk(void)
37{
38 int fd = open(unload_heads_path, O_RDONLY);
39 if (fd < 0) {
40 perror(unload_heads_path);
41 return 0;
42 }
43
44 close(fd);
45 return 1;
46}
19 47
20void write_int(char *path, int i) 48void write_int(char *path, int i)
21{ 49{
@@ -40,7 +68,7 @@ void set_led(int on)
40 68
41void protect(int seconds) 69void protect(int seconds)
42{ 70{
43 write_int("/sys/block/sda/device/unload_heads", seconds*1000); 71 write_int(unload_heads_path, seconds*1000);
44} 72}
45 73
46int on_ac(void) 74int on_ac(void)
@@ -57,45 +85,62 @@ void ignore_me(void)
57{ 85{
58 protect(0); 86 protect(0);
59 set_led(0); 87 set_led(0);
60
61} 88}
62 89
63int main(int argc, char* argv[]) 90int main(int argc, char **argv)
64{ 91{
65 int fd, ret; 92 int fd, ret;
93 struct sched_param param;
94
95 if (argc == 1)
96 ret = set_unload_heads_path("/dev/sda");
97 else if (argc == 2)
98 ret = set_unload_heads_path(argv[1]);
99 else
100 ret = -EINVAL;
101
102 if (ret || !valid_disk()) {
103 fprintf(stderr, "usage: %s <device> (default: /dev/sda)\n",
104 argv[0]);
105 exit(1);
106 }
107
108 fd = open("/dev/freefall", O_RDONLY);
109 if (fd < 0) {
110 perror("/dev/freefall");
111 return EXIT_FAILURE;
112 }
66 113
67 fd = open("/dev/freefall", O_RDONLY); 114 daemon(0, 0);
68 if (fd < 0) { 115 param.sched_priority = sched_get_priority_max(SCHED_FIFO);
69 perror("open"); 116 sched_setscheduler(0, SCHED_FIFO, &param);
70 return EXIT_FAILURE; 117 mlockall(MCL_CURRENT|MCL_FUTURE);
71 }
72 118
73 signal(SIGALRM, ignore_me); 119 signal(SIGALRM, ignore_me);
74 120
75 for (;;) { 121 for (;;) {
76 unsigned char count; 122 unsigned char count;
77 123
78 ret = read(fd, &count, sizeof(count)); 124 ret = read(fd, &count, sizeof(count));
79 alarm(0); 125 alarm(0);
80 if ((ret == -1) && (errno == EINTR)) { 126 if ((ret == -1) && (errno == EINTR)) {
81 /* Alarm expired, time to unpark the heads */ 127 /* Alarm expired, time to unpark the heads */
82 continue; 128 continue;
83 } 129 }
84 130
85 if (ret != sizeof(count)) { 131 if (ret != sizeof(count)) {
86 perror("read"); 132 perror("read");
87 break; 133 break;
88 } 134 }
89 135
90 protect(21); 136 protect(21);
91 set_led(1); 137 set_led(1);
92 if (1 || on_ac() || lid_open()) { 138 if (1 || on_ac() || lid_open())
93 alarm(2); 139 alarm(2);
94 } else { 140 else
95 alarm(20); 141 alarm(20);
96 } 142 }
97 } 143
98 144 close(fd);
99 close(fd); 145 return EXIT_SUCCESS;
100 return EXIT_SUCCESS;
101} 146}
diff --git a/Documentation/hwmon/pc87427 b/Documentation/hwmon/pc87427
index d1ebbe510f35..db5cc1227a83 100644
--- a/Documentation/hwmon/pc87427
+++ b/Documentation/hwmon/pc87427
@@ -34,5 +34,5 @@ Fan rotation speeds are reported as 14-bit values from a gated clock
34signal. Speeds down to 83 RPM can be measured. 34signal. Speeds down to 83 RPM can be measured.
35 35
36An alarm is triggered if the rotation speed drops below a programmable 36An alarm is triggered if the rotation speed drops below a programmable
37limit. Another alarm is triggered if the speed is too low to to be measured 37limit. Another alarm is triggered if the speed is too low to be measured
38(including stalled or missing fan). 38(including stalled or missing fan).
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 0f17d16dc101..c363840cdcea 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -933,7 +933,7 @@ and is between 256 and 4096 characters. It is defined in the file
933 1 -- enable informational integrity auditing messages. 933 1 -- enable informational integrity auditing messages.
934 934
935 ima_hash= [IMA] 935 ima_hash= [IMA]
936 Formt: { "sha1" | "md5" } 936 Format: { "sha1" | "md5" }
937 default: "sha1" 937 default: "sha1"
938 938
939 ima_tcb [IMA] 939 ima_tcb [IMA]
diff --git a/Documentation/kmemcheck.txt b/Documentation/kmemcheck.txt
index 363044609dad..c28f82895d6b 100644
--- a/Documentation/kmemcheck.txt
+++ b/Documentation/kmemcheck.txt
@@ -43,26 +43,7 @@ feature.
431. Downloading 431. Downloading
44============== 44==============
45 45
46kmemcheck can only be downloaded using git. If you want to write patches 46As of version 2.6.31-rc1, kmemcheck is included in the mainline kernel.
47against the current code, you should use the kmemcheck development branch of
48the tip tree. It is also possible to use the linux-next tree, which also
49includes the latest version of kmemcheck.
50
51Assuming that you've already cloned the linux-2.6.git repository, all you
52have to do is add the -tip tree as a remote, like this:
53
54 $ git remote add tip git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git
55
56To actually download the tree, fetch the remote:
57
58 $ git fetch tip
59
60And to check out a new local branch with the kmemcheck code:
61
62 $ git checkout -b kmemcheck tip/kmemcheck
63
64General instructions for the -tip tree can be found here:
65http://people.redhat.com/mingo/tip.git/readme.txt
66 47
67 48
682. Configuring and compiling 492. Configuring and compiling
diff --git a/Documentation/memory.txt b/Documentation/memory.txt
index 2b3dedd39538..802efe58647c 100644
--- a/Documentation/memory.txt
+++ b/Documentation/memory.txt
@@ -1,18 +1,7 @@
1There are several classic problems related to memory on Linux 1There are several classic problems related to memory on Linux
2systems. 2systems.
3 3
4 1) There are some buggy motherboards which cannot properly 4 1) There are some motherboards that will not cache above
5 deal with the memory above 16MB. Consider exchanging
6 your motherboard.
7
8 2) You cannot do DMA on the ISA bus to addresses above
9 16M. Most device drivers under Linux allow the use
10 of bounce buffers which work around this problem. Drivers
11 that don't use bounce buffers will be unstable with
12 more than 16M installed. Drivers that use bounce buffers
13 will be OK, but may have slightly higher overhead.
14
15 3) There are some motherboards that will not cache above
16 a certain quantity of memory. If you have one of these 5 a certain quantity of memory. If you have one of these
17 motherboards, your system will be SLOWER, not faster 6 motherboards, your system will be SLOWER, not faster
18 as you add more memory. Consider exchanging your 7 as you add more memory. Consider exchanging your
@@ -24,7 +13,7 @@ It can also tell Linux to use less memory than is actually installed.
24If you use "mem=" on a machine with PCI, consider using "memmap=" to avoid 13If you use "mem=" on a machine with PCI, consider using "memmap=" to avoid
25physical address space collisions. 14physical address space collisions.
26 15
27See the documentation of your boot loader (LILO, loadlin, etc.) about 16See the documentation of your boot loader (LILO, grub, loadlin, etc.) about
28how to pass options to the kernel. 17how to pass options to the kernel.
29 18
30There are other memory problems which Linux cannot deal with. Random 19There are other memory problems which Linux cannot deal with. Random
@@ -42,19 +31,3 @@ Try:
42 with the vendor. Consider testing it with memtest86 yourself. 31 with the vendor. Consider testing it with memtest86 yourself.
43 32
44 * Exchanging your CPU, cache, or motherboard for one that works. 33 * Exchanging your CPU, cache, or motherboard for one that works.
45
46 * Disabling the cache from the BIOS.
47
48 * Try passing the "mem=4M" option to the kernel to limit
49 Linux to using a very small amount of memory. Use "memmap="-option
50 together with "mem=" on systems with PCI to avoid physical address
51 space collisions.
52
53
54Other tricks:
55
56 * Try passing the "no-387" option to the kernel to ignore
57 a buggy FPU.
58
59 * Try passing the "no-hlt" option to disable the potentially
60 buggy HLT instruction in your CPU.
diff --git a/Documentation/networking/regulatory.txt b/Documentation/networking/regulatory.txt
index eaa1a25946c1..ee31369e9e5b 100644
--- a/Documentation/networking/regulatory.txt
+++ b/Documentation/networking/regulatory.txt
@@ -96,7 +96,7 @@ Example code - drivers hinting an alpha2:
96 96
97This example comes from the zd1211rw device driver. You can start 97This example comes from the zd1211rw device driver. You can start
98by having a mapping of your device's EEPROM country/regulatory 98by having a mapping of your device's EEPROM country/regulatory
99domain value to to a specific alpha2 as follows: 99domain value to a specific alpha2 as follows:
100 100
101static struct zd_reg_alpha2_map reg_alpha2_map[] = { 101static struct zd_reg_alpha2_map reg_alpha2_map[] = {
102 { ZD_REGDOMAIN_FCC, "US" }, 102 { ZD_REGDOMAIN_FCC, "US" },
diff --git a/Documentation/numastat.txt b/Documentation/numastat.txt
index 80133ace1eb2..9fcc9a608dc0 100644
--- a/Documentation/numastat.txt
+++ b/Documentation/numastat.txt
@@ -7,10 +7,10 @@ All units are pages. Hugepages have separate counters.
7 7
8numa_hit A process wanted to allocate memory from this node, 8numa_hit A process wanted to allocate memory from this node,
9 and succeeded. 9 and succeeded.
10numa_miss A process wanted to allocate memory from this node, 10numa_miss A process wanted to allocate memory from another node,
11 but ended up with memory from another. 11 but ended up with memory from this node.
12numa_foreign A process wanted to allocate on another node, 12numa_foreign A process wanted to allocate on this node,
13 but ended up with memory from this one. 13 but ended up with memory from another one.
14local_node A process ran on this node and got memory from it. 14local_node A process ran on this node and got memory from it.
15other_node A process ran on this node and got memory from another node. 15other_node A process ran on this node and got memory from another node.
16interleave_hit Interleaving wanted to allocate from this node 16interleave_hit Interleaving wanted to allocate from this node
diff --git a/Documentation/powerpc/dts-bindings/marvell.txt b/Documentation/powerpc/dts-bindings/marvell.txt
index 3708a2fd4747..f1533d91953a 100644
--- a/Documentation/powerpc/dts-bindings/marvell.txt
+++ b/Documentation/powerpc/dts-bindings/marvell.txt
@@ -32,7 +32,7 @@ prefixed with the string "marvell,", for Marvell Technology Group Ltd.
32 devices. This field represents the number of cells needed to 32 devices. This field represents the number of cells needed to
33 represent the address of the memory-mapped registers of devices 33 represent the address of the memory-mapped registers of devices
34 within the system controller chip. 34 within the system controller chip.
35 - #size-cells : Size representation for for the memory-mapped 35 - #size-cells : Size representation for the memory-mapped
36 registers within the system controller chip. 36 registers within the system controller chip.
37 - #interrupt-cells : Defines the width of cells used to represent 37 - #interrupt-cells : Defines the width of cells used to represent
38 interrupts. 38 interrupts.
diff --git a/Documentation/scsi/ChangeLog.megaraid b/Documentation/scsi/ChangeLog.megaraid
index eaa4801f2ce6..38e9e7cadc90 100644
--- a/Documentation/scsi/ChangeLog.megaraid
+++ b/Documentation/scsi/ChangeLog.megaraid
@@ -514,7 +514,7 @@ iv. Remove yield() while mailbox handshake in synchronous commands
514 514
515v. Remove redundant __megaraid_busywait_mbox routine 515v. Remove redundant __megaraid_busywait_mbox routine
516 516
517vi. Fix bug in the managment module, which causes a system lockup when the 517vi. Fix bug in the management module, which causes a system lockup when the
518 IO module is loaded and then unloaded, followed by executing any 518 IO module is loaded and then unloaded, followed by executing any
519 management utility. The current version of management module does not 519 management utility. The current version of management module does not
520 handle the adapter unregister properly. 520 handle the adapter unregister properly.
diff --git a/Documentation/scsi/scsi_fc_transport.txt b/Documentation/scsi/scsi_fc_transport.txt
index d7f181701dc2..aec6549ab097 100644
--- a/Documentation/scsi/scsi_fc_transport.txt
+++ b/Documentation/scsi/scsi_fc_transport.txt
@@ -378,7 +378,7 @@ Vport Disable/Enable:
378 int vport_disable(struct fc_vport *vport, bool disable) 378 int vport_disable(struct fc_vport *vport, bool disable)
379 379
380 where: 380 where:
381 vport: Is vport to to be enabled or disabled 381 vport: Is vport to be enabled or disabled
382 disable: If "true", the vport is to be disabled. 382 disable: If "true", the vport is to be disabled.
383 If "false", the vport is to be enabled. 383 If "false", the vport is to be enabled.
384 384
diff --git a/Documentation/sound/alsa/HD-Audio-Models.txt b/Documentation/sound/alsa/HD-Audio-Models.txt
index 97eebd63bedc..f1708b79f963 100644
--- a/Documentation/sound/alsa/HD-Audio-Models.txt
+++ b/Documentation/sound/alsa/HD-Audio-Models.txt
@@ -387,7 +387,7 @@ STAC92HD73*
387STAC92HD83* 387STAC92HD83*
388=========== 388===========
389 ref Reference board 389 ref Reference board
390 mic-ref Reference board with power managment for ports 390 mic-ref Reference board with power management for ports
391 dell-s14 Dell laptop 391 dell-s14 Dell laptop
392 auto BIOS setup (default) 392 auto BIOS setup (default)
393 393
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 2dbff53369d0..3e5b63ebb821 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -319,25 +319,29 @@ This option can be used to select the type of process address
319space randomization that is used in the system, for architectures 319space randomization that is used in the system, for architectures
320that support this feature. 320that support this feature.
321 321
3220 - Turn the process address space randomization off by default. 3220 - Turn the process address space randomization off. This is the
323 default for architectures that do not support this feature anyways,
324 and kernels that are booted with the "norandmaps" parameter.
323 325
3241 - Make the addresses of mmap base, stack and VDSO page randomized. 3261 - Make the addresses of mmap base, stack and VDSO page randomized.
325 This, among other things, implies that shared libraries will be 327 This, among other things, implies that shared libraries will be
326 loaded to random addresses. Also for PIE-linked binaries, the location 328 loaded to random addresses. Also for PIE-linked binaries, the
327 of code start is randomized. 329 location of code start is randomized. This is the default if the
330 CONFIG_COMPAT_BRK option is enabled.
328 331
329 With heap randomization, the situation is a little bit more 3322 - Additionally enable heap randomization. This is the default if
330 complicated. 333 CONFIG_COMPAT_BRK is disabled.
331 There a few legacy applications out there (such as some ancient 334
335 There are a few legacy applications out there (such as some ancient
332 versions of libc.so.5 from 1996) that assume that brk area starts 336 versions of libc.so.5 from 1996) that assume that brk area starts
333 just after the end of the code+bss. These applications break when 337 just after the end of the code+bss. These applications break when
334 start of the brk area is randomized. There are however no known 338 start of the brk area is randomized. There are however no known
335 non-legacy applications that would be broken this way, so for most 339 non-legacy applications that would be broken this way, so for most
336 systems it is safe to choose full randomization. However there is 340 systems it is safe to choose full randomization.
337 a CONFIG_COMPAT_BRK option for systems with ancient and/or broken 341
338 binaries, that makes heap non-randomized, but keeps all other 342 Systems with ancient and/or broken binaries should be configured
339 parts of process address space randomized if randomize_va_space 343 with CONFIG_COMPAT_BRK enabled, which excludes the heap from process
340 sysctl is turned on. 344 address space randomization.
341 345
342============================================================== 346==============================================================
343 347
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index c4de6359d440..e6fb1ec2744b 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -585,7 +585,9 @@ caching of directory and inode objects.
585At the default value of vfs_cache_pressure=100 the kernel will attempt to 585At the default value of vfs_cache_pressure=100 the kernel will attempt to
586reclaim dentries and inodes at a "fair" rate with respect to pagecache and 586reclaim dentries and inodes at a "fair" rate with respect to pagecache and
587swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer 587swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer
588to retain dentry and inode caches. Increasing vfs_cache_pressure beyond 100 588to retain dentry and inode caches. When vfs_cache_pressure=0, the kernel will
589never reclaim dentries and inodes due to memory pressure and this can easily
590lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100
589causes the kernel to prefer to reclaim dentries and inodes. 591causes the kernel to prefer to reclaim dentries and inodes.
590 592
591============================================================== 593==============================================================
diff --git a/Documentation/trace/events-kmem.txt b/Documentation/trace/events-kmem.txt
new file mode 100644
index 000000000000..6ef2a8652e17
--- /dev/null
+++ b/Documentation/trace/events-kmem.txt
@@ -0,0 +1,107 @@
1 Subsystem Trace Points: kmem
2
3The tracing system kmem captures events related to object and page allocation
4within the kernel. Broadly speaking there are four major subheadings.
5
6 o Slab allocation of small objects of unknown type (kmalloc)
7 o Slab allocation of small objects of known type
8 o Page allocation
9 o Per-CPU Allocator Activity
10 o External Fragmentation
11
12This document will describe what each of the tracepoints are and why they
13might be useful.
14
151. Slab allocation of small objects of unknown type
16===================================================
17kmalloc call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s
18kmalloc_node call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d
19kfree call_site=%lx ptr=%p
20
21Heavy activity for these events may indicate that a specific cache is
22justified, particularly if kmalloc slab pages are getting significantly
23internal fragmented as a result of the allocation pattern. By correlating
24kmalloc with kfree, it may be possible to identify memory leaks and where
25the allocation sites were.
26
27
282. Slab allocation of small objects of known type
29=================================================
30kmem_cache_alloc call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s
31kmem_cache_alloc_node call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d
32kmem_cache_free call_site=%lx ptr=%p
33
34These events are similar in usage to the kmalloc-related events except that
35it is likely easier to pin the event down to a specific cache. At the time
36of writing, no information is available on what slab is being allocated from,
37but the call_site can usually be used to extrapolate that information
38
393. Page allocation
40==================
41mm_page_alloc page=%p pfn=%lu order=%d migratetype=%d gfp_flags=%s
42mm_page_alloc_zone_locked page=%p pfn=%lu order=%u migratetype=%d cpu=%d percpu_refill=%d
43mm_page_free_direct page=%p pfn=%lu order=%d
44mm_pagevec_free page=%p pfn=%lu order=%d cold=%d
45
46These four events deal with page allocation and freeing. mm_page_alloc is
47a simple indicator of page allocator activity. Pages may be allocated from
48the per-CPU allocator (high performance) or the buddy allocator.
49
50If pages are allocated directly from the buddy allocator, the
51mm_page_alloc_zone_locked event is triggered. This event is important as high
52amounts of activity imply high activity on the zone->lock. Taking this lock
53impairs performance by disabling interrupts, dirtying cache lines between
54CPUs and serialising many CPUs.
55
56When a page is freed directly by the caller, the mm_page_free_direct event
57is triggered. Significant amounts of activity here could indicate that the
58callers should be batching their activities.
59
60When pages are freed using a pagevec, the mm_pagevec_free is
61triggered. Broadly speaking, pages are taken off the LRU lock in bulk and
62freed in batch with a pagevec. Significant amounts of activity here could
63indicate that the system is under memory pressure and can also indicate
64contention on the zone->lru_lock.
65
664. Per-CPU Allocator Activity
67=============================
68mm_page_alloc_zone_locked page=%p pfn=%lu order=%u migratetype=%d cpu=%d percpu_refill=%d
69mm_page_pcpu_drain page=%p pfn=%lu order=%d cpu=%d migratetype=%d
70
71In front of the page allocator is a per-cpu page allocator. It exists only
72for order-0 pages, reduces contention on the zone->lock and reduces the
73amount of writing on struct page.
74
75When a per-CPU list is empty or pages of the wrong type are allocated,
76the zone->lock will be taken once and the per-CPU list refilled. The event
77triggered is mm_page_alloc_zone_locked for each page allocated with the
78event indicating whether it is for a percpu_refill or not.
79
80When the per-CPU list is too full, a number of pages are freed, each one
81which triggers a mm_page_pcpu_drain event.
82
83The individual nature of the events are so that pages can be tracked
84between allocation and freeing. A number of drain or refill pages that occur
85consecutively imply the zone->lock being taken once. Large amounts of PCP
86refills and drains could imply an imbalance between CPUs where too much work
87is being concentrated in one place. It could also indicate that the per-CPU
88lists should be a larger size. Finally, large amounts of refills on one CPU
89and drains on another could be a factor in causing large amounts of cache
90line bounces due to writes between CPUs and worth investigating if pages
91can be allocated and freed on the same CPU through some algorithm change.
92
935. External Fragmentation
94=========================
95mm_page_alloc_extfrag page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d
96
97External fragmentation affects whether a high-order allocation will be
98successful or not. For some types of hardware, this is important although
99it is avoided where possible. If the system is using huge pages and needs
100to be able to resize the pool over the lifetime of the system, this value
101is important.
102
103Large numbers of this event implies that memory is fragmenting and
104high-order allocations will start failing at some time in the future. One
105means of reducing the occurange of this event is to increase the size of
106min_free_kbytes in increments of 3*pageblock_size*nr_online_nodes where
107pageblock_size is usually the size of the default hugepage size.
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index 78c45a87be57..02ac6ed38b2d 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -72,7 +72,7 @@ To enable all events in sched subsystem:
72 72
73 # echo 1 > /sys/kernel/debug/tracing/events/sched/enable 73 # echo 1 > /sys/kernel/debug/tracing/events/sched/enable
74 74
75To eanble all events: 75To enable all events:
76 76
77 # echo 1 > /sys/kernel/debug/tracing/events/enable 77 # echo 1 > /sys/kernel/debug/tracing/events/enable
78 78
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 1b6292bbdd6d..957b22fde2df 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -133,7 +133,7 @@ of ftrace. Here is a list of some of the key files:
133 than requested, the rest of the page will be used, 133 than requested, the rest of the page will be used,
134 making the actual allocation bigger than requested. 134 making the actual allocation bigger than requested.
135 ( Note, the size may not be a multiple of the page size 135 ( Note, the size may not be a multiple of the page size
136 due to buffer managment overhead. ) 136 due to buffer management overhead. )
137 137
138 This can only be updated when the current_tracer 138 This can only be updated when the current_tracer
139 is set to "nop". 139 is set to "nop".
diff --git a/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl b/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl
new file mode 100644
index 000000000000..7df50e8cf4d9
--- /dev/null
+++ b/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl
@@ -0,0 +1,418 @@
1#!/usr/bin/perl
2# This is a POC (proof of concept or piece of crap, take your pick) for reading the
3# text representation of trace output related to page allocation. It makes an attempt
4# to extract some high-level information on what is going on. The accuracy of the parser
5# may vary considerably
6#
7# Example usage: trace-pagealloc-postprocess.pl < /sys/kernel/debug/tracing/trace_pipe
8# other options
9# --prepend-parent Report on the parent proc and PID
10# --read-procstat If the trace lacks process info, get it from /proc
11# --ignore-pid Aggregate processes of the same name together
12#
13# Copyright (c) IBM Corporation 2009
14# Author: Mel Gorman <mel@csn.ul.ie>
15use strict;
16use Getopt::Long;
17
18# Tracepoint events
19use constant MM_PAGE_ALLOC => 1;
20use constant MM_PAGE_FREE_DIRECT => 2;
21use constant MM_PAGEVEC_FREE => 3;
22use constant MM_PAGE_PCPU_DRAIN => 4;
23use constant MM_PAGE_ALLOC_ZONE_LOCKED => 5;
24use constant MM_PAGE_ALLOC_EXTFRAG => 6;
25use constant EVENT_UNKNOWN => 7;
26
27# Constants used to track state
28use constant STATE_PCPU_PAGES_DRAINED => 8;
29use constant STATE_PCPU_PAGES_REFILLED => 9;
30
31# High-level events extrapolated from tracepoints
32use constant HIGH_PCPU_DRAINS => 10;
33use constant HIGH_PCPU_REFILLS => 11;
34use constant HIGH_EXT_FRAGMENT => 12;
35use constant HIGH_EXT_FRAGMENT_SEVERE => 13;
36use constant HIGH_EXT_FRAGMENT_MODERATE => 14;
37use constant HIGH_EXT_FRAGMENT_CHANGED => 15;
38
39my %perprocesspid;
40my %perprocess;
41my $opt_ignorepid;
42my $opt_read_procstat;
43my $opt_prepend_parent;
44
45# Catch sigint and exit on request
46my $sigint_report = 0;
47my $sigint_exit = 0;
48my $sigint_pending = 0;
49my $sigint_received = 0;
50sub sigint_handler {
51 my $current_time = time;
52 if ($current_time - 2 > $sigint_received) {
53 print "SIGINT received, report pending. Hit ctrl-c again to exit\n";
54 $sigint_report = 1;
55 } else {
56 if (!$sigint_exit) {
57 print "Second SIGINT received quickly, exiting\n";
58 }
59 $sigint_exit++;
60 }
61
62 if ($sigint_exit > 3) {
63 print "Many SIGINTs received, exiting now without report\n";
64 exit;
65 }
66
67 $sigint_received = $current_time;
68 $sigint_pending = 1;
69}
70$SIG{INT} = "sigint_handler";
71
72# Parse command line options
73GetOptions(
74 'ignore-pid' => \$opt_ignorepid,
75 'read-procstat' => \$opt_read_procstat,
76 'prepend-parent' => \$opt_prepend_parent,
77);
78
79# Defaults for dynamically discovered regex's
80my $regex_fragdetails_default = 'page=([0-9a-f]*) pfn=([0-9]*) alloc_order=([-0-9]*) fallback_order=([-0-9]*) pageblock_order=([-0-9]*) alloc_migratetype=([-0-9]*) fallback_migratetype=([-0-9]*) fragmenting=([-0-9]) change_ownership=([-0-9])';
81
82# Dyanically discovered regex
83my $regex_fragdetails;
84
85# Static regex used. Specified like this for readability and for use with /o
86# (process_pid) (cpus ) ( time ) (tpoint ) (details)
87my $regex_traceevent = '\s*([a-zA-Z0-9-]*)\s*(\[[0-9]*\])\s*([0-9.]*):\s*([a-zA-Z_]*):\s*(.*)';
88my $regex_statname = '[-0-9]*\s\((.*)\).*';
89my $regex_statppid = '[-0-9]*\s\(.*\)\s[A-Za-z]\s([0-9]*).*';
90
91sub generate_traceevent_regex {
92 my $event = shift;
93 my $default = shift;
94 my $regex;
95
96 # Read the event format or use the default
97 if (!open (FORMAT, "/sys/kernel/debug/tracing/events/$event/format")) {
98 $regex = $default;
99 } else {
100 my $line;
101 while (!eof(FORMAT)) {
102 $line = <FORMAT>;
103 if ($line =~ /^print fmt:\s"(.*)",.*/) {
104 $regex = $1;
105 $regex =~ s/%p/\([0-9a-f]*\)/g;
106 $regex =~ s/%d/\([-0-9]*\)/g;
107 $regex =~ s/%lu/\([0-9]*\)/g;
108 }
109 }
110 }
111
112 # Verify fields are in the right order
113 my $tuple;
114 foreach $tuple (split /\s/, $regex) {
115 my ($key, $value) = split(/=/, $tuple);
116 my $expected = shift;
117 if ($key ne $expected) {
118 print("WARNING: Format not as expected '$key' != '$expected'");
119 $regex =~ s/$key=\((.*)\)/$key=$1/;
120 }
121 }
122
123 if (defined shift) {
124 die("Fewer fields than expected in format");
125 }
126
127 return $regex;
128}
129$regex_fragdetails = generate_traceevent_regex("kmem/mm_page_alloc_extfrag",
130 $regex_fragdetails_default,
131 "page", "pfn",
132 "alloc_order", "fallback_order", "pageblock_order",
133 "alloc_migratetype", "fallback_migratetype",
134 "fragmenting", "change_ownership");
135
136sub read_statline($) {
137 my $pid = $_[0];
138 my $statline;
139
140 if (open(STAT, "/proc/$pid/stat")) {
141 $statline = <STAT>;
142 close(STAT);
143 }
144
145 if ($statline eq '') {
146 $statline = "-1 (UNKNOWN_PROCESS_NAME) R 0";
147 }
148
149 return $statline;
150}
151
152sub guess_process_pid($$) {
153 my $pid = $_[0];
154 my $statline = $_[1];
155
156 if ($pid == 0) {
157 return "swapper-0";
158 }
159
160 if ($statline !~ /$regex_statname/o) {
161 die("Failed to math stat line for process name :: $statline");
162 }
163 return "$1-$pid";
164}
165
166sub parent_info($$) {
167 my $pid = $_[0];
168 my $statline = $_[1];
169 my $ppid;
170
171 if ($pid == 0) {
172 return "NOPARENT-0";
173 }
174
175 if ($statline !~ /$regex_statppid/o) {
176 die("Failed to match stat line process ppid:: $statline");
177 }
178
179 # Read the ppid stat line
180 $ppid = $1;
181 return guess_process_pid($ppid, read_statline($ppid));
182}
183
184sub process_events {
185 my $traceevent;
186 my $process_pid;
187 my $cpus;
188 my $timestamp;
189 my $tracepoint;
190 my $details;
191 my $statline;
192
193 # Read each line of the event log
194EVENT_PROCESS:
195 while ($traceevent = <STDIN>) {
196 if ($traceevent =~ /$regex_traceevent/o) {
197 $process_pid = $1;
198 $tracepoint = $4;
199
200 if ($opt_read_procstat || $opt_prepend_parent) {
201 $process_pid =~ /(.*)-([0-9]*)$/;
202 my $process = $1;
203 my $pid = $2;
204
205 $statline = read_statline($pid);
206
207 if ($opt_read_procstat && $process eq '') {
208 $process_pid = guess_process_pid($pid, $statline);
209 }
210
211 if ($opt_prepend_parent) {
212 $process_pid = parent_info($pid, $statline) . " :: $process_pid";
213 }
214 }
215
216 # Unnecessary in this script. Uncomment if required
217 # $cpus = $2;
218 # $timestamp = $3;
219 } else {
220 next;
221 }
222
223 # Perl Switch() sucks majorly
224 if ($tracepoint eq "mm_page_alloc") {
225 $perprocesspid{$process_pid}->{MM_PAGE_ALLOC}++;
226 } elsif ($tracepoint eq "mm_page_free_direct") {
227 $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT}++;
228 } elsif ($tracepoint eq "mm_pagevec_free") {
229 $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE}++;
230 } elsif ($tracepoint eq "mm_page_pcpu_drain") {
231 $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN}++;
232 $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED}++;
233 } elsif ($tracepoint eq "mm_page_alloc_zone_locked") {
234 $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}++;
235 $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED}++;
236 } elsif ($tracepoint eq "mm_page_alloc_extfrag") {
237
238 # Extract the details of the event now
239 $details = $5;
240
241 my ($page, $pfn);
242 my ($alloc_order, $fallback_order, $pageblock_order);
243 my ($alloc_migratetype, $fallback_migratetype);
244 my ($fragmenting, $change_ownership);
245
246 if ($details !~ /$regex_fragdetails/o) {
247 print "WARNING: Failed to parse mm_page_alloc_extfrag as expected\n";
248 next;
249 }
250
251 $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG}++;
252 $page = $1;
253 $pfn = $2;
254 $alloc_order = $3;
255 $fallback_order = $4;
256 $pageblock_order = $5;
257 $alloc_migratetype = $6;
258 $fallback_migratetype = $7;
259 $fragmenting = $8;
260 $change_ownership = $9;
261
262 if ($fragmenting) {
263 $perprocesspid{$process_pid}->{HIGH_EXT_FRAG}++;
264 if ($fallback_order <= 3) {
265 $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE}++;
266 } else {
267 $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE}++;
268 }
269 }
270 if ($change_ownership) {
271 $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED}++;
272 }
273 } else {
274 $perprocesspid{$process_pid}->{EVENT_UNKNOWN}++;
275 }
276
277 # Catch a full pcpu drain event
278 if ($perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED} &&
279 $tracepoint ne "mm_page_pcpu_drain") {
280
281 $perprocesspid{$process_pid}->{HIGH_PCPU_DRAINS}++;
282 $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED} = 0;
283 }
284
285 # Catch a full pcpu refill event
286 if ($perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED} &&
287 $tracepoint ne "mm_page_alloc_zone_locked") {
288 $perprocesspid{$process_pid}->{HIGH_PCPU_REFILLS}++;
289 $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_REFILLED} = 0;
290 }
291
292 if ($sigint_pending) {
293 last EVENT_PROCESS;
294 }
295 }
296}
297
298sub dump_stats {
299 my $hashref = shift;
300 my %stats = %$hashref;
301
302 # Dump per-process stats
303 my $process_pid;
304 my $max_strlen = 0;
305
306 # Get the maximum process name
307 foreach $process_pid (keys %perprocesspid) {
308 my $len = length($process_pid);
309 if ($len > $max_strlen) {
310 $max_strlen = $len;
311 }
312 }
313 $max_strlen += 2;
314
315 printf("\n");
316 printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n",
317 "Process", "Pages", "Pages", "Pages", "Pages", "PCPU", "PCPU", "PCPU", "Fragment", "Fragment", "MigType", "Fragment", "Fragment", "Unknown");
318 printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n",
319 "details", "allocd", "allocd", "freed", "freed", "pages", "drains", "refills", "Fallback", "Causing", "Changed", "Severe", "Moderate", "");
320
321 printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s\n",
322 "", "", "under lock", "direct", "pagevec", "drain", "", "", "", "", "", "", "", "");
323
324 foreach $process_pid (keys %stats) {
325 # Dump final aggregates
326 if ($stats{$process_pid}->{STATE_PCPU_PAGES_DRAINED}) {
327 $stats{$process_pid}->{HIGH_PCPU_DRAINS}++;
328 $stats{$process_pid}->{STATE_PCPU_PAGES_DRAINED} = 0;
329 }
330 if ($stats{$process_pid}->{STATE_PCPU_PAGES_REFILLED}) {
331 $stats{$process_pid}->{HIGH_PCPU_REFILLS}++;
332 $stats{$process_pid}->{STATE_PCPU_PAGES_REFILLED} = 0;
333 }
334
335 printf("%-" . $max_strlen . "s %8d %10d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d\n",
336 $process_pid,
337 $stats{$process_pid}->{MM_PAGE_ALLOC},
338 $stats{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED},
339 $stats{$process_pid}->{MM_PAGE_FREE_DIRECT},
340 $stats{$process_pid}->{MM_PAGEVEC_FREE},
341 $stats{$process_pid}->{MM_PAGE_PCPU_DRAIN},
342 $stats{$process_pid}->{HIGH_PCPU_DRAINS},
343 $stats{$process_pid}->{HIGH_PCPU_REFILLS},
344 $stats{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG},
345 $stats{$process_pid}->{HIGH_EXT_FRAG},
346 $stats{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED},
347 $stats{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE},
348 $stats{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE},
349 $stats{$process_pid}->{EVENT_UNKNOWN});
350 }
351}
352
353sub aggregate_perprocesspid() {
354 my $process_pid;
355 my $process;
356 undef %perprocess;
357
358 foreach $process_pid (keys %perprocesspid) {
359 $process = $process_pid;
360 $process =~ s/-([0-9])*$//;
361 if ($process eq '') {
362 $process = "NO_PROCESS_NAME";
363 }
364
365 $perprocess{$process}->{MM_PAGE_ALLOC} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC};
366 $perprocess{$process}->{MM_PAGE_ALLOC_ZONE_LOCKED} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED};
367 $perprocess{$process}->{MM_PAGE_FREE_DIRECT} += $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT};
368 $perprocess{$process}->{MM_PAGEVEC_FREE} += $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE};
369 $perprocess{$process}->{MM_PAGE_PCPU_DRAIN} += $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN};
370 $perprocess{$process}->{HIGH_PCPU_DRAINS} += $perprocesspid{$process_pid}->{HIGH_PCPU_DRAINS};
371 $perprocess{$process}->{HIGH_PCPU_REFILLS} += $perprocesspid{$process_pid}->{HIGH_PCPU_REFILLS};
372 $perprocess{$process}->{MM_PAGE_ALLOC_EXTFRAG} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_EXTFRAG};
373 $perprocess{$process}->{HIGH_EXT_FRAG} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAG};
374 $perprocess{$process}->{HIGH_EXT_FRAGMENT_CHANGED} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_CHANGED};
375 $perprocess{$process}->{HIGH_EXT_FRAGMENT_SEVERE} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_SEVERE};
376 $perprocess{$process}->{HIGH_EXT_FRAGMENT_MODERATE} += $perprocesspid{$process_pid}->{HIGH_EXT_FRAGMENT_MODERATE};
377 $perprocess{$process}->{EVENT_UNKNOWN} += $perprocesspid{$process_pid}->{EVENT_UNKNOWN};
378 }
379}
380
381sub report() {
382 if (!$opt_ignorepid) {
383 dump_stats(\%perprocesspid);
384 } else {
385 aggregate_perprocesspid();
386 dump_stats(\%perprocess);
387 }
388}
389
390# Process events or signals until neither is available
391sub signal_loop() {
392 my $sigint_processed;
393 do {
394 $sigint_processed = 0;
395 process_events();
396
397 # Handle pending signals if any
398 if ($sigint_pending) {
399 my $current_time = time;
400
401 if ($sigint_exit) {
402 print "Received exit signal\n";
403 $sigint_pending = 0;
404 }
405 if ($sigint_report) {
406 if ($current_time >= $sigint_received + 2) {
407 report();
408 $sigint_report = 0;
409 $sigint_pending = 0;
410 $sigint_processed = 1;
411 }
412 }
413 }
414 } while ($sigint_pending || $sigint_processed);
415}
416
417signal_loop();
418report();
diff --git a/Documentation/trace/tracepoint-analysis.txt b/Documentation/trace/tracepoint-analysis.txt
new file mode 100644
index 000000000000..5eb4e487e667
--- /dev/null
+++ b/Documentation/trace/tracepoint-analysis.txt
@@ -0,0 +1,327 @@
1 Notes on Analysing Behaviour Using Events and Tracepoints
2
3 Documentation written by Mel Gorman
4 PCL information heavily based on email from Ingo Molnar
5
61. Introduction
7===============
8
9Tracepoints (see Documentation/trace/tracepoints.txt) can be used without
10creating custom kernel modules to register probe functions using the event
11tracing infrastructure.
12
13Simplistically, tracepoints will represent an important event that when can
14be taken in conjunction with other tracepoints to build a "Big Picture" of
15what is going on within the system. There are a large number of methods for
16gathering and interpreting these events. Lacking any current Best Practises,
17this document describes some of the methods that can be used.
18
19This document assumes that debugfs is mounted on /sys/kernel/debug and that
20the appropriate tracing options have been configured into the kernel. It is
21assumed that the PCL tool tools/perf has been installed and is in your path.
22
232. Listing Available Events
24===========================
25
262.1 Standard Utilities
27----------------------
28
29All possible events are visible from /sys/kernel/debug/tracing/events. Simply
30calling
31
32 $ find /sys/kernel/debug/tracing/events -type d
33
34will give a fair indication of the number of events available.
35
362.2 PCL
37-------
38
39Discovery and enumeration of all counters and events, including tracepoints
40are available with the perf tool. Getting a list of available events is a
41simple case of
42
43 $ perf list 2>&1 | grep Tracepoint
44 ext4:ext4_free_inode [Tracepoint event]
45 ext4:ext4_request_inode [Tracepoint event]
46 ext4:ext4_allocate_inode [Tracepoint event]
47 ext4:ext4_write_begin [Tracepoint event]
48 ext4:ext4_ordered_write_end [Tracepoint event]
49 [ .... remaining output snipped .... ]
50
51
522. Enabling Events
53==================
54
552.1 System-Wide Event Enabling
56------------------------------
57
58See Documentation/trace/events.txt for a proper description on how events
59can be enabled system-wide. A short example of enabling all events related
60to page allocation would look something like
61
62 $ for i in `find /sys/kernel/debug/tracing/events -name "enable" | grep mm_`; do echo 1 > $i; done
63
642.2 System-Wide Event Enabling with SystemTap
65---------------------------------------------
66
67In SystemTap, tracepoints are accessible using the kernel.trace() function
68call. The following is an example that reports every 5 seconds what processes
69were allocating the pages.
70
71 global page_allocs
72
73 probe kernel.trace("mm_page_alloc") {
74 page_allocs[execname()]++
75 }
76
77 function print_count() {
78 printf ("%-25s %-s\n", "#Pages Allocated", "Process Name")
79 foreach (proc in page_allocs-)
80 printf("%-25d %s\n", page_allocs[proc], proc)
81 printf ("\n")
82 delete page_allocs
83 }
84
85 probe timer.s(5) {
86 print_count()
87 }
88
892.3 System-Wide Event Enabling with PCL
90---------------------------------------
91
92By specifying the -a switch and analysing sleep, the system-wide events
93for a duration of time can be examined.
94
95 $ perf stat -a \
96 -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
97 -e kmem:mm_pagevec_free \
98 sleep 10
99 Performance counter stats for 'sleep 10':
100
101 9630 kmem:mm_page_alloc
102 2143 kmem:mm_page_free_direct
103 7424 kmem:mm_pagevec_free
104
105 10.002577764 seconds time elapsed
106
107Similarly, one could execute a shell and exit it as desired to get a report
108at that point.
109
1102.4 Local Event Enabling
111------------------------
112
113Documentation/trace/ftrace.txt describes how to enable events on a per-thread
114basis using set_ftrace_pid.
115
1162.5 Local Event Enablement with PCL
117-----------------------------------
118
119Events can be activate and tracked for the duration of a process on a local
120basis using PCL such as follows.
121
122 $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
123 -e kmem:mm_pagevec_free ./hackbench 10
124 Time: 0.909
125
126 Performance counter stats for './hackbench 10':
127
128 17803 kmem:mm_page_alloc
129 12398 kmem:mm_page_free_direct
130 4827 kmem:mm_pagevec_free
131
132 0.973913387 seconds time elapsed
133
1343. Event Filtering
135==================
136
137Documentation/trace/ftrace.txt covers in-depth how to filter events in
138ftrace. Obviously using grep and awk of trace_pipe is an option as well
139as any script reading trace_pipe.
140
1414. Analysing Event Variances with PCL
142=====================================
143
144Any workload can exhibit variances between runs and it can be important
145to know what the standard deviation in. By and large, this is left to the
146performance analyst to do it by hand. In the event that the discrete event
147occurrences are useful to the performance analyst, then perf can be used.
148
149 $ perf stat --repeat 5 -e kmem:mm_page_alloc -e kmem:mm_page_free_direct
150 -e kmem:mm_pagevec_free ./hackbench 10
151 Time: 0.890
152 Time: 0.895
153 Time: 0.915
154 Time: 1.001
155 Time: 0.899
156
157 Performance counter stats for './hackbench 10' (5 runs):
158
159 16630 kmem:mm_page_alloc ( +- 3.542% )
160 11486 kmem:mm_page_free_direct ( +- 4.771% )
161 4730 kmem:mm_pagevec_free ( +- 2.325% )
162
163 0.982653002 seconds time elapsed ( +- 1.448% )
164
165In the event that some higher-level event is required that depends on some
166aggregation of discrete events, then a script would need to be developed.
167
168Using --repeat, it is also possible to view how events are fluctuating over
169time on a system wide basis using -a and sleep.
170
171 $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
172 -e kmem:mm_pagevec_free \
173 -a --repeat 10 \
174 sleep 1
175 Performance counter stats for 'sleep 1' (10 runs):
176
177 1066 kmem:mm_page_alloc ( +- 26.148% )
178 182 kmem:mm_page_free_direct ( +- 5.464% )
179 890 kmem:mm_pagevec_free ( +- 30.079% )
180
181 1.002251757 seconds time elapsed ( +- 0.005% )
182
1835. Higher-Level Analysis with Helper Scripts
184============================================
185
186When events are enabled the events that are triggering can be read from
187/sys/kernel/debug/tracing/trace_pipe in human-readable format although binary
188options exist as well. By post-processing the output, further information can
189be gathered on-line as appropriate. Examples of post-processing might include
190
191 o Reading information from /proc for the PID that triggered the event
192 o Deriving a higher-level event from a series of lower-level events.
193 o Calculate latencies between two events
194
195Documentation/trace/postprocess/trace-pagealloc-postprocess.pl is an example
196script that can read trace_pipe from STDIN or a copy of a trace. When used
197on-line, it can be interrupted once to generate a report without existing
198and twice to exit.
199
200Simplistically, the script just reads STDIN and counts up events but it
201also can do more such as
202
203 o Derive high-level events from many low-level events. If a number of pages
204 are freed to the main allocator from the per-CPU lists, it recognises
205 that as one per-CPU drain even though there is no specific tracepoint
206 for that event
207 o It can aggregate based on PID or individual process number
208 o In the event memory is getting externally fragmented, it reports
209 on whether the fragmentation event was severe or moderate.
210 o When receiving an event about a PID, it can record who the parent was so
211 that if large numbers of events are coming from very short-lived
212 processes, the parent process responsible for creating all the helpers
213 can be identified
214
2156. Lower-Level Analysis with PCL
216================================
217
218There may also be a requirement to identify what functions with a program
219were generating events within the kernel. To begin this sort of analysis, the
220data must be recorded. At the time of writing, this required root
221
222 $ perf record -c 1 \
223 -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
224 -e kmem:mm_pagevec_free \
225 ./hackbench 10
226 Time: 0.894
227 [ perf record: Captured and wrote 0.733 MB perf.data (~32010 samples) ]
228
229Note the use of '-c 1' to set the event period to sample. The default sample
230period is quite high to minimise overhead but the information collected can be
231very coarse as a result.
232
233This record outputted a file called perf.data which can be analysed using
234perf report.
235
236 $ perf report
237 # Samples: 30922
238 #
239 # Overhead Command Shared Object
240 # ........ ......... ................................
241 #
242 87.27% hackbench [vdso]
243 6.85% hackbench /lib/i686/cmov/libc-2.9.so
244 2.62% hackbench /lib/ld-2.9.so
245 1.52% perf [vdso]
246 1.22% hackbench ./hackbench
247 0.48% hackbench [kernel]
248 0.02% perf /lib/i686/cmov/libc-2.9.so
249 0.01% perf /usr/bin/perf
250 0.01% perf /lib/ld-2.9.so
251 0.00% hackbench /lib/i686/cmov/libpthread-2.9.so
252 #
253 # (For more details, try: perf report --sort comm,dso,symbol)
254 #
255
256According to this, the vast majority of events occured triggered on events
257within the VDSO. With simple binaries, this will often be the case so lets
258take a slightly different example. In the course of writing this, it was
259noticed that X was generating an insane amount of page allocations so lets look
260at it
261
262 $ perf record -c 1 -f \
263 -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \
264 -e kmem:mm_pagevec_free \
265 -p `pidof X`
266
267This was interrupted after a few seconds and
268
269 $ perf report
270 # Samples: 27666
271 #
272 # Overhead Command Shared Object
273 # ........ ....... .......................................
274 #
275 51.95% Xorg [vdso]
276 47.95% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1
277 0.09% Xorg /lib/i686/cmov/libc-2.9.so
278 0.01% Xorg [kernel]
279 #
280 # (For more details, try: perf report --sort comm,dso,symbol)
281 #
282
283So, almost half of the events are occuring in a library. To get an idea which
284symbol.
285
286 $ perf report --sort comm,dso,symbol
287 # Samples: 27666
288 #
289 # Overhead Command Shared Object Symbol
290 # ........ ....... ....................................... ......
291 #
292 51.95% Xorg [vdso] [.] 0x000000ffffe424
293 47.93% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] pixmanFillsse2
294 0.09% Xorg /lib/i686/cmov/libc-2.9.so [.] _int_malloc
295 0.01% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] pixman_region32_copy_f
296 0.01% Xorg [kernel] [k] read_hpet
297 0.01% Xorg /opt/gfx-test/lib/libpixman-1.so.0.13.1 [.] get_fast_path
298 0.00% Xorg [kernel] [k] ftrace_trace_userstack
299
300To see where within the function pixmanFillsse2 things are going wrong
301
302 $ perf annotate pixmanFillsse2
303 [ ... ]
304 0.00 : 34eeb: 0f 18 08 prefetcht0 (%eax)
305 : }
306 :
307 : extern __inline void __attribute__((__gnu_inline__, __always_inline__, _
308 : _mm_store_si128 (__m128i *__P, __m128i __B) : {
309 : *__P = __B;
310 12.40 : 34eee: 66 0f 7f 80 40 ff ff movdqa %xmm0,-0xc0(%eax)
311 0.00 : 34ef5: ff
312 12.40 : 34ef6: 66 0f 7f 80 50 ff ff movdqa %xmm0,-0xb0(%eax)
313 0.00 : 34efd: ff
314 12.39 : 34efe: 66 0f 7f 80 60 ff ff movdqa %xmm0,-0xa0(%eax)
315 0.00 : 34f05: ff
316 12.67 : 34f06: 66 0f 7f 80 70 ff ff movdqa %xmm0,-0x90(%eax)
317 0.00 : 34f0d: ff
318 12.58 : 34f0e: 66 0f 7f 40 80 movdqa %xmm0,-0x80(%eax)
319 12.31 : 34f13: 66 0f 7f 40 90 movdqa %xmm0,-0x70(%eax)
320 12.40 : 34f18: 66 0f 7f 40 a0 movdqa %xmm0,-0x60(%eax)
321 12.31 : 34f1d: 66 0f 7f 40 b0 movdqa %xmm0,-0x50(%eax)
322
323At a glance, it looks like the time is being spent copying pixmaps to
324the card. Further investigation would be needed to determine why pixmaps
325are being copied around so much but a starting point would be to take an
326ancient build of libpixmap out of the library path where it was totally
327forgotten about from months ago!
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
index 2f77ced35df7..e57d6a9dd32b 100644
--- a/Documentation/vm/00-INDEX
+++ b/Documentation/vm/00-INDEX
@@ -6,6 +6,8 @@ balance
6 - various information on memory balancing. 6 - various information on memory balancing.
7hugetlbpage.txt 7hugetlbpage.txt
8 - a brief summary of hugetlbpage support in the Linux kernel. 8 - a brief summary of hugetlbpage support in the Linux kernel.
9ksm.txt
10 - how to use the Kernel Samepage Merging feature.
9locking 11locking
10 - info on how locking and synchronization is done in the Linux vm code. 12 - info on how locking and synchronization is done in the Linux vm code.
11numa 13numa
@@ -20,3 +22,5 @@ slabinfo.c
20 - source code for a tool to get reports about slabs. 22 - source code for a tool to get reports about slabs.
21slub.txt 23slub.txt
22 - a short users guide for SLUB. 24 - a short users guide for SLUB.
25map_hugetlb.c
26 - an example program that uses the MAP_HUGETLB mmap flag.
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt
index ea8714fcc3ad..82a7bd1800b2 100644
--- a/Documentation/vm/hugetlbpage.txt
+++ b/Documentation/vm/hugetlbpage.txt
@@ -18,13 +18,13 @@ First the Linux kernel needs to be built with the CONFIG_HUGETLBFS
18automatically when CONFIG_HUGETLBFS is selected) configuration 18automatically when CONFIG_HUGETLBFS is selected) configuration
19options. 19options.
20 20
21The kernel built with hugepage support should show the number of configured 21The kernel built with huge page support should show the number of configured
22hugepages in the system by running the "cat /proc/meminfo" command. 22huge pages in the system by running the "cat /proc/meminfo" command.
23 23
24/proc/meminfo also provides information about the total number of hugetlb 24/proc/meminfo also provides information about the total number of hugetlb
25pages configured in the kernel. It also displays information about the 25pages configured in the kernel. It also displays information about the
26number of free hugetlb pages at any time. It also displays information about 26number of free hugetlb pages at any time. It also displays information about
27the configured hugepage size - this is needed for generating the proper 27the configured huge page size - this is needed for generating the proper
28alignment and size of the arguments to the above system calls. 28alignment and size of the arguments to the above system calls.
29 29
30The output of "cat /proc/meminfo" will have lines like: 30The output of "cat /proc/meminfo" will have lines like:
@@ -37,25 +37,27 @@ HugePages_Surp: yyy
37Hugepagesize: zzz kB 37Hugepagesize: zzz kB
38 38
39where: 39where:
40HugePages_Total is the size of the pool of hugepages. 40HugePages_Total is the size of the pool of huge pages.
41HugePages_Free is the number of hugepages in the pool that are not yet 41HugePages_Free is the number of huge pages in the pool that are not yet
42allocated. 42 allocated.
43HugePages_Rsvd is short for "reserved," and is the number of hugepages 43HugePages_Rsvd is short for "reserved," and is the number of huge pages for
44for which a commitment to allocate from the pool has been made, but no 44 which a commitment to allocate from the pool has been made,
45allocation has yet been made. It's vaguely analogous to overcommit. 45 but no allocation has yet been made. Reserved huge pages
46HugePages_Surp is short for "surplus," and is the number of hugepages in 46 guarantee that an application will be able to allocate a
47the pool above the value in /proc/sys/vm/nr_hugepages. The maximum 47 huge page from the pool of huge pages at fault time.
48number of surplus hugepages is controlled by 48HugePages_Surp is short for "surplus," and is the number of huge pages in
49/proc/sys/vm/nr_overcommit_hugepages. 49 the pool above the value in /proc/sys/vm/nr_hugepages. The
50 maximum number of surplus huge pages is controlled by
51 /proc/sys/vm/nr_overcommit_hugepages.
50 52
51/proc/filesystems should also show a filesystem of type "hugetlbfs" configured 53/proc/filesystems should also show a filesystem of type "hugetlbfs" configured
52in the kernel. 54in the kernel.
53 55
54/proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb 56/proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb
55pages in the kernel. Super user can dynamically request more (or free some 57pages in the kernel. Super user can dynamically request more (or free some
56pre-configured) hugepages. 58pre-configured) huge pages.
57The allocation (or deallocation) of hugetlb pages is possible only if there are 59The allocation (or deallocation) of hugetlb pages is possible only if there are
58enough physically contiguous free pages in system (freeing of hugepages is 60enough physically contiguous free pages in system (freeing of huge pages is
59possible only if there are enough hugetlb pages free that can be transferred 61possible only if there are enough hugetlb pages free that can be transferred
60back to regular memory pool). 62back to regular memory pool).
61 63
@@ -67,43 +69,82 @@ use either the mmap system call or shared memory system calls to start using
67the huge pages. It is required that the system administrator preallocate 69the huge pages. It is required that the system administrator preallocate
68enough memory for huge page purposes. 70enough memory for huge page purposes.
69 71
70Use the following command to dynamically allocate/deallocate hugepages: 72The administrator can preallocate huge pages on the kernel boot command line by
73specifying the "hugepages=N" parameter, where 'N' = the number of huge pages
74requested. This is the most reliable method for preallocating huge pages as
75memory has not yet become fragmented.
76
77Some platforms support multiple huge page sizes. To preallocate huge pages
78of a specific size, one must preceed the huge pages boot command parameters
79with a huge page size selection parameter "hugepagesz=<size>". <size> must
80be specified in bytes with optional scale suffix [kKmMgG]. The default huge
81page size may be selected with the "default_hugepagesz=<size>" boot parameter.
82
83/proc/sys/vm/nr_hugepages indicates the current number of configured [default
84size] hugetlb pages in the kernel. Super user can dynamically request more
85(or free some pre-configured) huge pages.
86
87Use the following command to dynamically allocate/deallocate default sized
88huge pages:
71 89
72 echo 20 > /proc/sys/vm/nr_hugepages 90 echo 20 > /proc/sys/vm/nr_hugepages
73 91
74This command will try to configure 20 hugepages in the system. The success 92This command will try to configure 20 default sized huge pages in the system.
75or failure of allocation depends on the amount of physically contiguous 93On a NUMA platform, the kernel will attempt to distribute the huge page pool
76memory that is preset in system at this time. System administrators may want 94over the all on-line nodes. These huge pages, allocated when nr_hugepages
77to put this command in one of the local rc init files. This will enable the 95is increased, are called "persistent huge pages".
78kernel to request huge pages early in the boot process (when the possibility 96
79of getting physical contiguous pages is still very high). In either 97The success or failure of huge page allocation depends on the amount of
80case, administrators will want to verify the number of hugepages actually 98physically contiguous memory that is preset in system at the time of the
81allocated by checking the sysctl or meminfo. 99allocation attempt. If the kernel is unable to allocate huge pages from
82 100some nodes in a NUMA system, it will attempt to make up the difference by
83/proc/sys/vm/nr_overcommit_hugepages indicates how large the pool of 101allocating extra pages on other nodes with sufficient available contiguous
84hugepages can grow, if more hugepages than /proc/sys/vm/nr_hugepages are 102memory, if any.
85requested by applications. echo'ing any non-zero value into this file 103
86indicates that the hugetlb subsystem is allowed to try to obtain 104System administrators may want to put this command in one of the local rc init
87hugepages from the buddy allocator, if the normal pool is exhausted. As 105files. This will enable the kernel to request huge pages early in the boot
88these surplus hugepages go out of use, they are freed back to the buddy 106process when the possibility of getting physical contiguous pages is still
107very high. Administrators can verify the number of huge pages actually
108allocated by checking the sysctl or meminfo. To check the per node
109distribution of huge pages in a NUMA system, use:
110
111 cat /sys/devices/system/node/node*/meminfo | fgrep Huge
112
113/proc/sys/vm/nr_overcommit_hugepages specifies how large the pool of
114huge pages can grow, if more huge pages than /proc/sys/vm/nr_hugepages are
115requested by applications. Writing any non-zero value into this file
116indicates that the hugetlb subsystem is allowed to try to obtain "surplus"
117huge pages from the buddy allocator, when the normal pool is exhausted. As
118these surplus huge pages go out of use, they are freed back to the buddy
89allocator. 119allocator.
90 120
121When increasing the huge page pool size via nr_hugepages, any surplus
122pages will first be promoted to persistent huge pages. Then, additional
123huge pages will be allocated, if necessary and if possible, to fulfill
124the new huge page pool size.
125
126The administrator may shrink the pool of preallocated huge pages for
127the default huge page size by setting the nr_hugepages sysctl to a
128smaller value. The kernel will attempt to balance the freeing of huge pages
129across all on-line nodes. Any free huge pages on the selected nodes will
130be freed back to the buddy allocator.
131
91Caveat: Shrinking the pool via nr_hugepages such that it becomes less 132Caveat: Shrinking the pool via nr_hugepages such that it becomes less
92than the number of hugepages in use will convert the balance to surplus 133than the number of huge pages in use will convert the balance to surplus
93huge pages even if it would exceed the overcommit value. As long as 134huge pages even if it would exceed the overcommit value. As long as
94this condition holds, however, no more surplus huge pages will be 135this condition holds, however, no more surplus huge pages will be
95allowed on the system until one of the two sysctls are increased 136allowed on the system until one of the two sysctls are increased
96sufficiently, or the surplus huge pages go out of use and are freed. 137sufficiently, or the surplus huge pages go out of use and are freed.
97 138
98With support for multiple hugepage pools at run-time available, much of 139With support for multiple huge page pools at run-time available, much of
99the hugepage userspace interface has been duplicated in sysfs. The above 140the huge page userspace interface has been duplicated in sysfs. The above
100information applies to the default hugepage size (which will be 141information applies to the default huge page size which will be
101controlled by the proc interfaces for backwards compatibility). The root 142controlled by the /proc interfaces for backwards compatibility. The root
102hugepage control directory is 143huge page control directory in sysfs is:
103 144
104 /sys/kernel/mm/hugepages 145 /sys/kernel/mm/hugepages
105 146
106For each hugepage size supported by the running kernel, a subdirectory 147For each huge page size supported by the running kernel, a subdirectory
107will exist, of the form 148will exist, of the form
108 149
109 hugepages-${size}kB 150 hugepages-${size}kB
@@ -116,9 +157,9 @@ Inside each of these directories, the same set of files will exist:
116 resv_hugepages 157 resv_hugepages
117 surplus_hugepages 158 surplus_hugepages
118 159
119which function as described above for the default hugepage-sized case. 160which function as described above for the default huge page-sized case.
120 161
121If the user applications are going to request hugepages using mmap system 162If the user applications are going to request huge pages using mmap system
122call, then it is required that system administrator mount a file system of 163call, then it is required that system administrator mount a file system of
123type hugetlbfs: 164type hugetlbfs:
124 165
@@ -127,7 +168,7 @@ type hugetlbfs:
127 none /mnt/huge 168 none /mnt/huge
128 169
129This command mounts a (pseudo) filesystem of type hugetlbfs on the directory 170This command mounts a (pseudo) filesystem of type hugetlbfs on the directory
130/mnt/huge. Any files created on /mnt/huge uses hugepages. The uid and gid 171/mnt/huge. Any files created on /mnt/huge uses huge pages. The uid and gid
131options sets the owner and group of the root of the file system. By default 172options sets the owner and group of the root of the file system. By default
132the uid and gid of the current process are taken. The mode option sets the 173the uid and gid of the current process are taken. The mode option sets the
133mode of root of file system to value & 0777. This value is given in octal. 174mode of root of file system to value & 0777. This value is given in octal.
@@ -146,24 +187,26 @@ Regular chown, chgrp, and chmod commands (with right permissions) could be
146used to change the file attributes on hugetlbfs. 187used to change the file attributes on hugetlbfs.
147 188
148Also, it is important to note that no such mount command is required if the 189Also, it is important to note that no such mount command is required if the
149applications are going to use only shmat/shmget system calls. Users who 190applications are going to use only shmat/shmget system calls or mmap with
150wish to use hugetlb page via shared memory segment should be a member of 191MAP_HUGETLB. Users who wish to use hugetlb page via shared memory segment
151a supplementary group and system admin needs to configure that gid into 192should be a member of a supplementary group and system admin needs to
152/proc/sys/vm/hugetlb_shm_group. It is possible for same or different 193configure that gid into /proc/sys/vm/hugetlb_shm_group. It is possible for
153applications to use any combination of mmaps and shm* calls, though the 194same or different applications to use any combination of mmaps and shm*
154mount of filesystem will be required for using mmap calls. 195calls, though the mount of filesystem will be required for using mmap calls
196without MAP_HUGETLB. For an example of how to use mmap with MAP_HUGETLB see
197map_hugetlb.c.
155 198
156******************************************************************* 199*******************************************************************
157 200
158/* 201/*
159 * Example of using hugepage memory in a user application using Sys V shared 202 * Example of using huge page memory in a user application using Sys V shared
160 * memory system calls. In this example the app is requesting 256MB of 203 * memory system calls. In this example the app is requesting 256MB of
161 * memory that is backed by huge pages. The application uses the flag 204 * memory that is backed by huge pages. The application uses the flag
162 * SHM_HUGETLB in the shmget system call to inform the kernel that it is 205 * SHM_HUGETLB in the shmget system call to inform the kernel that it is
163 * requesting hugepages. 206 * requesting huge pages.
164 * 207 *
165 * For the ia64 architecture, the Linux kernel reserves Region number 4 for 208 * For the ia64 architecture, the Linux kernel reserves Region number 4 for
166 * hugepages. That means the addresses starting with 0x800000... will need 209 * huge pages. That means the addresses starting with 0x800000... will need
167 * to be specified. Specifying a fixed address is not required on ppc64, 210 * to be specified. Specifying a fixed address is not required on ppc64,
168 * i386 or x86_64. 211 * i386 or x86_64.
169 * 212 *
@@ -252,14 +295,14 @@ int main(void)
252******************************************************************* 295*******************************************************************
253 296
254/* 297/*
255 * Example of using hugepage memory in a user application using the mmap 298 * Example of using huge page memory in a user application using the mmap
256 * system call. Before running this application, make sure that the 299 * system call. Before running this application, make sure that the
257 * administrator has mounted the hugetlbfs filesystem (on some directory 300 * administrator has mounted the hugetlbfs filesystem (on some directory
258 * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this 301 * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
259 * example, the app is requesting memory of size 256MB that is backed by 302 * example, the app is requesting memory of size 256MB that is backed by
260 * huge pages. 303 * huge pages.
261 * 304 *
262 * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. 305 * For ia64 architecture, Linux kernel reserves Region number 4 for huge pages.
263 * That means the addresses starting with 0x800000... will need to be 306 * That means the addresses starting with 0x800000... will need to be
264 * specified. Specifying a fixed address is not required on ppc64, i386 307 * specified. Specifying a fixed address is not required on ppc64, i386
265 * or x86_64. 308 * or x86_64.
diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt
new file mode 100644
index 000000000000..72a22f65960e
--- /dev/null
+++ b/Documentation/vm/ksm.txt
@@ -0,0 +1,89 @@
1How to use the Kernel Samepage Merging feature
2----------------------------------------------
3
4KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y,
5added to the Linux kernel in 2.6.32. See mm/ksm.c for its implementation,
6and http://lwn.net/Articles/306704/ and http://lwn.net/Articles/330589/
7
8The KSM daemon ksmd periodically scans those areas of user memory which
9have been registered with it, looking for pages of identical content which
10can be replaced by a single write-protected page (which is automatically
11copied if a process later wants to update its content).
12
13KSM was originally developed for use with KVM (where it was known as
14Kernel Shared Memory), to fit more virtual machines into physical memory,
15by sharing the data common between them. But it can be useful to any
16application which generates many instances of the same data.
17
18KSM only merges anonymous (private) pages, never pagecache (file) pages.
19KSM's merged pages are at present locked into kernel memory for as long
20as they are shared: so cannot be swapped out like the user pages they
21replace (but swapping KSM pages should follow soon in a later release).
22
23KSM only operates on those areas of address space which an application
24has advised to be likely candidates for merging, by using the madvise(2)
25system call: int madvise(addr, length, MADV_MERGEABLE).
26
27The app may call int madvise(addr, length, MADV_UNMERGEABLE) to cancel
28that advice and restore unshared pages: whereupon KSM unmerges whatever
29it merged in that range. Note: this unmerging call may suddenly require
30more memory than is available - possibly failing with EAGAIN, but more
31probably arousing the Out-Of-Memory killer.
32
33If KSM is not configured into the running kernel, madvise MADV_MERGEABLE
34and MADV_UNMERGEABLE simply fail with EINVAL. If the running kernel was
35built with CONFIG_KSM=y, those calls will normally succeed: even if the
36the KSM daemon is not currently running, MADV_MERGEABLE still registers
37the range for whenever the KSM daemon is started; even if the range
38cannot contain any pages which KSM could actually merge; even if
39MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE.
40
41Like other madvise calls, they are intended for use on mapped areas of
42the user address space: they will report ENOMEM if the specified range
43includes unmapped gaps (though working on the intervening mapped areas),
44and might fail with EAGAIN if not enough memory for internal structures.
45
46Applications should be considerate in their use of MADV_MERGEABLE,
47restricting its use to areas likely to benefit. KSM's scans may use
48a lot of processing power, and its kernel-resident pages are a limited
49resource. Some installations will disable KSM for these reasons.
50
51The KSM daemon is controlled by sysfs files in /sys/kernel/mm/ksm/,
52readable by all but writable only by root:
53
54max_kernel_pages - set to maximum number of kernel pages that KSM may use
55 e.g. "echo 2000 > /sys/kernel/mm/ksm/max_kernel_pages"
56 Value 0 imposes no limit on the kernel pages KSM may use;
57 but note that any process using MADV_MERGEABLE can cause
58 KSM to allocate these pages, unswappable until it exits.
59 Default: 2000 (chosen for demonstration purposes)
60
61pages_to_scan - how many present pages to scan before ksmd goes to sleep
62 e.g. "echo 200 > /sys/kernel/mm/ksm/pages_to_scan"
63 Default: 200 (chosen for demonstration purposes)
64
65sleep_millisecs - how many milliseconds ksmd should sleep before next scan
66 e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs"
67 Default: 20 (chosen for demonstration purposes)
68
69run - set 0 to stop ksmd from running but keep merged pages,
70 set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run",
71 set 2 to stop ksmd and unmerge all pages currently merged,
72 but leave mergeable areas registered for next run
73 Default: 1 (for immediate use by apps which register)
74
75The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/:
76
77pages_shared - how many shared unswappable kernel pages KSM is using
78pages_sharing - how many more sites are sharing them i.e. how much saved
79pages_unshared - how many pages unique but repeatedly checked for merging
80pages_volatile - how many pages changing too fast to be placed in a tree
81full_scans - how many times all mergeable areas have been scanned
82
83A high ratio of pages_sharing to pages_shared indicates good sharing, but
84a high ratio of pages_unshared to pages_sharing indicates wasted effort.
85pages_volatile embraces several different kinds of activity, but a high
86proportion there would also indicate poor use of madvise MADV_MERGEABLE.
87
88Izik Eidus,
89Hugh Dickins, 30 July 2009
diff --git a/Documentation/vm/map_hugetlb.c b/Documentation/vm/map_hugetlb.c
new file mode 100644
index 000000000000..e2bdae37f499
--- /dev/null
+++ b/Documentation/vm/map_hugetlb.c
@@ -0,0 +1,77 @@
1/*
2 * Example of using hugepage memory in a user application using the mmap
3 * system call with MAP_HUGETLB flag. Before running this program make
4 * sure the administrator has allocated enough default sized huge pages
5 * to cover the 256 MB allocation.
6 *
7 * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
8 * That means the addresses starting with 0x800000... will need to be
9 * specified. Specifying a fixed address is not required on ppc64, i386
10 * or x86_64.
11 */
12#include <stdlib.h>
13#include <stdio.h>
14#include <unistd.h>
15#include <sys/mman.h>
16#include <fcntl.h>
17
18#define LENGTH (256UL*1024*1024)
19#define PROTECTION (PROT_READ | PROT_WRITE)
20
21#ifndef MAP_HUGETLB
22#define MAP_HUGETLB 0x40
23#endif
24
25/* Only ia64 requires this */
26#ifdef __ia64__
27#define ADDR (void *)(0x8000000000000000UL)
28#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
29#else
30#define ADDR (void *)(0x0UL)
31#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
32#endif
33
34void check_bytes(char *addr)
35{
36 printf("First hex is %x\n", *((unsigned int *)addr));
37}
38
39void write_bytes(char *addr)
40{
41 unsigned long i;
42
43 for (i = 0; i < LENGTH; i++)
44 *(addr + i) = (char)i;
45}
46
47void read_bytes(char *addr)
48{
49 unsigned long i;
50
51 check_bytes(addr);
52 for (i = 0; i < LENGTH; i++)
53 if (*(addr + i) != (char)i) {
54 printf("Mismatch at %lu\n", i);
55 break;
56 }
57}
58
59int main(void)
60{
61 void *addr;
62
63 addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, 0, 0);
64 if (addr == MAP_FAILED) {
65 perror("mmap");
66 exit(1);
67 }
68
69 printf("Returned address is %p\n", addr);
70 check_bytes(addr);
71 write_bytes(addr);
72 read_bytes(addr);
73
74 munmap(addr, LENGTH);
75
76 return 0;
77}