diff options
author | Russell King <rmk+kernel@arm.linux.org.uk> | 2009-09-24 16:22:33 -0400 |
---|---|---|
committer | Russell King <rmk+kernel@arm.linux.org.uk> | 2009-09-24 16:22:33 -0400 |
commit | baea7b946f00a291b166ccae7fcfed6c01530cc6 (patch) | |
tree | 4aa275fbdbec9c7b9b4629e8bee2bbecd3c6a6af /Documentation/vm | |
parent | ae19ffbadc1b2100285a5b5b3d0a4e0a11390904 (diff) | |
parent | 94e0fb086fc5663c38bbc0fe86d698be8314f82f (diff) |
Merge branch 'origin' into for-linus
Conflicts:
MAINTAINERS
Diffstat (limited to 'Documentation/vm')
-rw-r--r-- | Documentation/vm/.gitignore | 1 | ||||
-rw-r--r-- | Documentation/vm/00-INDEX | 4 | ||||
-rw-r--r-- | Documentation/vm/hugetlbpage.txt | 147 | ||||
-rw-r--r-- | Documentation/vm/ksm.txt | 89 | ||||
-rw-r--r-- | Documentation/vm/locking | 2 | ||||
-rw-r--r-- | Documentation/vm/map_hugetlb.c | 77 | ||||
-rw-r--r-- | Documentation/vm/page-types.c | 248 | ||||
-rw-r--r-- | Documentation/vm/slabinfo.c | 68 |
8 files changed, 505 insertions, 131 deletions
diff --git a/Documentation/vm/.gitignore b/Documentation/vm/.gitignore index 33e8a023df02..09b164a5700f 100644 --- a/Documentation/vm/.gitignore +++ b/Documentation/vm/.gitignore | |||
@@ -1 +1,2 @@ | |||
1 | page-types | ||
1 | slabinfo | 2 | slabinfo |
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX index 2f77ced35df7..e57d6a9dd32b 100644 --- a/Documentation/vm/00-INDEX +++ b/Documentation/vm/00-INDEX | |||
@@ -6,6 +6,8 @@ balance | |||
6 | - various information on memory balancing. | 6 | - various information on memory balancing. |
7 | hugetlbpage.txt | 7 | hugetlbpage.txt |
8 | - a brief summary of hugetlbpage support in the Linux kernel. | 8 | - a brief summary of hugetlbpage support in the Linux kernel. |
9 | ksm.txt | ||
10 | - how to use the Kernel Samepage Merging feature. | ||
9 | locking | 11 | locking |
10 | - info on how locking and synchronization is done in the Linux vm code. | 12 | - info on how locking and synchronization is done in the Linux vm code. |
11 | numa | 13 | numa |
@@ -20,3 +22,5 @@ slabinfo.c | |||
20 | - source code for a tool to get reports about slabs. | 22 | - source code for a tool to get reports about slabs. |
21 | slub.txt | 23 | slub.txt |
22 | - a short users guide for SLUB. | 24 | - a short users guide for SLUB. |
25 | map_hugetlb.c | ||
26 | - an example program that uses the MAP_HUGETLB mmap flag. | ||
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index ea8714fcc3ad..82a7bd1800b2 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt | |||
@@ -18,13 +18,13 @@ First the Linux kernel needs to be built with the CONFIG_HUGETLBFS | |||
18 | automatically when CONFIG_HUGETLBFS is selected) configuration | 18 | automatically when CONFIG_HUGETLBFS is selected) configuration |
19 | options. | 19 | options. |
20 | 20 | ||
21 | The kernel built with hugepage support should show the number of configured | 21 | The kernel built with huge page support should show the number of configured |
22 | hugepages in the system by running the "cat /proc/meminfo" command. | 22 | huge pages in the system by running the "cat /proc/meminfo" command. |
23 | 23 | ||
24 | /proc/meminfo also provides information about the total number of hugetlb | 24 | /proc/meminfo also provides information about the total number of hugetlb |
25 | pages configured in the kernel. It also displays information about the | 25 | pages configured in the kernel. It also displays information about the |
26 | number of free hugetlb pages at any time. It also displays information about | 26 | number of free hugetlb pages at any time. It also displays information about |
27 | the configured hugepage size - this is needed for generating the proper | 27 | the configured huge page size - this is needed for generating the proper |
28 | alignment and size of the arguments to the above system calls. | 28 | alignment and size of the arguments to the above system calls. |
29 | 29 | ||
30 | The output of "cat /proc/meminfo" will have lines like: | 30 | The output of "cat /proc/meminfo" will have lines like: |
@@ -37,25 +37,27 @@ HugePages_Surp: yyy | |||
37 | Hugepagesize: zzz kB | 37 | Hugepagesize: zzz kB |
38 | 38 | ||
39 | where: | 39 | where: |
40 | HugePages_Total is the size of the pool of hugepages. | 40 | HugePages_Total is the size of the pool of huge pages. |
41 | HugePages_Free is the number of hugepages in the pool that are not yet | 41 | HugePages_Free is the number of huge pages in the pool that are not yet |
42 | allocated. | 42 | allocated. |
43 | HugePages_Rsvd is short for "reserved," and is the number of hugepages | 43 | HugePages_Rsvd is short for "reserved," and is the number of huge pages for |
44 | for which a commitment to allocate from the pool has been made, but no | 44 | which a commitment to allocate from the pool has been made, |
45 | allocation has yet been made. It's vaguely analogous to overcommit. | 45 | but no allocation has yet been made. Reserved huge pages |
46 | HugePages_Surp is short for "surplus," and is the number of hugepages in | 46 | guarantee that an application will be able to allocate a |
47 | the pool above the value in /proc/sys/vm/nr_hugepages. The maximum | 47 | huge page from the pool of huge pages at fault time. |
48 | number of surplus hugepages is controlled by | 48 | HugePages_Surp is short for "surplus," and is the number of huge pages in |
49 | /proc/sys/vm/nr_overcommit_hugepages. | 49 | the pool above the value in /proc/sys/vm/nr_hugepages. The |
50 | maximum number of surplus huge pages is controlled by | ||
51 | /proc/sys/vm/nr_overcommit_hugepages. | ||
50 | 52 | ||
51 | /proc/filesystems should also show a filesystem of type "hugetlbfs" configured | 53 | /proc/filesystems should also show a filesystem of type "hugetlbfs" configured |
52 | in the kernel. | 54 | in the kernel. |
53 | 55 | ||
54 | /proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb | 56 | /proc/sys/vm/nr_hugepages indicates the current number of configured hugetlb |
55 | pages in the kernel. Super user can dynamically request more (or free some | 57 | pages in the kernel. Super user can dynamically request more (or free some |
56 | pre-configured) hugepages. | 58 | pre-configured) huge pages. |
57 | The allocation (or deallocation) of hugetlb pages is possible only if there are | 59 | The allocation (or deallocation) of hugetlb pages is possible only if there are |
58 | enough physically contiguous free pages in system (freeing of hugepages is | 60 | enough physically contiguous free pages in system (freeing of huge pages is |
59 | possible only if there are enough hugetlb pages free that can be transferred | 61 | possible only if there are enough hugetlb pages free that can be transferred |
60 | back to regular memory pool). | 62 | back to regular memory pool). |
61 | 63 | ||
@@ -67,43 +69,82 @@ use either the mmap system call or shared memory system calls to start using | |||
67 | the huge pages. It is required that the system administrator preallocate | 69 | the huge pages. It is required that the system administrator preallocate |
68 | enough memory for huge page purposes. | 70 | enough memory for huge page purposes. |
69 | 71 | ||
70 | Use the following command to dynamically allocate/deallocate hugepages: | 72 | The administrator can preallocate huge pages on the kernel boot command line by |
73 | specifying the "hugepages=N" parameter, where 'N' = the number of huge pages | ||
74 | requested. This is the most reliable method for preallocating huge pages as | ||
75 | memory has not yet become fragmented. | ||
76 | |||
77 | Some platforms support multiple huge page sizes. To preallocate huge pages | ||
78 | of a specific size, one must preceed the huge pages boot command parameters | ||
79 | with a huge page size selection parameter "hugepagesz=<size>". <size> must | ||
80 | be specified in bytes with optional scale suffix [kKmMgG]. The default huge | ||
81 | page size may be selected with the "default_hugepagesz=<size>" boot parameter. | ||
82 | |||
83 | /proc/sys/vm/nr_hugepages indicates the current number of configured [default | ||
84 | size] hugetlb pages in the kernel. Super user can dynamically request more | ||
85 | (or free some pre-configured) huge pages. | ||
86 | |||
87 | Use the following command to dynamically allocate/deallocate default sized | ||
88 | huge pages: | ||
71 | 89 | ||
72 | echo 20 > /proc/sys/vm/nr_hugepages | 90 | echo 20 > /proc/sys/vm/nr_hugepages |
73 | 91 | ||
74 | This command will try to configure 20 hugepages in the system. The success | 92 | This command will try to configure 20 default sized huge pages in the system. |
75 | or failure of allocation depends on the amount of physically contiguous | 93 | On a NUMA platform, the kernel will attempt to distribute the huge page pool |
76 | memory that is preset in system at this time. System administrators may want | 94 | over the all on-line nodes. These huge pages, allocated when nr_hugepages |
77 | to put this command in one of the local rc init files. This will enable the | 95 | is increased, are called "persistent huge pages". |
78 | kernel to request huge pages early in the boot process (when the possibility | 96 | |
79 | of getting physical contiguous pages is still very high). In either | 97 | The success or failure of huge page allocation depends on the amount of |
80 | case, administrators will want to verify the number of hugepages actually | 98 | physically contiguous memory that is preset in system at the time of the |
81 | allocated by checking the sysctl or meminfo. | 99 | allocation attempt. If the kernel is unable to allocate huge pages from |
82 | 100 | some nodes in a NUMA system, it will attempt to make up the difference by | |
83 | /proc/sys/vm/nr_overcommit_hugepages indicates how large the pool of | 101 | allocating extra pages on other nodes with sufficient available contiguous |
84 | hugepages can grow, if more hugepages than /proc/sys/vm/nr_hugepages are | 102 | memory, if any. |
85 | requested by applications. echo'ing any non-zero value into this file | 103 | |
86 | indicates that the hugetlb subsystem is allowed to try to obtain | 104 | System administrators may want to put this command in one of the local rc init |
87 | hugepages from the buddy allocator, if the normal pool is exhausted. As | 105 | files. This will enable the kernel to request huge pages early in the boot |
88 | these surplus hugepages go out of use, they are freed back to the buddy | 106 | process when the possibility of getting physical contiguous pages is still |
107 | very high. Administrators can verify the number of huge pages actually | ||
108 | allocated by checking the sysctl or meminfo. To check the per node | ||
109 | distribution of huge pages in a NUMA system, use: | ||
110 | |||
111 | cat /sys/devices/system/node/node*/meminfo | fgrep Huge | ||
112 | |||
113 | /proc/sys/vm/nr_overcommit_hugepages specifies how large the pool of | ||
114 | huge pages can grow, if more huge pages than /proc/sys/vm/nr_hugepages are | ||
115 | requested by applications. Writing any non-zero value into this file | ||
116 | indicates that the hugetlb subsystem is allowed to try to obtain "surplus" | ||
117 | huge pages from the buddy allocator, when the normal pool is exhausted. As | ||
118 | these surplus huge pages go out of use, they are freed back to the buddy | ||
89 | allocator. | 119 | allocator. |
90 | 120 | ||
121 | When increasing the huge page pool size via nr_hugepages, any surplus | ||
122 | pages will first be promoted to persistent huge pages. Then, additional | ||
123 | huge pages will be allocated, if necessary and if possible, to fulfill | ||
124 | the new huge page pool size. | ||
125 | |||
126 | The administrator may shrink the pool of preallocated huge pages for | ||
127 | the default huge page size by setting the nr_hugepages sysctl to a | ||
128 | smaller value. The kernel will attempt to balance the freeing of huge pages | ||
129 | across all on-line nodes. Any free huge pages on the selected nodes will | ||
130 | be freed back to the buddy allocator. | ||
131 | |||
91 | Caveat: Shrinking the pool via nr_hugepages such that it becomes less | 132 | Caveat: Shrinking the pool via nr_hugepages such that it becomes less |
92 | than the number of hugepages in use will convert the balance to surplus | 133 | than the number of huge pages in use will convert the balance to surplus |
93 | huge pages even if it would exceed the overcommit value. As long as | 134 | huge pages even if it would exceed the overcommit value. As long as |
94 | this condition holds, however, no more surplus huge pages will be | 135 | this condition holds, however, no more surplus huge pages will be |
95 | allowed on the system until one of the two sysctls are increased | 136 | allowed on the system until one of the two sysctls are increased |
96 | sufficiently, or the surplus huge pages go out of use and are freed. | 137 | sufficiently, or the surplus huge pages go out of use and are freed. |
97 | 138 | ||
98 | With support for multiple hugepage pools at run-time available, much of | 139 | With support for multiple huge page pools at run-time available, much of |
99 | the hugepage userspace interface has been duplicated in sysfs. The above | 140 | the huge page userspace interface has been duplicated in sysfs. The above |
100 | information applies to the default hugepage size (which will be | 141 | information applies to the default huge page size which will be |
101 | controlled by the proc interfaces for backwards compatibility). The root | 142 | controlled by the /proc interfaces for backwards compatibility. The root |
102 | hugepage control directory is | 143 | huge page control directory in sysfs is: |
103 | 144 | ||
104 | /sys/kernel/mm/hugepages | 145 | /sys/kernel/mm/hugepages |
105 | 146 | ||
106 | For each hugepage size supported by the running kernel, a subdirectory | 147 | For each huge page size supported by the running kernel, a subdirectory |
107 | will exist, of the form | 148 | will exist, of the form |
108 | 149 | ||
109 | hugepages-${size}kB | 150 | hugepages-${size}kB |
@@ -116,9 +157,9 @@ Inside each of these directories, the same set of files will exist: | |||
116 | resv_hugepages | 157 | resv_hugepages |
117 | surplus_hugepages | 158 | surplus_hugepages |
118 | 159 | ||
119 | which function as described above for the default hugepage-sized case. | 160 | which function as described above for the default huge page-sized case. |
120 | 161 | ||
121 | If the user applications are going to request hugepages using mmap system | 162 | If the user applications are going to request huge pages using mmap system |
122 | call, then it is required that system administrator mount a file system of | 163 | call, then it is required that system administrator mount a file system of |
123 | type hugetlbfs: | 164 | type hugetlbfs: |
124 | 165 | ||
@@ -127,7 +168,7 @@ type hugetlbfs: | |||
127 | none /mnt/huge | 168 | none /mnt/huge |
128 | 169 | ||
129 | This command mounts a (pseudo) filesystem of type hugetlbfs on the directory | 170 | This command mounts a (pseudo) filesystem of type hugetlbfs on the directory |
130 | /mnt/huge. Any files created on /mnt/huge uses hugepages. The uid and gid | 171 | /mnt/huge. Any files created on /mnt/huge uses huge pages. The uid and gid |
131 | options sets the owner and group of the root of the file system. By default | 172 | options sets the owner and group of the root of the file system. By default |
132 | the uid and gid of the current process are taken. The mode option sets the | 173 | the uid and gid of the current process are taken. The mode option sets the |
133 | mode of root of file system to value & 0777. This value is given in octal. | 174 | mode of root of file system to value & 0777. This value is given in octal. |
@@ -146,24 +187,26 @@ Regular chown, chgrp, and chmod commands (with right permissions) could be | |||
146 | used to change the file attributes on hugetlbfs. | 187 | used to change the file attributes on hugetlbfs. |
147 | 188 | ||
148 | Also, it is important to note that no such mount command is required if the | 189 | Also, it is important to note that no such mount command is required if the |
149 | applications are going to use only shmat/shmget system calls. Users who | 190 | applications are going to use only shmat/shmget system calls or mmap with |
150 | wish to use hugetlb page via shared memory segment should be a member of | 191 | MAP_HUGETLB. Users who wish to use hugetlb page via shared memory segment |
151 | a supplementary group and system admin needs to configure that gid into | 192 | should be a member of a supplementary group and system admin needs to |
152 | /proc/sys/vm/hugetlb_shm_group. It is possible for same or different | 193 | configure that gid into /proc/sys/vm/hugetlb_shm_group. It is possible for |
153 | applications to use any combination of mmaps and shm* calls, though the | 194 | same or different applications to use any combination of mmaps and shm* |
154 | mount of filesystem will be required for using mmap calls. | 195 | calls, though the mount of filesystem will be required for using mmap calls |
196 | without MAP_HUGETLB. For an example of how to use mmap with MAP_HUGETLB see | ||
197 | map_hugetlb.c. | ||
155 | 198 | ||
156 | ******************************************************************* | 199 | ******************************************************************* |
157 | 200 | ||
158 | /* | 201 | /* |
159 | * Example of using hugepage memory in a user application using Sys V shared | 202 | * Example of using huge page memory in a user application using Sys V shared |
160 | * memory system calls. In this example the app is requesting 256MB of | 203 | * memory system calls. In this example the app is requesting 256MB of |
161 | * memory that is backed by huge pages. The application uses the flag | 204 | * memory that is backed by huge pages. The application uses the flag |
162 | * SHM_HUGETLB in the shmget system call to inform the kernel that it is | 205 | * SHM_HUGETLB in the shmget system call to inform the kernel that it is |
163 | * requesting hugepages. | 206 | * requesting huge pages. |
164 | * | 207 | * |
165 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for | 208 | * For the ia64 architecture, the Linux kernel reserves Region number 4 for |
166 | * hugepages. That means the addresses starting with 0x800000... will need | 209 | * huge pages. That means the addresses starting with 0x800000... will need |
167 | * to be specified. Specifying a fixed address is not required on ppc64, | 210 | * to be specified. Specifying a fixed address is not required on ppc64, |
168 | * i386 or x86_64. | 211 | * i386 or x86_64. |
169 | * | 212 | * |
@@ -252,14 +295,14 @@ int main(void) | |||
252 | ******************************************************************* | 295 | ******************************************************************* |
253 | 296 | ||
254 | /* | 297 | /* |
255 | * Example of using hugepage memory in a user application using the mmap | 298 | * Example of using huge page memory in a user application using the mmap |
256 | * system call. Before running this application, make sure that the | 299 | * system call. Before running this application, make sure that the |
257 | * administrator has mounted the hugetlbfs filesystem (on some directory | 300 | * administrator has mounted the hugetlbfs filesystem (on some directory |
258 | * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this | 301 | * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this |
259 | * example, the app is requesting memory of size 256MB that is backed by | 302 | * example, the app is requesting memory of size 256MB that is backed by |
260 | * huge pages. | 303 | * huge pages. |
261 | * | 304 | * |
262 | * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. | 305 | * For ia64 architecture, Linux kernel reserves Region number 4 for huge pages. |
263 | * That means the addresses starting with 0x800000... will need to be | 306 | * That means the addresses starting with 0x800000... will need to be |
264 | * specified. Specifying a fixed address is not required on ppc64, i386 | 307 | * specified. Specifying a fixed address is not required on ppc64, i386 |
265 | * or x86_64. | 308 | * or x86_64. |
diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt new file mode 100644 index 000000000000..72a22f65960e --- /dev/null +++ b/Documentation/vm/ksm.txt | |||
@@ -0,0 +1,89 @@ | |||
1 | How to use the Kernel Samepage Merging feature | ||
2 | ---------------------------------------------- | ||
3 | |||
4 | KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y, | ||
5 | added to the Linux kernel in 2.6.32. See mm/ksm.c for its implementation, | ||
6 | and http://lwn.net/Articles/306704/ and http://lwn.net/Articles/330589/ | ||
7 | |||
8 | The KSM daemon ksmd periodically scans those areas of user memory which | ||
9 | have been registered with it, looking for pages of identical content which | ||
10 | can be replaced by a single write-protected page (which is automatically | ||
11 | copied if a process later wants to update its content). | ||
12 | |||
13 | KSM was originally developed for use with KVM (where it was known as | ||
14 | Kernel Shared Memory), to fit more virtual machines into physical memory, | ||
15 | by sharing the data common between them. But it can be useful to any | ||
16 | application which generates many instances of the same data. | ||
17 | |||
18 | KSM only merges anonymous (private) pages, never pagecache (file) pages. | ||
19 | KSM's merged pages are at present locked into kernel memory for as long | ||
20 | as they are shared: so cannot be swapped out like the user pages they | ||
21 | replace (but swapping KSM pages should follow soon in a later release). | ||
22 | |||
23 | KSM only operates on those areas of address space which an application | ||
24 | has advised to be likely candidates for merging, by using the madvise(2) | ||
25 | system call: int madvise(addr, length, MADV_MERGEABLE). | ||
26 | |||
27 | The app may call int madvise(addr, length, MADV_UNMERGEABLE) to cancel | ||
28 | that advice and restore unshared pages: whereupon KSM unmerges whatever | ||
29 | it merged in that range. Note: this unmerging call may suddenly require | ||
30 | more memory than is available - possibly failing with EAGAIN, but more | ||
31 | probably arousing the Out-Of-Memory killer. | ||
32 | |||
33 | If KSM is not configured into the running kernel, madvise MADV_MERGEABLE | ||
34 | and MADV_UNMERGEABLE simply fail with EINVAL. If the running kernel was | ||
35 | built with CONFIG_KSM=y, those calls will normally succeed: even if the | ||
36 | the KSM daemon is not currently running, MADV_MERGEABLE still registers | ||
37 | the range for whenever the KSM daemon is started; even if the range | ||
38 | cannot contain any pages which KSM could actually merge; even if | ||
39 | MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE. | ||
40 | |||
41 | Like other madvise calls, they are intended for use on mapped areas of | ||
42 | the user address space: they will report ENOMEM if the specified range | ||
43 | includes unmapped gaps (though working on the intervening mapped areas), | ||
44 | and might fail with EAGAIN if not enough memory for internal structures. | ||
45 | |||
46 | Applications should be considerate in their use of MADV_MERGEABLE, | ||
47 | restricting its use to areas likely to benefit. KSM's scans may use | ||
48 | a lot of processing power, and its kernel-resident pages are a limited | ||
49 | resource. Some installations will disable KSM for these reasons. | ||
50 | |||
51 | The KSM daemon is controlled by sysfs files in /sys/kernel/mm/ksm/, | ||
52 | readable by all but writable only by root: | ||
53 | |||
54 | max_kernel_pages - set to maximum number of kernel pages that KSM may use | ||
55 | e.g. "echo 2000 > /sys/kernel/mm/ksm/max_kernel_pages" | ||
56 | Value 0 imposes no limit on the kernel pages KSM may use; | ||
57 | but note that any process using MADV_MERGEABLE can cause | ||
58 | KSM to allocate these pages, unswappable until it exits. | ||
59 | Default: 2000 (chosen for demonstration purposes) | ||
60 | |||
61 | pages_to_scan - how many present pages to scan before ksmd goes to sleep | ||
62 | e.g. "echo 200 > /sys/kernel/mm/ksm/pages_to_scan" | ||
63 | Default: 200 (chosen for demonstration purposes) | ||
64 | |||
65 | sleep_millisecs - how many milliseconds ksmd should sleep before next scan | ||
66 | e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs" | ||
67 | Default: 20 (chosen for demonstration purposes) | ||
68 | |||
69 | run - set 0 to stop ksmd from running but keep merged pages, | ||
70 | set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run", | ||
71 | set 2 to stop ksmd and unmerge all pages currently merged, | ||
72 | but leave mergeable areas registered for next run | ||
73 | Default: 1 (for immediate use by apps which register) | ||
74 | |||
75 | The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/: | ||
76 | |||
77 | pages_shared - how many shared unswappable kernel pages KSM is using | ||
78 | pages_sharing - how many more sites are sharing them i.e. how much saved | ||
79 | pages_unshared - how many pages unique but repeatedly checked for merging | ||
80 | pages_volatile - how many pages changing too fast to be placed in a tree | ||
81 | full_scans - how many times all mergeable areas have been scanned | ||
82 | |||
83 | A high ratio of pages_sharing to pages_shared indicates good sharing, but | ||
84 | a high ratio of pages_unshared to pages_sharing indicates wasted effort. | ||
85 | pages_volatile embraces several different kinds of activity, but a high | ||
86 | proportion there would also indicate poor use of madvise MADV_MERGEABLE. | ||
87 | |||
88 | Izik Eidus, | ||
89 | Hugh Dickins, 30 July 2009 | ||
diff --git a/Documentation/vm/locking b/Documentation/vm/locking index f366fa956179..25fadb448760 100644 --- a/Documentation/vm/locking +++ b/Documentation/vm/locking | |||
@@ -80,7 +80,7 @@ Note: PTL can also be used to guarantee that no new clones using the | |||
80 | mm start up ... this is a loose form of stability on mm_users. For | 80 | mm start up ... this is a loose form of stability on mm_users. For |
81 | example, it is used in copy_mm to protect against a racing tlb_gather_mmu | 81 | example, it is used in copy_mm to protect against a racing tlb_gather_mmu |
82 | single address space optimization, so that the zap_page_range (from | 82 | single address space optimization, so that the zap_page_range (from |
83 | vmtruncate) does not lose sending ipi's to cloned threads that might | 83 | truncate) does not lose sending ipi's to cloned threads that might |
84 | be spawned underneath it and go to user mode to drag in pte's into tlbs. | 84 | be spawned underneath it and go to user mode to drag in pte's into tlbs. |
85 | 85 | ||
86 | swap_lock | 86 | swap_lock |
diff --git a/Documentation/vm/map_hugetlb.c b/Documentation/vm/map_hugetlb.c new file mode 100644 index 000000000000..e2bdae37f499 --- /dev/null +++ b/Documentation/vm/map_hugetlb.c | |||
@@ -0,0 +1,77 @@ | |||
1 | /* | ||
2 | * Example of using hugepage memory in a user application using the mmap | ||
3 | * system call with MAP_HUGETLB flag. Before running this program make | ||
4 | * sure the administrator has allocated enough default sized huge pages | ||
5 | * to cover the 256 MB allocation. | ||
6 | * | ||
7 | * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. | ||
8 | * That means the addresses starting with 0x800000... will need to be | ||
9 | * specified. Specifying a fixed address is not required on ppc64, i386 | ||
10 | * or x86_64. | ||
11 | */ | ||
12 | #include <stdlib.h> | ||
13 | #include <stdio.h> | ||
14 | #include <unistd.h> | ||
15 | #include <sys/mman.h> | ||
16 | #include <fcntl.h> | ||
17 | |||
18 | #define LENGTH (256UL*1024*1024) | ||
19 | #define PROTECTION (PROT_READ | PROT_WRITE) | ||
20 | |||
21 | #ifndef MAP_HUGETLB | ||
22 | #define MAP_HUGETLB 0x40 | ||
23 | #endif | ||
24 | |||
25 | /* Only ia64 requires this */ | ||
26 | #ifdef __ia64__ | ||
27 | #define ADDR (void *)(0x8000000000000000UL) | ||
28 | #define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) | ||
29 | #else | ||
30 | #define ADDR (void *)(0x0UL) | ||
31 | #define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) | ||
32 | #endif | ||
33 | |||
34 | void check_bytes(char *addr) | ||
35 | { | ||
36 | printf("First hex is %x\n", *((unsigned int *)addr)); | ||
37 | } | ||
38 | |||
39 | void write_bytes(char *addr) | ||
40 | { | ||
41 | unsigned long i; | ||
42 | |||
43 | for (i = 0; i < LENGTH; i++) | ||
44 | *(addr + i) = (char)i; | ||
45 | } | ||
46 | |||
47 | void read_bytes(char *addr) | ||
48 | { | ||
49 | unsigned long i; | ||
50 | |||
51 | check_bytes(addr); | ||
52 | for (i = 0; i < LENGTH; i++) | ||
53 | if (*(addr + i) != (char)i) { | ||
54 | printf("Mismatch at %lu\n", i); | ||
55 | break; | ||
56 | } | ||
57 | } | ||
58 | |||
59 | int main(void) | ||
60 | { | ||
61 | void *addr; | ||
62 | |||
63 | addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, 0, 0); | ||
64 | if (addr == MAP_FAILED) { | ||
65 | perror("mmap"); | ||
66 | exit(1); | ||
67 | } | ||
68 | |||
69 | printf("Returned address is %p\n", addr); | ||
70 | check_bytes(addr); | ||
71 | write_bytes(addr); | ||
72 | read_bytes(addr); | ||
73 | |||
74 | munmap(addr, LENGTH); | ||
75 | |||
76 | return 0; | ||
77 | } | ||
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c index 0833f44ba16b..fa1a30d9e9d5 100644 --- a/Documentation/vm/page-types.c +++ b/Documentation/vm/page-types.c | |||
@@ -5,6 +5,7 @@ | |||
5 | * Copyright (C) 2009 Wu Fengguang <fengguang.wu@intel.com> | 5 | * Copyright (C) 2009 Wu Fengguang <fengguang.wu@intel.com> |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #define _LARGEFILE64_SOURCE | ||
8 | #include <stdio.h> | 9 | #include <stdio.h> |
9 | #include <stdlib.h> | 10 | #include <stdlib.h> |
10 | #include <unistd.h> | 11 | #include <unistd.h> |
@@ -13,12 +14,33 @@ | |||
13 | #include <string.h> | 14 | #include <string.h> |
14 | #include <getopt.h> | 15 | #include <getopt.h> |
15 | #include <limits.h> | 16 | #include <limits.h> |
17 | #include <assert.h> | ||
16 | #include <sys/types.h> | 18 | #include <sys/types.h> |
17 | #include <sys/errno.h> | 19 | #include <sys/errno.h> |
18 | #include <sys/fcntl.h> | 20 | #include <sys/fcntl.h> |
19 | 21 | ||
20 | 22 | ||
21 | /* | 23 | /* |
24 | * pagemap kernel ABI bits | ||
25 | */ | ||
26 | |||
27 | #define PM_ENTRY_BYTES sizeof(uint64_t) | ||
28 | #define PM_STATUS_BITS 3 | ||
29 | #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) | ||
30 | #define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) | ||
31 | #define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) | ||
32 | #define PM_PSHIFT_BITS 6 | ||
33 | #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) | ||
34 | #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) | ||
35 | #define PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) | ||
36 | #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) | ||
37 | #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) | ||
38 | |||
39 | #define PM_PRESENT PM_STATUS(4LL) | ||
40 | #define PM_SWAP PM_STATUS(2LL) | ||
41 | |||
42 | |||
43 | /* | ||
22 | * kernel page flags | 44 | * kernel page flags |
23 | */ | 45 | */ |
24 | 46 | ||
@@ -126,6 +148,14 @@ static int nr_addr_ranges; | |||
126 | static unsigned long opt_offset[MAX_ADDR_RANGES]; | 148 | static unsigned long opt_offset[MAX_ADDR_RANGES]; |
127 | static unsigned long opt_size[MAX_ADDR_RANGES]; | 149 | static unsigned long opt_size[MAX_ADDR_RANGES]; |
128 | 150 | ||
151 | #define MAX_VMAS 10240 | ||
152 | static int nr_vmas; | ||
153 | static unsigned long pg_start[MAX_VMAS]; | ||
154 | static unsigned long pg_end[MAX_VMAS]; | ||
155 | static unsigned long voffset; | ||
156 | |||
157 | static int pagemap_fd; | ||
158 | |||
129 | #define MAX_BIT_FILTERS 64 | 159 | #define MAX_BIT_FILTERS 64 |
130 | static int nr_bit_filters; | 160 | static int nr_bit_filters; |
131 | static uint64_t opt_mask[MAX_BIT_FILTERS]; | 161 | static uint64_t opt_mask[MAX_BIT_FILTERS]; |
@@ -135,7 +165,6 @@ static int page_size; | |||
135 | 165 | ||
136 | #define PAGES_BATCH (64 << 10) /* 64k pages */ | 166 | #define PAGES_BATCH (64 << 10) /* 64k pages */ |
137 | static int kpageflags_fd; | 167 | static int kpageflags_fd; |
138 | static uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH]; | ||
139 | 168 | ||
140 | #define HASH_SHIFT 13 | 169 | #define HASH_SHIFT 13 |
141 | #define HASH_SIZE (1 << HASH_SHIFT) | 170 | #define HASH_SIZE (1 << HASH_SHIFT) |
@@ -158,12 +187,17 @@ static uint64_t page_flags[HASH_SIZE]; | |||
158 | type __min2 = (y); \ | 187 | type __min2 = (y); \ |
159 | __min1 < __min2 ? __min1 : __min2; }) | 188 | __min1 < __min2 ? __min1 : __min2; }) |
160 | 189 | ||
161 | unsigned long pages2mb(unsigned long pages) | 190 | #define max_t(type, x, y) ({ \ |
191 | type __max1 = (x); \ | ||
192 | type __max2 = (y); \ | ||
193 | __max1 > __max2 ? __max1 : __max2; }) | ||
194 | |||
195 | static unsigned long pages2mb(unsigned long pages) | ||
162 | { | 196 | { |
163 | return (pages * page_size) >> 20; | 197 | return (pages * page_size) >> 20; |
164 | } | 198 | } |
165 | 199 | ||
166 | void fatal(const char *x, ...) | 200 | static void fatal(const char *x, ...) |
167 | { | 201 | { |
168 | va_list ap; | 202 | va_list ap; |
169 | 203 | ||
@@ -178,7 +212,7 @@ void fatal(const char *x, ...) | |||
178 | * page flag names | 212 | * page flag names |
179 | */ | 213 | */ |
180 | 214 | ||
181 | char *page_flag_name(uint64_t flags) | 215 | static char *page_flag_name(uint64_t flags) |
182 | { | 216 | { |
183 | static char buf[65]; | 217 | static char buf[65]; |
184 | int present; | 218 | int present; |
@@ -197,7 +231,7 @@ char *page_flag_name(uint64_t flags) | |||
197 | return buf; | 231 | return buf; |
198 | } | 232 | } |
199 | 233 | ||
200 | char *page_flag_longname(uint64_t flags) | 234 | static char *page_flag_longname(uint64_t flags) |
201 | { | 235 | { |
202 | static char buf[1024]; | 236 | static char buf[1024]; |
203 | int i, n; | 237 | int i, n; |
@@ -221,32 +255,40 @@ char *page_flag_longname(uint64_t flags) | |||
221 | * page list and summary | 255 | * page list and summary |
222 | */ | 256 | */ |
223 | 257 | ||
224 | void show_page_range(unsigned long offset, uint64_t flags) | 258 | static void show_page_range(unsigned long offset, uint64_t flags) |
225 | { | 259 | { |
226 | static uint64_t flags0; | 260 | static uint64_t flags0; |
261 | static unsigned long voff; | ||
227 | static unsigned long index; | 262 | static unsigned long index; |
228 | static unsigned long count; | 263 | static unsigned long count; |
229 | 264 | ||
230 | if (flags == flags0 && offset == index + count) { | 265 | if (flags == flags0 && offset == index + count && |
266 | (!opt_pid || voffset == voff + count)) { | ||
231 | count++; | 267 | count++; |
232 | return; | 268 | return; |
233 | } | 269 | } |
234 | 270 | ||
235 | if (count) | 271 | if (count) { |
236 | printf("%lu\t%lu\t%s\n", | 272 | if (opt_pid) |
273 | printf("%lx\t", voff); | ||
274 | printf("%lx\t%lx\t%s\n", | ||
237 | index, count, page_flag_name(flags0)); | 275 | index, count, page_flag_name(flags0)); |
276 | } | ||
238 | 277 | ||
239 | flags0 = flags; | 278 | flags0 = flags; |
240 | index = offset; | 279 | index = offset; |
280 | voff = voffset; | ||
241 | count = 1; | 281 | count = 1; |
242 | } | 282 | } |
243 | 283 | ||
244 | void show_page(unsigned long offset, uint64_t flags) | 284 | static void show_page(unsigned long offset, uint64_t flags) |
245 | { | 285 | { |
246 | printf("%lu\t%s\n", offset, page_flag_name(flags)); | 286 | if (opt_pid) |
287 | printf("%lx\t", voffset); | ||
288 | printf("%lx\t%s\n", offset, page_flag_name(flags)); | ||
247 | } | 289 | } |
248 | 290 | ||
249 | void show_summary(void) | 291 | static void show_summary(void) |
250 | { | 292 | { |
251 | int i; | 293 | int i; |
252 | 294 | ||
@@ -272,7 +314,7 @@ void show_summary(void) | |||
272 | * page flag filters | 314 | * page flag filters |
273 | */ | 315 | */ |
274 | 316 | ||
275 | int bit_mask_ok(uint64_t flags) | 317 | static int bit_mask_ok(uint64_t flags) |
276 | { | 318 | { |
277 | int i; | 319 | int i; |
278 | 320 | ||
@@ -289,7 +331,7 @@ int bit_mask_ok(uint64_t flags) | |||
289 | return 1; | 331 | return 1; |
290 | } | 332 | } |
291 | 333 | ||
292 | uint64_t expand_overloaded_flags(uint64_t flags) | 334 | static uint64_t expand_overloaded_flags(uint64_t flags) |
293 | { | 335 | { |
294 | /* SLOB/SLUB overload several page flags */ | 336 | /* SLOB/SLUB overload several page flags */ |
295 | if (flags & BIT(SLAB)) { | 337 | if (flags & BIT(SLAB)) { |
@@ -308,7 +350,7 @@ uint64_t expand_overloaded_flags(uint64_t flags) | |||
308 | return flags; | 350 | return flags; |
309 | } | 351 | } |
310 | 352 | ||
311 | uint64_t well_known_flags(uint64_t flags) | 353 | static uint64_t well_known_flags(uint64_t flags) |
312 | { | 354 | { |
313 | /* hide flags intended only for kernel hacker */ | 355 | /* hide flags intended only for kernel hacker */ |
314 | flags &= ~KPF_HACKERS_BITS; | 356 | flags &= ~KPF_HACKERS_BITS; |
@@ -325,7 +367,7 @@ uint64_t well_known_flags(uint64_t flags) | |||
325 | * page frame walker | 367 | * page frame walker |
326 | */ | 368 | */ |
327 | 369 | ||
328 | int hash_slot(uint64_t flags) | 370 | static int hash_slot(uint64_t flags) |
329 | { | 371 | { |
330 | int k = HASH_KEY(flags); | 372 | int k = HASH_KEY(flags); |
331 | int i; | 373 | int i; |
@@ -352,7 +394,7 @@ int hash_slot(uint64_t flags) | |||
352 | exit(EXIT_FAILURE); | 394 | exit(EXIT_FAILURE); |
353 | } | 395 | } |
354 | 396 | ||
355 | void add_page(unsigned long offset, uint64_t flags) | 397 | static void add_page(unsigned long offset, uint64_t flags) |
356 | { | 398 | { |
357 | flags = expand_overloaded_flags(flags); | 399 | flags = expand_overloaded_flags(flags); |
358 | 400 | ||
@@ -371,7 +413,7 @@ void add_page(unsigned long offset, uint64_t flags) | |||
371 | total_pages++; | 413 | total_pages++; |
372 | } | 414 | } |
373 | 415 | ||
374 | void walk_pfn(unsigned long index, unsigned long count) | 416 | static void walk_pfn(unsigned long index, unsigned long count) |
375 | { | 417 | { |
376 | unsigned long batch; | 418 | unsigned long batch; |
377 | unsigned long n; | 419 | unsigned long n; |
@@ -383,6 +425,8 @@ void walk_pfn(unsigned long index, unsigned long count) | |||
383 | lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET); | 425 | lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET); |
384 | 426 | ||
385 | while (count) { | 427 | while (count) { |
428 | uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH]; | ||
429 | |||
386 | batch = min_t(unsigned long, count, PAGES_BATCH); | 430 | batch = min_t(unsigned long, count, PAGES_BATCH); |
387 | n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES); | 431 | n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES); |
388 | if (n == 0) | 432 | if (n == 0) |
@@ -404,7 +448,82 @@ void walk_pfn(unsigned long index, unsigned long count) | |||
404 | } | 448 | } |
405 | } | 449 | } |
406 | 450 | ||
407 | void walk_addr_ranges(void) | 451 | |
452 | #define PAGEMAP_BATCH 4096 | ||
453 | static unsigned long task_pfn(unsigned long pgoff) | ||
454 | { | ||
455 | static uint64_t buf[PAGEMAP_BATCH]; | ||
456 | static unsigned long start; | ||
457 | static long count; | ||
458 | uint64_t pfn; | ||
459 | |||
460 | if (pgoff < start || pgoff >= start + count) { | ||
461 | if (lseek64(pagemap_fd, | ||
462 | (uint64_t)pgoff * PM_ENTRY_BYTES, | ||
463 | SEEK_SET) < 0) { | ||
464 | perror("pagemap seek"); | ||
465 | exit(EXIT_FAILURE); | ||
466 | } | ||
467 | count = read(pagemap_fd, buf, sizeof(buf)); | ||
468 | if (count == 0) | ||
469 | return 0; | ||
470 | if (count < 0) { | ||
471 | perror("pagemap read"); | ||
472 | exit(EXIT_FAILURE); | ||
473 | } | ||
474 | if (count % PM_ENTRY_BYTES) { | ||
475 | fatal("pagemap read not aligned.\n"); | ||
476 | exit(EXIT_FAILURE); | ||
477 | } | ||
478 | count /= PM_ENTRY_BYTES; | ||
479 | start = pgoff; | ||
480 | } | ||
481 | |||
482 | pfn = buf[pgoff - start]; | ||
483 | if (pfn & PM_PRESENT) | ||
484 | pfn = PM_PFRAME(pfn); | ||
485 | else | ||
486 | pfn = 0; | ||
487 | |||
488 | return pfn; | ||
489 | } | ||
490 | |||
491 | static void walk_task(unsigned long index, unsigned long count) | ||
492 | { | ||
493 | int i = 0; | ||
494 | const unsigned long end = index + count; | ||
495 | |||
496 | while (index < end) { | ||
497 | |||
498 | while (pg_end[i] <= index) | ||
499 | if (++i >= nr_vmas) | ||
500 | return; | ||
501 | if (pg_start[i] >= end) | ||
502 | return; | ||
503 | |||
504 | voffset = max_t(unsigned long, pg_start[i], index); | ||
505 | index = min_t(unsigned long, pg_end[i], end); | ||
506 | |||
507 | assert(voffset < index); | ||
508 | for (; voffset < index; voffset++) { | ||
509 | unsigned long pfn = task_pfn(voffset); | ||
510 | if (pfn) | ||
511 | walk_pfn(pfn, 1); | ||
512 | } | ||
513 | } | ||
514 | } | ||
515 | |||
516 | static void add_addr_range(unsigned long offset, unsigned long size) | ||
517 | { | ||
518 | if (nr_addr_ranges >= MAX_ADDR_RANGES) | ||
519 | fatal("too many addr ranges\n"); | ||
520 | |||
521 | opt_offset[nr_addr_ranges] = offset; | ||
522 | opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset); | ||
523 | nr_addr_ranges++; | ||
524 | } | ||
525 | |||
526 | static void walk_addr_ranges(void) | ||
408 | { | 527 | { |
409 | int i; | 528 | int i; |
410 | 529 | ||
@@ -415,10 +534,13 @@ void walk_addr_ranges(void) | |||
415 | } | 534 | } |
416 | 535 | ||
417 | if (!nr_addr_ranges) | 536 | if (!nr_addr_ranges) |
418 | walk_pfn(0, ULONG_MAX); | 537 | add_addr_range(0, ULONG_MAX); |
419 | 538 | ||
420 | for (i = 0; i < nr_addr_ranges; i++) | 539 | for (i = 0; i < nr_addr_ranges; i++) |
421 | walk_pfn(opt_offset[i], opt_size[i]); | 540 | if (!opt_pid) |
541 | walk_pfn(opt_offset[i], opt_size[i]); | ||
542 | else | ||
543 | walk_task(opt_offset[i], opt_size[i]); | ||
422 | 544 | ||
423 | close(kpageflags_fd); | 545 | close(kpageflags_fd); |
424 | } | 546 | } |
@@ -428,7 +550,7 @@ void walk_addr_ranges(void) | |||
428 | * user interface | 550 | * user interface |
429 | */ | 551 | */ |
430 | 552 | ||
431 | const char *page_flag_type(uint64_t flag) | 553 | static const char *page_flag_type(uint64_t flag) |
432 | { | 554 | { |
433 | if (flag & KPF_HACKERS_BITS) | 555 | if (flag & KPF_HACKERS_BITS) |
434 | return "(r)"; | 556 | return "(r)"; |
@@ -437,7 +559,7 @@ const char *page_flag_type(uint64_t flag) | |||
437 | return " "; | 559 | return " "; |
438 | } | 560 | } |
439 | 561 | ||
440 | void usage(void) | 562 | static void usage(void) |
441 | { | 563 | { |
442 | int i, j; | 564 | int i, j; |
443 | 565 | ||
@@ -446,8 +568,8 @@ void usage(void) | |||
446 | " -r|--raw Raw mode, for kernel developers\n" | 568 | " -r|--raw Raw mode, for kernel developers\n" |
447 | " -a|--addr addr-spec Walk a range of pages\n" | 569 | " -a|--addr addr-spec Walk a range of pages\n" |
448 | " -b|--bits bits-spec Walk pages with specified bits\n" | 570 | " -b|--bits bits-spec Walk pages with specified bits\n" |
449 | #if 0 /* planned features */ | ||
450 | " -p|--pid pid Walk process address space\n" | 571 | " -p|--pid pid Walk process address space\n" |
572 | #if 0 /* planned features */ | ||
451 | " -f|--file filename Walk file address space\n" | 573 | " -f|--file filename Walk file address space\n" |
452 | #endif | 574 | #endif |
453 | " -l|--list Show page details in ranges\n" | 575 | " -l|--list Show page details in ranges\n" |
@@ -459,7 +581,7 @@ void usage(void) | |||
459 | " N+M pages range from N to N+M-1\n" | 581 | " N+M pages range from N to N+M-1\n" |
460 | " N,M pages range from N to M-1\n" | 582 | " N,M pages range from N to M-1\n" |
461 | " N, pages range from N to end\n" | 583 | " N, pages range from N to end\n" |
462 | " ,M pages range from 0 to M\n" | 584 | " ,M pages range from 0 to M-1\n" |
463 | "bits-spec:\n" | 585 | "bits-spec:\n" |
464 | " bit1,bit2 (flags & (bit1|bit2)) != 0\n" | 586 | " bit1,bit2 (flags & (bit1|bit2)) != 0\n" |
465 | " bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n" | 587 | " bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n" |
@@ -482,7 +604,7 @@ void usage(void) | |||
482 | "(r) raw mode bits (o) overloaded bits\n"); | 604 | "(r) raw mode bits (o) overloaded bits\n"); |
483 | } | 605 | } |
484 | 606 | ||
485 | unsigned long long parse_number(const char *str) | 607 | static unsigned long long parse_number(const char *str) |
486 | { | 608 | { |
487 | unsigned long long n; | 609 | unsigned long long n; |
488 | 610 | ||
@@ -494,26 +616,62 @@ unsigned long long parse_number(const char *str) | |||
494 | return n; | 616 | return n; |
495 | } | 617 | } |
496 | 618 | ||
497 | void parse_pid(const char *str) | 619 | static void parse_pid(const char *str) |
498 | { | 620 | { |
621 | FILE *file; | ||
622 | char buf[5000]; | ||
623 | |||
499 | opt_pid = parse_number(str); | 624 | opt_pid = parse_number(str); |
500 | } | ||
501 | 625 | ||
502 | void parse_file(const char *name) | 626 | sprintf(buf, "/proc/%d/pagemap", opt_pid); |
503 | { | 627 | pagemap_fd = open(buf, O_RDONLY); |
628 | if (pagemap_fd < 0) { | ||
629 | perror(buf); | ||
630 | exit(EXIT_FAILURE); | ||
631 | } | ||
632 | |||
633 | sprintf(buf, "/proc/%d/maps", opt_pid); | ||
634 | file = fopen(buf, "r"); | ||
635 | if (!file) { | ||
636 | perror(buf); | ||
637 | exit(EXIT_FAILURE); | ||
638 | } | ||
639 | |||
640 | while (fgets(buf, sizeof(buf), file) != NULL) { | ||
641 | unsigned long vm_start; | ||
642 | unsigned long vm_end; | ||
643 | unsigned long long pgoff; | ||
644 | int major, minor; | ||
645 | char r, w, x, s; | ||
646 | unsigned long ino; | ||
647 | int n; | ||
648 | |||
649 | n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu", | ||
650 | &vm_start, | ||
651 | &vm_end, | ||
652 | &r, &w, &x, &s, | ||
653 | &pgoff, | ||
654 | &major, &minor, | ||
655 | &ino); | ||
656 | if (n < 10) { | ||
657 | fprintf(stderr, "unexpected line: %s\n", buf); | ||
658 | continue; | ||
659 | } | ||
660 | pg_start[nr_vmas] = vm_start / page_size; | ||
661 | pg_end[nr_vmas] = vm_end / page_size; | ||
662 | if (++nr_vmas >= MAX_VMAS) { | ||
663 | fprintf(stderr, "too many VMAs\n"); | ||
664 | break; | ||
665 | } | ||
666 | } | ||
667 | fclose(file); | ||
504 | } | 668 | } |
505 | 669 | ||
506 | void add_addr_range(unsigned long offset, unsigned long size) | 670 | static void parse_file(const char *name) |
507 | { | 671 | { |
508 | if (nr_addr_ranges >= MAX_ADDR_RANGES) | ||
509 | fatal("too much addr ranges\n"); | ||
510 | |||
511 | opt_offset[nr_addr_ranges] = offset; | ||
512 | opt_size[nr_addr_ranges] = size; | ||
513 | nr_addr_ranges++; | ||
514 | } | 672 | } |
515 | 673 | ||
516 | void parse_addr_range(const char *optarg) | 674 | static void parse_addr_range(const char *optarg) |
517 | { | 675 | { |
518 | unsigned long offset; | 676 | unsigned long offset; |
519 | unsigned long size; | 677 | unsigned long size; |
@@ -547,7 +705,7 @@ void parse_addr_range(const char *optarg) | |||
547 | add_addr_range(offset, size); | 705 | add_addr_range(offset, size); |
548 | } | 706 | } |
549 | 707 | ||
550 | void add_bits_filter(uint64_t mask, uint64_t bits) | 708 | static void add_bits_filter(uint64_t mask, uint64_t bits) |
551 | { | 709 | { |
552 | if (nr_bit_filters >= MAX_BIT_FILTERS) | 710 | if (nr_bit_filters >= MAX_BIT_FILTERS) |
553 | fatal("too much bit filters\n"); | 711 | fatal("too much bit filters\n"); |
@@ -557,7 +715,7 @@ void add_bits_filter(uint64_t mask, uint64_t bits) | |||
557 | nr_bit_filters++; | 715 | nr_bit_filters++; |
558 | } | 716 | } |
559 | 717 | ||
560 | uint64_t parse_flag_name(const char *str, int len) | 718 | static uint64_t parse_flag_name(const char *str, int len) |
561 | { | 719 | { |
562 | int i; | 720 | int i; |
563 | 721 | ||
@@ -577,7 +735,7 @@ uint64_t parse_flag_name(const char *str, int len) | |||
577 | return parse_number(str); | 735 | return parse_number(str); |
578 | } | 736 | } |
579 | 737 | ||
580 | uint64_t parse_flag_names(const char *str, int all) | 738 | static uint64_t parse_flag_names(const char *str, int all) |
581 | { | 739 | { |
582 | const char *p = str; | 740 | const char *p = str; |
583 | uint64_t flags = 0; | 741 | uint64_t flags = 0; |
@@ -596,7 +754,7 @@ uint64_t parse_flag_names(const char *str, int all) | |||
596 | return flags; | 754 | return flags; |
597 | } | 755 | } |
598 | 756 | ||
599 | void parse_bits_mask(const char *optarg) | 757 | static void parse_bits_mask(const char *optarg) |
600 | { | 758 | { |
601 | uint64_t mask; | 759 | uint64_t mask; |
602 | uint64_t bits; | 760 | uint64_t bits; |
@@ -621,7 +779,7 @@ void parse_bits_mask(const char *optarg) | |||
621 | } | 779 | } |
622 | 780 | ||
623 | 781 | ||
624 | struct option opts[] = { | 782 | static struct option opts[] = { |
625 | { "raw" , 0, NULL, 'r' }, | 783 | { "raw" , 0, NULL, 'r' }, |
626 | { "pid" , 1, NULL, 'p' }, | 784 | { "pid" , 1, NULL, 'p' }, |
627 | { "file" , 1, NULL, 'f' }, | 785 | { "file" , 1, NULL, 'f' }, |
@@ -676,8 +834,10 @@ int main(int argc, char *argv[]) | |||
676 | } | 834 | } |
677 | } | 835 | } |
678 | 836 | ||
837 | if (opt_list && opt_pid) | ||
838 | printf("voffset\t"); | ||
679 | if (opt_list == 1) | 839 | if (opt_list == 1) |
680 | printf("offset\tcount\tflags\n"); | 840 | printf("offset\tlen\tflags\n"); |
681 | if (opt_list == 2) | 841 | if (opt_list == 2) |
682 | printf("offset\tflags\n"); | 842 | printf("offset\tflags\n"); |
683 | 843 | ||
diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c index df3227605d59..92e729f4b676 100644 --- a/Documentation/vm/slabinfo.c +++ b/Documentation/vm/slabinfo.c | |||
@@ -87,7 +87,7 @@ int page_size; | |||
87 | 87 | ||
88 | regex_t pattern; | 88 | regex_t pattern; |
89 | 89 | ||
90 | void fatal(const char *x, ...) | 90 | static void fatal(const char *x, ...) |
91 | { | 91 | { |
92 | va_list ap; | 92 | va_list ap; |
93 | 93 | ||
@@ -97,7 +97,7 @@ void fatal(const char *x, ...) | |||
97 | exit(EXIT_FAILURE); | 97 | exit(EXIT_FAILURE); |
98 | } | 98 | } |
99 | 99 | ||
100 | void usage(void) | 100 | static void usage(void) |
101 | { | 101 | { |
102 | printf("slabinfo 5/7/2007. (c) 2007 sgi.\n\n" | 102 | printf("slabinfo 5/7/2007. (c) 2007 sgi.\n\n" |
103 | "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n" | 103 | "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n" |
@@ -131,7 +131,7 @@ void usage(void) | |||
131 | ); | 131 | ); |
132 | } | 132 | } |
133 | 133 | ||
134 | unsigned long read_obj(const char *name) | 134 | static unsigned long read_obj(const char *name) |
135 | { | 135 | { |
136 | FILE *f = fopen(name, "r"); | 136 | FILE *f = fopen(name, "r"); |
137 | 137 | ||
@@ -151,7 +151,7 @@ unsigned long read_obj(const char *name) | |||
151 | /* | 151 | /* |
152 | * Get the contents of an attribute | 152 | * Get the contents of an attribute |
153 | */ | 153 | */ |
154 | unsigned long get_obj(const char *name) | 154 | static unsigned long get_obj(const char *name) |
155 | { | 155 | { |
156 | if (!read_obj(name)) | 156 | if (!read_obj(name)) |
157 | return 0; | 157 | return 0; |
@@ -159,7 +159,7 @@ unsigned long get_obj(const char *name) | |||
159 | return atol(buffer); | 159 | return atol(buffer); |
160 | } | 160 | } |
161 | 161 | ||
162 | unsigned long get_obj_and_str(const char *name, char **x) | 162 | static unsigned long get_obj_and_str(const char *name, char **x) |
163 | { | 163 | { |
164 | unsigned long result = 0; | 164 | unsigned long result = 0; |
165 | char *p; | 165 | char *p; |
@@ -178,7 +178,7 @@ unsigned long get_obj_and_str(const char *name, char **x) | |||
178 | return result; | 178 | return result; |
179 | } | 179 | } |
180 | 180 | ||
181 | void set_obj(struct slabinfo *s, const char *name, int n) | 181 | static void set_obj(struct slabinfo *s, const char *name, int n) |
182 | { | 182 | { |
183 | char x[100]; | 183 | char x[100]; |
184 | FILE *f; | 184 | FILE *f; |
@@ -192,7 +192,7 @@ void set_obj(struct slabinfo *s, const char *name, int n) | |||
192 | fclose(f); | 192 | fclose(f); |
193 | } | 193 | } |
194 | 194 | ||
195 | unsigned long read_slab_obj(struct slabinfo *s, const char *name) | 195 | static unsigned long read_slab_obj(struct slabinfo *s, const char *name) |
196 | { | 196 | { |
197 | char x[100]; | 197 | char x[100]; |
198 | FILE *f; | 198 | FILE *f; |
@@ -215,7 +215,7 @@ unsigned long read_slab_obj(struct slabinfo *s, const char *name) | |||
215 | /* | 215 | /* |
216 | * Put a size string together | 216 | * Put a size string together |
217 | */ | 217 | */ |
218 | int store_size(char *buffer, unsigned long value) | 218 | static int store_size(char *buffer, unsigned long value) |
219 | { | 219 | { |
220 | unsigned long divisor = 1; | 220 | unsigned long divisor = 1; |
221 | char trailer = 0; | 221 | char trailer = 0; |
@@ -247,7 +247,7 @@ int store_size(char *buffer, unsigned long value) | |||
247 | return n; | 247 | return n; |
248 | } | 248 | } |
249 | 249 | ||
250 | void decode_numa_list(int *numa, char *t) | 250 | static void decode_numa_list(int *numa, char *t) |
251 | { | 251 | { |
252 | int node; | 252 | int node; |
253 | int nr; | 253 | int nr; |
@@ -272,7 +272,7 @@ void decode_numa_list(int *numa, char *t) | |||
272 | } | 272 | } |
273 | } | 273 | } |
274 | 274 | ||
275 | void slab_validate(struct slabinfo *s) | 275 | static void slab_validate(struct slabinfo *s) |
276 | { | 276 | { |
277 | if (strcmp(s->name, "*") == 0) | 277 | if (strcmp(s->name, "*") == 0) |
278 | return; | 278 | return; |
@@ -280,7 +280,7 @@ void slab_validate(struct slabinfo *s) | |||
280 | set_obj(s, "validate", 1); | 280 | set_obj(s, "validate", 1); |
281 | } | 281 | } |
282 | 282 | ||
283 | void slab_shrink(struct slabinfo *s) | 283 | static void slab_shrink(struct slabinfo *s) |
284 | { | 284 | { |
285 | if (strcmp(s->name, "*") == 0) | 285 | if (strcmp(s->name, "*") == 0) |
286 | return; | 286 | return; |
@@ -290,7 +290,7 @@ void slab_shrink(struct slabinfo *s) | |||
290 | 290 | ||
291 | int line = 0; | 291 | int line = 0; |
292 | 292 | ||
293 | void first_line(void) | 293 | static void first_line(void) |
294 | { | 294 | { |
295 | if (show_activity) | 295 | if (show_activity) |
296 | printf("Name Objects Alloc Free %%Fast Fallb O\n"); | 296 | printf("Name Objects Alloc Free %%Fast Fallb O\n"); |
@@ -302,7 +302,7 @@ void first_line(void) | |||
302 | /* | 302 | /* |
303 | * Find the shortest alias of a slab | 303 | * Find the shortest alias of a slab |
304 | */ | 304 | */ |
305 | struct aliasinfo *find_one_alias(struct slabinfo *find) | 305 | static struct aliasinfo *find_one_alias(struct slabinfo *find) |
306 | { | 306 | { |
307 | struct aliasinfo *a; | 307 | struct aliasinfo *a; |
308 | struct aliasinfo *best = NULL; | 308 | struct aliasinfo *best = NULL; |
@@ -318,18 +318,18 @@ struct aliasinfo *find_one_alias(struct slabinfo *find) | |||
318 | return best; | 318 | return best; |
319 | } | 319 | } |
320 | 320 | ||
321 | unsigned long slab_size(struct slabinfo *s) | 321 | static unsigned long slab_size(struct slabinfo *s) |
322 | { | 322 | { |
323 | return s->slabs * (page_size << s->order); | 323 | return s->slabs * (page_size << s->order); |
324 | } | 324 | } |
325 | 325 | ||
326 | unsigned long slab_activity(struct slabinfo *s) | 326 | static unsigned long slab_activity(struct slabinfo *s) |
327 | { | 327 | { |
328 | return s->alloc_fastpath + s->free_fastpath + | 328 | return s->alloc_fastpath + s->free_fastpath + |
329 | s->alloc_slowpath + s->free_slowpath; | 329 | s->alloc_slowpath + s->free_slowpath; |
330 | } | 330 | } |
331 | 331 | ||
332 | void slab_numa(struct slabinfo *s, int mode) | 332 | static void slab_numa(struct slabinfo *s, int mode) |
333 | { | 333 | { |
334 | int node; | 334 | int node; |
335 | 335 | ||
@@ -374,7 +374,7 @@ void slab_numa(struct slabinfo *s, int mode) | |||
374 | line++; | 374 | line++; |
375 | } | 375 | } |
376 | 376 | ||
377 | void show_tracking(struct slabinfo *s) | 377 | static void show_tracking(struct slabinfo *s) |
378 | { | 378 | { |
379 | printf("\n%s: Kernel object allocation\n", s->name); | 379 | printf("\n%s: Kernel object allocation\n", s->name); |
380 | printf("-----------------------------------------------------------------------\n"); | 380 | printf("-----------------------------------------------------------------------\n"); |
@@ -392,7 +392,7 @@ void show_tracking(struct slabinfo *s) | |||
392 | 392 | ||
393 | } | 393 | } |
394 | 394 | ||
395 | void ops(struct slabinfo *s) | 395 | static void ops(struct slabinfo *s) |
396 | { | 396 | { |
397 | if (strcmp(s->name, "*") == 0) | 397 | if (strcmp(s->name, "*") == 0) |
398 | return; | 398 | return; |
@@ -405,14 +405,14 @@ void ops(struct slabinfo *s) | |||
405 | printf("\n%s has no kmem_cache operations\n", s->name); | 405 | printf("\n%s has no kmem_cache operations\n", s->name); |
406 | } | 406 | } |
407 | 407 | ||
408 | const char *onoff(int x) | 408 | static const char *onoff(int x) |
409 | { | 409 | { |
410 | if (x) | 410 | if (x) |
411 | return "On "; | 411 | return "On "; |
412 | return "Off"; | 412 | return "Off"; |
413 | } | 413 | } |
414 | 414 | ||
415 | void slab_stats(struct slabinfo *s) | 415 | static void slab_stats(struct slabinfo *s) |
416 | { | 416 | { |
417 | unsigned long total_alloc; | 417 | unsigned long total_alloc; |
418 | unsigned long total_free; | 418 | unsigned long total_free; |
@@ -477,7 +477,7 @@ void slab_stats(struct slabinfo *s) | |||
477 | s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total); | 477 | s->deactivate_to_tail, (s->deactivate_to_tail * 100) / total); |
478 | } | 478 | } |
479 | 479 | ||
480 | void report(struct slabinfo *s) | 480 | static void report(struct slabinfo *s) |
481 | { | 481 | { |
482 | if (strcmp(s->name, "*") == 0) | 482 | if (strcmp(s->name, "*") == 0) |
483 | return; | 483 | return; |
@@ -518,7 +518,7 @@ void report(struct slabinfo *s) | |||
518 | slab_stats(s); | 518 | slab_stats(s); |
519 | } | 519 | } |
520 | 520 | ||
521 | void slabcache(struct slabinfo *s) | 521 | static void slabcache(struct slabinfo *s) |
522 | { | 522 | { |
523 | char size_str[20]; | 523 | char size_str[20]; |
524 | char dist_str[40]; | 524 | char dist_str[40]; |
@@ -593,7 +593,7 @@ void slabcache(struct slabinfo *s) | |||
593 | /* | 593 | /* |
594 | * Analyze debug options. Return false if something is amiss. | 594 | * Analyze debug options. Return false if something is amiss. |
595 | */ | 595 | */ |
596 | int debug_opt_scan(char *opt) | 596 | static int debug_opt_scan(char *opt) |
597 | { | 597 | { |
598 | if (!opt || !opt[0] || strcmp(opt, "-") == 0) | 598 | if (!opt || !opt[0] || strcmp(opt, "-") == 0) |
599 | return 1; | 599 | return 1; |
@@ -642,7 +642,7 @@ int debug_opt_scan(char *opt) | |||
642 | return 1; | 642 | return 1; |
643 | } | 643 | } |
644 | 644 | ||
645 | int slab_empty(struct slabinfo *s) | 645 | static int slab_empty(struct slabinfo *s) |
646 | { | 646 | { |
647 | if (s->objects > 0) | 647 | if (s->objects > 0) |
648 | return 0; | 648 | return 0; |
@@ -657,7 +657,7 @@ int slab_empty(struct slabinfo *s) | |||
657 | return 1; | 657 | return 1; |
658 | } | 658 | } |
659 | 659 | ||
660 | void slab_debug(struct slabinfo *s) | 660 | static void slab_debug(struct slabinfo *s) |
661 | { | 661 | { |
662 | if (strcmp(s->name, "*") == 0) | 662 | if (strcmp(s->name, "*") == 0) |
663 | return; | 663 | return; |
@@ -717,7 +717,7 @@ void slab_debug(struct slabinfo *s) | |||
717 | set_obj(s, "trace", 1); | 717 | set_obj(s, "trace", 1); |
718 | } | 718 | } |
719 | 719 | ||
720 | void totals(void) | 720 | static void totals(void) |
721 | { | 721 | { |
722 | struct slabinfo *s; | 722 | struct slabinfo *s; |
723 | 723 | ||
@@ -976,7 +976,7 @@ void totals(void) | |||
976 | b1, b2, b3); | 976 | b1, b2, b3); |
977 | } | 977 | } |
978 | 978 | ||
979 | void sort_slabs(void) | 979 | static void sort_slabs(void) |
980 | { | 980 | { |
981 | struct slabinfo *s1,*s2; | 981 | struct slabinfo *s1,*s2; |
982 | 982 | ||
@@ -1005,7 +1005,7 @@ void sort_slabs(void) | |||
1005 | } | 1005 | } |
1006 | } | 1006 | } |
1007 | 1007 | ||
1008 | void sort_aliases(void) | 1008 | static void sort_aliases(void) |
1009 | { | 1009 | { |
1010 | struct aliasinfo *a1,*a2; | 1010 | struct aliasinfo *a1,*a2; |
1011 | 1011 | ||
@@ -1030,7 +1030,7 @@ void sort_aliases(void) | |||
1030 | } | 1030 | } |
1031 | } | 1031 | } |
1032 | 1032 | ||
1033 | void link_slabs(void) | 1033 | static void link_slabs(void) |
1034 | { | 1034 | { |
1035 | struct aliasinfo *a; | 1035 | struct aliasinfo *a; |
1036 | struct slabinfo *s; | 1036 | struct slabinfo *s; |
@@ -1048,7 +1048,7 @@ void link_slabs(void) | |||
1048 | } | 1048 | } |
1049 | } | 1049 | } |
1050 | 1050 | ||
1051 | void alias(void) | 1051 | static void alias(void) |
1052 | { | 1052 | { |
1053 | struct aliasinfo *a; | 1053 | struct aliasinfo *a; |
1054 | char *active = NULL; | 1054 | char *active = NULL; |
@@ -1079,7 +1079,7 @@ void alias(void) | |||
1079 | } | 1079 | } |
1080 | 1080 | ||
1081 | 1081 | ||
1082 | void rename_slabs(void) | 1082 | static void rename_slabs(void) |
1083 | { | 1083 | { |
1084 | struct slabinfo *s; | 1084 | struct slabinfo *s; |
1085 | struct aliasinfo *a; | 1085 | struct aliasinfo *a; |
@@ -1102,12 +1102,12 @@ void rename_slabs(void) | |||
1102 | } | 1102 | } |
1103 | } | 1103 | } |
1104 | 1104 | ||
1105 | int slab_mismatch(char *slab) | 1105 | static int slab_mismatch(char *slab) |
1106 | { | 1106 | { |
1107 | return regexec(&pattern, slab, 0, NULL, 0); | 1107 | return regexec(&pattern, slab, 0, NULL, 0); |
1108 | } | 1108 | } |
1109 | 1109 | ||
1110 | void read_slab_dir(void) | 1110 | static void read_slab_dir(void) |
1111 | { | 1111 | { |
1112 | DIR *dir; | 1112 | DIR *dir; |
1113 | struct dirent *de; | 1113 | struct dirent *de; |
@@ -1209,7 +1209,7 @@ void read_slab_dir(void) | |||
1209 | fatal("Too many aliases\n"); | 1209 | fatal("Too many aliases\n"); |
1210 | } | 1210 | } |
1211 | 1211 | ||
1212 | void output_slabs(void) | 1212 | static void output_slabs(void) |
1213 | { | 1213 | { |
1214 | struct slabinfo *slab; | 1214 | struct slabinfo *slab; |
1215 | 1215 | ||