diff options
Diffstat (limited to 'Documentation')
-rw-r--r-- | Documentation/accounting/getdelays.c | 3 | ||||
-rw-r--r-- | Documentation/atomic_ops.txt | 4 | ||||
-rw-r--r-- | Documentation/fb/vesafb.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/proc.txt | 15 | ||||
-rw-r--r-- | Documentation/kernel-parameters.txt | 4 | ||||
-rw-r--r-- | Documentation/sysctl/vm.txt | 23 | ||||
-rw-r--r-- | Documentation/vm/Makefile | 2 | ||||
-rw-r--r-- | Documentation/vm/balance | 18 | ||||
-rw-r--r-- | Documentation/vm/page-types.c | 698 | ||||
-rw-r--r-- | Documentation/vm/pagemap.txt | 68 |
10 files changed, 806 insertions, 31 deletions
diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c index 7ea231172c85..aa73e72fd793 100644 --- a/Documentation/accounting/getdelays.c +++ b/Documentation/accounting/getdelays.c | |||
@@ -246,7 +246,8 @@ void print_ioacct(struct taskstats *t) | |||
246 | 246 | ||
247 | int main(int argc, char *argv[]) | 247 | int main(int argc, char *argv[]) |
248 | { | 248 | { |
249 | int c, rc, rep_len, aggr_len, len2, cmd_type; | 249 | int c, rc, rep_len, aggr_len, len2; |
250 | int cmd_type = TASKSTATS_CMD_ATTR_UNSPEC; | ||
250 | __u16 id; | 251 | __u16 id; |
251 | __u32 mypid; | 252 | __u32 mypid; |
252 | 253 | ||
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt index 4ef245010457..396bec3b74ed 100644 --- a/Documentation/atomic_ops.txt +++ b/Documentation/atomic_ops.txt | |||
@@ -229,10 +229,10 @@ kernel. It is the use of atomic counters to implement reference | |||
229 | counting, and it works such that once the counter falls to zero it can | 229 | counting, and it works such that once the counter falls to zero it can |
230 | be guaranteed that no other entity can be accessing the object: | 230 | be guaranteed that no other entity can be accessing the object: |
231 | 231 | ||
232 | static void obj_list_add(struct obj *obj) | 232 | static void obj_list_add(struct obj *obj, struct list_head *head) |
233 | { | 233 | { |
234 | obj->active = 1; | 234 | obj->active = 1; |
235 | list_add(&obj->list); | 235 | list_add(&obj->list, head); |
236 | } | 236 | } |
237 | 237 | ||
238 | static void obj_list_del(struct obj *obj) | 238 | static void obj_list_del(struct obj *obj) |
diff --git a/Documentation/fb/vesafb.txt b/Documentation/fb/vesafb.txt index ee277dd204b0..950d5a658cb3 100644 --- a/Documentation/fb/vesafb.txt +++ b/Documentation/fb/vesafb.txt | |||
@@ -95,7 +95,7 @@ There is no way to change the vesafb video mode and/or timings after | |||
95 | booting linux. If you are not happy with the 60 Hz refresh rate, you | 95 | booting linux. If you are not happy with the 60 Hz refresh rate, you |
96 | have these options: | 96 | have these options: |
97 | 97 | ||
98 | * configure and load the DOS-Tools for your the graphics board (if | 98 | * configure and load the DOS-Tools for the graphics board (if |
99 | available) and boot linux with loadlin. | 99 | available) and boot linux with loadlin. |
100 | * use a native driver (matroxfb/atyfb) instead if vesafb. If none | 100 | * use a native driver (matroxfb/atyfb) instead if vesafb. If none |
101 | is available, write a new one! | 101 | is available, write a new one! |
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index cd8717a36271..ebff3c10a07f 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -1003,11 +1003,13 @@ CHAPTER 3: PER-PROCESS PARAMETERS | |||
1003 | 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score | 1003 | 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score |
1004 | ------------------------------------------------------ | 1004 | ------------------------------------------------------ |
1005 | 1005 | ||
1006 | This file can be used to adjust the score used to select which processes | 1006 | This file can be used to adjust the score used to select which processes should |
1007 | should be killed in an out-of-memory situation. Giving it a high score will | 1007 | be killed in an out-of-memory situation. The oom_adj value is a characteristic |
1008 | increase the likelihood of this process being killed by the oom-killer. Valid | 1008 | of the task's mm, so all threads that share an mm with pid will have the same |
1009 | values are in the range -16 to +15, plus the special value -17, which disables | 1009 | oom_adj value. A high value will increase the likelihood of this process being |
1010 | oom-killing altogether for this process. | 1010 | killed by the oom-killer. Valid values are in the range -16 to +15 as |
1011 | explained below and a special value of -17, which disables oom-killing | ||
1012 | altogether for threads sharing pid's mm. | ||
1011 | 1013 | ||
1012 | The process to be killed in an out-of-memory situation is selected among all others | 1014 | The process to be killed in an out-of-memory situation is selected among all others |
1013 | based on its badness score. This value equals the original memory size of the process | 1015 | based on its badness score. This value equals the original memory size of the process |
@@ -1021,6 +1023,9 @@ the parent's score if they do not share the same memory. Thus forking servers | |||
1021 | are the prime candidates to be killed. Having only one 'hungry' child will make | 1023 | are the prime candidates to be killed. Having only one 'hungry' child will make |
1022 | parent less preferable than the child. | 1024 | parent less preferable than the child. |
1023 | 1025 | ||
1026 | /proc/<pid>/oom_adj cannot be changed for kthreads since they are immune from | ||
1027 | oom-killing already. | ||
1028 | |||
1024 | /proc/<pid>/oom_score shows process' current badness score. | 1029 | /proc/<pid>/oom_score shows process' current badness score. |
1025 | 1030 | ||
1026 | The following heuristics are then applied: | 1031 | The following heuristics are then applied: |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index ad3800630772..5578248c18a4 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -546,6 +546,10 @@ and is between 256 and 4096 characters. It is defined in the file | |||
546 | console=brl,ttyS0 | 546 | console=brl,ttyS0 |
547 | For now, only VisioBraille is supported. | 547 | For now, only VisioBraille is supported. |
548 | 548 | ||
549 | consoleblank= [KNL] The console blank (screen saver) timeout in | ||
550 | seconds. Defaults to 10*60 = 10mins. A value of 0 | ||
551 | disables the blank timer. | ||
552 | |||
549 | coredump_filter= | 553 | coredump_filter= |
550 | [KNL] Change the default value for | 554 | [KNL] Change the default value for |
551 | /proc/<pid>/coredump_filter. | 555 | /proc/<pid>/coredump_filter. |
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 6fab2dcbb4d3..c4de6359d440 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt | |||
@@ -233,8 +233,8 @@ These protections are added to score to judge whether this zone should be used | |||
233 | for page allocation or should be reclaimed. | 233 | for page allocation or should be reclaimed. |
234 | 234 | ||
235 | In this example, if normal pages (index=2) are required to this DMA zone and | 235 | In this example, if normal pages (index=2) are required to this DMA zone and |
236 | pages_high is used for watermark, the kernel judges this zone should not be | 236 | watermark[WMARK_HIGH] is used for watermark, the kernel judges this zone should |
237 | used because pages_free(1355) is smaller than watermark + protection[2] | 237 | not be used because pages_free(1355) is smaller than watermark + protection[2] |
238 | (4 + 2004 = 2008). If this protection value is 0, this zone would be used for | 238 | (4 + 2004 = 2008). If this protection value is 0, this zone would be used for |
239 | normal page requirement. If requirement is DMA zone(index=0), protection[0] | 239 | normal page requirement. If requirement is DMA zone(index=0), protection[0] |
240 | (=0) is used. | 240 | (=0) is used. |
@@ -280,9 +280,10 @@ The default value is 65536. | |||
280 | min_free_kbytes: | 280 | min_free_kbytes: |
281 | 281 | ||
282 | This is used to force the Linux VM to keep a minimum number | 282 | This is used to force the Linux VM to keep a minimum number |
283 | of kilobytes free. The VM uses this number to compute a pages_min | 283 | of kilobytes free. The VM uses this number to compute a |
284 | value for each lowmem zone in the system. Each lowmem zone gets | 284 | watermark[WMARK_MIN] value for each lowmem zone in the system. |
285 | a number of reserved free pages based proportionally on its size. | 285 | Each lowmem zone gets a number of reserved free pages based |
286 | proportionally on its size. | ||
286 | 287 | ||
287 | Some minimal amount of memory is needed to satisfy PF_MEMALLOC | 288 | Some minimal amount of memory is needed to satisfy PF_MEMALLOC |
288 | allocations; if you set this to lower than 1024KB, your system will | 289 | allocations; if you set this to lower than 1024KB, your system will |
@@ -314,10 +315,14 @@ min_unmapped_ratio: | |||
314 | 315 | ||
315 | This is available only on NUMA kernels. | 316 | This is available only on NUMA kernels. |
316 | 317 | ||
317 | A percentage of the total pages in each zone. Zone reclaim will only | 318 | This is a percentage of the total pages in each zone. Zone reclaim will |
318 | occur if more than this percentage of pages are file backed and unmapped. | 319 | only occur if more than this percentage of pages are in a state that |
319 | This is to insure that a minimal amount of local pages is still available for | 320 | zone_reclaim_mode allows to be reclaimed. |
320 | file I/O even if the node is overallocated. | 321 | |
322 | If zone_reclaim_mode has the value 4 OR'd, then the percentage is compared | ||
323 | against all file-backed unmapped pages including swapcache pages and tmpfs | ||
324 | files. Otherwise, only unmapped pages backed by normal files but not tmpfs | ||
325 | files and similar are considered. | ||
321 | 326 | ||
322 | The default is 1 percent. | 327 | The default is 1 percent. |
323 | 328 | ||
diff --git a/Documentation/vm/Makefile b/Documentation/vm/Makefile index 6f562f778b28..27479d43a9b0 100644 --- a/Documentation/vm/Makefile +++ b/Documentation/vm/Makefile | |||
@@ -2,7 +2,7 @@ | |||
2 | obj- := dummy.o | 2 | obj- := dummy.o |
3 | 3 | ||
4 | # List of programs to build | 4 | # List of programs to build |
5 | hostprogs-y := slabinfo | 5 | hostprogs-y := slabinfo slqbinfo page-types |
6 | 6 | ||
7 | # Tell kbuild to always build the programs | 7 | # Tell kbuild to always build the programs |
8 | always := $(hostprogs-y) | 8 | always := $(hostprogs-y) |
diff --git a/Documentation/vm/balance b/Documentation/vm/balance index bd3d31bc4915..c46e68cf9344 100644 --- a/Documentation/vm/balance +++ b/Documentation/vm/balance | |||
@@ -75,15 +75,15 @@ Page stealing from process memory and shm is done if stealing the page would | |||
75 | alleviate memory pressure on any zone in the page's node that has fallen below | 75 | alleviate memory pressure on any zone in the page's node that has fallen below |
76 | its watermark. | 76 | its watermark. |
77 | 77 | ||
78 | pages_min/pages_low/pages_high/low_on_memory/zone_wake_kswapd: These are | 78 | watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These |
79 | per-zone fields, used to determine when a zone needs to be balanced. When | 79 | are per-zone fields, used to determine when a zone needs to be balanced. When |
80 | the number of pages falls below pages_min, the hysteric field low_on_memory | 80 | the number of pages falls below watermark[WMARK_MIN], the hysteric field |
81 | gets set. This stays set till the number of free pages becomes pages_high. | 81 | low_on_memory gets set. This stays set till the number of free pages becomes |
82 | When low_on_memory is set, page allocation requests will try to free some | 82 | watermark[WMARK_HIGH]. When low_on_memory is set, page allocation requests will |
83 | pages in the zone (providing GFP_WAIT is set in the request). Orthogonal | 83 | try to free some pages in the zone (providing GFP_WAIT is set in the request). |
84 | to this, is the decision to poke kswapd to free some zone pages. That | 84 | Orthogonal to this, is the decision to poke kswapd to free some zone pages. |
85 | decision is not hysteresis based, and is done when the number of free | 85 | That decision is not hysteresis based, and is done when the number of free |
86 | pages is below pages_low; in which case zone_wake_kswapd is also set. | 86 | pages is below watermark[WMARK_LOW]; in which case zone_wake_kswapd is also set. |
87 | 87 | ||
88 | 88 | ||
89 | (Good) Ideas that I have heard: | 89 | (Good) Ideas that I have heard: |
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c new file mode 100644 index 000000000000..0833f44ba16b --- /dev/null +++ b/Documentation/vm/page-types.c | |||
@@ -0,0 +1,698 @@ | |||
1 | /* | ||
2 | * page-types: Tool for querying page flags | ||
3 | * | ||
4 | * Copyright (C) 2009 Intel corporation | ||
5 | * Copyright (C) 2009 Wu Fengguang <fengguang.wu@intel.com> | ||
6 | */ | ||
7 | |||
8 | #include <stdio.h> | ||
9 | #include <stdlib.h> | ||
10 | #include <unistd.h> | ||
11 | #include <stdint.h> | ||
12 | #include <stdarg.h> | ||
13 | #include <string.h> | ||
14 | #include <getopt.h> | ||
15 | #include <limits.h> | ||
16 | #include <sys/types.h> | ||
17 | #include <sys/errno.h> | ||
18 | #include <sys/fcntl.h> | ||
19 | |||
20 | |||
21 | /* | ||
22 | * kernel page flags | ||
23 | */ | ||
24 | |||
25 | #define KPF_BYTES 8 | ||
26 | #define PROC_KPAGEFLAGS "/proc/kpageflags" | ||
27 | |||
28 | /* copied from kpageflags_read() */ | ||
29 | #define KPF_LOCKED 0 | ||
30 | #define KPF_ERROR 1 | ||
31 | #define KPF_REFERENCED 2 | ||
32 | #define KPF_UPTODATE 3 | ||
33 | #define KPF_DIRTY 4 | ||
34 | #define KPF_LRU 5 | ||
35 | #define KPF_ACTIVE 6 | ||
36 | #define KPF_SLAB 7 | ||
37 | #define KPF_WRITEBACK 8 | ||
38 | #define KPF_RECLAIM 9 | ||
39 | #define KPF_BUDDY 10 | ||
40 | |||
41 | /* [11-20] new additions in 2.6.31 */ | ||
42 | #define KPF_MMAP 11 | ||
43 | #define KPF_ANON 12 | ||
44 | #define KPF_SWAPCACHE 13 | ||
45 | #define KPF_SWAPBACKED 14 | ||
46 | #define KPF_COMPOUND_HEAD 15 | ||
47 | #define KPF_COMPOUND_TAIL 16 | ||
48 | #define KPF_HUGE 17 | ||
49 | #define KPF_UNEVICTABLE 18 | ||
50 | #define KPF_NOPAGE 20 | ||
51 | |||
52 | /* [32-] kernel hacking assistances */ | ||
53 | #define KPF_RESERVED 32 | ||
54 | #define KPF_MLOCKED 33 | ||
55 | #define KPF_MAPPEDTODISK 34 | ||
56 | #define KPF_PRIVATE 35 | ||
57 | #define KPF_PRIVATE_2 36 | ||
58 | #define KPF_OWNER_PRIVATE 37 | ||
59 | #define KPF_ARCH 38 | ||
60 | #define KPF_UNCACHED 39 | ||
61 | |||
62 | /* [48-] take some arbitrary free slots for expanding overloaded flags | ||
63 | * not part of kernel API | ||
64 | */ | ||
65 | #define KPF_READAHEAD 48 | ||
66 | #define KPF_SLOB_FREE 49 | ||
67 | #define KPF_SLUB_FROZEN 50 | ||
68 | #define KPF_SLUB_DEBUG 51 | ||
69 | |||
70 | #define KPF_ALL_BITS ((uint64_t)~0ULL) | ||
71 | #define KPF_HACKERS_BITS (0xffffULL << 32) | ||
72 | #define KPF_OVERLOADED_BITS (0xffffULL << 48) | ||
73 | #define BIT(name) (1ULL << KPF_##name) | ||
74 | #define BITS_COMPOUND (BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL)) | ||
75 | |||
76 | static char *page_flag_names[] = { | ||
77 | [KPF_LOCKED] = "L:locked", | ||
78 | [KPF_ERROR] = "E:error", | ||
79 | [KPF_REFERENCED] = "R:referenced", | ||
80 | [KPF_UPTODATE] = "U:uptodate", | ||
81 | [KPF_DIRTY] = "D:dirty", | ||
82 | [KPF_LRU] = "l:lru", | ||
83 | [KPF_ACTIVE] = "A:active", | ||
84 | [KPF_SLAB] = "S:slab", | ||
85 | [KPF_WRITEBACK] = "W:writeback", | ||
86 | [KPF_RECLAIM] = "I:reclaim", | ||
87 | [KPF_BUDDY] = "B:buddy", | ||
88 | |||
89 | [KPF_MMAP] = "M:mmap", | ||
90 | [KPF_ANON] = "a:anonymous", | ||
91 | [KPF_SWAPCACHE] = "s:swapcache", | ||
92 | [KPF_SWAPBACKED] = "b:swapbacked", | ||
93 | [KPF_COMPOUND_HEAD] = "H:compound_head", | ||
94 | [KPF_COMPOUND_TAIL] = "T:compound_tail", | ||
95 | [KPF_HUGE] = "G:huge", | ||
96 | [KPF_UNEVICTABLE] = "u:unevictable", | ||
97 | [KPF_NOPAGE] = "n:nopage", | ||
98 | |||
99 | [KPF_RESERVED] = "r:reserved", | ||
100 | [KPF_MLOCKED] = "m:mlocked", | ||
101 | [KPF_MAPPEDTODISK] = "d:mappedtodisk", | ||
102 | [KPF_PRIVATE] = "P:private", | ||
103 | [KPF_PRIVATE_2] = "p:private_2", | ||
104 | [KPF_OWNER_PRIVATE] = "O:owner_private", | ||
105 | [KPF_ARCH] = "h:arch", | ||
106 | [KPF_UNCACHED] = "c:uncached", | ||
107 | |||
108 | [KPF_READAHEAD] = "I:readahead", | ||
109 | [KPF_SLOB_FREE] = "P:slob_free", | ||
110 | [KPF_SLUB_FROZEN] = "A:slub_frozen", | ||
111 | [KPF_SLUB_DEBUG] = "E:slub_debug", | ||
112 | }; | ||
113 | |||
114 | |||
115 | /* | ||
116 | * data structures | ||
117 | */ | ||
118 | |||
119 | static int opt_raw; /* for kernel developers */ | ||
120 | static int opt_list; /* list pages (in ranges) */ | ||
121 | static int opt_no_summary; /* don't show summary */ | ||
122 | static pid_t opt_pid; /* process to walk */ | ||
123 | |||
124 | #define MAX_ADDR_RANGES 1024 | ||
125 | static int nr_addr_ranges; | ||
126 | static unsigned long opt_offset[MAX_ADDR_RANGES]; | ||
127 | static unsigned long opt_size[MAX_ADDR_RANGES]; | ||
128 | |||
129 | #define MAX_BIT_FILTERS 64 | ||
130 | static int nr_bit_filters; | ||
131 | static uint64_t opt_mask[MAX_BIT_FILTERS]; | ||
132 | static uint64_t opt_bits[MAX_BIT_FILTERS]; | ||
133 | |||
134 | static int page_size; | ||
135 | |||
136 | #define PAGES_BATCH (64 << 10) /* 64k pages */ | ||
137 | static int kpageflags_fd; | ||
138 | static uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH]; | ||
139 | |||
140 | #define HASH_SHIFT 13 | ||
141 | #define HASH_SIZE (1 << HASH_SHIFT) | ||
142 | #define HASH_MASK (HASH_SIZE - 1) | ||
143 | #define HASH_KEY(flags) (flags & HASH_MASK) | ||
144 | |||
145 | static unsigned long total_pages; | ||
146 | static unsigned long nr_pages[HASH_SIZE]; | ||
147 | static uint64_t page_flags[HASH_SIZE]; | ||
148 | |||
149 | |||
150 | /* | ||
151 | * helper functions | ||
152 | */ | ||
153 | |||
154 | #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) | ||
155 | |||
156 | #define min_t(type, x, y) ({ \ | ||
157 | type __min1 = (x); \ | ||
158 | type __min2 = (y); \ | ||
159 | __min1 < __min2 ? __min1 : __min2; }) | ||
160 | |||
161 | unsigned long pages2mb(unsigned long pages) | ||
162 | { | ||
163 | return (pages * page_size) >> 20; | ||
164 | } | ||
165 | |||
166 | void fatal(const char *x, ...) | ||
167 | { | ||
168 | va_list ap; | ||
169 | |||
170 | va_start(ap, x); | ||
171 | vfprintf(stderr, x, ap); | ||
172 | va_end(ap); | ||
173 | exit(EXIT_FAILURE); | ||
174 | } | ||
175 | |||
176 | |||
177 | /* | ||
178 | * page flag names | ||
179 | */ | ||
180 | |||
181 | char *page_flag_name(uint64_t flags) | ||
182 | { | ||
183 | static char buf[65]; | ||
184 | int present; | ||
185 | int i, j; | ||
186 | |||
187 | for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) { | ||
188 | present = (flags >> i) & 1; | ||
189 | if (!page_flag_names[i]) { | ||
190 | if (present) | ||
191 | fatal("unkown flag bit %d\n", i); | ||
192 | continue; | ||
193 | } | ||
194 | buf[j++] = present ? page_flag_names[i][0] : '_'; | ||
195 | } | ||
196 | |||
197 | return buf; | ||
198 | } | ||
199 | |||
200 | char *page_flag_longname(uint64_t flags) | ||
201 | { | ||
202 | static char buf[1024]; | ||
203 | int i, n; | ||
204 | |||
205 | for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) { | ||
206 | if (!page_flag_names[i]) | ||
207 | continue; | ||
208 | if ((flags >> i) & 1) | ||
209 | n += snprintf(buf + n, sizeof(buf) - n, "%s,", | ||
210 | page_flag_names[i] + 2); | ||
211 | } | ||
212 | if (n) | ||
213 | n--; | ||
214 | buf[n] = '\0'; | ||
215 | |||
216 | return buf; | ||
217 | } | ||
218 | |||
219 | |||
220 | /* | ||
221 | * page list and summary | ||
222 | */ | ||
223 | |||
224 | void show_page_range(unsigned long offset, uint64_t flags) | ||
225 | { | ||
226 | static uint64_t flags0; | ||
227 | static unsigned long index; | ||
228 | static unsigned long count; | ||
229 | |||
230 | if (flags == flags0 && offset == index + count) { | ||
231 | count++; | ||
232 | return; | ||
233 | } | ||
234 | |||
235 | if (count) | ||
236 | printf("%lu\t%lu\t%s\n", | ||
237 | index, count, page_flag_name(flags0)); | ||
238 | |||
239 | flags0 = flags; | ||
240 | index = offset; | ||
241 | count = 1; | ||
242 | } | ||
243 | |||
244 | void show_page(unsigned long offset, uint64_t flags) | ||
245 | { | ||
246 | printf("%lu\t%s\n", offset, page_flag_name(flags)); | ||
247 | } | ||
248 | |||
249 | void show_summary(void) | ||
250 | { | ||
251 | int i; | ||
252 | |||
253 | printf(" flags\tpage-count MB" | ||
254 | " symbolic-flags\t\t\tlong-symbolic-flags\n"); | ||
255 | |||
256 | for (i = 0; i < ARRAY_SIZE(nr_pages); i++) { | ||
257 | if (nr_pages[i]) | ||
258 | printf("0x%016llx\t%10lu %8lu %s\t%s\n", | ||
259 | (unsigned long long)page_flags[i], | ||
260 | nr_pages[i], | ||
261 | pages2mb(nr_pages[i]), | ||
262 | page_flag_name(page_flags[i]), | ||
263 | page_flag_longname(page_flags[i])); | ||
264 | } | ||
265 | |||
266 | printf(" total\t%10lu %8lu\n", | ||
267 | total_pages, pages2mb(total_pages)); | ||
268 | } | ||
269 | |||
270 | |||
271 | /* | ||
272 | * page flag filters | ||
273 | */ | ||
274 | |||
275 | int bit_mask_ok(uint64_t flags) | ||
276 | { | ||
277 | int i; | ||
278 | |||
279 | for (i = 0; i < nr_bit_filters; i++) { | ||
280 | if (opt_bits[i] == KPF_ALL_BITS) { | ||
281 | if ((flags & opt_mask[i]) == 0) | ||
282 | return 0; | ||
283 | } else { | ||
284 | if ((flags & opt_mask[i]) != opt_bits[i]) | ||
285 | return 0; | ||
286 | } | ||
287 | } | ||
288 | |||
289 | return 1; | ||
290 | } | ||
291 | |||
292 | uint64_t expand_overloaded_flags(uint64_t flags) | ||
293 | { | ||
294 | /* SLOB/SLUB overload several page flags */ | ||
295 | if (flags & BIT(SLAB)) { | ||
296 | if (flags & BIT(PRIVATE)) | ||
297 | flags ^= BIT(PRIVATE) | BIT(SLOB_FREE); | ||
298 | if (flags & BIT(ACTIVE)) | ||
299 | flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN); | ||
300 | if (flags & BIT(ERROR)) | ||
301 | flags ^= BIT(ERROR) | BIT(SLUB_DEBUG); | ||
302 | } | ||
303 | |||
304 | /* PG_reclaim is overloaded as PG_readahead in the read path */ | ||
305 | if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM)) | ||
306 | flags ^= BIT(RECLAIM) | BIT(READAHEAD); | ||
307 | |||
308 | return flags; | ||
309 | } | ||
310 | |||
311 | uint64_t well_known_flags(uint64_t flags) | ||
312 | { | ||
313 | /* hide flags intended only for kernel hacker */ | ||
314 | flags &= ~KPF_HACKERS_BITS; | ||
315 | |||
316 | /* hide non-hugeTLB compound pages */ | ||
317 | if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE))) | ||
318 | flags &= ~BITS_COMPOUND; | ||
319 | |||
320 | return flags; | ||
321 | } | ||
322 | |||
323 | |||
324 | /* | ||
325 | * page frame walker | ||
326 | */ | ||
327 | |||
328 | int hash_slot(uint64_t flags) | ||
329 | { | ||
330 | int k = HASH_KEY(flags); | ||
331 | int i; | ||
332 | |||
333 | /* Explicitly reserve slot 0 for flags 0: the following logic | ||
334 | * cannot distinguish an unoccupied slot from slot (flags==0). | ||
335 | */ | ||
336 | if (flags == 0) | ||
337 | return 0; | ||
338 | |||
339 | /* search through the remaining (HASH_SIZE-1) slots */ | ||
340 | for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) { | ||
341 | if (!k || k >= ARRAY_SIZE(page_flags)) | ||
342 | k = 1; | ||
343 | if (page_flags[k] == 0) { | ||
344 | page_flags[k] = flags; | ||
345 | return k; | ||
346 | } | ||
347 | if (page_flags[k] == flags) | ||
348 | return k; | ||
349 | } | ||
350 | |||
351 | fatal("hash table full: bump up HASH_SHIFT?\n"); | ||
352 | exit(EXIT_FAILURE); | ||
353 | } | ||
354 | |||
355 | void add_page(unsigned long offset, uint64_t flags) | ||
356 | { | ||
357 | flags = expand_overloaded_flags(flags); | ||
358 | |||
359 | if (!opt_raw) | ||
360 | flags = well_known_flags(flags); | ||
361 | |||
362 | if (!bit_mask_ok(flags)) | ||
363 | return; | ||
364 | |||
365 | if (opt_list == 1) | ||
366 | show_page_range(offset, flags); | ||
367 | else if (opt_list == 2) | ||
368 | show_page(offset, flags); | ||
369 | |||
370 | nr_pages[hash_slot(flags)]++; | ||
371 | total_pages++; | ||
372 | } | ||
373 | |||
374 | void walk_pfn(unsigned long index, unsigned long count) | ||
375 | { | ||
376 | unsigned long batch; | ||
377 | unsigned long n; | ||
378 | unsigned long i; | ||
379 | |||
380 | if (index > ULONG_MAX / KPF_BYTES) | ||
381 | fatal("index overflow: %lu\n", index); | ||
382 | |||
383 | lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET); | ||
384 | |||
385 | while (count) { | ||
386 | batch = min_t(unsigned long, count, PAGES_BATCH); | ||
387 | n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES); | ||
388 | if (n == 0) | ||
389 | break; | ||
390 | if (n < 0) { | ||
391 | perror(PROC_KPAGEFLAGS); | ||
392 | exit(EXIT_FAILURE); | ||
393 | } | ||
394 | |||
395 | if (n % KPF_BYTES != 0) | ||
396 | fatal("partial read: %lu bytes\n", n); | ||
397 | n = n / KPF_BYTES; | ||
398 | |||
399 | for (i = 0; i < n; i++) | ||
400 | add_page(index + i, kpageflags_buf[i]); | ||
401 | |||
402 | index += batch; | ||
403 | count -= batch; | ||
404 | } | ||
405 | } | ||
406 | |||
407 | void walk_addr_ranges(void) | ||
408 | { | ||
409 | int i; | ||
410 | |||
411 | kpageflags_fd = open(PROC_KPAGEFLAGS, O_RDONLY); | ||
412 | if (kpageflags_fd < 0) { | ||
413 | perror(PROC_KPAGEFLAGS); | ||
414 | exit(EXIT_FAILURE); | ||
415 | } | ||
416 | |||
417 | if (!nr_addr_ranges) | ||
418 | walk_pfn(0, ULONG_MAX); | ||
419 | |||
420 | for (i = 0; i < nr_addr_ranges; i++) | ||
421 | walk_pfn(opt_offset[i], opt_size[i]); | ||
422 | |||
423 | close(kpageflags_fd); | ||
424 | } | ||
425 | |||
426 | |||
427 | /* | ||
428 | * user interface | ||
429 | */ | ||
430 | |||
431 | const char *page_flag_type(uint64_t flag) | ||
432 | { | ||
433 | if (flag & KPF_HACKERS_BITS) | ||
434 | return "(r)"; | ||
435 | if (flag & KPF_OVERLOADED_BITS) | ||
436 | return "(o)"; | ||
437 | return " "; | ||
438 | } | ||
439 | |||
440 | void usage(void) | ||
441 | { | ||
442 | int i, j; | ||
443 | |||
444 | printf( | ||
445 | "page-types [options]\n" | ||
446 | " -r|--raw Raw mode, for kernel developers\n" | ||
447 | " -a|--addr addr-spec Walk a range of pages\n" | ||
448 | " -b|--bits bits-spec Walk pages with specified bits\n" | ||
449 | #if 0 /* planned features */ | ||
450 | " -p|--pid pid Walk process address space\n" | ||
451 | " -f|--file filename Walk file address space\n" | ||
452 | #endif | ||
453 | " -l|--list Show page details in ranges\n" | ||
454 | " -L|--list-each Show page details one by one\n" | ||
455 | " -N|--no-summary Don't show summay info\n" | ||
456 | " -h|--help Show this usage message\n" | ||
457 | "addr-spec:\n" | ||
458 | " N one page at offset N (unit: pages)\n" | ||
459 | " N+M pages range from N to N+M-1\n" | ||
460 | " N,M pages range from N to M-1\n" | ||
461 | " N, pages range from N to end\n" | ||
462 | " ,M pages range from 0 to M\n" | ||
463 | "bits-spec:\n" | ||
464 | " bit1,bit2 (flags & (bit1|bit2)) != 0\n" | ||
465 | " bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1\n" | ||
466 | " bit1,~bit2 (flags & (bit1|bit2)) == bit1\n" | ||
467 | " =bit1,bit2 flags == (bit1|bit2)\n" | ||
468 | "bit-names:\n" | ||
469 | ); | ||
470 | |||
471 | for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) { | ||
472 | if (!page_flag_names[i]) | ||
473 | continue; | ||
474 | printf("%16s%s", page_flag_names[i] + 2, | ||
475 | page_flag_type(1ULL << i)); | ||
476 | if (++j > 3) { | ||
477 | j = 0; | ||
478 | putchar('\n'); | ||
479 | } | ||
480 | } | ||
481 | printf("\n " | ||
482 | "(r) raw mode bits (o) overloaded bits\n"); | ||
483 | } | ||
484 | |||
485 | unsigned long long parse_number(const char *str) | ||
486 | { | ||
487 | unsigned long long n; | ||
488 | |||
489 | n = strtoll(str, NULL, 0); | ||
490 | |||
491 | if (n == 0 && str[0] != '0') | ||
492 | fatal("invalid name or number: %s\n", str); | ||
493 | |||
494 | return n; | ||
495 | } | ||
496 | |||
497 | void parse_pid(const char *str) | ||
498 | { | ||
499 | opt_pid = parse_number(str); | ||
500 | } | ||
501 | |||
502 | void parse_file(const char *name) | ||
503 | { | ||
504 | } | ||
505 | |||
506 | void add_addr_range(unsigned long offset, unsigned long size) | ||
507 | { | ||
508 | if (nr_addr_ranges >= MAX_ADDR_RANGES) | ||
509 | fatal("too much addr ranges\n"); | ||
510 | |||
511 | opt_offset[nr_addr_ranges] = offset; | ||
512 | opt_size[nr_addr_ranges] = size; | ||
513 | nr_addr_ranges++; | ||
514 | } | ||
515 | |||
516 | void parse_addr_range(const char *optarg) | ||
517 | { | ||
518 | unsigned long offset; | ||
519 | unsigned long size; | ||
520 | char *p; | ||
521 | |||
522 | p = strchr(optarg, ','); | ||
523 | if (!p) | ||
524 | p = strchr(optarg, '+'); | ||
525 | |||
526 | if (p == optarg) { | ||
527 | offset = 0; | ||
528 | size = parse_number(p + 1); | ||
529 | } else if (p) { | ||
530 | offset = parse_number(optarg); | ||
531 | if (p[1] == '\0') | ||
532 | size = ULONG_MAX; | ||
533 | else { | ||
534 | size = parse_number(p + 1); | ||
535 | if (*p == ',') { | ||
536 | if (size < offset) | ||
537 | fatal("invalid range: %lu,%lu\n", | ||
538 | offset, size); | ||
539 | size -= offset; | ||
540 | } | ||
541 | } | ||
542 | } else { | ||
543 | offset = parse_number(optarg); | ||
544 | size = 1; | ||
545 | } | ||
546 | |||
547 | add_addr_range(offset, size); | ||
548 | } | ||
549 | |||
550 | void add_bits_filter(uint64_t mask, uint64_t bits) | ||
551 | { | ||
552 | if (nr_bit_filters >= MAX_BIT_FILTERS) | ||
553 | fatal("too much bit filters\n"); | ||
554 | |||
555 | opt_mask[nr_bit_filters] = mask; | ||
556 | opt_bits[nr_bit_filters] = bits; | ||
557 | nr_bit_filters++; | ||
558 | } | ||
559 | |||
560 | uint64_t parse_flag_name(const char *str, int len) | ||
561 | { | ||
562 | int i; | ||
563 | |||
564 | if (!*str || !len) | ||
565 | return 0; | ||
566 | |||
567 | if (len <= 8 && !strncmp(str, "compound", len)) | ||
568 | return BITS_COMPOUND; | ||
569 | |||
570 | for (i = 0; i < ARRAY_SIZE(page_flag_names); i++) { | ||
571 | if (!page_flag_names[i]) | ||
572 | continue; | ||
573 | if (!strncmp(str, page_flag_names[i] + 2, len)) | ||
574 | return 1ULL << i; | ||
575 | } | ||
576 | |||
577 | return parse_number(str); | ||
578 | } | ||
579 | |||
580 | uint64_t parse_flag_names(const char *str, int all) | ||
581 | { | ||
582 | const char *p = str; | ||
583 | uint64_t flags = 0; | ||
584 | |||
585 | while (1) { | ||
586 | if (*p == ',' || *p == '=' || *p == '\0') { | ||
587 | if ((*str != '~') || (*str == '~' && all && *++str)) | ||
588 | flags |= parse_flag_name(str, p - str); | ||
589 | if (*p != ',') | ||
590 | break; | ||
591 | str = p + 1; | ||
592 | } | ||
593 | p++; | ||
594 | } | ||
595 | |||
596 | return flags; | ||
597 | } | ||
598 | |||
599 | void parse_bits_mask(const char *optarg) | ||
600 | { | ||
601 | uint64_t mask; | ||
602 | uint64_t bits; | ||
603 | const char *p; | ||
604 | |||
605 | p = strchr(optarg, '='); | ||
606 | if (p == optarg) { | ||
607 | mask = KPF_ALL_BITS; | ||
608 | bits = parse_flag_names(p + 1, 0); | ||
609 | } else if (p) { | ||
610 | mask = parse_flag_names(optarg, 0); | ||
611 | bits = parse_flag_names(p + 1, 0); | ||
612 | } else if (strchr(optarg, '~')) { | ||
613 | mask = parse_flag_names(optarg, 1); | ||
614 | bits = parse_flag_names(optarg, 0); | ||
615 | } else { | ||
616 | mask = parse_flag_names(optarg, 0); | ||
617 | bits = KPF_ALL_BITS; | ||
618 | } | ||
619 | |||
620 | add_bits_filter(mask, bits); | ||
621 | } | ||
622 | |||
623 | |||
624 | struct option opts[] = { | ||
625 | { "raw" , 0, NULL, 'r' }, | ||
626 | { "pid" , 1, NULL, 'p' }, | ||
627 | { "file" , 1, NULL, 'f' }, | ||
628 | { "addr" , 1, NULL, 'a' }, | ||
629 | { "bits" , 1, NULL, 'b' }, | ||
630 | { "list" , 0, NULL, 'l' }, | ||
631 | { "list-each" , 0, NULL, 'L' }, | ||
632 | { "no-summary", 0, NULL, 'N' }, | ||
633 | { "help" , 0, NULL, 'h' }, | ||
634 | { NULL , 0, NULL, 0 } | ||
635 | }; | ||
636 | |||
637 | int main(int argc, char *argv[]) | ||
638 | { | ||
639 | int c; | ||
640 | |||
641 | page_size = getpagesize(); | ||
642 | |||
643 | while ((c = getopt_long(argc, argv, | ||
644 | "rp:f:a:b:lLNh", opts, NULL)) != -1) { | ||
645 | switch (c) { | ||
646 | case 'r': | ||
647 | opt_raw = 1; | ||
648 | break; | ||
649 | case 'p': | ||
650 | parse_pid(optarg); | ||
651 | break; | ||
652 | case 'f': | ||
653 | parse_file(optarg); | ||
654 | break; | ||
655 | case 'a': | ||
656 | parse_addr_range(optarg); | ||
657 | break; | ||
658 | case 'b': | ||
659 | parse_bits_mask(optarg); | ||
660 | break; | ||
661 | case 'l': | ||
662 | opt_list = 1; | ||
663 | break; | ||
664 | case 'L': | ||
665 | opt_list = 2; | ||
666 | break; | ||
667 | case 'N': | ||
668 | opt_no_summary = 1; | ||
669 | break; | ||
670 | case 'h': | ||
671 | usage(); | ||
672 | exit(0); | ||
673 | default: | ||
674 | usage(); | ||
675 | exit(1); | ||
676 | } | ||
677 | } | ||
678 | |||
679 | if (opt_list == 1) | ||
680 | printf("offset\tcount\tflags\n"); | ||
681 | if (opt_list == 2) | ||
682 | printf("offset\tflags\n"); | ||
683 | |||
684 | walk_addr_ranges(); | ||
685 | |||
686 | if (opt_list == 1) | ||
687 | show_page_range(0, 0); /* drain the buffer */ | ||
688 | |||
689 | if (opt_no_summary) | ||
690 | return 0; | ||
691 | |||
692 | if (opt_list) | ||
693 | printf("\n\n"); | ||
694 | |||
695 | show_summary(); | ||
696 | |||
697 | return 0; | ||
698 | } | ||
diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt index ce72c0fe6177..600a304a828c 100644 --- a/Documentation/vm/pagemap.txt +++ b/Documentation/vm/pagemap.txt | |||
@@ -12,9 +12,9 @@ There are three components to pagemap: | |||
12 | value for each virtual page, containing the following data (from | 12 | value for each virtual page, containing the following data (from |
13 | fs/proc/task_mmu.c, above pagemap_read): | 13 | fs/proc/task_mmu.c, above pagemap_read): |
14 | 14 | ||
15 | * Bits 0-55 page frame number (PFN) if present | 15 | * Bits 0-54 page frame number (PFN) if present |
16 | * Bits 0-4 swap type if swapped | 16 | * Bits 0-4 swap type if swapped |
17 | * Bits 5-55 swap offset if swapped | 17 | * Bits 5-54 swap offset if swapped |
18 | * Bits 55-60 page shift (page size = 1<<page shift) | 18 | * Bits 55-60 page shift (page size = 1<<page shift) |
19 | * Bit 61 reserved for future use | 19 | * Bit 61 reserved for future use |
20 | * Bit 62 page swapped | 20 | * Bit 62 page swapped |
@@ -36,7 +36,7 @@ There are three components to pagemap: | |||
36 | * /proc/kpageflags. This file contains a 64-bit set of flags for each | 36 | * /proc/kpageflags. This file contains a 64-bit set of flags for each |
37 | page, indexed by PFN. | 37 | page, indexed by PFN. |
38 | 38 | ||
39 | The flags are (from fs/proc/proc_misc, above kpageflags_read): | 39 | The flags are (from fs/proc/page.c, above kpageflags_read): |
40 | 40 | ||
41 | 0. LOCKED | 41 | 0. LOCKED |
42 | 1. ERROR | 42 | 1. ERROR |
@@ -49,6 +49,68 @@ There are three components to pagemap: | |||
49 | 8. WRITEBACK | 49 | 8. WRITEBACK |
50 | 9. RECLAIM | 50 | 9. RECLAIM |
51 | 10. BUDDY | 51 | 10. BUDDY |
52 | 11. MMAP | ||
53 | 12. ANON | ||
54 | 13. SWAPCACHE | ||
55 | 14. SWAPBACKED | ||
56 | 15. COMPOUND_HEAD | ||
57 | 16. COMPOUND_TAIL | ||
58 | 16. HUGE | ||
59 | 18. UNEVICTABLE | ||
60 | 20. NOPAGE | ||
61 | |||
62 | Short descriptions to the page flags: | ||
63 | |||
64 | 0. LOCKED | ||
65 | page is being locked for exclusive access, eg. by undergoing read/write IO | ||
66 | |||
67 | 7. SLAB | ||
68 | page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator | ||
69 | When compound page is used, SLUB/SLQB will only set this flag on the head | ||
70 | page; SLOB will not flag it at all. | ||
71 | |||
72 | 10. BUDDY | ||
73 | a free memory block managed by the buddy system allocator | ||
74 | The buddy system organizes free memory in blocks of various orders. | ||
75 | An order N block has 2^N physically contiguous pages, with the BUDDY flag | ||
76 | set for and _only_ for the first page. | ||
77 | |||
78 | 15. COMPOUND_HEAD | ||
79 | 16. COMPOUND_TAIL | ||
80 | A compound page with order N consists of 2^N physically contiguous pages. | ||
81 | A compound page with order 2 takes the form of "HTTT", where H donates its | ||
82 | head page and T donates its tail page(s). The major consumers of compound | ||
83 | pages are hugeTLB pages (Documentation/vm/hugetlbpage.txt), the SLUB etc. | ||
84 | memory allocators and various device drivers. However in this interface, | ||
85 | only huge/giga pages are made visible to end users. | ||
86 | 17. HUGE | ||
87 | this is an integral part of a HugeTLB page | ||
88 | |||
89 | 20. NOPAGE | ||
90 | no page frame exists at the requested address | ||
91 | |||
92 | [IO related page flags] | ||
93 | 1. ERROR IO error occurred | ||
94 | 3. UPTODATE page has up-to-date data | ||
95 | ie. for file backed page: (in-memory data revision >= on-disk one) | ||
96 | 4. DIRTY page has been written to, hence contains new data | ||
97 | ie. for file backed page: (in-memory data revision > on-disk one) | ||
98 | 8. WRITEBACK page is being synced to disk | ||
99 | |||
100 | [LRU related page flags] | ||
101 | 5. LRU page is in one of the LRU lists | ||
102 | 6. ACTIVE page is in the active LRU list | ||
103 | 18. UNEVICTABLE page is in the unevictable (non-)LRU list | ||
104 | It is somehow pinned and not a candidate for LRU page reclaims, | ||
105 | eg. ramfs pages, shmctl(SHM_LOCK) and mlock() memory segments | ||
106 | 2. REFERENCED page has been referenced since last LRU list enqueue/requeue | ||
107 | 9. RECLAIM page will be reclaimed soon after its pageout IO completed | ||
108 | 11. MMAP a memory mapped page | ||
109 | 12. ANON a memory mapped page that is not part of a file | ||
110 | 13. SWAPCACHE page is mapped to swap space, ie. has an associated swap entry | ||
111 | 14. SWAPBACKED page is backed by swap/RAM | ||
112 | |||
113 | The page-types tool in this directory can be used to query the above flags. | ||
52 | 114 | ||
53 | Using pagemap to do something useful: | 115 | Using pagemap to do something useful: |
54 | 116 | ||