diff options
author | Hugh Dickins <hugh@veritas.com> | 2005-10-29 21:16:18 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-10-30 00:40:39 -0400 |
commit | 365e9c87a982c03d0af3886e29d877f581b59611 (patch) | |
tree | d06c1918ca9fe6677d7e4e869555e095004274f7 /fs | |
parent | 861f2fb8e796022b4928cab9c74fca6681a1c557 (diff) |
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/compat.c | 1 | ||||
-rw-r--r-- | fs/exec.c | 1 | ||||
-rw-r--r-- | fs/proc/task_mmu.c | 23 |
3 files changed, 21 insertions, 4 deletions
diff --git a/fs/compat.c b/fs/compat.c index a719e158e002..8e71cdbecc7c 100644 --- a/fs/compat.c +++ b/fs/compat.c | |||
@@ -1490,7 +1490,6 @@ int compat_do_execve(char * filename, | |||
1490 | /* execve success */ | 1490 | /* execve success */ |
1491 | security_bprm_free(bprm); | 1491 | security_bprm_free(bprm); |
1492 | acct_update_integrals(current); | 1492 | acct_update_integrals(current); |
1493 | update_mem_hiwater(current); | ||
1494 | kfree(bprm); | 1493 | kfree(bprm); |
1495 | return retval; | 1494 | return retval; |
1496 | } | 1495 | } |
@@ -1207,7 +1207,6 @@ int do_execve(char * filename, | |||
1207 | /* execve success */ | 1207 | /* execve success */ |
1208 | security_bprm_free(bprm); | 1208 | security_bprm_free(bprm); |
1209 | acct_update_integrals(current); | 1209 | acct_update_integrals(current); |
1210 | update_mem_hiwater(current); | ||
1211 | kfree(bprm); | 1210 | kfree(bprm); |
1212 | return retval; | 1211 | return retval; |
1213 | } | 1212 | } |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index bccee7cf9ccd..7c89b4549049 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -14,22 +14,41 @@ | |||
14 | char *task_mem(struct mm_struct *mm, char *buffer) | 14 | char *task_mem(struct mm_struct *mm, char *buffer) |
15 | { | 15 | { |
16 | unsigned long data, text, lib; | 16 | unsigned long data, text, lib; |
17 | unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; | ||
18 | |||
19 | /* | ||
20 | * Note: to minimize their overhead, mm maintains hiwater_vm and | ||
21 | * hiwater_rss only when about to *lower* total_vm or rss. Any | ||
22 | * collector of these hiwater stats must therefore get total_vm | ||
23 | * and rss too, which will usually be the higher. Barriers? not | ||
24 | * worth the effort, such snapshots can always be inconsistent. | ||
25 | */ | ||
26 | hiwater_vm = total_vm = mm->total_vm; | ||
27 | if (hiwater_vm < mm->hiwater_vm) | ||
28 | hiwater_vm = mm->hiwater_vm; | ||
29 | hiwater_rss = total_rss = get_mm_rss(mm); | ||
30 | if (hiwater_rss < mm->hiwater_rss) | ||
31 | hiwater_rss = mm->hiwater_rss; | ||
17 | 32 | ||
18 | data = mm->total_vm - mm->shared_vm - mm->stack_vm; | 33 | data = mm->total_vm - mm->shared_vm - mm->stack_vm; |
19 | text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; | 34 | text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; |
20 | lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; | 35 | lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; |
21 | buffer += sprintf(buffer, | 36 | buffer += sprintf(buffer, |
37 | "VmPeak:\t%8lu kB\n" | ||
22 | "VmSize:\t%8lu kB\n" | 38 | "VmSize:\t%8lu kB\n" |
23 | "VmLck:\t%8lu kB\n" | 39 | "VmLck:\t%8lu kB\n" |
40 | "VmHWM:\t%8lu kB\n" | ||
24 | "VmRSS:\t%8lu kB\n" | 41 | "VmRSS:\t%8lu kB\n" |
25 | "VmData:\t%8lu kB\n" | 42 | "VmData:\t%8lu kB\n" |
26 | "VmStk:\t%8lu kB\n" | 43 | "VmStk:\t%8lu kB\n" |
27 | "VmExe:\t%8lu kB\n" | 44 | "VmExe:\t%8lu kB\n" |
28 | "VmLib:\t%8lu kB\n" | 45 | "VmLib:\t%8lu kB\n" |
29 | "VmPTE:\t%8lu kB\n", | 46 | "VmPTE:\t%8lu kB\n", |
30 | (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), | 47 | hiwater_vm << (PAGE_SHIFT-10), |
48 | (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), | ||
31 | mm->locked_vm << (PAGE_SHIFT-10), | 49 | mm->locked_vm << (PAGE_SHIFT-10), |
32 | get_mm_rss(mm) << (PAGE_SHIFT-10), | 50 | hiwater_rss << (PAGE_SHIFT-10), |
51 | total_rss << (PAGE_SHIFT-10), | ||
33 | data << (PAGE_SHIFT-10), | 52 | data << (PAGE_SHIFT-10), |
34 | mm->stack_vm << (PAGE_SHIFT-10), text, lib, | 53 | mm->stack_vm << (PAGE_SHIFT-10), text, lib, |
35 | (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); | 54 | (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); |