aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHugh Dickins <hugh@veritas.com>2005-10-29 21:16:18 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-10-30 00:40:39 -0400
commit365e9c87a982c03d0af3886e29d877f581b59611 (patch)
treed06c1918ca9fe6677d7e4e869555e095004274f7
parent861f2fb8e796022b4928cab9c74fca6681a1c557 (diff)
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those concerned with mm scalability. Originally it was called whenever rss or total_vm got raised. Then many of those callsites were replaced by a timer tick call from account_system_time. Now Frank van Maarseveen reports that to be found inadequate. How about this? Works for Frank. Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros update_hiwater_rss and update_hiwater_vm. Don't attempt to keep mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually by 1): those are hot paths. Do the opposite, update only when about to lower rss (usually by many), or just before final accounting in do_exit. Handle mm->hiwater_vm in the same way, though it's much less of an issue. Demand that whoever collects these hiwater statistics do the work of taking the maximum with rss or total_vm. And there has been no collector of these hiwater statistics in the tree. The new convention needs an example, so match Frank's usage by adding a VmPeak line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS (High-Water-Mark or High-Water-Memory). There was a particular anomaly during mremap move, that hiwater_vm might be captured too high. A fleeting such anomaly remains, but it's quickly corrected now, whereas before it would stick. What locking? None: if the app is racy then these statistics will be racy, it's not worth any overhead to make them exact. But whenever it suits, hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under page_table_lock (for now) or with preemption disabled (later on): without going to any trouble, minimize the time between reading current values and updating, to minimize those occasions when a racing thread bumps a count up and back down in between. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--fs/compat.c1
-rw-r--r--fs/exec.c1
-rw-r--r--fs/proc/task_mmu.c23
-rw-r--r--include/linux/mm.h3
-rw-r--r--include/linux/sched.h10
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/sched.c2
-rw-r--r--mm/fremap.c4
-rw-r--r--mm/hugetlb.c3
-rw-r--r--mm/memory.c17
-rw-r--r--mm/mmap.c4
-rw-r--r--mm/mremap.c12
-rw-r--r--mm/nommu.c15
-rw-r--r--mm/rmap.c6
14 files changed, 64 insertions, 42 deletions
diff --git a/fs/compat.c b/fs/compat.c
index a719e158e002..8e71cdbecc7c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1490,7 +1490,6 @@ int compat_do_execve(char * filename,
1490 /* execve success */ 1490 /* execve success */
1491 security_bprm_free(bprm); 1491 security_bprm_free(bprm);
1492 acct_update_integrals(current); 1492 acct_update_integrals(current);
1493 update_mem_hiwater(current);
1494 kfree(bprm); 1493 kfree(bprm);
1495 return retval; 1494 return retval;
1496 } 1495 }
diff --git a/fs/exec.c b/fs/exec.c
index cefadf5ab83b..9bb55c8cf224 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1207,7 +1207,6 @@ int do_execve(char * filename,
1207 /* execve success */ 1207 /* execve success */
1208 security_bprm_free(bprm); 1208 security_bprm_free(bprm);
1209 acct_update_integrals(current); 1209 acct_update_integrals(current);
1210 update_mem_hiwater(current);
1211 kfree(bprm); 1210 kfree(bprm);
1212 return retval; 1211 return retval;
1213 } 1212 }
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index bccee7cf9ccd..7c89b4549049 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -14,22 +14,41 @@
14char *task_mem(struct mm_struct *mm, char *buffer) 14char *task_mem(struct mm_struct *mm, char *buffer)
15{ 15{
16 unsigned long data, text, lib; 16 unsigned long data, text, lib;
17 unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
18
19 /*
20 * Note: to minimize their overhead, mm maintains hiwater_vm and
21 * hiwater_rss only when about to *lower* total_vm or rss. Any
22 * collector of these hiwater stats must therefore get total_vm
23 * and rss too, which will usually be the higher. Barriers? not
24 * worth the effort, such snapshots can always be inconsistent.
25 */
26 hiwater_vm = total_vm = mm->total_vm;
27 if (hiwater_vm < mm->hiwater_vm)
28 hiwater_vm = mm->hiwater_vm;
29 hiwater_rss = total_rss = get_mm_rss(mm);
30 if (hiwater_rss < mm->hiwater_rss)
31 hiwater_rss = mm->hiwater_rss;
17 32
18 data = mm->total_vm - mm->shared_vm - mm->stack_vm; 33 data = mm->total_vm - mm->shared_vm - mm->stack_vm;
19 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; 34 text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
20 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; 35 lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
21 buffer += sprintf(buffer, 36 buffer += sprintf(buffer,
37 "VmPeak:\t%8lu kB\n"
22 "VmSize:\t%8lu kB\n" 38 "VmSize:\t%8lu kB\n"
23 "VmLck:\t%8lu kB\n" 39 "VmLck:\t%8lu kB\n"
40 "VmHWM:\t%8lu kB\n"
24 "VmRSS:\t%8lu kB\n" 41 "VmRSS:\t%8lu kB\n"
25 "VmData:\t%8lu kB\n" 42 "VmData:\t%8lu kB\n"
26 "VmStk:\t%8lu kB\n" 43 "VmStk:\t%8lu kB\n"
27 "VmExe:\t%8lu kB\n" 44 "VmExe:\t%8lu kB\n"
28 "VmLib:\t%8lu kB\n" 45 "VmLib:\t%8lu kB\n"
29 "VmPTE:\t%8lu kB\n", 46 "VmPTE:\t%8lu kB\n",
30 (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), 47 hiwater_vm << (PAGE_SHIFT-10),
48 (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
31 mm->locked_vm << (PAGE_SHIFT-10), 49 mm->locked_vm << (PAGE_SHIFT-10),
32 get_mm_rss(mm) << (PAGE_SHIFT-10), 50 hiwater_rss << (PAGE_SHIFT-10),
51 total_rss << (PAGE_SHIFT-10),
33 data << (PAGE_SHIFT-10), 52 data << (PAGE_SHIFT-10),
34 mm->stack_vm << (PAGE_SHIFT-10), text, lib, 53 mm->stack_vm << (PAGE_SHIFT-10), text, lib,
35 (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); 54 (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index da42093250c3..7d4552fe0864 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -938,9 +938,6 @@ static inline void vm_stat_account(struct mm_struct *mm,
938} 938}
939#endif /* CONFIG_PROC_FS */ 939#endif /* CONFIG_PROC_FS */
940 940
941/* update per process rss and vm hiwater data */
942extern void update_mem_hiwater(struct task_struct *tsk);
943
944#ifndef CONFIG_DEBUG_PAGEALLOC 941#ifndef CONFIG_DEBUG_PAGEALLOC
945static inline void 942static inline void
946kernel_map_pages(struct page *page, int numpages, int enable) 943kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index afcaac66cbd5..a9c0b7d26303 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -256,6 +256,16 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
256#define dec_mm_counter(mm, member) (mm)->_##member-- 256#define dec_mm_counter(mm, member) (mm)->_##member--
257#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss) 257#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss)
258 258
259#define update_hiwater_rss(mm) do { \
260 unsigned long _rss = get_mm_rss(mm); \
261 if ((mm)->hiwater_rss < _rss) \
262 (mm)->hiwater_rss = _rss; \
263} while (0)
264#define update_hiwater_vm(mm) do { \
265 if ((mm)->hiwater_vm < (mm)->total_vm) \
266 (mm)->hiwater_vm = (mm)->total_vm; \
267} while (0)
268
259typedef unsigned long mm_counter_t; 269typedef unsigned long mm_counter_t;
260 270
261struct mm_struct { 271struct mm_struct {
diff --git a/kernel/exit.c b/kernel/exit.c
index 3b25b182d2be..79f52b85d6ed 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -839,7 +839,10 @@ fastcall NORET_TYPE void do_exit(long code)
839 preempt_count()); 839 preempt_count());
840 840
841 acct_update_integrals(tsk); 841 acct_update_integrals(tsk);
842 update_mem_hiwater(tsk); 842 if (tsk->mm) {
843 update_hiwater_rss(tsk->mm);
844 update_hiwater_vm(tsk->mm);
845 }
843 group_dead = atomic_dec_and_test(&tsk->signal->live); 846 group_dead = atomic_dec_and_test(&tsk->signal->live);
844 if (group_dead) { 847 if (group_dead) {
845 del_timer_sync(&tsk->signal->real_timer); 848 del_timer_sync(&tsk->signal->real_timer);
diff --git a/kernel/sched.c b/kernel/sched.c
index 1e5cafdf4e27..4f26c544d02c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2511,8 +2511,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
2511 cpustat->idle = cputime64_add(cpustat->idle, tmp); 2511 cpustat->idle = cputime64_add(cpustat->idle, tmp);
2512 /* Account for system time used */ 2512 /* Account for system time used */
2513 acct_update_integrals(p); 2513 acct_update_integrals(p);
2514 /* Update rss highwater mark */
2515 update_mem_hiwater(p);
2516} 2514}
2517 2515
2518/* 2516/*
diff --git a/mm/fremap.c b/mm/fremap.c
index 7f08d10ceaff..49719a35769a 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -143,8 +143,10 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
143 if (!pte) 143 if (!pte)
144 goto err_unlock; 144 goto err_unlock;
145 145
146 if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) 146 if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
147 update_hiwater_rss(mm);
147 dec_mm_counter(mm, file_rss); 148 dec_mm_counter(mm, file_rss);
149 }
148 150
149 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); 151 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
150 pte_val = *pte; 152 pte_val = *pte;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 094455bcbbf7..ac5f044bf514 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -310,6 +310,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
310 BUG_ON(start & ~HPAGE_MASK); 310 BUG_ON(start & ~HPAGE_MASK);
311 BUG_ON(end & ~HPAGE_MASK); 311 BUG_ON(end & ~HPAGE_MASK);
312 312
313 /* Update high watermark before we lower rss */
314 update_hiwater_rss(mm);
315
313 for (address = start; address < end; address += HPAGE_SIZE) { 316 for (address = start; address < end; address += HPAGE_SIZE) {
314 ptep = huge_pte_offset(mm, address); 317 ptep = huge_pte_offset(mm, address);
315 if (! ptep) 318 if (! ptep)
diff --git a/mm/memory.c b/mm/memory.c
index a25ee1d3e20a..692ad810263d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -820,6 +820,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
820 lru_add_drain(); 820 lru_add_drain();
821 spin_lock(&mm->page_table_lock); 821 spin_lock(&mm->page_table_lock);
822 tlb = tlb_gather_mmu(mm, 0); 822 tlb = tlb_gather_mmu(mm, 0);
823 update_hiwater_rss(mm);
823 end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); 824 end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
824 tlb_finish_mmu(tlb, address, end); 825 tlb_finish_mmu(tlb, address, end);
825 spin_unlock(&mm->page_table_lock); 826 spin_unlock(&mm->page_table_lock);
@@ -2225,22 +2226,6 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr)
2225 2226
2226EXPORT_SYMBOL(vmalloc_to_pfn); 2227EXPORT_SYMBOL(vmalloc_to_pfn);
2227 2228
2228/*
2229 * update_mem_hiwater
2230 * - update per process rss and vm high water data
2231 */
2232void update_mem_hiwater(struct task_struct *tsk)
2233{
2234 if (tsk->mm) {
2235 unsigned long rss = get_mm_rss(tsk->mm);
2236
2237 if (tsk->mm->hiwater_rss < rss)
2238 tsk->mm->hiwater_rss = rss;
2239 if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
2240 tsk->mm->hiwater_vm = tsk->mm->total_vm;
2241 }
2242}
2243
2244#if !defined(__HAVE_ARCH_GATE_AREA) 2229#if !defined(__HAVE_ARCH_GATE_AREA)
2245 2230
2246#if defined(AT_SYSINFO_EHDR) 2231#if defined(AT_SYSINFO_EHDR)
diff --git a/mm/mmap.c b/mm/mmap.c
index 8a111792b8db..c43b28457007 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1640,6 +1640,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
1640 */ 1640 */
1641static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) 1641static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1642{ 1642{
1643 /* Update high watermark before we lower total_vm */
1644 update_hiwater_vm(mm);
1643 do { 1645 do {
1644 long nrpages = vma_pages(vma); 1646 long nrpages = vma_pages(vma);
1645 1647
@@ -1668,6 +1670,7 @@ static void unmap_region(struct mm_struct *mm,
1668 lru_add_drain(); 1670 lru_add_drain();
1669 spin_lock(&mm->page_table_lock); 1671 spin_lock(&mm->page_table_lock);
1670 tlb = tlb_gather_mmu(mm, 0); 1672 tlb = tlb_gather_mmu(mm, 0);
1673 update_hiwater_rss(mm);
1671 unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); 1674 unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
1672 vm_unacct_memory(nr_accounted); 1675 vm_unacct_memory(nr_accounted);
1673 free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, 1676 free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
@@ -1953,6 +1956,7 @@ void exit_mmap(struct mm_struct *mm)
1953 1956
1954 flush_cache_mm(mm); 1957 flush_cache_mm(mm);
1955 tlb = tlb_gather_mmu(mm, 1); 1958 tlb = tlb_gather_mmu(mm, 1);
1959 /* Don't update_hiwater_rss(mm) here, do_exit already did */
1956 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 1960 /* Use -1 here to ensure all VMAs in the mm are unmapped */
1957 end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); 1961 end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
1958 vm_unacct_memory(nr_accounted); 1962 vm_unacct_memory(nr_accounted);
diff --git a/mm/mremap.c b/mm/mremap.c
index 318eea5467a0..ccf456477020 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -167,6 +167,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
167 unsigned long new_pgoff; 167 unsigned long new_pgoff;
168 unsigned long moved_len; 168 unsigned long moved_len;
169 unsigned long excess = 0; 169 unsigned long excess = 0;
170 unsigned long hiwater_vm;
170 int split = 0; 171 int split = 0;
171 172
172 /* 173 /*
@@ -205,9 +206,15 @@ static unsigned long move_vma(struct vm_area_struct *vma,
205 } 206 }
206 207
207 /* 208 /*
208 * if we failed to move page tables we still do total_vm increment 209 * If we failed to move page tables we still do total_vm increment
209 * since do_munmap() will decrement it by old_len == new_len 210 * since do_munmap() will decrement it by old_len == new_len.
211 *
212 * Since total_vm is about to be raised artificially high for a
213 * moment, we need to restore high watermark afterwards: if stats
214 * are taken meanwhile, total_vm and hiwater_vm appear too high.
215 * If this were a serious issue, we'd add a flag to do_munmap().
210 */ 216 */
217 hiwater_vm = mm->hiwater_vm;
211 mm->total_vm += new_len >> PAGE_SHIFT; 218 mm->total_vm += new_len >> PAGE_SHIFT;
212 vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); 219 vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
213 220
@@ -216,6 +223,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
216 vm_unacct_memory(excess >> PAGE_SHIFT); 223 vm_unacct_memory(excess >> PAGE_SHIFT);
217 excess = 0; 224 excess = 0;
218 } 225 }
226 mm->hiwater_vm = hiwater_vm;
219 227
220 /* Restore VM_ACCOUNT if one or two pieces of vma left */ 228 /* Restore VM_ACCOUNT if one or two pieces of vma left */
221 if (excess) { 229 if (excess) {
diff --git a/mm/nommu.c b/mm/nommu.c
index 599924886eb5..dfb124ffb9be 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -931,6 +931,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
931 realalloc -= kobjsize(vml); 931 realalloc -= kobjsize(vml);
932 askedalloc -= sizeof(*vml); 932 askedalloc -= sizeof(*vml);
933 kfree(vml); 933 kfree(vml);
934
935 update_hiwater_vm(mm);
934 mm->total_vm -= len >> PAGE_SHIFT; 936 mm->total_vm -= len >> PAGE_SHIFT;
935 937
936#ifdef DEBUG 938#ifdef DEBUG
@@ -1078,19 +1080,6 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1078{ 1080{
1079} 1081}
1080 1082
1081void update_mem_hiwater(struct task_struct *tsk)
1082{
1083 unsigned long rss;
1084
1085 if (likely(tsk->mm)) {
1086 rss = get_mm_rss(tsk->mm);
1087 if (tsk->mm->hiwater_rss < rss)
1088 tsk->mm->hiwater_rss = rss;
1089 if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
1090 tsk->mm->hiwater_vm = tsk->mm->total_vm;
1091 }
1092}
1093
1094void unmap_mapping_range(struct address_space *mapping, 1083void unmap_mapping_range(struct address_space *mapping,
1095 loff_t const holebegin, loff_t const holelen, 1084 loff_t const holebegin, loff_t const holelen,
1096 int even_cows) 1085 int even_cows)
diff --git a/mm/rmap.c b/mm/rmap.c
index f69d5342ce7f..4c52c56c9905 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -538,6 +538,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
538 if (pte_dirty(pteval)) 538 if (pte_dirty(pteval))
539 set_page_dirty(page); 539 set_page_dirty(page);
540 540
541 /* Update high watermark before we lower rss */
542 update_hiwater_rss(mm);
543
541 if (PageAnon(page)) { 544 if (PageAnon(page)) {
542 swp_entry_t entry = { .val = page->private }; 545 swp_entry_t entry = { .val = page->private };
543 /* 546 /*
@@ -628,6 +631,9 @@ static void try_to_unmap_cluster(unsigned long cursor,
628 if (!pmd_present(*pmd)) 631 if (!pmd_present(*pmd))
629 goto out_unlock; 632 goto out_unlock;
630 633
634 /* Update high watermark before we lower rss */
635 update_hiwater_rss(mm);
636
631 for (original_pte = pte = pte_offset_map(pmd, address); 637 for (original_pte = pte = pte_offset_map(pmd, address);
632 address < end; pte++, address += PAGE_SIZE) { 638 address < end; pte++, address += PAGE_SIZE) {
633 639