diff options
author | Hugh Dickins <hugh@veritas.com> | 2005-10-29 21:16:18 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-10-30 00:40:39 -0400 |
commit | 365e9c87a982c03d0af3886e29d877f581b59611 (patch) | |
tree | d06c1918ca9fe6677d7e4e869555e095004274f7 /mm | |
parent | 861f2fb8e796022b4928cab9c74fca6681a1c557 (diff) |
[PATCH] mm: update_hiwaters just in time
update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability. Originally it was called whenever rss or
total_vm got raised. Then many of those callsites were replaced by a timer
tick call from account_system_time. Now Frank van Maarseveen reports that to
be found inadequate. How about this? Works for Frank.
Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm. Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths. Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit. Handle
mm->hiwater_vm in the same way, though it's much less of an issue. Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.
And there has been no collector of these hiwater statistics in the tree. The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).
There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high. A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.
What locking? None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact. But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/fremap.c | 4 | ||||
-rw-r--r-- | mm/hugetlb.c | 3 | ||||
-rw-r--r-- | mm/memory.c | 17 | ||||
-rw-r--r-- | mm/mmap.c | 4 | ||||
-rw-r--r-- | mm/mremap.c | 12 | ||||
-rw-r--r-- | mm/nommu.c | 15 | ||||
-rw-r--r-- | mm/rmap.c | 6 |
7 files changed, 29 insertions, 32 deletions
diff --git a/mm/fremap.c b/mm/fremap.c index 7f08d10ceaff..49719a35769a 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -143,8 +143,10 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
143 | if (!pte) | 143 | if (!pte) |
144 | goto err_unlock; | 144 | goto err_unlock; |
145 | 145 | ||
146 | if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) | 146 | if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) { |
147 | update_hiwater_rss(mm); | ||
147 | dec_mm_counter(mm, file_rss); | 148 | dec_mm_counter(mm, file_rss); |
149 | } | ||
148 | 150 | ||
149 | set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); | 151 | set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); |
150 | pte_val = *pte; | 152 | pte_val = *pte; |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 094455bcbbf7..ac5f044bf514 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -310,6 +310,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
310 | BUG_ON(start & ~HPAGE_MASK); | 310 | BUG_ON(start & ~HPAGE_MASK); |
311 | BUG_ON(end & ~HPAGE_MASK); | 311 | BUG_ON(end & ~HPAGE_MASK); |
312 | 312 | ||
313 | /* Update high watermark before we lower rss */ | ||
314 | update_hiwater_rss(mm); | ||
315 | |||
313 | for (address = start; address < end; address += HPAGE_SIZE) { | 316 | for (address = start; address < end; address += HPAGE_SIZE) { |
314 | ptep = huge_pte_offset(mm, address); | 317 | ptep = huge_pte_offset(mm, address); |
315 | if (! ptep) | 318 | if (! ptep) |
diff --git a/mm/memory.c b/mm/memory.c index a25ee1d3e20a..692ad810263d 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -820,6 +820,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, | |||
820 | lru_add_drain(); | 820 | lru_add_drain(); |
821 | spin_lock(&mm->page_table_lock); | 821 | spin_lock(&mm->page_table_lock); |
822 | tlb = tlb_gather_mmu(mm, 0); | 822 | tlb = tlb_gather_mmu(mm, 0); |
823 | update_hiwater_rss(mm); | ||
823 | end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); | 824 | end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); |
824 | tlb_finish_mmu(tlb, address, end); | 825 | tlb_finish_mmu(tlb, address, end); |
825 | spin_unlock(&mm->page_table_lock); | 826 | spin_unlock(&mm->page_table_lock); |
@@ -2225,22 +2226,6 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr) | |||
2225 | 2226 | ||
2226 | EXPORT_SYMBOL(vmalloc_to_pfn); | 2227 | EXPORT_SYMBOL(vmalloc_to_pfn); |
2227 | 2228 | ||
2228 | /* | ||
2229 | * update_mem_hiwater | ||
2230 | * - update per process rss and vm high water data | ||
2231 | */ | ||
2232 | void update_mem_hiwater(struct task_struct *tsk) | ||
2233 | { | ||
2234 | if (tsk->mm) { | ||
2235 | unsigned long rss = get_mm_rss(tsk->mm); | ||
2236 | |||
2237 | if (tsk->mm->hiwater_rss < rss) | ||
2238 | tsk->mm->hiwater_rss = rss; | ||
2239 | if (tsk->mm->hiwater_vm < tsk->mm->total_vm) | ||
2240 | tsk->mm->hiwater_vm = tsk->mm->total_vm; | ||
2241 | } | ||
2242 | } | ||
2243 | |||
2244 | #if !defined(__HAVE_ARCH_GATE_AREA) | 2229 | #if !defined(__HAVE_ARCH_GATE_AREA) |
2245 | 2230 | ||
2246 | #if defined(AT_SYSINFO_EHDR) | 2231 | #if defined(AT_SYSINFO_EHDR) |
@@ -1640,6 +1640,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) | |||
1640 | */ | 1640 | */ |
1641 | static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) | 1641 | static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) |
1642 | { | 1642 | { |
1643 | /* Update high watermark before we lower total_vm */ | ||
1644 | update_hiwater_vm(mm); | ||
1643 | do { | 1645 | do { |
1644 | long nrpages = vma_pages(vma); | 1646 | long nrpages = vma_pages(vma); |
1645 | 1647 | ||
@@ -1668,6 +1670,7 @@ static void unmap_region(struct mm_struct *mm, | |||
1668 | lru_add_drain(); | 1670 | lru_add_drain(); |
1669 | spin_lock(&mm->page_table_lock); | 1671 | spin_lock(&mm->page_table_lock); |
1670 | tlb = tlb_gather_mmu(mm, 0); | 1672 | tlb = tlb_gather_mmu(mm, 0); |
1673 | update_hiwater_rss(mm); | ||
1671 | unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); | 1674 | unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); |
1672 | vm_unacct_memory(nr_accounted); | 1675 | vm_unacct_memory(nr_accounted); |
1673 | free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, | 1676 | free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, |
@@ -1953,6 +1956,7 @@ void exit_mmap(struct mm_struct *mm) | |||
1953 | 1956 | ||
1954 | flush_cache_mm(mm); | 1957 | flush_cache_mm(mm); |
1955 | tlb = tlb_gather_mmu(mm, 1); | 1958 | tlb = tlb_gather_mmu(mm, 1); |
1959 | /* Don't update_hiwater_rss(mm) here, do_exit already did */ | ||
1956 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 1960 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
1957 | end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); | 1961 | end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); |
1958 | vm_unacct_memory(nr_accounted); | 1962 | vm_unacct_memory(nr_accounted); |
diff --git a/mm/mremap.c b/mm/mremap.c index 318eea5467a0..ccf456477020 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -167,6 +167,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
167 | unsigned long new_pgoff; | 167 | unsigned long new_pgoff; |
168 | unsigned long moved_len; | 168 | unsigned long moved_len; |
169 | unsigned long excess = 0; | 169 | unsigned long excess = 0; |
170 | unsigned long hiwater_vm; | ||
170 | int split = 0; | 171 | int split = 0; |
171 | 172 | ||
172 | /* | 173 | /* |
@@ -205,9 +206,15 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
205 | } | 206 | } |
206 | 207 | ||
207 | /* | 208 | /* |
208 | * if we failed to move page tables we still do total_vm increment | 209 | * If we failed to move page tables we still do total_vm increment |
209 | * since do_munmap() will decrement it by old_len == new_len | 210 | * since do_munmap() will decrement it by old_len == new_len. |
211 | * | ||
212 | * Since total_vm is about to be raised artificially high for a | ||
213 | * moment, we need to restore high watermark afterwards: if stats | ||
214 | * are taken meanwhile, total_vm and hiwater_vm appear too high. | ||
215 | * If this were a serious issue, we'd add a flag to do_munmap(). | ||
210 | */ | 216 | */ |
217 | hiwater_vm = mm->hiwater_vm; | ||
211 | mm->total_vm += new_len >> PAGE_SHIFT; | 218 | mm->total_vm += new_len >> PAGE_SHIFT; |
212 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); | 219 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); |
213 | 220 | ||
@@ -216,6 +223,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
216 | vm_unacct_memory(excess >> PAGE_SHIFT); | 223 | vm_unacct_memory(excess >> PAGE_SHIFT); |
217 | excess = 0; | 224 | excess = 0; |
218 | } | 225 | } |
226 | mm->hiwater_vm = hiwater_vm; | ||
219 | 227 | ||
220 | /* Restore VM_ACCOUNT if one or two pieces of vma left */ | 228 | /* Restore VM_ACCOUNT if one or two pieces of vma left */ |
221 | if (excess) { | 229 | if (excess) { |
diff --git a/mm/nommu.c b/mm/nommu.c index 599924886eb5..dfb124ffb9be 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -931,6 +931,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) | |||
931 | realalloc -= kobjsize(vml); | 931 | realalloc -= kobjsize(vml); |
932 | askedalloc -= sizeof(*vml); | 932 | askedalloc -= sizeof(*vml); |
933 | kfree(vml); | 933 | kfree(vml); |
934 | |||
935 | update_hiwater_vm(mm); | ||
934 | mm->total_vm -= len >> PAGE_SHIFT; | 936 | mm->total_vm -= len >> PAGE_SHIFT; |
935 | 937 | ||
936 | #ifdef DEBUG | 938 | #ifdef DEBUG |
@@ -1078,19 +1080,6 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr) | |||
1078 | { | 1080 | { |
1079 | } | 1081 | } |
1080 | 1082 | ||
1081 | void update_mem_hiwater(struct task_struct *tsk) | ||
1082 | { | ||
1083 | unsigned long rss; | ||
1084 | |||
1085 | if (likely(tsk->mm)) { | ||
1086 | rss = get_mm_rss(tsk->mm); | ||
1087 | if (tsk->mm->hiwater_rss < rss) | ||
1088 | tsk->mm->hiwater_rss = rss; | ||
1089 | if (tsk->mm->hiwater_vm < tsk->mm->total_vm) | ||
1090 | tsk->mm->hiwater_vm = tsk->mm->total_vm; | ||
1091 | } | ||
1092 | } | ||
1093 | |||
1094 | void unmap_mapping_range(struct address_space *mapping, | 1083 | void unmap_mapping_range(struct address_space *mapping, |
1095 | loff_t const holebegin, loff_t const holelen, | 1084 | loff_t const holebegin, loff_t const holelen, |
1096 | int even_cows) | 1085 | int even_cows) |
@@ -538,6 +538,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) | |||
538 | if (pte_dirty(pteval)) | 538 | if (pte_dirty(pteval)) |
539 | set_page_dirty(page); | 539 | set_page_dirty(page); |
540 | 540 | ||
541 | /* Update high watermark before we lower rss */ | ||
542 | update_hiwater_rss(mm); | ||
543 | |||
541 | if (PageAnon(page)) { | 544 | if (PageAnon(page)) { |
542 | swp_entry_t entry = { .val = page->private }; | 545 | swp_entry_t entry = { .val = page->private }; |
543 | /* | 546 | /* |
@@ -628,6 +631,9 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
628 | if (!pmd_present(*pmd)) | 631 | if (!pmd_present(*pmd)) |
629 | goto out_unlock; | 632 | goto out_unlock; |
630 | 633 | ||
634 | /* Update high watermark before we lower rss */ | ||
635 | update_hiwater_rss(mm); | ||
636 | |||
631 | for (original_pte = pte = pte_offset_map(pmd, address); | 637 | for (original_pte = pte = pte_offset_map(pmd, address); |
632 | address < end; pte++, address += PAGE_SIZE) { | 638 | address < end; pte++, address += PAGE_SIZE) { |
633 | 639 | ||