From 39c715b71740c4a78ba4769fb54826929bac03cb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 21 Jun 2005 17:14:34 -0700 Subject: [PATCH] smp_processor_id() cleanup This patch implements a number of smp_processor_id() cleanup ideas that Arjan van de Ven and I came up with. The previous __smp_processor_id/_smp_processor_id/smp_processor_id API spaghetti was hard to follow both on the implementational and on the usage side. Some of the complexity arose from picking wrong names, some of the complexity comes from the fact that not all architectures defined __smp_processor_id. In the new code, there are two externally visible symbols: - smp_processor_id(): debug variant. - raw_smp_processor_id(): nondebug variant. Replaces all existing uses of _smp_processor_id() and __smp_processor_id(). Defined by every SMP architecture in include/asm-*/smp.h. There is one new internal symbol, dependent on DEBUG_PREEMPT: - debug_smp_processor_id(): internal debug variant, mapped to smp_processor_id(). Also, i moved debug_smp_processor_id() from lib/kernel_lock.c into a new lib/smp_processor_id.c file. All related comments got updated and/or clarified. I have build/boot tested the following 8 .config combinations on x86: {SMP,UP} x {PREEMPT,!PREEMPT} x {DEBUG_PREEMPT,!DEBUG_PREEMPT} I have also build/boot tested x64 on UP/PREEMPT/DEBUG_PREEMPT. (Other architectures are untested, but should work just fine.) Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/lib/delay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/sh') diff --git a/arch/sh/lib/delay.c b/arch/sh/lib/delay.c index 50b36037d8..351714694d 100644 --- a/arch/sh/lib/delay.c +++ b/arch/sh/lib/delay.c @@ -24,7 +24,7 @@ inline void __const_udelay(unsigned long xloops) __asm__("dmulu.l %0, %2\n\t" "sts mach, %0" : "=r" (xloops) - : "0" (xloops), "r" (cpu_data[_smp_processor_id()].loops_per_jiffy) + : "0" (xloops), "r" (cpu_data[raw_smp_processor_id()].loops_per_jiffy) : "macl", "mach"); __delay(xloops * HZ); } -- cgit v1.2.2 From 63551ae0feaaa23807ebea60de1901564bbef32e Mon Sep 17 00:00:00 2001 From: David Gibson Date: Tue, 21 Jun 2005 17:14:44 -0700 Subject: [PATCH] Hugepage consolidation A lot of the code in arch/*/mm/hugetlbpage.c is quite similar. This patch attempts to consolidate a lot of the code across the arch's, putting the combined version in mm/hugetlb.c. There are a couple of uglyish hacks in order to covert all the hugepage archs, but the result is a very large reduction in the total amount of code. It also means things like hugepage lazy allocation could be implemented in one place, instead of six. Tested, at least a little, on ppc64, i386 and x86_64. Notes: - this patch changes the meaning of set_huge_pte() to be more analagous to set_pte() - does SH4 need s special huge_ptep_get_and_clear()?? Acked-by: William Lee Irwin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/mm/hugetlbpage.c | 196 ++++++----------------------------------------- 1 file changed, 24 insertions(+), 172 deletions(-) (limited to 'arch/sh') diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 1f897bab23..95bb1a6c60 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -24,7 +24,7 @@ #include #include -static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pmd_t *pmd; @@ -39,7 +39,7 @@ static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) return pte; } -static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pmd_t *pmd; @@ -56,28 +56,34 @@ static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) #define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0) -static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, - struct page *page, pte_t * page_table, int write_access) +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry) { - unsigned long i; - pte_t entry; + int i; - add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); + for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { + set_pte_at(mm, addr, ptep, entry); + ptep++; + addr += PAGE_SIZE; + pte_val(entry) += PAGE_SIZE; + } +} - if (write_access) - entry = pte_mkwrite(pte_mkdirty(mk_pte(page, - vma->vm_page_prot))); - else - entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); - entry = pte_mkyoung(entry); - mk_pte_huge(entry); +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pte_t entry; + int i; - for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { - set_pte(page_table, entry); - page_table++; + entry = *ptep; - pte_val(entry) += PAGE_SIZE; + for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { + pte_clear(mm, addr, ptep); + addr += PAGE_SIZE; + ptep++; } + + return entry; } /* @@ -92,79 +98,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len) return 0; } -int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma) -{ - pte_t *src_pte, *dst_pte, entry; - struct page *ptepage; - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; - int i; - - while (addr < end) { - dst_pte = huge_pte_alloc(dst, addr); - if (!dst_pte) - goto nomem; - src_pte = huge_pte_offset(src, addr); - BUG_ON(!src_pte || pte_none(*src_pte)); - entry = *src_pte; - ptepage = pte_page(entry); - get_page(ptepage); - for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { - set_pte(dst_pte, entry); - pte_val(entry) += PAGE_SIZE; - dst_pte++; - } - add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); - addr += HPAGE_SIZE; - } - return 0; - -nomem: - return -ENOMEM; -} - -int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct page **pages, struct vm_area_struct **vmas, - unsigned long *position, int *length, int i) -{ - unsigned long vaddr = *position; - int remainder = *length; - - WARN_ON(!is_vm_hugetlb_page(vma)); - - while (vaddr < vma->vm_end && remainder) { - if (pages) { - pte_t *pte; - struct page *page; - - pte = huge_pte_offset(mm, vaddr); - - /* hugetlb should be locked, and hence, prefaulted */ - BUG_ON(!pte || pte_none(*pte)); - - page = pte_page(*pte); - - WARN_ON(!PageCompound(page)); - - get_page(page); - pages[i] = page; - } - - if (vmas) - vmas[i] = vma; - - vaddr += PAGE_SIZE; - --remainder; - ++i; - } - - *length = remainder; - *position = vaddr; - - return i; -} - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { @@ -181,84 +114,3 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, { return NULL; } - -void unmap_hugepage_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long address; - pte_t *pte; - struct page *page; - int i; - - BUG_ON(start & (HPAGE_SIZE - 1)); - BUG_ON(end & (HPAGE_SIZE - 1)); - - for (address = start; address < end; address += HPAGE_SIZE) { - pte = huge_pte_offset(mm, address); - BUG_ON(!pte); - if (pte_none(*pte)) - continue; - page = pte_page(*pte); - put_page(page); - for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { - pte_clear(mm, address+(i*PAGE_SIZE), pte); - pte++; - } - } - add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT)); - flush_tlb_range(vma, start, end); -} - -int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) -{ - struct mm_struct *mm = current->mm; - unsigned long addr; - int ret = 0; - - BUG_ON(vma->vm_start & ~HPAGE_MASK); - BUG_ON(vma->vm_end & ~HPAGE_MASK); - - spin_lock(&mm->page_table_lock); - for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { - unsigned long idx; - pte_t *pte = huge_pte_alloc(mm, addr); - struct page *page; - - if (!pte) { - ret = -ENOMEM; - goto out; - } - if (!pte_none(*pte)) - continue; - - idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) - + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); - page = find_get_page(mapping, idx); - if (!page) { - /* charge the fs quota first */ - if (hugetlb_get_quota(mapping)) { - ret = -ENOMEM; - goto out; - } - page = alloc_huge_page(); - if (!page) { - hugetlb_put_quota(mapping); - ret = -ENOMEM; - goto out; - } - ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); - if (! ret) { - unlock_page(page); - } else { - hugetlb_put_quota(mapping); - free_huge_page(page); - goto out; - } - } - set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); - } -out: - spin_unlock(&mm->page_table_lock); - return ret; -} -- cgit v1.2.2 From 1363c3cd8603a913a27e2995dccbd70d5312d8e6 Mon Sep 17 00:00:00 2001 From: Wolfgang Wander Date: Tue, 21 Jun 2005 17:14:49 -0700 Subject: [PATCH] Avoiding mmap fragmentation Ingo recently introduced a great speedup for allocating new mmaps using the free_area_cache pointer which boosts the specweb SSL benchmark by 4-5% and causes huge performance increases in thread creation. The downside of this patch is that it does lead to fragmentation in the mmap-ed areas (visible via /proc/self/maps), such that some applications that work fine under 2.4 kernels quickly run out of memory on any 2.6 kernel. The problem is twofold: 1) the free_area_cache is used to continue a search for memory where the last search ended. Before the change new areas were always searched from the base address on. So now new small areas are cluttering holes of all sizes throughout the whole mmap-able region whereas before small holes tended to close holes near the base leaving holes far from the base large and available for larger requests. 2) the free_area_cache also is set to the location of the last munmap-ed area so in scenarios where we allocate e.g. five regions of 1K each, then free regions 4 2 3 in this order the next request for 1K will be placed in the position of the old region 3, whereas before we appended it to the still active region 1, placing it at the location of the old region 2. Before we had 1 free region of 2K, now we only get two free regions of 1K -> fragmentation. The patch addresses thes issues by introducing yet another cache descriptor cached_hole_size that contains the largest known hole size below the current free_area_cache. If a new request comes in the size is compared against the cached_hole_size and if the request can be filled with a hole below free_area_cache the search is started from the base instead. The results look promising: Whereas 2.6.12-rc4 fragments quickly and my (earlier posted) leakme.c test program terminates after 50000+ iterations with 96 distinct and fragmented maps in /proc/self/maps it performs nicely (as expected) with thread creation, Ingo's test_str02 with 20000 threads requires 0.7s system time. Taking out Ingo's patch (un-patch available per request) by basically deleting all mentions of free_area_cache from the kernel and starting the search for new memory always at the respective bases we observe: leakme terminates successfully with 11 distinctive hardly fragmented areas in /proc/self/maps but thread creating is gringdingly slow: 30+s(!) system time for Ingo's test_str02 with 20000 threads. Now - drumroll ;-) the appended patch works fine with leakme: it ends with only 7 distinct areas in /proc/self/maps and also thread creation seems sufficiently fast with 0.71s for 20000 threads. Signed-off-by: Wolfgang Wander Credit-to: "Richard Purdie" Signed-off-by: Ken Chen Acked-by: Ingo Molnar (partly) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/kernel/sys_sh.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/sh') diff --git a/arch/sh/kernel/sys_sh.c b/arch/sh/kernel/sys_sh.c index df5ac294c3..917b2f32f2 100644 --- a/arch/sh/kernel/sys_sh.c +++ b/arch/sh/kernel/sys_sh.c @@ -79,6 +79,10 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, (!vma || addr + len <= vma->vm_start)) return addr; } + if (len <= mm->cached_hole_size) { + mm->cached_hole_size = 0; + mm->free_area_cache = TASK_UNMAPPED_BASE; + } if (flags & MAP_PRIVATE) addr = PAGE_ALIGN(mm->free_area_cache); else @@ -95,6 +99,7 @@ full_search: */ if (start_addr != TASK_UNMAPPED_BASE) { start_addr = addr = TASK_UNMAPPED_BASE; + mm->cached_hole_size = 0; goto full_search; } return -ENOMEM; @@ -106,6 +111,9 @@ full_search: mm->free_area_cache = addr + len; return addr; } + if (addr + mm->cached_hole_size < vma->vm_start) + mm->cached_hole_size = vma->vm_start - addr; + addr = vma->vm_end; if (!(flags & MAP_PRIVATE)) addr = COLOUR_ALIGN(addr); -- cgit v1.2.2