diff options
author | Hugh Dickins <hugh@veritas.com> | 2005-10-29 21:16:40 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-10-30 00:40:42 -0400 |
commit | 4c21e2f2441dc5fbb957b030333f5a3f2d02dea7 (patch) | |
tree | 1f76d33bb1d76221c6424bc5fed080a4f91349a6 /mm/memory.c | |
parent | b38c6845b695141259019e2b7c0fe6c32a6e720d (diff) |
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 24 |
1 files changed, 14 insertions, 10 deletions
diff --git a/mm/memory.c b/mm/memory.c index 8461e2dd91d7..e9ef599498b5 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) | |||
114 | { | 114 | { |
115 | struct page *page = pmd_page(*pmd); | 115 | struct page *page = pmd_page(*pmd); |
116 | pmd_clear(pmd); | 116 | pmd_clear(pmd); |
117 | pte_lock_deinit(page); | ||
117 | pte_free_tlb(tlb, page); | 118 | pte_free_tlb(tlb, page); |
118 | dec_page_state(nr_page_table_pages); | 119 | dec_page_state(nr_page_table_pages); |
119 | tlb->mm->nr_ptes--; | 120 | tlb->mm->nr_ptes--; |
@@ -294,10 +295,12 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | |||
294 | if (!new) | 295 | if (!new) |
295 | return -ENOMEM; | 296 | return -ENOMEM; |
296 | 297 | ||
298 | pte_lock_init(new); | ||
297 | spin_lock(&mm->page_table_lock); | 299 | spin_lock(&mm->page_table_lock); |
298 | if (pmd_present(*pmd)) /* Another has populated it */ | 300 | if (pmd_present(*pmd)) { /* Another has populated it */ |
301 | pte_lock_deinit(new); | ||
299 | pte_free(new); | 302 | pte_free(new); |
300 | else { | 303 | } else { |
301 | mm->nr_ptes++; | 304 | mm->nr_ptes++; |
302 | inc_page_state(nr_page_table_pages); | 305 | inc_page_state(nr_page_table_pages); |
303 | pmd_populate(mm, pmd, new); | 306 | pmd_populate(mm, pmd, new); |
@@ -432,7 +435,7 @@ again: | |||
432 | if (!dst_pte) | 435 | if (!dst_pte) |
433 | return -ENOMEM; | 436 | return -ENOMEM; |
434 | src_pte = pte_offset_map_nested(src_pmd, addr); | 437 | src_pte = pte_offset_map_nested(src_pmd, addr); |
435 | src_ptl = &src_mm->page_table_lock; | 438 | src_ptl = pte_lockptr(src_mm, src_pmd); |
436 | spin_lock(src_ptl); | 439 | spin_lock(src_ptl); |
437 | 440 | ||
438 | do { | 441 | do { |
@@ -1194,15 +1197,16 @@ EXPORT_SYMBOL(remap_pfn_range); | |||
1194 | * (but do_wp_page is only called after already making such a check; | 1197 | * (but do_wp_page is only called after already making such a check; |
1195 | * and do_anonymous_page and do_no_page can safely check later on). | 1198 | * and do_anonymous_page and do_no_page can safely check later on). |
1196 | */ | 1199 | */ |
1197 | static inline int pte_unmap_same(struct mm_struct *mm, | 1200 | static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, |
1198 | pte_t *page_table, pte_t orig_pte) | 1201 | pte_t *page_table, pte_t orig_pte) |
1199 | { | 1202 | { |
1200 | int same = 1; | 1203 | int same = 1; |
1201 | #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) | 1204 | #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) |
1202 | if (sizeof(pte_t) > sizeof(unsigned long)) { | 1205 | if (sizeof(pte_t) > sizeof(unsigned long)) { |
1203 | spin_lock(&mm->page_table_lock); | 1206 | spinlock_t *ptl = pte_lockptr(mm, pmd); |
1207 | spin_lock(ptl); | ||
1204 | same = pte_same(*page_table, orig_pte); | 1208 | same = pte_same(*page_table, orig_pte); |
1205 | spin_unlock(&mm->page_table_lock); | 1209 | spin_unlock(ptl); |
1206 | } | 1210 | } |
1207 | #endif | 1211 | #endif |
1208 | pte_unmap(page_table); | 1212 | pte_unmap(page_table); |
@@ -1655,7 +1659,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1655 | pte_t pte; | 1659 | pte_t pte; |
1656 | int ret = VM_FAULT_MINOR; | 1660 | int ret = VM_FAULT_MINOR; |
1657 | 1661 | ||
1658 | if (!pte_unmap_same(mm, page_table, orig_pte)) | 1662 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
1659 | goto out; | 1663 | goto out; |
1660 | 1664 | ||
1661 | entry = pte_to_swp_entry(orig_pte); | 1665 | entry = pte_to_swp_entry(orig_pte); |
@@ -1773,7 +1777,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1773 | page_cache_get(page); | 1777 | page_cache_get(page); |
1774 | entry = mk_pte(page, vma->vm_page_prot); | 1778 | entry = mk_pte(page, vma->vm_page_prot); |
1775 | 1779 | ||
1776 | ptl = &mm->page_table_lock; | 1780 | ptl = pte_lockptr(mm, pmd); |
1777 | spin_lock(ptl); | 1781 | spin_lock(ptl); |
1778 | if (!pte_none(*page_table)) | 1782 | if (!pte_none(*page_table)) |
1779 | goto release; | 1783 | goto release; |
@@ -1934,7 +1938,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1934 | pgoff_t pgoff; | 1938 | pgoff_t pgoff; |
1935 | int err; | 1939 | int err; |
1936 | 1940 | ||
1937 | if (!pte_unmap_same(mm, page_table, orig_pte)) | 1941 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
1938 | return VM_FAULT_MINOR; | 1942 | return VM_FAULT_MINOR; |
1939 | 1943 | ||
1940 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { | 1944 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { |
@@ -1992,7 +1996,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
1992 | pte, pmd, write_access, entry); | 1996 | pte, pmd, write_access, entry); |
1993 | } | 1997 | } |
1994 | 1998 | ||
1995 | ptl = &mm->page_table_lock; | 1999 | ptl = pte_lockptr(mm, pmd); |
1996 | spin_lock(ptl); | 2000 | spin_lock(ptl); |
1997 | if (unlikely(!pte_same(*pte, entry))) | 2001 | if (unlikely(!pte_same(*pte, entry))) |
1998 | goto unlock; | 2002 | goto unlock; |