diff options
| author | Hugh Dickins <hugh@veritas.com> | 2005-10-29 21:16:40 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-10-30 00:40:42 -0400 |
| commit | 4c21e2f2441dc5fbb957b030333f5a3f2d02dea7 (patch) | |
| tree | 1f76d33bb1d76221c6424bc5fed080a4f91349a6 /include/linux | |
| parent | b38c6845b695141259019e2b7c0fe6c32a6e720d (diff) | |
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'include/linux')
| -rw-r--r-- | include/linux/buffer_head.h | 6 | ||||
| -rw-r--r-- | include/linux/mm.h | 46 |
2 files changed, 41 insertions, 11 deletions
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 88af42f5e0..c937d6e655 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h | |||
| @@ -126,8 +126,8 @@ BUFFER_FNS(Eopnotsupp, eopnotsupp) | |||
| 126 | /* If we *know* page->private refers to buffer_heads */ | 126 | /* If we *know* page->private refers to buffer_heads */ |
| 127 | #define page_buffers(page) \ | 127 | #define page_buffers(page) \ |
| 128 | ({ \ | 128 | ({ \ |
| 129 | BUG_ON(!PagePrivate(page)); \ | 129 | BUG_ON(!PagePrivate(page)); \ |
| 130 | ((struct buffer_head *)(page)->private); \ | 130 | ((struct buffer_head *)page_private(page)); \ |
| 131 | }) | 131 | }) |
| 132 | #define page_has_buffers(page) PagePrivate(page) | 132 | #define page_has_buffers(page) PagePrivate(page) |
| 133 | 133 | ||
| @@ -219,7 +219,7 @@ static inline void attach_page_buffers(struct page *page, | |||
| 219 | { | 219 | { |
| 220 | page_cache_get(page); | 220 | page_cache_get(page); |
| 221 | SetPagePrivate(page); | 221 | SetPagePrivate(page); |
| 222 | page->private = (unsigned long)head; | 222 | set_page_private(page, (unsigned long)head); |
| 223 | } | 223 | } |
| 224 | 224 | ||
| 225 | static inline void get_bh(struct buffer_head *bh) | 225 | static inline void get_bh(struct buffer_head *bh) |
diff --git a/include/linux/mm.h b/include/linux/mm.h index e8d1424153..8a514eca40 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
| @@ -226,13 +226,18 @@ struct page { | |||
| 226 | * to show when page is mapped | 226 | * to show when page is mapped |
| 227 | * & limit reverse map searches. | 227 | * & limit reverse map searches. |
| 228 | */ | 228 | */ |
| 229 | unsigned long private; /* Mapping-private opaque data: | 229 | union { |
| 230 | unsigned long private; /* Mapping-private opaque data: | ||
| 230 | * usually used for buffer_heads | 231 | * usually used for buffer_heads |
| 231 | * if PagePrivate set; used for | 232 | * if PagePrivate set; used for |
| 232 | * swp_entry_t if PageSwapCache | 233 | * swp_entry_t if PageSwapCache |
| 233 | * When page is free, this indicates | 234 | * When page is free, this indicates |
| 234 | * order in the buddy system. | 235 | * order in the buddy system. |
| 235 | */ | 236 | */ |
| 237 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | ||
| 238 | spinlock_t ptl; | ||
| 239 | #endif | ||
| 240 | } u; | ||
| 236 | struct address_space *mapping; /* If low bit clear, points to | 241 | struct address_space *mapping; /* If low bit clear, points to |
| 237 | * inode address_space, or NULL. | 242 | * inode address_space, or NULL. |
| 238 | * If page mapped as anonymous | 243 | * If page mapped as anonymous |
| @@ -260,6 +265,9 @@ struct page { | |||
| 260 | #endif /* WANT_PAGE_VIRTUAL */ | 265 | #endif /* WANT_PAGE_VIRTUAL */ |
| 261 | }; | 266 | }; |
| 262 | 267 | ||
| 268 | #define page_private(page) ((page)->u.private) | ||
| 269 | #define set_page_private(page, v) ((page)->u.private = (v)) | ||
| 270 | |||
| 263 | /* | 271 | /* |
| 264 | * FIXME: take this include out, include page-flags.h in | 272 | * FIXME: take this include out, include page-flags.h in |
| 265 | * files which need it (119 of them) | 273 | * files which need it (119 of them) |
| @@ -311,17 +319,17 @@ extern void FASTCALL(__page_cache_release(struct page *)); | |||
| 311 | 319 | ||
| 312 | #ifdef CONFIG_HUGETLB_PAGE | 320 | #ifdef CONFIG_HUGETLB_PAGE |
| 313 | 321 | ||
| 314 | static inline int page_count(struct page *p) | 322 | static inline int page_count(struct page *page) |
| 315 | { | 323 | { |
| 316 | if (PageCompound(p)) | 324 | if (PageCompound(page)) |
| 317 | p = (struct page *)p->private; | 325 | page = (struct page *)page_private(page); |
| 318 | return atomic_read(&(p)->_count) + 1; | 326 | return atomic_read(&page->_count) + 1; |
| 319 | } | 327 | } |
| 320 | 328 | ||
| 321 | static inline void get_page(struct page *page) | 329 | static inline void get_page(struct page *page) |
| 322 | { | 330 | { |
| 323 | if (unlikely(PageCompound(page))) | 331 | if (unlikely(PageCompound(page))) |
| 324 | page = (struct page *)page->private; | 332 | page = (struct page *)page_private(page); |
| 325 | atomic_inc(&page->_count); | 333 | atomic_inc(&page->_count); |
| 326 | } | 334 | } |
| 327 | 335 | ||
| @@ -587,7 +595,7 @@ static inline int PageAnon(struct page *page) | |||
| 587 | static inline pgoff_t page_index(struct page *page) | 595 | static inline pgoff_t page_index(struct page *page) |
| 588 | { | 596 | { |
| 589 | if (unlikely(PageSwapCache(page))) | 597 | if (unlikely(PageSwapCache(page))) |
| 590 | return page->private; | 598 | return page_private(page); |
| 591 | return page->index; | 599 | return page->index; |
| 592 | } | 600 | } |
| 593 | 601 | ||
| @@ -779,9 +787,31 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a | |||
| 779 | } | 787 | } |
| 780 | #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ | 788 | #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ |
| 781 | 789 | ||
| 790 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | ||
| 791 | /* | ||
| 792 | * We tuck a spinlock to guard each pagetable page into its struct page, | ||
| 793 | * at page->private, with BUILD_BUG_ON to make sure that this will not | ||
| 794 | * overflow into the next struct page (as it might with DEBUG_SPINLOCK). | ||
| 795 | * When freeing, reset page->mapping so free_pages_check won't complain. | ||
| 796 | */ | ||
| 797 | #define __pte_lockptr(page) &((page)->u.ptl) | ||
| 798 | #define pte_lock_init(_page) do { \ | ||
| 799 | spin_lock_init(__pte_lockptr(_page)); \ | ||
| 800 | } while (0) | ||
| 801 | #define pte_lock_deinit(page) ((page)->mapping = NULL) | ||
| 802 | #define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) | ||
| 803 | #else | ||
| 804 | /* | ||
| 805 | * We use mm->page_table_lock to guard all pagetable pages of the mm. | ||
| 806 | */ | ||
| 807 | #define pte_lock_init(page) do {} while (0) | ||
| 808 | #define pte_lock_deinit(page) do {} while (0) | ||
| 809 | #define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) | ||
| 810 | #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ | ||
| 811 | |||
| 782 | #define pte_offset_map_lock(mm, pmd, address, ptlp) \ | 812 | #define pte_offset_map_lock(mm, pmd, address, ptlp) \ |
| 783 | ({ \ | 813 | ({ \ |
| 784 | spinlock_t *__ptl = &(mm)->page_table_lock; \ | 814 | spinlock_t *__ptl = pte_lockptr(mm, pmd); \ |
| 785 | pte_t *__pte = pte_offset_map(pmd, address); \ | 815 | pte_t *__pte = pte_offset_map(pmd, address); \ |
| 786 | *(ptlp) = __ptl; \ | 816 | *(ptlp) = __ptl; \ |
| 787 | spin_lock(__ptl); \ | 817 | spin_lock(__ptl); \ |
