diff options
author | Hugh Dickins <hugh@veritas.com> | 2005-10-29 21:16:40 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-10-30 00:40:42 -0400 |
commit | 4c21e2f2441dc5fbb957b030333f5a3f2d02dea7 (patch) | |
tree | 1f76d33bb1d76221c6424bc5fed080a4f91349a6 /include/linux/mm.h | |
parent | b38c6845b695141259019e2b7c0fe6c32a6e720d (diff) |
[PATCH] mm: split page table lock
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.
This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock. (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)
In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.
Splitting the lock is not quite for free: another cacheline access. Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS. But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.
There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'include/linux/mm.h')
-rw-r--r-- | include/linux/mm.h | 46 |
1 files changed, 38 insertions, 8 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h index e8d1424153bb..8a514eca40d5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -226,13 +226,18 @@ struct page { | |||
226 | * to show when page is mapped | 226 | * to show when page is mapped |
227 | * & limit reverse map searches. | 227 | * & limit reverse map searches. |
228 | */ | 228 | */ |
229 | unsigned long private; /* Mapping-private opaque data: | 229 | union { |
230 | unsigned long private; /* Mapping-private opaque data: | ||
230 | * usually used for buffer_heads | 231 | * usually used for buffer_heads |
231 | * if PagePrivate set; used for | 232 | * if PagePrivate set; used for |
232 | * swp_entry_t if PageSwapCache | 233 | * swp_entry_t if PageSwapCache |
233 | * When page is free, this indicates | 234 | * When page is free, this indicates |
234 | * order in the buddy system. | 235 | * order in the buddy system. |
235 | */ | 236 | */ |
237 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | ||
238 | spinlock_t ptl; | ||
239 | #endif | ||
240 | } u; | ||
236 | struct address_space *mapping; /* If low bit clear, points to | 241 | struct address_space *mapping; /* If low bit clear, points to |
237 | * inode address_space, or NULL. | 242 | * inode address_space, or NULL. |
238 | * If page mapped as anonymous | 243 | * If page mapped as anonymous |
@@ -260,6 +265,9 @@ struct page { | |||
260 | #endif /* WANT_PAGE_VIRTUAL */ | 265 | #endif /* WANT_PAGE_VIRTUAL */ |
261 | }; | 266 | }; |
262 | 267 | ||
268 | #define page_private(page) ((page)->u.private) | ||
269 | #define set_page_private(page, v) ((page)->u.private = (v)) | ||
270 | |||
263 | /* | 271 | /* |
264 | * FIXME: take this include out, include page-flags.h in | 272 | * FIXME: take this include out, include page-flags.h in |
265 | * files which need it (119 of them) | 273 | * files which need it (119 of them) |
@@ -311,17 +319,17 @@ extern void FASTCALL(__page_cache_release(struct page *)); | |||
311 | 319 | ||
312 | #ifdef CONFIG_HUGETLB_PAGE | 320 | #ifdef CONFIG_HUGETLB_PAGE |
313 | 321 | ||
314 | static inline int page_count(struct page *p) | 322 | static inline int page_count(struct page *page) |
315 | { | 323 | { |
316 | if (PageCompound(p)) | 324 | if (PageCompound(page)) |
317 | p = (struct page *)p->private; | 325 | page = (struct page *)page_private(page); |
318 | return atomic_read(&(p)->_count) + 1; | 326 | return atomic_read(&page->_count) + 1; |
319 | } | 327 | } |
320 | 328 | ||
321 | static inline void get_page(struct page *page) | 329 | static inline void get_page(struct page *page) |
322 | { | 330 | { |
323 | if (unlikely(PageCompound(page))) | 331 | if (unlikely(PageCompound(page))) |
324 | page = (struct page *)page->private; | 332 | page = (struct page *)page_private(page); |
325 | atomic_inc(&page->_count); | 333 | atomic_inc(&page->_count); |
326 | } | 334 | } |
327 | 335 | ||
@@ -587,7 +595,7 @@ static inline int PageAnon(struct page *page) | |||
587 | static inline pgoff_t page_index(struct page *page) | 595 | static inline pgoff_t page_index(struct page *page) |
588 | { | 596 | { |
589 | if (unlikely(PageSwapCache(page))) | 597 | if (unlikely(PageSwapCache(page))) |
590 | return page->private; | 598 | return page_private(page); |
591 | return page->index; | 599 | return page->index; |
592 | } | 600 | } |
593 | 601 | ||
@@ -779,9 +787,31 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a | |||
779 | } | 787 | } |
780 | #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ | 788 | #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ |
781 | 789 | ||
790 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | ||
791 | /* | ||
792 | * We tuck a spinlock to guard each pagetable page into its struct page, | ||
793 | * at page->private, with BUILD_BUG_ON to make sure that this will not | ||
794 | * overflow into the next struct page (as it might with DEBUG_SPINLOCK). | ||
795 | * When freeing, reset page->mapping so free_pages_check won't complain. | ||
796 | */ | ||
797 | #define __pte_lockptr(page) &((page)->u.ptl) | ||
798 | #define pte_lock_init(_page) do { \ | ||
799 | spin_lock_init(__pte_lockptr(_page)); \ | ||
800 | } while (0) | ||
801 | #define pte_lock_deinit(page) ((page)->mapping = NULL) | ||
802 | #define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) | ||
803 | #else | ||
804 | /* | ||
805 | * We use mm->page_table_lock to guard all pagetable pages of the mm. | ||
806 | */ | ||
807 | #define pte_lock_init(page) do {} while (0) | ||
808 | #define pte_lock_deinit(page) do {} while (0) | ||
809 | #define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) | ||
810 | #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ | ||
811 | |||
782 | #define pte_offset_map_lock(mm, pmd, address, ptlp) \ | 812 | #define pte_offset_map_lock(mm, pmd, address, ptlp) \ |
783 | ({ \ | 813 | ({ \ |
784 | spinlock_t *__ptl = &(mm)->page_table_lock; \ | 814 | spinlock_t *__ptl = pte_lockptr(mm, pmd); \ |
785 | pte_t *__pte = pte_offset_map(pmd, address); \ | 815 | pte_t *__pte = pte_offset_map(pmd, address); \ |
786 | *(ptlp) = __ptl; \ | 816 | *(ptlp) = __ptl; \ |
787 | spin_lock(__ptl); \ | 817 | spin_lock(__ptl); \ |