[PATCH] mm: split page table lock

Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with a many-threaded application which concurrently initializes different parts of a large anonymous area. This patch corrects that, by using a separate spinlock per page table page, to guard the page table entries in that page, instead of using the mm's single page_table_lock. (But even then, page_table_lock is still used to guard page table allocation, and anon_vma allocation.) In this implementation, the spinlock is tucked inside the struct page of the page table page: with a BUILD_BUG_ON in case it overflows - which it would in the case of 32-bit PA-RISC with spinlock debugging enabled. Splitting the lock is not quite for free: another cacheline access. Ideally, I suppose we would use split ptlock only for multi-threaded processes on multi-cpu machines; but deciding that dynamically would have its own costs. So for now enable it by config, at some number of cpus - since the Kconfig language doesn't support inequalities, let preprocessor compare that with NR_CPUS. But I don't think it's worth being user-configurable: for good testing of both split and unsplit configs, split now at 4 cpus, and perhaps change that to 8 later. There is a benefit even for singly threaded processes: kswapd can be attacking one part of the mm while another part is busy faulting. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Hugh Dickins <hugh@veritas.com> 2005-10-29 21:16:40 -0400
committer: Linus Torvalds <torvalds@g5.osdl.org> 2005-10-30 00:40:42 -0400
commit: 4c21e2f2441dc5fbb957b030333f5a3f2d02dea7 (patch)
tree: 1f76d33bb1d76221c6424bc5fed080a4f91349a6 /mm/page_alloc.c
parent: b38c6845b695141259019e2b7c0fe6c32a6e720d (diff)
1 files changed, 8 insertions, 8 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0541288ebf4b..a2995a5d012c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -154,7 +154,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
                struct page *p = page + i;
                SetPageCompound(p);
-                p->private = (unsigned long)page;
+                set_page_private(p, (unsigned long)page);
        }
 }
@@ -174,7 +174,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
                if (!PageCompound(p))
                        bad_page(__FUNCTION__, page);
-                if (p->private != (unsigned long)page)
+                if (page_private(p) != (unsigned long)page)
                        bad_page(__FUNCTION__, page);
                ClearPageCompound(p);
        }
@@ -187,18 +187,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 * So, we don't need atomic page->flags operations here.
 */
 static inline unsigned long page_order(struct page *page) {
-        return page->private;
+        return page_private(page);
 }
 static inline void set_page_order(struct page *page, int order) {
-        page->private = order;
+        set_page_private(page, order);
        __SetPagePrivate(page);
 }
 static inline void rmv_page_order(struct page *page)
 {
        __ClearPagePrivate(page);
-        page->private = 0;
+        set_page_private(page, 0);
 }
 /*
@@ -238,7 +238,7 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
 * (a) the buddy is free &&
 * (b) the buddy is on the buddy system &&
 * (c) a page and its buddy have the same order.
- * for recording page's order, we use page->private and PG_private.
+ * for recording page's order, we use page_private(page) and PG_private.
 *
 */
 static inline int page_is_buddy(struct page *page, int order)
@@ -264,7 +264,7 @@ static inline int page_is_buddy(struct page *page, int order)
 * parts of the VM system.
 * At each level, we keep a list of pages, which are heads of continuous
 * free pages of length of (1 << order) and marked with PG_Private.Page's
- * order is recorded in page->private field.
+ * order is recorded in page_private(page) field.
 * So when we are allocating or freeing one, we can derive the state of the
 * other.  That is, if we allocate a small block, and both were   
 * free, the remainder of the region must be split into blocks.   
@@ -463,7 +463,7 @@ static void prep_new_page(struct page *page, int order)
        page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
                        1 << PG_referenced | 1 << PG_arch_1 |
                        1 << PG_checked | 1 << PG_mappedtodisk);
-        page->private = 0;
+        set_page_private(page, 0);
        set_page_refs(page, order);
        kernel_map_pages(page, 1 << order, 1);
 }
author	Hugh Dickins <hugh@veritas.com>	2005-10-29 21:16:40 -0400
committer	Linus Torvalds <torvalds@g5.osdl.org>	2005-10-30 00:40:42 -0400
commit	4c21e2f2441dc5fbb957b030333f5a3f2d02dea7 (patch)
tree	1f76d33bb1d76221c6424bc5fed080a4f91349a6 /mm/page_alloc.c
parent	b38c6845b695141259019e2b7c0fe6c32a6e720d (diff)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0541288ebf4b..a2995a5d012c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -154,7 +154,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
154	struct page *p = page + i;	154	struct page *p = page + i;
155		155
156	SetPageCompound(p);	156	SetPageCompound(p);
157	p->private = (unsigned long)page;	157	set_page_private(p, (unsigned long)page);
158	}	158	}
159	}	159	}
160		160
@@ -174,7 +174,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
174		174
175	if (!PageCompound(p))	175	if (!PageCompound(p))
176	bad_page(__FUNCTION__, page);	176	bad_page(__FUNCTION__, page);
177	if (p->private != (unsigned long)page)	177	if (page_private(p) != (unsigned long)page)
178	bad_page(__FUNCTION__, page);	178	bad_page(__FUNCTION__, page);
179	ClearPageCompound(p);	179	ClearPageCompound(p);
180	}	180	}
@@ -187,18 +187,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
187	* So, we don't need atomic page->flags operations here.	187	* So, we don't need atomic page->flags operations here.
188	*/	188	*/
189	static inline unsigned long page_order(struct page *page) {	189	static inline unsigned long page_order(struct page *page) {
190	return page->private;	190	return page_private(page);
191	}	191	}
192		192
193	static inline void set_page_order(struct page *page, int order) {	193	static inline void set_page_order(struct page *page, int order) {
194	page->private = order;	194	set_page_private(page, order);
195	__SetPagePrivate(page);	195	__SetPagePrivate(page);
196	}	196	}
197		197
198	static inline void rmv_page_order(struct page *page)	198	static inline void rmv_page_order(struct page *page)
199	{	199	{
200	__ClearPagePrivate(page);	200	__ClearPagePrivate(page);
201	page->private = 0;	201	set_page_private(page, 0);
202	}	202	}
203		203
204	/*	204	/*
@@ -238,7 +238,7 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
238	* (a) the buddy is free &&	238	* (a) the buddy is free &&
239	* (b) the buddy is on the buddy system &&	239	* (b) the buddy is on the buddy system &&
240	* (c) a page and its buddy have the same order.	240	* (c) a page and its buddy have the same order.
241	* for recording page's order, we use page->private and PG_private.	241	* for recording page's order, we use page_private(page) and PG_private.
242	*	242	*
243	*/	243	*/
244	static inline int page_is_buddy(struct page *page, int order)	244	static inline int page_is_buddy(struct page *page, int order)
@@ -264,7 +264,7 @@ static inline int page_is_buddy(struct page *page, int order)
264	* parts of the VM system.	264	* parts of the VM system.
265	* At each level, we keep a list of pages, which are heads of continuous	265	* At each level, we keep a list of pages, which are heads of continuous
266	* free pages of length of (1 << order) and marked with PG_Private.Page's	266	* free pages of length of (1 << order) and marked with PG_Private.Page's
267	* order is recorded in page->private field.	267	* order is recorded in page_private(page) field.
268	* So when we are allocating or freeing one, we can derive the state of the	268	* So when we are allocating or freeing one, we can derive the state of the
269	* other. That is, if we allocate a small block, and both were	269	* other. That is, if we allocate a small block, and both were
270	* free, the remainder of the region must be split into blocks.	270	* free, the remainder of the region must be split into blocks.
@@ -463,7 +463,7 @@ static void prep_new_page(struct page *page, int order)
463	page->flags &= ~(1 << PG_uptodate \| 1 << PG_error \|	463	page->flags &= ~(1 << PG_uptodate \| 1 << PG_error \|
464	1 << PG_referenced \| 1 << PG_arch_1 \|	464	1 << PG_referenced \| 1 << PG_arch_1 \|
465	1 << PG_checked \| 1 << PG_mappedtodisk);	465	1 << PG_checked \| 1 << PG_mappedtodisk);
466	page->private = 0;	466	set_page_private(page, 0);
467	set_page_refs(page, order);	467	set_page_refs(page, order);
468	kernel_map_pages(page, 1 << order, 1);	468	kernel_map_pages(page, 1 << order, 1);
469	}	469	}