remove ZERO_PAGE

The commit b5810039a54e5babf428e9a1e89fc1940fabff11 contains the note A last caveat: the ZERO_PAGE is now refcounted and managed with rmap (and thus mapcounted and count towards shared rss). These writes to the struct page could cause excessive cacheline bouncing on big systems. There are a number of ways this could be addressed if it is an issue. And indeed this cacheline bouncing has shown up on large SGI systems. There was a situation where an Altix system was essentially livelocked tearing down ZERO_PAGE pagetables when an HPC app aborted during startup. This situation can be avoided in userspace, but it does highlight the potential scalability problem with refcounting ZERO_PAGE, and corner cases where it can really hurt (we don't want the system to livelock!). There are several broad ways to fix this problem: 1. add back some special casing to avoid refcounting ZERO_PAGE 2. per-node or per-cpu ZERO_PAGES 3. remove the ZERO_PAGE completely I will argue for 3. The others should also fix the problem, but they result in more complex code than does 3, with little or no real benefit that I can see. Why? Inserting a ZERO_PAGE for anonymous read faults appears to be a false optimisation: if an application is performance critical, it would not be doing many read faults of new memory, or at least it could be expected to write to that memory soon afterwards. If cache or memory use is critical, it should not be working with a significant number of ZERO_PAGEs anyway (a more compact representation of zeroes should be used). As a sanity check -- mesuring on my desktop system, there are never many mappings to the ZERO_PAGE (eg. 2 or 3), thus memory usage here should not increase much without it. When running a make -j4 kernel compile on my dual core system, there are about 1,000 mappings to the ZERO_PAGE created per second, but about 1,000 ZERO_PAGE COW faults per second (less than 1 ZERO_PAGE mapping per second is torn down without being COWed). So removing ZERO_PAGE will save 1,000 page faults per second when running kbuild, while keeping it only saves less than 1 page clearing operation per second. 1 page clear is cheaper than a thousand faults, presumably, so there isn't an obvious loss. Neither the logical argument nor these basic tests give a guarantee of no regressions. However, this is a reasonable opportunity to try to remove the ZERO_PAGE from the pagefault path. If it is found to cause regressions, we can reintroduce it and just avoid refcounting it. The /dev/zero ZERO_PAGE usage and TLB tricks also get nuked. I don't see much use to them except on benchmarks. All other users of ZERO_PAGE are converted just to use ZERO_PAGE(0) for simplicity. We can look at replacing them all and maybe ripping out ZERO_PAGE completely when we are more satisfied with this solution. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus "snif" Torvalds <torvalds@linux-foundation.org>
author: Nick Piggin <npiggin@suse.de> 2007-10-16 04:24:40 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-10-16 12:42:53 -0400
commit: 557ed1fa2620dc119adb86b34c614e152a629a80 (patch)
tree: d00b31a7f197583c2bd8fffa1fd135fbbb5d6abc /mm/memory.c
parent: aadb4bc4a1f9108c1d0fbd121827c936c2ed4217 (diff)
1 files changed, 21 insertions, 130 deletions
diff --git a/mm/memory.c b/mm/memory.c
index f82b359b2745..2a8430844b6d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -966,7 +966,7 @@ no_page_table:
         * has touched so far, we don't want to allocate page tables.
         */
        if (flags & FOLL_ANON) {
-                page = ZERO_PAGE(address);
+                page = ZERO_PAGE(0);
                if (flags & FOLL_GET)
                        get_page(page);
                BUG_ON(flags & FOLL_WRITE);
@@ -1111,95 +1111,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 }
 EXPORT_SYMBOL(get_user_pages);
-static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
-                        unsigned long addr, unsigned long end, pgprot_t prot)
-{
-        pte_t *pte;
-        spinlock_t *ptl;
-        int err = 0;
-        pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
-        if (!pte)
-                return -EAGAIN;
-        arch_enter_lazy_mmu_mode();
-        do {
-                struct page *page = ZERO_PAGE(addr);
-                pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
-                if (unlikely(!pte_none(*pte))) {
-                        err = -EEXIST;
-                        pte++;
-                        break;
-                }
-                page_cache_get(page);
-                page_add_file_rmap(page);
-                inc_mm_counter(mm, file_rss);
-                set_pte_at(mm, addr, pte, zero_pte);
-        } while (pte++, addr += PAGE_SIZE, addr != end);
-        arch_leave_lazy_mmu_mode();
-        pte_unmap_unlock(pte - 1, ptl);
-        return err;
-}
-static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
-                        unsigned long addr, unsigned long end, pgprot_t prot)
-{
-        pmd_t *pmd;
-        unsigned long next;
-        int err;
-        pmd = pmd_alloc(mm, pud, addr);
-        if (!pmd)
-                return -EAGAIN;
-        do {
-                next = pmd_addr_end(addr, end);
-                err = zeromap_pte_range(mm, pmd, addr, next, prot);
-                if (err)
-                        break;
-        } while (pmd++, addr = next, addr != end);
-        return err;
-}
-static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
-                        unsigned long addr, unsigned long end, pgprot_t prot)
-{
-        pud_t *pud;
-        unsigned long next;
-        int err;
-        pud = pud_alloc(mm, pgd, addr);
-        if (!pud)
-                return -EAGAIN;
-        do {
-                next = pud_addr_end(addr, end);
-                err = zeromap_pmd_range(mm, pud, addr, next, prot);
-                if (err)
-                        break;
-        } while (pud++, addr = next, addr != end);
-        return err;
-}
-int zeromap_page_range(struct vm_area_struct *vma,
-                        unsigned long addr, unsigned long size, pgprot_t prot)
-{
-        pgd_t *pgd;
-        unsigned long next;
-        unsigned long end = addr + size;
-        struct mm_struct *mm = vma->vm_mm;
-        int err;
-        BUG_ON(addr >= end);
-        pgd = pgd_offset(mm, addr);
-        flush_cache_range(vma, addr, end);
-        do {
-                next = pgd_addr_end(addr, end);
-                err = zeromap_pud_range(mm, pgd, addr, next, prot);
-                if (err)
-                        break;
-        } while (pgd++, addr = next, addr != end);
-        return err;
-}
 pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
 {
        pgd_t * pgd = pgd_offset(mm, addr);
@@ -1717,16 +1628,11 @@ gotten:
        if (unlikely(anon_vma_prepare(vma)))
                goto oom;
-        if (old_page == ZERO_PAGE(address)) {
+        VM_BUG_ON(old_page == ZERO_PAGE(0));
-                new_page = alloc_zeroed_user_highpage_movable(vma, address);
+        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-                if (!new_page)
+        if (!new_page)
-                        goto oom;
+                goto oom;
-        } else {
+        cow_user_page(new_page, old_page, address, vma);
-                new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-                if (!new_page)
-                        goto oom;
-                cow_user_page(new_page, old_page, address, vma);
-        }
        /*
         * Re-check the pte - we dropped the lock
@@ -2252,39 +2158,24 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        spinlock_t *ptl;
        pte_t entry;
-        if (write_access) {
+        /* Allocate our own private page. */
-                /* Allocate our own private page. */
+        pte_unmap(page_table);
-                pte_unmap(page_table);
-                if (unlikely(anon_vma_prepare(vma)))
-                        goto oom;
-                page = alloc_zeroed_user_highpage_movable(vma, address);
-                if (!page)
-                        goto oom;
-                entry = mk_pte(page, vma->vm_page_prot);
-                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+        if (unlikely(anon_vma_prepare(vma)))
-                if (!pte_none(*page_table))
+                goto oom;
-                        goto release;
+        page = alloc_zeroed_user_highpage_movable(vma, address);
-                inc_mm_counter(mm, anon_rss);
+        if (!page)
-                lru_cache_add_active(page);
+                goto oom;
-                page_add_new_anon_rmap(page, vma, address);
-        } else {
-                /* Map the ZERO_PAGE - vm_page_prot is readonly */
-                page = ZERO_PAGE(address);
-                page_cache_get(page);
-                entry = mk_pte(page, vma->vm_page_prot);
-                ptl = pte_lockptr(mm, pmd);
+        entry = mk_pte(page, vma->vm_page_prot);
-                spin_lock(ptl);
+        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                if (!pte_none(*page_table))
-                        goto release;
-                inc_mm_counter(mm, file_rss);
-                page_add_file_rmap(page);
-        }
+        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+        if (!pte_none(*page_table))
+                goto release;
+        inc_mm_counter(mm, anon_rss);
+        lru_cache_add_active(page);
+        page_add_new_anon_rmap(page, vma, address);
        set_pte_at(mm, address, page_table, entry);
        /* No need to invalidate - it was non-present before */
author	Nick Piggin <npiggin@suse.de>	2007-10-16 04:24:40 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-10-16 12:42:53 -0400
commit	557ed1fa2620dc119adb86b34c614e152a629a80 (patch)
tree	d00b31a7f197583c2bd8fffa1fd135fbbb5d6abc /mm/memory.c
parent	aadb4bc4a1f9108c1d0fbd121827c936c2ed4217 (diff)

diff --git a/mm/memory.c b/mm/memory.c index f82b359b2745..2a8430844b6d 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -966,7 +966,7 @@ no_page_table:
966	* has touched so far, we don't want to allocate page tables.	966	* has touched so far, we don't want to allocate page tables.
967	*/	967	*/
968	if (flags & FOLL_ANON) {	968	if (flags & FOLL_ANON) {
969	page = ZERO_PAGE(address);	969	page = ZERO_PAGE(0);
970	if (flags & FOLL_GET)	970	if (flags & FOLL_GET)
971	get_page(page);	971	get_page(page);
972	BUG_ON(flags & FOLL_WRITE);	972	BUG_ON(flags & FOLL_WRITE);
@@ -1111,95 +1111,6 @@ int get_user_pages(struct task_struct tsk, struct mm_struct mm,
1111	}	1111	}
1112	EXPORT_SYMBOL(get_user_pages);	1112	EXPORT_SYMBOL(get_user_pages);
1113		1113
1114	static int zeromap_pte_range(struct mm_struct mm, pmd_t pmd,
1115	unsigned long addr, unsigned long end, pgprot_t prot)
1116	{
1117	pte_t *pte;
1118	spinlock_t *ptl;
1119	int err = 0;
1120
1121	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1122	if (!pte)
1123	return -EAGAIN;
1124	arch_enter_lazy_mmu_mode();
1125	do {
1126	struct page *page = ZERO_PAGE(addr);
1127	pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
1128
1129	if (unlikely(!pte_none(*pte))) {
1130	err = -EEXIST;
1131	pte++;
1132	break;
1133	}
1134	page_cache_get(page);
1135	page_add_file_rmap(page);
1136	inc_mm_counter(mm, file_rss);
1137	set_pte_at(mm, addr, pte, zero_pte);
1138	} while (pte++, addr += PAGE_SIZE, addr != end);
1139	arch_leave_lazy_mmu_mode();
1140	pte_unmap_unlock(pte - 1, ptl);
1141	return err;
1142	}
1143
1144	static inline int zeromap_pmd_range(struct mm_struct mm, pud_t pud,
1145	unsigned long addr, unsigned long end, pgprot_t prot)
1146	{
1147	pmd_t *pmd;
1148	unsigned long next;
1149	int err;
1150
1151	pmd = pmd_alloc(mm, pud, addr);
1152	if (!pmd)
1153	return -EAGAIN;
1154	do {
1155	next = pmd_addr_end(addr, end);
1156	err = zeromap_pte_range(mm, pmd, addr, next, prot);
1157	if (err)
1158	break;
1159	} while (pmd++, addr = next, addr != end);
1160	return err;
1161	}
1162
1163	static inline int zeromap_pud_range(struct mm_struct mm, pgd_t pgd,
1164	unsigned long addr, unsigned long end, pgprot_t prot)
1165	{
1166	pud_t *pud;
1167	unsigned long next;
1168	int err;
1169
1170	pud = pud_alloc(mm, pgd, addr);
1171	if (!pud)
1172	return -EAGAIN;
1173	do {
1174	next = pud_addr_end(addr, end);
1175	err = zeromap_pmd_range(mm, pud, addr, next, prot);
1176	if (err)
1177	break;
1178	} while (pud++, addr = next, addr != end);
1179	return err;
1180	}
1181
1182	int zeromap_page_range(struct vm_area_struct *vma,
1183	unsigned long addr, unsigned long size, pgprot_t prot)
1184	{
1185	pgd_t *pgd;
1186	unsigned long next;
1187	unsigned long end = addr + size;
1188	struct mm_struct *mm = vma->vm_mm;
1189	int err;
1190
1191	BUG_ON(addr >= end);
1192	pgd = pgd_offset(mm, addr);
1193	flush_cache_range(vma, addr, end);
1194	do {
1195	next = pgd_addr_end(addr, end);
1196	err = zeromap_pud_range(mm, pgd, addr, next, prot);
1197	if (err)
1198	break;
1199	} while (pgd++, addr = next, addr != end);
1200	return err;
1201	}
1202
1203	pte_t * fastcall get_locked_pte(struct mm_struct mm, unsigned long addr, spinlock_t *ptl)	1114	pte_t * fastcall get_locked_pte(struct mm_struct mm, unsigned long addr, spinlock_t *ptl)
1204	{	1115	{
1205	pgd_t * pgd = pgd_offset(mm, addr);	1116	pgd_t * pgd = pgd_offset(mm, addr);
@@ -1717,16 +1628,11 @@ gotten:
1717		1628
1718	if (unlikely(anon_vma_prepare(vma)))	1629	if (unlikely(anon_vma_prepare(vma)))
1719	goto oom;	1630	goto oom;
1720	if (old_page == ZERO_PAGE(address)) {	1631	VM_BUG_ON(old_page == ZERO_PAGE(0));
1721	new_page = alloc_zeroed_user_highpage_movable(vma, address);	1632	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1722	if (!new_page)	1633	if (!new_page)
1723	goto oom;	1634	goto oom;
1724	} else {	1635	cow_user_page(new_page, old_page, address, vma);
1725	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1726	if (!new_page)
1727	goto oom;
1728	cow_user_page(new_page, old_page, address, vma);
1729	}
1730		1636
1731	/*	1637	/*
1732	* Re-check the pte - we dropped the lock	1638	* Re-check the pte - we dropped the lock
@@ -2252,39 +2158,24 @@ static int do_anonymous_page(struct mm_struct mm, struct vm_area_struct vma,
2252	spinlock_t *ptl;	2158	spinlock_t *ptl;
2253	pte_t entry;	2159	pte_t entry;
2254		2160
2255	if (write_access) {	2161	/* Allocate our own private page. */
2256	/* Allocate our own private page. */	2162	pte_unmap(page_table);
2257	pte_unmap(page_table);
2258
2259	if (unlikely(anon_vma_prepare(vma)))
2260	goto oom;
2261	page = alloc_zeroed_user_highpage_movable(vma, address);
2262	if (!page)
2263	goto oom;
2264
2265	entry = mk_pte(page, vma->vm_page_prot);
2266	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2267		2163
2268	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);	2164	if (unlikely(anon_vma_prepare(vma)))
2269	if (!pte_none(*page_table))	2165	goto oom;
2270	goto release;	2166	page = alloc_zeroed_user_highpage_movable(vma, address);
2271	inc_mm_counter(mm, anon_rss);	2167	if (!page)
2272	lru_cache_add_active(page);	2168	goto oom;
2273	page_add_new_anon_rmap(page, vma, address);
2274	} else {
2275	/* Map the ZERO_PAGE - vm_page_prot is readonly */
2276	page = ZERO_PAGE(address);
2277	page_cache_get(page);
2278	entry = mk_pte(page, vma->vm_page_prot);
2279		2169
2280	ptl = pte_lockptr(mm, pmd);	2170	entry = mk_pte(page, vma->vm_page_prot);
2281	spin_lock(ptl);	2171	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2282	if (!pte_none(*page_table))
2283	goto release;
2284	inc_mm_counter(mm, file_rss);
2285	page_add_file_rmap(page);
2286	}
2287		2172
		2173	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
		2174	if (!pte_none(*page_table))
		2175	goto release;
		2176	inc_mm_counter(mm, anon_rss);
		2177	lru_cache_add_active(page);
		2178	page_add_new_anon_rmap(page, vma, address);
2288	set_pte_at(mm, address, page_table, entry);	2179	set_pte_at(mm, address, page_table, entry);
2289		2180
2290	/* No need to invalidate - it was non-present before */	2181	/* No need to invalidate - it was non-present before */