From 6aab341e0a28aff100a09831c5300a2994b8b986 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Mon, 28 Nov 2005 14:34:23 -0800
Subject: mm: re-architect the VM_UNPAGED logic

This replaces the (in my opinion horrible) VM_UNMAPPED logic with very
explicit support for a "remapped page range" aka VM_PFNMAP.  It allows a
VM area to contain an arbitrary range of page table entries that the VM
never touches, and never considers to be normal pages.

Any user of "remap_pfn_range()" automatically gets this new
functionality, and doesn't even have to mark the pages reserved or
indeed mark them any other way.  It just works.  As a side effect, doing
mmap() on /dev/mem works for arbitrary ranges.

Sparc update from David in the next commit.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 189 ++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 100 insertions(+), 89 deletions(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index d1f46f4e4c8a..b57fbc636058 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -333,9 +333,9 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 }
 
 /*
- * This function is called to print an error when a pte in a
- * !VM_UNPAGED region is found pointing to an invalid pfn (which
- * is an error.
+ * This function is called to print an error when a bad pte
+ * is found. For example, we might have a PFN-mapped pte in
+ * a region that doesn't allow it.
  *
  * The calling function must still handle the error.
  */
@@ -350,19 +350,56 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
 }
 
 /*
- * page_is_anon applies strict checks for an anonymous page belonging to
- * this vma at this address.  It is used on VM_UNPAGED vmas, which are
- * usually populated with shared originals (which must not be counted),
- * but occasionally contain private COWed copies (when !VM_SHARED, or
- * perhaps via ptrace when VM_SHARED).  An mmap of /dev/mem might window
- * free pages, pages from other processes, or from other parts of this:
- * it's tricky, but try not to be deceived by foreign anonymous pages.
+ * This function gets the "struct page" associated with a pte.
+ *
+ * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
+ * will have each page table entry just pointing to a raw page frame
+ * number, and as far as the VM layer is concerned, those do not have
+ * pages associated with them - even if the PFN might point to memory
+ * that otherwise is perfectly fine and has a "struct page".
+ *
+ * The way we recognize those mappings is through the rules set up
+ * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
+ * and the vm_pgoff will point to the first PFN mapped: thus every
+ * page that is a raw mapping will always honor the rule
+ *
+ *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
+ *
+ * and if that isn't true, the page has been COW'ed (in which case it
+ * _does_ have a "struct page" associated with it even if it is in a
+ * VM_PFNMAP range).
  */
-static inline int page_is_anon(struct page *page,
-			struct vm_area_struct *vma, unsigned long addr)
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 {
-	return page && PageAnon(page) && page_mapped(page) &&
-		page_address_in_vma(page, vma) == addr;
+	unsigned long pfn = pte_pfn(pte);
+
+	if (vma->vm_flags & VM_PFNMAP) {
+		unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
+		if (pfn == vma->vm_pgoff + off)
+			return NULL;
+	}
+
+	/*
+	 * Add some anal sanity checks for now. Eventually,
+	 * we should just do "return pfn_to_page(pfn)", but
+	 * in the meantime we check that we get a valid pfn,
+	 * and that the resulting page looks ok.
+	 *
+	 * Remove this test eventually!
+	 */
+	if (unlikely(!pfn_valid(pfn))) {
+		print_bad_pte(vma, pte, addr);
+		return NULL;
+	}
+
+	/*
+	 * NOTE! We still have PageReserved() pages in the page 
+	 * tables. 
+	 *
+	 * The PAGE_ZERO() pages and various VDSO mappings can
+	 * cause them to exist.
+	 */
+	return pfn_to_page(pfn);
 }
 
 /*
@@ -379,7 +416,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	unsigned long vm_flags = vma->vm_flags;
 	pte_t pte = *src_pte;
 	struct page *page;
-	unsigned long pfn;
 
 	/* pte contains position in swap or file, so copy. */
 	if (unlikely(!pte_present(pte))) {
@@ -397,22 +433,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		goto out_set_pte;
 	}
 
-	pfn = pte_pfn(pte);
-	page = pfn_valid(pfn)? pfn_to_page(pfn): NULL;
-
-	if (unlikely(vm_flags & VM_UNPAGED))
-		if (!page_is_anon(page, vma, addr))
-			goto out_set_pte;
-
-	/*
-	 * If the pte points outside of valid memory but
-	 * the region is not VM_UNPAGED, we have a problem.
-	 */
-	if (unlikely(!page)) {
-		print_bad_pte(vma, pte, addr);
-		goto out_set_pte; /* try to do something sane */
-	}
-
 	/*
 	 * If it's a COW mapping, write protect it both
 	 * in the parent and the child
@@ -429,9 +449,13 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	if (vm_flags & VM_SHARED)
 		pte = pte_mkclean(pte);
 	pte = pte_mkold(pte);
-	get_page(page);
-	page_dup_rmap(page);
-	rss[!!PageAnon(page)]++;
+
+	page = vm_normal_page(vma, addr, pte);
+	if (page) {
+		get_page(page);
+		page_dup_rmap(page);
+		rss[!!PageAnon(page)]++;
+	}
 
 out_set_pte:
 	set_pte_at(dst_mm, addr, dst_pte, pte);
@@ -543,7 +567,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 * readonly mappings. The tradeoff is that copy_page_range is more
 	 * efficient than faulting.
 	 */
-	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_UNPAGED))) {
+	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP))) {
 		if (!vma->anon_vma)
 			return 0;
 	}
@@ -584,19 +608,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 		}
 		if (pte_present(ptent)) {
 			struct page *page;
-			unsigned long pfn;
 
 			(*zap_work) -= PAGE_SIZE;
 
-			pfn = pte_pfn(ptent);
-			page = pfn_valid(pfn)? pfn_to_page(pfn): NULL;
-
-			if (unlikely(vma->vm_flags & VM_UNPAGED)) {
-				if (!page_is_anon(page, vma, addr))
-					page = NULL;
-			} else if (unlikely(!page))
-				print_bad_pte(vma, ptent, addr);
-
+			page = vm_normal_page(vma, addr, ptent);
 			if (unlikely(details) && page) {
 				/*
 				 * unmap_shared_mapping_pages() wants to
@@ -852,7 +867,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 /*
  * Do a quick page-table lookup for a single page.
  */
-struct page *follow_page(struct mm_struct *mm, unsigned long address,
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 			unsigned int flags)
 {
 	pgd_t *pgd;
@@ -860,8 +875,8 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
 	pmd_t *pmd;
 	pte_t *ptep, pte;
 	spinlock_t *ptl;
-	unsigned long pfn;
 	struct page *page;
+	struct mm_struct *mm = vma->vm_mm;
 
 	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
 	if (!IS_ERR(page)) {
@@ -897,11 +912,10 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
 		goto unlock;
 	if ((flags & FOLL_WRITE) && !pte_write(pte))
 		goto unlock;
-	pfn = pte_pfn(pte);
-	if (!pfn_valid(pfn))
+	page = vm_normal_page(vma, address, pte);
+	if (unlikely(!page))
 		goto unlock;
 
-	page = pfn_to_page(pfn);
 	if (flags & FOLL_GET)
 		get_page(page);
 	if (flags & FOLL_TOUCH) {
@@ -974,8 +988,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				return i ? : -EFAULT;
 			}
 			if (pages) {
-				pages[i] = pte_page(*pte);
-				get_page(pages[i]);
+				struct page *page = vm_normal_page(vma, start, *pte);
+				pages[i] = page;
+				if (page)
+					get_page(page);
 			}
 			pte_unmap(pte);
 			if (vmas)
@@ -1010,7 +1026,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				foll_flags |= FOLL_WRITE;
 
 			cond_resched();
-			while (!(page = follow_page(mm, start, foll_flags))) {
+			while (!(page = follow_page(vma, start, foll_flags))) {
 				int ret;
 				ret = __handle_mm_fault(mm, vma, start,
 						foll_flags & FOLL_WRITE);
@@ -1214,11 +1230,12 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 *	in 2.6 the LRU scan won't even find its pages, so this
 	 *	flag means no more than count its pages in reserved_vm,
 	 * 	and omit it from core dump, even when VM_IO turned off.
-	 *   VM_UNPAGED tells the core MM not to "manage" these pages
-         *	(e.g. refcount, mapcount, try to swap them out): in
-	 *	particular, zap_pte_range does not try to free them.
+	 *   VM_PFNMAP tells the core MM that the base pages are just
+	 *	raw PFN mappings, and do not have a "struct page" associated
+	 *	with them.
 	 */
-	vma->vm_flags |= VM_IO | VM_RESERVED | VM_UNPAGED;
+	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+	vma->vm_pgoff = pfn;
 
 	BUG_ON(addr >= end);
 	pfn -= addr >> PAGE_SHIFT;
@@ -1273,6 +1290,26 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 	return pte;
 }
 
+static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
+{
+	/*
+	 * If the source page was a PFN mapping, we don't have
+	 * a "struct page" for it. We do a best-effort copy by
+	 * just copying from the original user address. If that
+	 * fails, we just zero-fill it. Live with it.
+	 */
+	if (unlikely(!src)) {
+		void *kaddr = kmap_atomic(dst, KM_USER0);
+		unsigned long left = __copy_from_user_inatomic(kaddr, (void __user *)va, PAGE_SIZE);
+		if (left)
+			memset(kaddr, 0, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		return;
+		
+	}
+	copy_user_highpage(dst, src, va);
+}
+
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
@@ -1296,28 +1333,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		spinlock_t *ptl, pte_t orig_pte)
 {
 	struct page *old_page, *src_page, *new_page;
-	unsigned long pfn = pte_pfn(orig_pte);
 	pte_t entry;
 	int ret = VM_FAULT_MINOR;
 
-	if (unlikely(!pfn_valid(pfn))) {
-		/*
-		 * Page table corrupted: show pte and kill process.
-		 * Or it's an attempt to COW an out-of-map VM_UNPAGED
-		 * entry, which copy_user_highpage does not support.
-		 */
-		print_bad_pte(vma, orig_pte, address);
-		ret = VM_FAULT_OOM;
-		goto unlock;
-	}
-	old_page = pfn_to_page(pfn);
+	old_page = vm_normal_page(vma, address, orig_pte);
 	src_page = old_page;
-
-	if (unlikely(vma->vm_flags & VM_UNPAGED))
-		if (!page_is_anon(old_page, vma, address)) {
-			old_page = NULL;
-			goto gotten;
-		}
+	if (!old_page)
+		goto gotten;
 
 	if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
 		int reuse = can_share_swap_page(old_page);
@@ -1351,7 +1373,7 @@ gotten:
 		new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
 		if (!new_page)
 			goto oom;
-		copy_user_highpage(new_page, src_page, address);
+		cow_user_page(new_page, src_page, address);
 	}
 
 	/*
@@ -1812,16 +1834,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	pte_t entry;
 
-	/*
-	 * A VM_UNPAGED vma will normally be filled with present ptes
-	 * by remap_pfn_range, and never arrive here; but it might have
-	 * holes, or if !VM_DONTEXPAND, mremap might have expanded it.
-	 * It's weird enough handling anon pages in unpaged vmas, we do
-	 * not want to worry about ZERO_PAGEs too (it may or may not
-	 * matter if their counts wrap): just give them anon pages.
-	 */
-
-	if (write_access || (vma->vm_flags & VM_UNPAGED)) {
+	if (write_access) {
 		/* Allocate our own private page. */
 		pte_unmap(page_table);
 
@@ -1896,8 +1909,6 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	int anon = 0;
 
 	pte_unmap(page_table);
-	BUG_ON(vma->vm_flags & VM_UNPAGED);
-
 	if (vma->vm_file) {
 		mapping = vma->vm_file->f_mapping;
 		sequence = mapping->truncate_count;
@@ -1930,7 +1941,7 @@ retry:
 		page = alloc_page_vma(GFP_HIGHUSER, vma, address);
 		if (!page)
 			goto oom;
-		copy_user_highpage(page, new_page, address);
+		cow_user_page(page, new_page, address);
 		page_cache_release(new_page);
 		new_page = page;
 		anon = 1;
-- 
cgit v1.2.2


From e0f39591cc178026607fcbbe9a53be435fe8285d Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 28 Nov 2005 13:43:44 -0800
Subject: [PATCH] Workaround for gcc 2.96 (undefined references)

  LD      .tmp_vmlinux1
mm/built-in.o(.text+0x100d6): In function `copy_page_range':
: undefined reference to `__pud_alloc'
mm/built-in.o(.text+0x1010b): In function `copy_page_range':
: undefined reference to `__pmd_alloc'
mm/built-in.o(.text+0x11ef4): In function `__handle_mm_fault':
: undefined reference to `__pud_alloc'
fs/built-in.o(.text+0xc930): In function `install_arg_page':
: undefined reference to `__pud_alloc'
make: *** [.tmp_vmlinux1] Error 1

Those missing references in mm/memory.c arise from this code in
include/linux/mm.h, combined with the fact that __PGTABLE_PMD_FOLDED and
__PGTABLE_PUD_FOLDED are both set and __ARCH_HAS_4LEVEL_HACK is not:

/*
 * The following ifdef needed to get the 4level-fixup.h header to work.
 * Remove it when 4level-fixup.h has been removed.
 */
#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK)
static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
        return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))?
                NULL: pud_offset(pgd, address);
}

static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
        return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
                NULL: pmd_offset(pud, address);
}
#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */

With my configuration the pgd_none and pud_none routines are inlines
returning a constant 0.  Apparently the old compiler avoids generating
calls to __pud_alloc and __pmd_alloc but still lists them as undefined
references in the module's symbol table.

I don't know which change caused this problem.  I think it was added
somewhere between 2.6.14 and 2.6.15-rc1, because I remember building
several 2.6.14-rc kernels without difficulty.  However I can't point to an
individual culprit.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index b57fbc636058..9ab206b829a2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2160,6 +2160,12 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
+#else
+/* Workaround for gcc 2.96 */
+int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+{
+	return 0;
+}
 #endif /* __PAGETABLE_PUD_FOLDED */
 
 #ifndef __PAGETABLE_PMD_FOLDED
@@ -2188,6 +2194,12 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
+#else
+/* Workaround for gcc 2.96 */
+int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
+{
+	return 0;
+}
 #endif /* __PAGETABLE_PMD_FOLDED */
 
 int make_pages_present(unsigned long addr, unsigned long end)
-- 
cgit v1.2.2


From fa2a455b028f3b6ca4dae129c6337d7edf21f12c Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Tue, 29 Nov 2005 18:43:17 +1100
Subject: [PATCH] Fix vma argument in get_usr_pages() for gate areas

The system call gate area handling called vm_normal_page() with the
wrong vma (which was always NULL, and caused an oops).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index 9ab206b829a2..6c1eac92a316 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -988,7 +988,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				return i ? : -EFAULT;
 			}
 			if (pages) {
-				struct page *page = vm_normal_page(vma, start, *pte);
+				struct page *page = vm_normal_page(gate_vma, start, *pte);
 				pages[i] = page;
 				if (page)
 					get_page(page);
-- 
cgit v1.2.2


From eca351336acb2fa943611e0846562ce3997ef53b Mon Sep 17 00:00:00 2001
From: Ben Collins <bcollins@debian.org>
Date: Tue, 29 Nov 2005 11:45:26 -0800
Subject: [PATCH] Fix missing pfn variables caused by vm changes

I image this showed up because of "unused var..." when the changes
occured, because flush_cache_page() is a noop in most places.  This
showed up for me on parisc however, where flush_cache_page() is a real
function.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index 6c1eac92a316..74839b3a3999 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1345,7 +1345,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		int reuse = can_share_swap_page(old_page);
 		unlock_page(old_page);
 		if (reuse) {
-			flush_cache_page(vma, address, pfn);
+			flush_cache_page(vma, address, pte_pfn(orig_pte));
 			entry = pte_mkyoung(orig_pte);
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 			ptep_set_access_flags(vma, address, page_table, entry, 1);
@@ -1389,7 +1389,7 @@ gotten:
 			}
 		} else
 			inc_mm_counter(mm, anon_rss);
-		flush_cache_page(vma, address, pfn);
+		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		ptep_establish(vma, address, page_table, entry);
-- 
cgit v1.2.2


From 238f58d898df941aa9d1cb390fb27ff4febe8965 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Tue, 29 Nov 2005 13:01:56 -0800
Subject: Support strange discontiguous PFN remappings

These get created by some drivers that don't generally even want a pfn
remapping at all, but would really mostly prefer to just map pages
they've allocated individually instead.

For now, create a helper function that turns such an incomplete PFN
remapping call into a loop that does that explicit mapping.  In the long
run we almost certainly want to export a totally different interface for
that, though.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index 74839b3a3999..990e7dc666f8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1146,6 +1146,95 @@ int zeromap_page_range(struct vm_area_struct *vma,
 	return err;
 }
 
+/*
+ * This is the old fallback for page remapping.
+ *
+ * For historical reasons, it only allows reserved pages. Only
+ * old drivers should use this, and they needed to mark their
+ * pages reserved for the old functions anyway.
+ */
+static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
+{
+	int retval;
+	pgd_t * pgd;
+	pud_t * pud;
+	pmd_t * pmd;  
+	pte_t * pte;
+	spinlock_t *ptl;  
+
+	retval = -EINVAL;
+	if (PageAnon(page) || !PageReserved(page))
+		goto out;
+	retval = -ENOMEM;
+	flush_dcache_page(page);
+	pgd = pgd_offset(mm, addr);
+	pud = pud_alloc(mm, pgd, addr);
+	if (!pud)
+		goto out;
+	pmd = pmd_alloc(mm, pud, addr);
+	if (!pmd)
+		goto out;
+	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+	if (!pte)
+		goto out;
+	retval = -EBUSY;
+	if (!pte_none(*pte))
+		goto out_unlock;
+
+	/* Ok, finally just insert the thing.. */
+	get_page(page);
+	inc_mm_counter(mm, file_rss);
+	page_add_file_rmap(page);
+	set_pte_at(mm, addr, pte, mk_pte(page, prot));
+
+	retval = 0;
+out_unlock:
+	pte_unmap_unlock(pte, ptl);
+out:
+	return retval;
+}
+
+/*
+ * Somebody does a pfn remapping that doesn't actually work as a vma.
+ *
+ * Do it as individual pages instead, and warn about it. It's bad form,
+ * and very inefficient.
+ */
+static int incomplete_pfn_remap(struct vm_area_struct *vma,
+		unsigned long start, unsigned long end,
+		unsigned long pfn, pgprot_t prot)
+{
+	static int warn = 10;
+	struct page *page;
+	int retval;
+
+	if (!(vma->vm_flags & VM_INCOMPLETE)) {
+		if (warn) {
+			warn--;
+			printk("%s does an incomplete pfn remapping", current->comm);
+			dump_stack();
+		}
+	}
+	vma->vm_flags |= VM_INCOMPLETE | VM_IO | VM_RESERVED;
+
+	if (start < vma->vm_start || end > vma->vm_end)
+		return -EINVAL;
+
+	if (!pfn_valid(pfn))
+		return -EINVAL;
+
+	retval = 0;
+	page = pfn_to_page(pfn);
+	while (start < end) {
+		retval = insert_page(vma->vm_mm, start, page, prot);
+		if (retval < 0)
+			break;
+		start += PAGE_SIZE;
+		page++;
+	}
+	return retval;
+}
+
 /*
  * maps a range of physical memory into the requested pages. the old
  * mappings are removed. any references to nonexistent pages results
@@ -1220,6 +1309,9 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	struct mm_struct *mm = vma->vm_mm;
 	int err;
 
+	if (addr != vma->vm_start || end != vma->vm_end)
+		return incomplete_pfn_remap(vma, addr, end, pfn, prot);
+
 	/*
 	 * Physically remapped pages are special. Tell the
 	 * rest of the world about it:
-- 
cgit v1.2.2


From c9cfcddfd65735437a4cb8563d6b66a6da8a5ed6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Tue, 29 Nov 2005 14:03:14 -0800
Subject: VM: add common helper function to create the page tables

This logic was duplicated four times, for no good reason.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index 990e7dc666f8..74f95ae0510b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1146,6 +1146,18 @@ int zeromap_page_range(struct vm_area_struct *vma,
 	return err;
 }
 
+pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
+{
+	pgd_t * pgd = pgd_offset(mm, addr);
+	pud_t * pud = pud_alloc(mm, pgd, addr);
+	if (pud) {
+		pmd_t * pmd = pmd_alloc(mm, pgd, addr);
+		if (pmd)
+			return pte_alloc_map_lock(mm, pmd, addr, ptl);
+	}
+	return NULL;
+}
+
 /*
  * This is the old fallback for page remapping.
  *
@@ -1156,10 +1168,7 @@ int zeromap_page_range(struct vm_area_struct *vma,
 static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
 {
 	int retval;
-	pgd_t * pgd;
-	pud_t * pud;
-	pmd_t * pmd;  
-	pte_t * pte;
+	pte_t *pte;
 	spinlock_t *ptl;  
 
 	retval = -EINVAL;
@@ -1167,14 +1176,7 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa
 		goto out;
 	retval = -ENOMEM;
 	flush_dcache_page(page);
-	pgd = pgd_offset(mm, addr);
-	pud = pud_alloc(mm, pgd, addr);
-	if (!pud)
-		goto out;
-	pmd = pmd_alloc(mm, pud, addr);
-	if (!pmd)
-		goto out;
-	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+	pte = get_locked_pte(mm, addr, &ptl);
 	if (!pte)
 		goto out;
 	retval = -EBUSY;
-- 
cgit v1.2.2


From 5d2a2dbbc1025dbf7998b9289574d9592b8f21cc Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Tue, 29 Nov 2005 14:07:55 -0800
Subject: cow_user_page: fix page alignment

High Dickins points out that the user virtual address passed to the page
fault handler isn't necessarily page-aligned.

Also, add a comment on why the copy could fail for the user address case.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index 74f95ae0510b..745b3482e6c2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1394,8 +1394,15 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
 	 */
 	if (unlikely(!src)) {
 		void *kaddr = kmap_atomic(dst, KM_USER0);
-		unsigned long left = __copy_from_user_inatomic(kaddr, (void __user *)va, PAGE_SIZE);
-		if (left)
+		void __user *uaddr = (void __user *)(va & PAGE_MASK);
+
+		/*
+		 * This really shouldn't fail, because the page is there
+		 * in the page tables. But it might just be unreadable,
+		 * in which case we just give up and fill the result with
+		 * zeroes.
+		 */
+		if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
 			memset(kaddr, 0, PAGE_SIZE);
 		kunmap_atomic(kaddr, KM_USER0);
 		return;
-- 
cgit v1.2.2


From e5bbe4dfc8dbfc50ef89f8641e020616d4d1e69e Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 29 Nov 2005 16:54:51 +0000
Subject: [PATCH] pfnmap: remove src_page from do_wp_page

Clean away do_wp_page's "src_page": cow_user_page makes it unnecessary.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index 745b3482e6c2..ae259b6e5a21 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1433,12 +1433,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		spinlock_t *ptl, pte_t orig_pte)
 {
-	struct page *old_page, *src_page, *new_page;
+	struct page *old_page, *new_page;
 	pte_t entry;
 	int ret = VM_FAULT_MINOR;
 
 	old_page = vm_normal_page(vma, address, orig_pte);
-	src_page = old_page;
 	if (!old_page)
 		goto gotten;
 
@@ -1466,7 +1465,7 @@ gotten:
 
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
-	if (src_page == ZERO_PAGE(address)) {
+	if (old_page == ZERO_PAGE(address)) {
 		new_page = alloc_zeroed_user_highpage(vma, address);
 		if (!new_page)
 			goto oom;
@@ -1474,7 +1473,7 @@ gotten:
 		new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
 		if (!new_page)
 			goto oom;
-		cow_user_page(new_page, src_page, address);
+		cow_user_page(new_page, old_page, address);
 	}
 
 	/*
-- 
cgit v1.2.2


From 325f04dbca60a4cfe4ac25e7cf246edd07eb4c5f Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 29 Nov 2005 16:55:48 +0000
Subject: [PATCH] pfnmap: do_no_page BUG_ON again

Use copy_user_highpage directly instead of cow_user_page in do_no_page:
in the immediately following page_cache_release, and elsewhere, it is
assuming that new_page is normal.  If any VM_PFNMAP driver can get to
do_no_page, it's just a BUG (but not in the case of do_anonymous_page).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index ae259b6e5a21..5bfa52a98630 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2009,6 +2009,8 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	int anon = 0;
 
 	pte_unmap(page_table);
+	BUG_ON(vma->vm_flags & VM_PFNMAP);
+
 	if (vma->vm_file) {
 		mapping = vma->vm_file->f_mapping;
 		sequence = mapping->truncate_count;
@@ -2041,7 +2043,7 @@ retry:
 		page = alloc_page_vma(GFP_HIGHUSER, vma, address);
 		if (!page)
 			goto oom;
-		cow_user_page(page, new_page, address);
+		copy_user_highpage(page, new_page, address);
 		page_cache_release(new_page);
 		new_page = page;
 		anon = 1;
-- 
cgit v1.2.2


From 49c91fb01ff3948285608c65754b3ffbf57d50f2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@fys.uio.no>
Date: Tue, 29 Nov 2005 19:27:22 -0500
Subject: [PATCH] VM: Fix typos in get_locked_pte

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index 5bfa52a98630..8d10b5540c73 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1146,12 +1146,12 @@ int zeromap_page_range(struct vm_area_struct *vma,
 	return err;
 }
 
-pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
+pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
 {
 	pgd_t * pgd = pgd_offset(mm, addr);
 	pud_t * pud = pud_alloc(mm, pgd, addr);
 	if (pud) {
-		pmd_t * pmd = pmd_alloc(mm, pgd, addr);
+		pmd_t * pmd = pmd_alloc(mm, pud, addr);
 		if (pmd)
 			return pte_alloc_map_lock(mm, pmd, addr, ptl);
 	}
-- 
cgit v1.2.2


From a145dd411eb28c83ee4bb68b66f62c326c0f764e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Wed, 30 Nov 2005 09:35:19 -0800
Subject: VM: add "vm_insert_page()" function

This is what a lot of drivers will actually want to use to insert
individual pages into a user VMA.  It doesn't have the old PageReserved
restrictions of remap_pfn_range(), and it doesn't complain about partial
remappings.

The page you insert needs to be a nice clean kernel allocation, so you
can't insert arbitrary page mappings with this, but that's not what
people want.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 36 ++++++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index 8d10b5540c73..4b4fc3a7ea48 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1172,7 +1172,7 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa
 	spinlock_t *ptl;  
 
 	retval = -EINVAL;
-	if (PageAnon(page) || !PageReserved(page))
+	if (PageAnon(page))
 		goto out;
 	retval = -ENOMEM;
 	flush_dcache_page(page);
@@ -1196,6 +1196,35 @@ out:
 	return retval;
 }
 
+/*
+ * This allows drivers to insert individual pages they've allocated
+ * into a user vma.
+ *
+ * The page has to be a nice clean _individual_ kernel allocation.
+ * If you allocate a compound page, you need to have marked it as
+ * such (__GFP_COMP), or manually just split the page up yourself
+ * (which is mainly an issue of doing "set_page_count(page, 1)" for
+ * each sub-page, and then freeing them one by one when you free
+ * them rather than freeing it as a compound page).
+ *
+ * NOTE! Traditionally this was done with "remap_pfn_range()" which
+ * took an arbitrary page protection parameter. This doesn't allow
+ * that. Your vma protection will have to be set up correctly, which
+ * means that if you want a shared writable mapping, you'd better
+ * ask for a shared writable mapping!
+ *
+ * The page does not need to be reserved.
+ */
+int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page)
+{
+	if (addr < vma->vm_start || addr >= vma->vm_end)
+		return -EFAULT;
+	if (!page_count(page))
+		return -EINVAL;
+	return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
+}
+EXPORT_SYMBOL_GPL(vm_insert_page);
+
 /*
  * Somebody does a pfn remapping that doesn't actually work as a vma.
  *
@@ -1225,8 +1254,11 @@ static int incomplete_pfn_remap(struct vm_area_struct *vma,
 	if (!pfn_valid(pfn))
 		return -EINVAL;
 
-	retval = 0;
 	page = pfn_to_page(pfn);
+	if (!PageReserved(page))
+		return -EINVAL;
+
+	retval = 0;
 	while (start < end) {
 		retval = insert_page(vma->vm_mm, start, page, prot);
 		if (retval < 0)
-- 
cgit v1.2.2


From e3c3374fbf7efe9487edc53cd10436ed641983aa Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sat, 3 Dec 2005 20:48:11 -0800
Subject: Make vm_insert_page() available to NVidia module

It used to use remap_pfn_range(), which wasn't GPL-only either, and the
new interface is actually simpler and does more checking, so we
shouldn't unnecessarily discourage people from switching over.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index 4b4fc3a7ea48..aa8af0e20269 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1223,7 +1223,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *
 		return -EINVAL;
 	return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
 }
-EXPORT_SYMBOL_GPL(vm_insert_page);
+EXPORT_SYMBOL(vm_insert_page);
 
 /*
  * Somebody does a pfn remapping that doesn't actually work as a vma.
-- 
cgit v1.2.2