From 0d71d10a4252a3938e6b70189bc776171c02e076 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 23 Jul 2008 21:27:05 -0700 Subject: mm: remove nopfn There are no users of nopfn in the tree. Remove it. [hugh@veritas.com: fix build error] Signed-off-by: Nick Piggin Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 67 +++++++------------------------------------------------------ 1 file changed, 7 insertions(+), 60 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 2302d228fe04..46dbed4b7446 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1058,11 +1058,9 @@ static inline int use_zero_page(struct vm_area_struct *vma) if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) return 0; /* - * And if we have a fault or a nopfn routine, it's not an - * anonymous region. + * And if we have a fault routine, it's not an anonymous region. */ - return !vma->vm_ops || - (!vma->vm_ops->fault && !vma->vm_ops->nopfn); + return !vma->vm_ops || !vma->vm_ops->fault; } int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, @@ -1338,6 +1336,11 @@ out: * * This function should only be called from a vm_ops->fault handler, and * in that case the handler should return NULL. + * + * vma cannot be a COW mapping. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. */ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) @@ -2501,59 +2504,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } - -/* - * do_no_pfn() tries to create a new page mapping for a page without - * a struct_page backing it - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. - * - * It is expected that the ->nopfn handler always returns the same pfn - * for a given virtual mapping. - * - * Mark this `noinline' to prevent it from bloating the main pagefault code. - */ -static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access) -{ - spinlock_t *ptl; - pte_t entry; - unsigned long pfn; - - pte_unmap(page_table); - BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); - BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - - pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); - - BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); - - if (unlikely(pfn == NOPFN_OOM)) - return VM_FAULT_OOM; - else if (unlikely(pfn == NOPFN_SIGBUS)) - return VM_FAULT_SIGBUS; - else if (unlikely(pfn == NOPFN_REFAULT)) - return 0; - - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - - /* Only go through if we didn't race with anybody else... */ - if (pte_none(*page_table)) { - entry = pfn_pte(pfn, vma->vm_page_prot); - if (write_access) - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - set_pte_at(mm, address, page_table, entry); - } - pte_unmap_unlock(page_table, ptl); - return 0; -} - /* * Fault of a previously existing named mapping. Repopulate the pte * from the encoded file_pte if possible. This enables swappable @@ -2614,9 +2564,6 @@ static inline int handle_pte_fault(struct mm_struct *mm, if (likely(vma->vm_ops->fault)) return do_linear_fault(mm, vma, address, pte, pmd, write_access, entry); - if (unlikely(vma->vm_ops->nopfn)) - return do_no_pfn(mm, vma, address, pte, - pmd, write_access); } return do_anonymous_page(mm, vma, address, pte, pmd, write_access); -- cgit v1.2.2 From 28b2ee20c7cba812b6f2ccf6d722cf86d00a84dc Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 23 Jul 2008 21:27:05 -0700 Subject: access_process_vm device memory infrastructure In order to be able to debug things like the X server and programs using the PPC Cell SPUs, the debugger needs to be able to access device memory through ptrace and /proc/pid/mem. This patch: Add the generic_access_phys access function and put the hooks in place to allow access_process_vm to access device or PPC Cell SPU memory. [riel@redhat.com: Add documentation for the vm_ops->access function] Signed-off-by: Rik van Riel Signed-off-by: Benjamin Herrensmidt Cc: Dave Airlie Cc: Hugh Dickins Cc: Paul Mackerras Cc: Arnd Bergmann Acked-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 113 insertions(+), 18 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 46dbed4b7446..87350321e66f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2751,6 +2751,86 @@ int in_gate_area_no_task(unsigned long addr) #endif /* __HAVE_ARCH_GATE_AREA */ +#ifdef CONFIG_HAVE_IOREMAP_PROT +static resource_size_t follow_phys(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned long *prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + resource_size_t phys_addr = 0; + struct mm_struct *mm = vma->vm_mm; + + VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP))); + + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto no_page_table; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto no_page_table; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto no_page_table; + + /* We cannot handle huge page PFN maps. Luckily they don't exist. */ + if (pmd_huge(*pmd)) + goto no_page_table; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!ptep) + goto out; + + pte = *ptep; + if (!pte_present(pte)) + goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; + phys_addr = pte_pfn(pte); + phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ + + *prot = pgprot_val(pte_pgprot(pte)); + +unlock: + pte_unmap_unlock(ptep, ptl); +out: + return phys_addr; +no_page_table: + return 0; +} + +int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + resource_size_t phys_addr; + unsigned long prot = 0; + void *maddr; + int offset = addr & (PAGE_SIZE-1); + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return -EINVAL; + + phys_addr = follow_phys(vma, addr, write, &prot); + + if (!phys_addr) + return -EINVAL; + + maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); + if (write) + memcpy_toio(maddr + offset, buf, len); + else + memcpy_fromio(buf, maddr + offset, len); + iounmap(maddr); + + return len; +} +#endif + /* * Access another process' address space. * Source/target buffer must be kernel space, @@ -2760,7 +2840,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in { struct mm_struct *mm; struct vm_area_struct *vma; - struct page *page; void *old_buf = buf; mm = get_task_mm(tsk); @@ -2772,28 +2851,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in while (len) { int bytes, ret, offset; void *maddr; + struct page *page = NULL; ret = get_user_pages(tsk, mm, addr, 1, write, 1, &page, &vma); - if (ret <= 0) - break; - - bytes = len; - offset = addr & (PAGE_SIZE-1); - if (bytes > PAGE_SIZE-offset) - bytes = PAGE_SIZE-offset; - - maddr = kmap(page); - if (write) { - copy_to_user_page(vma, page, addr, - maddr + offset, buf, bytes); - set_page_dirty_lock(page); + if (ret <= 0) { + /* + * Check if this is a VM_IO | VM_PFNMAP VMA, which + * we can access using slightly different code. + */ +#ifdef CONFIG_HAVE_IOREMAP_PROT + vma = find_vma(mm, addr); + if (!vma) + break; + if (vma->vm_ops && vma->vm_ops->access) + ret = vma->vm_ops->access(vma, addr, buf, + len, write); + if (ret <= 0) +#endif + break; + bytes = ret; } else { - copy_from_user_page(vma, page, addr, - buf, maddr + offset, bytes); + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + page_cache_release(page); } - kunmap(page); - page_cache_release(page); len -= bytes; buf += bytes; addr += bytes; -- cgit v1.2.2 From 42b7772812d15b86543a23b82bd6070eef9a08b1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 23 Jul 2008 21:27:10 -0700 Subject: mm: remove double indirection on tlb parameter to free_pgd_range() & Co The double indirection here is not needed anywhere and hence (at least) confusing. Signed-off-by: Jan Beulich Cc: Hugh Dickins Cc: Nick Piggin Cc: Christoph Lameter Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "Luck, Tony" Cc: Paul Mundt Cc: "David S. Miller" Acked-by: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 87350321e66f..82f3f1c5cf17 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -61,6 +61,8 @@ #include #include +#include "internal.h" + #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; @@ -211,7 +213,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, * * Must be called with pagetable lock held. */ -void free_pgd_range(struct mmu_gather **tlb, +void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { @@ -262,16 +264,16 @@ void free_pgd_range(struct mmu_gather **tlb, return; start = addr; - pgd = pgd_offset((*tlb)->mm, addr); + pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - free_pud_range(*tlb, pgd, addr, next, floor, ceiling); + free_pud_range(tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); } -void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, +void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling) { while (vma) { -- cgit v1.2.2 From 04f2cbe35699d22dbf428373682ead85ca1240f5 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 23 Jul 2008 21:27:25 -0700 Subject: hugetlb: guarantee that COW faults for a process that called mmap(MAP_PRIVATE) on hugetlbfs will succeed After patch 2 in this series, a process that successfully calls mmap() for a MAP_PRIVATE mapping will be guaranteed to successfully fault until a process calls fork(). At that point, the next write fault from the parent could fail due to COW if the child still has a reference. We only reserve pages for the parent but a copy must be made to avoid leaking data from the parent to the child after fork(). Reserves could be taken for both parent and child at fork time to guarantee faults but if the mapping is large it is highly likely we will not have sufficient pages for the reservation, and it is common to fork only to exec() immediatly after. A failure here would be very undesirable. Note that the current behaviour of mainline with MAP_PRIVATE pages is pretty bad. The following situation is allowed to occur today. 1. Process calls mmap(MAP_PRIVATE) 2. Process calls mlock() to fault all pages and makes sure it succeeds 3. Process forks() 4. Process writes to MAP_PRIVATE mapping while child still exists 5. If the COW fails at this point, the process gets SIGKILLed even though it had taken care to ensure the pages existed This patch improves the situation by guaranteeing the reliability of the process that successfully calls mmap(). When the parent performs COW, it will try to satisfy the allocation without using reserves. If that fails the parent will steal the page leaving any children without a page. Faults from the child after that point will result in failure. If the child COW happens first, an attempt will be made to allocate the page without reserves and the child will get SIGKILLed on failure. To summarise the new behaviour: 1. If the original mapper performs COW on a private mapping with multiple references, it will attempt to allocate a hugepage from the pool or the buddy allocator without using the existing reserves. On fail, VMAs mapping the same area are traversed and the page being COW'd is unmapped where found. It will then steal the original page as the last mapper in the normal way. 2. The VMAs the pages were unmapped from are flagged to note that pages with data no longer exist. Future no-page faults on those VMAs will terminate the process as otherwise it would appear that data was corrupted. A warning is printed to the console that this situation occured. 2. If the child performs COW first, it will attempt to satisfy the COW from the pool if there are enough pages or via the buddy allocator if overcommit is allowed and the buddy allocator can satisfy the request. If it fails, the child will be killed. If the pool is large enough, existing applications will not notice that the reserves were a factor. Existing applications depending on the no-reserves been set are unlikely to exist as for much of the history of hugetlbfs, pages were prefaulted at mmap(), allocating the pages at that point or failing the mmap(). [npiggin@suse.de: fix CONFIG_HUGETLB=n build] Signed-off-by: Mel Gorman Acked-by: Adam Litke Cc: Andy Whitcroft Cc: William Lee Irwin III Cc: Hugh Dickins Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 82f3f1c5cf17..72932489a082 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -901,7 +901,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, } if (unlikely(is_vm_hugetlb_page(vma))) { - unmap_hugepage_range(vma, start, end); + unmap_hugepage_range(vma, start, end, NULL); zap_work -= (end - start) / (HPAGE_SIZE / PAGE_SIZE); start = end; -- cgit v1.2.2 From a5516438959d90b071ff0a484ce4f3f523dc3152 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:41 -0700 Subject: hugetlb: modular state for hugetlb page size The goal of this patchset is to support multiple hugetlb page sizes. This is achieved by introducing a new struct hstate structure, which encapsulates the important hugetlb state and constants (eg. huge page size, number of huge pages currently allocated, etc). The hstate structure is then passed around the code which requires these fields, they will do the right thing regardless of the exact hstate they are operating on. This patch adds the hstate structure, with a single global instance of it (default_hstate), and does the basic work of converting hugetlb to use the hstate. Future patches will add more hstate structures to allow for different hugetlbfs mounts to have different page sizes. [akpm@linux-foundation.org: coding-style fixes] Acked-by: Adam Litke Acked-by: Nishanth Aravamudan Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 72932489a082..c1c1d6d8c22b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -903,7 +903,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, if (unlikely(is_vm_hugetlb_page(vma))) { unmap_hugepage_range(vma, start, end, NULL); zap_work -= (end - start) / - (HPAGE_SIZE / PAGE_SIZE); + pages_per_huge_page(hstate_vma(vma)); start = end; } else start = unmap_page_range(*tlbp, vma, -- cgit v1.2.2 From a137e1cc6d6e7d315fef03962a2a5a113348b13b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:43 -0700 Subject: hugetlbfs: per mount huge page sizes Add the ability to configure the hugetlb hstate used on a per mount basis. - Add a new pagesize= option to the hugetlbfs mount that allows setting the page size - This option causes the mount code to find the hstate corresponding to the specified size, and sets up a pointer to the hstate in the mount's superblock. - Change the hstate accessors to use this information rather than the global_hstate they were using (requires a slight change in mm/memory.c so we don't NULL deref in the error-unmap path -- see comments). [np: take hstate out of hugetlbfs inode and vma->vm_private_data] Acked-by: Adam Litke Acked-by: Nishanth Aravamudan Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index c1c1d6d8c22b..02fc6b1047b0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -901,9 +901,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, } if (unlikely(is_vm_hugetlb_page(vma))) { - unmap_hugepage_range(vma, start, end, NULL); - zap_work -= (end - start) / + /* + * It is undesirable to test vma->vm_file as it + * should be non-null for valid hugetlb area. + * However, vm_file will be NULL in the error + * cleanup path of do_mmap_pgoff. When + * hugetlbfs ->mmap method fails, + * do_mmap_pgoff() nullifies vma->vm_file + * before calling this function to clean up. + * Since no pte has actually been setup, it is + * safe to do nothing in this case. + */ + if (vma->vm_file) { + unmap_hugepage_range(vma, start, end, NULL); + zap_work -= (end - start) / pages_per_huge_page(hstate_vma(vma)); + } + start = end; } else start = unmap_page_range(*tlbp, vma, -- cgit v1.2.2 From ceb868796181dc95ea01a110e123afd391639873 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:50 -0700 Subject: hugetlb: introduce pud_huge Straight forward extensions for huge pages located in the PUD instead of PMDs. Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 02fc6b1047b0..262e3eb6601a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -998,19 +998,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, goto no_page_table; pud = pud_offset(pgd, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) + if (pud_none(*pud)) + goto no_page_table; + if (pud_huge(*pud)) { + BUG_ON(flags & FOLL_GET); + page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); + goto out; + } + if (unlikely(pud_bad(*pud))) goto no_page_table; - + pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) goto no_page_table; - if (pmd_huge(*pmd)) { BUG_ON(flags & FOLL_GET); page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; } - if (unlikely(pmd_bad(*pmd))) goto no_page_table; @@ -1567,6 +1572,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long next; int err; + BUG_ON(pud_huge(*pud)); + pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; -- cgit v1.2.2