aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHugh Dickins <hugh@veritas.com>2005-10-29 21:16:33 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-10-30 00:40:41 -0400
commitdeceb6cd17e6dfafe4c4f81b1b4153bc41b2cb70 (patch)
tree2a722f50e8edef8609a49f65bfcb222e499c44cc
parentc34d1b4d165c67b966bca4aba026443d7ff161eb (diff)
[PATCH] mm: follow_page with inner ptlock
Final step in pushing down common core's page_table_lock. follow_page no longer wants caller to hold page_table_lock, uses pte_offset_map_lock itself; and so no page_table_lock is taken in get_user_pages itself. But get_user_pages (and get_futex_key) do then need follow_page to pin the page for them: take Daniel's suggestion of bitflags to follow_page. Need one for WRITE, another for TOUCH (it was the accessed flag before: vanished along with check_user_page_readable, but surely get_numa_maps is wrong to mark every page it finds as accessed), another for GET. And another, ANON to dispose of untouched_anonymous_page: it seems silly for that to descend a second time, let follow_page observe if there was no page table and return ZERO_PAGE if so. Fix minor bug in that: check VM_LOCKED - make_pages_present ought to make readonly anonymous present. Give get_numa_maps a cond_resched while we're there. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--include/linux/mm.h20
-rw-r--r--kernel/futex.c6
-rw-r--r--mm/memory.c152
-rw-r--r--mm/nommu.c3
5 files changed, 88 insertions, 96 deletions
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7e5e7ec2e36..d2fa42006d8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -419,7 +419,6 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
419 for_each_node(i) 419 for_each_node(i)
420 md->node[i] =0; 420 md->node[i] =0;
421 421
422 spin_lock(&mm->page_table_lock);
423 for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { 422 for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
424 page = follow_page(mm, vaddr, 0); 423 page = follow_page(mm, vaddr, 0);
425 if (page) { 424 if (page) {
@@ -434,8 +433,8 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
434 md->anon++; 433 md->anon++;
435 md->node[page_to_nid(page)]++; 434 md->node[page_to_nid(page)]++;
436 } 435 }
436 cond_resched();
437 } 437 }
438 spin_unlock(&mm->page_table_lock);
439 return md; 438 return md;
440} 439}
441 440
diff --git a/include/linux/mm.h b/include/linux/mm.h
index aa8de20e2e8..e8d1424153b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -938,14 +938,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
938 return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; 938 return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
939} 939}
940 940
941extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); 941struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
942 942struct page *vmalloc_to_page(void *addr);
943extern struct page * vmalloc_to_page(void *addr); 943unsigned long vmalloc_to_pfn(void *addr);
944extern unsigned long vmalloc_to_pfn(void *addr); 944int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
945extern struct page * follow_page(struct mm_struct *mm, unsigned long address, 945 unsigned long pfn, unsigned long size, pgprot_t);
946 int write); 946
947int remap_pfn_range(struct vm_area_struct *, unsigned long, 947struct page *follow_page(struct mm_struct *, unsigned long address,
948 unsigned long, unsigned long, pgprot_t); 948 unsigned int foll_flags);
949#define FOLL_WRITE 0x01 /* check pte is writable */
950#define FOLL_TOUCH 0x02 /* mark page accessed */
951#define FOLL_GET 0x04 /* do get_page on page */
952#define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */
949 953
950#ifdef CONFIG_PROC_FS 954#ifdef CONFIG_PROC_FS
951void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); 955void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
diff --git a/kernel/futex.c b/kernel/futex.c
index ca05fe6a70b..3b4d5ad44cc 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -205,15 +205,13 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
205 /* 205 /*
206 * Do a quick atomic lookup first - this is the fastpath. 206 * Do a quick atomic lookup first - this is the fastpath.
207 */ 207 */
208 spin_lock(&current->mm->page_table_lock); 208 page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET);
209 page = follow_page(mm, uaddr, 0);
210 if (likely(page != NULL)) { 209 if (likely(page != NULL)) {
211 key->shared.pgoff = 210 key->shared.pgoff =
212 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 211 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
213 spin_unlock(&current->mm->page_table_lock); 212 put_page(page);
214 return 0; 213 return 0;
215 } 214 }
216 spin_unlock(&current->mm->page_table_lock);
217 215
218 /* 216 /*
219 * Do it the general way. 217 * Do it the general way.
diff --git a/mm/memory.c b/mm/memory.c
index 51f7c0a220d..8461e2dd91d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -807,86 +807,82 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
807 807
808/* 808/*
809 * Do a quick page-table lookup for a single page. 809 * Do a quick page-table lookup for a single page.
810 * mm->page_table_lock must be held.
811 */ 810 */
812struct page *follow_page(struct mm_struct *mm, unsigned long address, int write) 811struct page *follow_page(struct mm_struct *mm, unsigned long address,
812 unsigned int flags)
813{ 813{
814 pgd_t *pgd; 814 pgd_t *pgd;
815 pud_t *pud; 815 pud_t *pud;
816 pmd_t *pmd; 816 pmd_t *pmd;
817 pte_t *ptep, pte; 817 pte_t *ptep, pte;
818 spinlock_t *ptl;
818 unsigned long pfn; 819 unsigned long pfn;
819 struct page *page; 820 struct page *page;
820 821
821 page = follow_huge_addr(mm, address, write); 822 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
822 if (! IS_ERR(page)) 823 if (!IS_ERR(page)) {
823 return page; 824 BUG_ON(flags & FOLL_GET);
825 goto out;
826 }
824 827
828 page = NULL;
825 pgd = pgd_offset(mm, address); 829 pgd = pgd_offset(mm, address);
826 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 830 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
827 goto out; 831 goto no_page_table;
828 832
829 pud = pud_offset(pgd, address); 833 pud = pud_offset(pgd, address);
830 if (pud_none(*pud) || unlikely(pud_bad(*pud))) 834 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
831 goto out; 835 goto no_page_table;
832 836
833 pmd = pmd_offset(pud, address); 837 pmd = pmd_offset(pud, address);
834 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 838 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
839 goto no_page_table;
840
841 if (pmd_huge(*pmd)) {
842 BUG_ON(flags & FOLL_GET);
843 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
835 goto out; 844 goto out;
836 if (pmd_huge(*pmd)) 845 }
837 return follow_huge_pmd(mm, address, pmd, write);
838 846
839 ptep = pte_offset_map(pmd, address); 847 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
840 if (!ptep) 848 if (!ptep)
841 goto out; 849 goto out;
842 850
843 pte = *ptep; 851 pte = *ptep;
844 pte_unmap(ptep); 852 if (!pte_present(pte))
845 if (pte_present(pte)) { 853 goto unlock;
846 if (write && !pte_write(pte)) 854 if ((flags & FOLL_WRITE) && !pte_write(pte))
847 goto out; 855 goto unlock;
848 pfn = pte_pfn(pte); 856 pfn = pte_pfn(pte);
849 if (pfn_valid(pfn)) { 857 if (!pfn_valid(pfn))
850 page = pfn_to_page(pfn); 858 goto unlock;
851 if (write && !pte_dirty(pte) &&!PageDirty(page))
852 set_page_dirty(page);
853 mark_page_accessed(page);
854 return page;
855 }
856 }
857 859
860 page = pfn_to_page(pfn);
861 if (flags & FOLL_GET)
862 get_page(page);
863 if (flags & FOLL_TOUCH) {
864 if ((flags & FOLL_WRITE) &&
865 !pte_dirty(pte) && !PageDirty(page))
866 set_page_dirty(page);
867 mark_page_accessed(page);
868 }
869unlock:
870 pte_unmap_unlock(ptep, ptl);
858out: 871out:
859 return NULL; 872 return page;
860}
861
862static inline int
863untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
864 unsigned long address)
865{
866 pgd_t *pgd;
867 pud_t *pud;
868 pmd_t *pmd;
869
870 /* Check if the vma is for an anonymous mapping. */
871 if (vma->vm_ops && vma->vm_ops->nopage)
872 return 0;
873
874 /* Check if page directory entry exists. */
875 pgd = pgd_offset(mm, address);
876 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
877 return 1;
878
879 pud = pud_offset(pgd, address);
880 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
881 return 1;
882
883 /* Check if page middle directory entry exists. */
884 pmd = pmd_offset(pud, address);
885 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
886 return 1;
887 873
888 /* There is a pte slot for 'address' in 'mm'. */ 874no_page_table:
889 return 0; 875 /*
876 * When core dumping an enormous anonymous area that nobody
877 * has touched so far, we don't want to allocate page tables.
878 */
879 if (flags & FOLL_ANON) {
880 page = ZERO_PAGE(address);
881 if (flags & FOLL_GET)
882 get_page(page);
883 BUG_ON(flags & FOLL_WRITE);
884 }
885 return page;
890} 886}
891 887
892int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 888int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -894,18 +890,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
894 struct page **pages, struct vm_area_struct **vmas) 890 struct page **pages, struct vm_area_struct **vmas)
895{ 891{
896 int i; 892 int i;
897 unsigned int flags; 893 unsigned int vm_flags;
898 894
899 /* 895 /*
900 * Require read or write permissions. 896 * Require read or write permissions.
901 * If 'force' is set, we only require the "MAY" flags. 897 * If 'force' is set, we only require the "MAY" flags.
902 */ 898 */
903 flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 899 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
904 flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 900 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
905 i = 0; 901 i = 0;
906 902
907 do { 903 do {
908 struct vm_area_struct * vma; 904 struct vm_area_struct *vma;
905 unsigned int foll_flags;
909 906
910 vma = find_extend_vma(mm, start); 907 vma = find_extend_vma(mm, start);
911 if (!vma && in_gate_area(tsk, start)) { 908 if (!vma && in_gate_area(tsk, start)) {
@@ -946,7 +943,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
946 } 943 }
947 944
948 if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED)) 945 if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
949 || !(flags & vma->vm_flags)) 946 || !(vm_flags & vma->vm_flags))
950 return i ? : -EFAULT; 947 return i ? : -EFAULT;
951 948
952 if (is_vm_hugetlb_page(vma)) { 949 if (is_vm_hugetlb_page(vma)) {
@@ -954,29 +951,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
954 &start, &len, i); 951 &start, &len, i);
955 continue; 952 continue;
956 } 953 }
957 spin_lock(&mm->page_table_lock); 954
955 foll_flags = FOLL_TOUCH;
956 if (pages)
957 foll_flags |= FOLL_GET;
958 if (!write && !(vma->vm_flags & VM_LOCKED) &&
959 (!vma->vm_ops || !vma->vm_ops->nopage))
960 foll_flags |= FOLL_ANON;
961
958 do { 962 do {
959 int write_access = write;
960 struct page *page; 963 struct page *page;
961 964
962 cond_resched_lock(&mm->page_table_lock); 965 if (write)
963 while (!(page = follow_page(mm, start, write_access))) { 966 foll_flags |= FOLL_WRITE;
964 int ret;
965
966 /*
967 * Shortcut for anonymous pages. We don't want
968 * to force the creation of pages tables for
969 * insanely big anonymously mapped areas that
970 * nobody touched so far. This is important
971 * for doing a core dump for these mappings.
972 */
973 if (!write && untouched_anonymous_page(mm,vma,start)) {
974 page = ZERO_PAGE(start);
975 break;
976 }
977 spin_unlock(&mm->page_table_lock);
978 ret = __handle_mm_fault(mm, vma, start, write_access);
979 967
968 cond_resched();
969 while (!(page = follow_page(mm, start, foll_flags))) {
970 int ret;
971 ret = __handle_mm_fault(mm, vma, start,
972 foll_flags & FOLL_WRITE);
980 /* 973 /*
981 * The VM_FAULT_WRITE bit tells us that do_wp_page has 974 * The VM_FAULT_WRITE bit tells us that do_wp_page has
982 * broken COW when necessary, even if maybe_mkwrite 975 * broken COW when necessary, even if maybe_mkwrite
@@ -984,7 +977,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
984 * subsequent page lookups as if they were reads. 977 * subsequent page lookups as if they were reads.
985 */ 978 */
986 if (ret & VM_FAULT_WRITE) 979 if (ret & VM_FAULT_WRITE)
987 write_access = 0; 980 foll_flags &= ~FOLL_WRITE;
988 981
989 switch (ret & ~VM_FAULT_WRITE) { 982 switch (ret & ~VM_FAULT_WRITE) {
990 case VM_FAULT_MINOR: 983 case VM_FAULT_MINOR:
@@ -1000,12 +993,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1000 default: 993 default:
1001 BUG(); 994 BUG();
1002 } 995 }
1003 spin_lock(&mm->page_table_lock);
1004 } 996 }
1005 if (pages) { 997 if (pages) {
1006 pages[i] = page; 998 pages[i] = page;
1007 flush_dcache_page(page); 999 flush_dcache_page(page);
1008 page_cache_get(page);
1009 } 1000 }
1010 if (vmas) 1001 if (vmas)
1011 vmas[i] = vma; 1002 vmas[i] = vma;
@@ -1013,7 +1004,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1013 start += PAGE_SIZE; 1004 start += PAGE_SIZE;
1014 len--; 1005 len--;
1015 } while (len && start < vma->vm_end); 1006 } while (len && start < vma->vm_end);
1016 spin_unlock(&mm->page_table_lock);
1017 } while (len); 1007 } while (len);
1018 return i; 1008 return i;
1019} 1009}
diff --git a/mm/nommu.c b/mm/nommu.c
index dfb124ffb9b..d1e076a487c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1049,7 +1049,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1049 1049
1050EXPORT_SYMBOL(find_vma); 1050EXPORT_SYMBOL(find_vma);
1051 1051
1052struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write) 1052struct page *follow_page(struct mm_struct *mm, unsigned long address,
1053 unsigned int foll_flags)
1053{ 1054{
1054 return NULL; 1055 return NULL;
1055} 1056}