diff options
author | Hugh Dickins <hugh@veritas.com> | 2005-10-29 21:16:33 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-10-30 00:40:41 -0400 |
commit | deceb6cd17e6dfafe4c4f81b1b4153bc41b2cb70 (patch) | |
tree | 2a722f50e8edef8609a49f65bfcb222e499c44cc | |
parent | c34d1b4d165c67b966bca4aba026443d7ff161eb (diff) |
[PATCH] mm: follow_page with inner ptlock
Final step in pushing down common core's page_table_lock. follow_page no
longer wants caller to hold page_table_lock, uses pte_offset_map_lock itself;
and so no page_table_lock is taken in get_user_pages itself.
But get_user_pages (and get_futex_key) do then need follow_page to pin the
page for them: take Daniel's suggestion of bitflags to follow_page.
Need one for WRITE, another for TOUCH (it was the accessed flag before:
vanished along with check_user_page_readable, but surely get_numa_maps is
wrong to mark every page it finds as accessed), another for GET.
And another, ANON to dispose of untouched_anonymous_page: it seems silly for
that to descend a second time, let follow_page observe if there was no page
table and return ZERO_PAGE if so. Fix minor bug in that: check VM_LOCKED -
make_pages_present ought to make readonly anonymous present.
Give get_numa_maps a cond_resched while we're there.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | fs/proc/task_mmu.c | 3 | ||||
-rw-r--r-- | include/linux/mm.h | 20 | ||||
-rw-r--r-- | kernel/futex.c | 6 | ||||
-rw-r--r-- | mm/memory.c | 152 | ||||
-rw-r--r-- | mm/nommu.c | 3 |
5 files changed, 88 insertions, 96 deletions
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7e5e7ec2e36d..d2fa42006d8f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -419,7 +419,6 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) | |||
419 | for_each_node(i) | 419 | for_each_node(i) |
420 | md->node[i] =0; | 420 | md->node[i] =0; |
421 | 421 | ||
422 | spin_lock(&mm->page_table_lock); | ||
423 | for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { | 422 | for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { |
424 | page = follow_page(mm, vaddr, 0); | 423 | page = follow_page(mm, vaddr, 0); |
425 | if (page) { | 424 | if (page) { |
@@ -434,8 +433,8 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) | |||
434 | md->anon++; | 433 | md->anon++; |
435 | md->node[page_to_nid(page)]++; | 434 | md->node[page_to_nid(page)]++; |
436 | } | 435 | } |
436 | cond_resched(); | ||
437 | } | 437 | } |
438 | spin_unlock(&mm->page_table_lock); | ||
439 | return md; | 438 | return md; |
440 | } | 439 | } |
441 | 440 | ||
diff --git a/include/linux/mm.h b/include/linux/mm.h index aa8de20e2e80..e8d1424153bb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -938,14 +938,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) | |||
938 | return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; | 938 | return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; |
939 | } | 939 | } |
940 | 940 | ||
941 | extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); | 941 | struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); |
942 | 942 | struct page *vmalloc_to_page(void *addr); | |
943 | extern struct page * vmalloc_to_page(void *addr); | 943 | unsigned long vmalloc_to_pfn(void *addr); |
944 | extern unsigned long vmalloc_to_pfn(void *addr); | 944 | int remap_pfn_range(struct vm_area_struct *, unsigned long addr, |
945 | extern struct page * follow_page(struct mm_struct *mm, unsigned long address, | 945 | unsigned long pfn, unsigned long size, pgprot_t); |
946 | int write); | 946 | |
947 | int remap_pfn_range(struct vm_area_struct *, unsigned long, | 947 | struct page *follow_page(struct mm_struct *, unsigned long address, |
948 | unsigned long, unsigned long, pgprot_t); | 948 | unsigned int foll_flags); |
949 | #define FOLL_WRITE 0x01 /* check pte is writable */ | ||
950 | #define FOLL_TOUCH 0x02 /* mark page accessed */ | ||
951 | #define FOLL_GET 0x04 /* do get_page on page */ | ||
952 | #define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */ | ||
949 | 953 | ||
950 | #ifdef CONFIG_PROC_FS | 954 | #ifdef CONFIG_PROC_FS |
951 | void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); | 955 | void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); |
diff --git a/kernel/futex.c b/kernel/futex.c index ca05fe6a70b2..3b4d5ad44cc6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -205,15 +205,13 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
205 | /* | 205 | /* |
206 | * Do a quick atomic lookup first - this is the fastpath. | 206 | * Do a quick atomic lookup first - this is the fastpath. |
207 | */ | 207 | */ |
208 | spin_lock(¤t->mm->page_table_lock); | 208 | page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET); |
209 | page = follow_page(mm, uaddr, 0); | ||
210 | if (likely(page != NULL)) { | 209 | if (likely(page != NULL)) { |
211 | key->shared.pgoff = | 210 | key->shared.pgoff = |
212 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 211 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
213 | spin_unlock(¤t->mm->page_table_lock); | 212 | put_page(page); |
214 | return 0; | 213 | return 0; |
215 | } | 214 | } |
216 | spin_unlock(¤t->mm->page_table_lock); | ||
217 | 215 | ||
218 | /* | 216 | /* |
219 | * Do it the general way. | 217 | * Do it the general way. |
diff --git a/mm/memory.c b/mm/memory.c index 51f7c0a220d4..8461e2dd91d7 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -807,86 +807,82 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, | |||
807 | 807 | ||
808 | /* | 808 | /* |
809 | * Do a quick page-table lookup for a single page. | 809 | * Do a quick page-table lookup for a single page. |
810 | * mm->page_table_lock must be held. | ||
811 | */ | 810 | */ |
812 | struct page *follow_page(struct mm_struct *mm, unsigned long address, int write) | 811 | struct page *follow_page(struct mm_struct *mm, unsigned long address, |
812 | unsigned int flags) | ||
813 | { | 813 | { |
814 | pgd_t *pgd; | 814 | pgd_t *pgd; |
815 | pud_t *pud; | 815 | pud_t *pud; |
816 | pmd_t *pmd; | 816 | pmd_t *pmd; |
817 | pte_t *ptep, pte; | 817 | pte_t *ptep, pte; |
818 | spinlock_t *ptl; | ||
818 | unsigned long pfn; | 819 | unsigned long pfn; |
819 | struct page *page; | 820 | struct page *page; |
820 | 821 | ||
821 | page = follow_huge_addr(mm, address, write); | 822 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); |
822 | if (! IS_ERR(page)) | 823 | if (!IS_ERR(page)) { |
823 | return page; | 824 | BUG_ON(flags & FOLL_GET); |
825 | goto out; | ||
826 | } | ||
824 | 827 | ||
828 | page = NULL; | ||
825 | pgd = pgd_offset(mm, address); | 829 | pgd = pgd_offset(mm, address); |
826 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | 830 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) |
827 | goto out; | 831 | goto no_page_table; |
828 | 832 | ||
829 | pud = pud_offset(pgd, address); | 833 | pud = pud_offset(pgd, address); |
830 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | 834 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) |
831 | goto out; | 835 | goto no_page_table; |
832 | 836 | ||
833 | pmd = pmd_offset(pud, address); | 837 | pmd = pmd_offset(pud, address); |
834 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | 838 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) |
839 | goto no_page_table; | ||
840 | |||
841 | if (pmd_huge(*pmd)) { | ||
842 | BUG_ON(flags & FOLL_GET); | ||
843 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | ||
835 | goto out; | 844 | goto out; |
836 | if (pmd_huge(*pmd)) | 845 | } |
837 | return follow_huge_pmd(mm, address, pmd, write); | ||
838 | 846 | ||
839 | ptep = pte_offset_map(pmd, address); | 847 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); |
840 | if (!ptep) | 848 | if (!ptep) |
841 | goto out; | 849 | goto out; |
842 | 850 | ||
843 | pte = *ptep; | 851 | pte = *ptep; |
844 | pte_unmap(ptep); | 852 | if (!pte_present(pte)) |
845 | if (pte_present(pte)) { | 853 | goto unlock; |
846 | if (write && !pte_write(pte)) | 854 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
847 | goto out; | 855 | goto unlock; |
848 | pfn = pte_pfn(pte); | 856 | pfn = pte_pfn(pte); |
849 | if (pfn_valid(pfn)) { | 857 | if (!pfn_valid(pfn)) |
850 | page = pfn_to_page(pfn); | 858 | goto unlock; |
851 | if (write && !pte_dirty(pte) &&!PageDirty(page)) | ||
852 | set_page_dirty(page); | ||
853 | mark_page_accessed(page); | ||
854 | return page; | ||
855 | } | ||
856 | } | ||
857 | 859 | ||
860 | page = pfn_to_page(pfn); | ||
861 | if (flags & FOLL_GET) | ||
862 | get_page(page); | ||
863 | if (flags & FOLL_TOUCH) { | ||
864 | if ((flags & FOLL_WRITE) && | ||
865 | !pte_dirty(pte) && !PageDirty(page)) | ||
866 | set_page_dirty(page); | ||
867 | mark_page_accessed(page); | ||
868 | } | ||
869 | unlock: | ||
870 | pte_unmap_unlock(ptep, ptl); | ||
858 | out: | 871 | out: |
859 | return NULL; | 872 | return page; |
860 | } | ||
861 | |||
862 | static inline int | ||
863 | untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, | ||
864 | unsigned long address) | ||
865 | { | ||
866 | pgd_t *pgd; | ||
867 | pud_t *pud; | ||
868 | pmd_t *pmd; | ||
869 | |||
870 | /* Check if the vma is for an anonymous mapping. */ | ||
871 | if (vma->vm_ops && vma->vm_ops->nopage) | ||
872 | return 0; | ||
873 | |||
874 | /* Check if page directory entry exists. */ | ||
875 | pgd = pgd_offset(mm, address); | ||
876 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | ||
877 | return 1; | ||
878 | |||
879 | pud = pud_offset(pgd, address); | ||
880 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | ||
881 | return 1; | ||
882 | |||
883 | /* Check if page middle directory entry exists. */ | ||
884 | pmd = pmd_offset(pud, address); | ||
885 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | ||
886 | return 1; | ||
887 | 873 | ||
888 | /* There is a pte slot for 'address' in 'mm'. */ | 874 | no_page_table: |
889 | return 0; | 875 | /* |
876 | * When core dumping an enormous anonymous area that nobody | ||
877 | * has touched so far, we don't want to allocate page tables. | ||
878 | */ | ||
879 | if (flags & FOLL_ANON) { | ||
880 | page = ZERO_PAGE(address); | ||
881 | if (flags & FOLL_GET) | ||
882 | get_page(page); | ||
883 | BUG_ON(flags & FOLL_WRITE); | ||
884 | } | ||
885 | return page; | ||
890 | } | 886 | } |
891 | 887 | ||
892 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 888 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
@@ -894,18 +890,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
894 | struct page **pages, struct vm_area_struct **vmas) | 890 | struct page **pages, struct vm_area_struct **vmas) |
895 | { | 891 | { |
896 | int i; | 892 | int i; |
897 | unsigned int flags; | 893 | unsigned int vm_flags; |
898 | 894 | ||
899 | /* | 895 | /* |
900 | * Require read or write permissions. | 896 | * Require read or write permissions. |
901 | * If 'force' is set, we only require the "MAY" flags. | 897 | * If 'force' is set, we only require the "MAY" flags. |
902 | */ | 898 | */ |
903 | flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); | 899 | vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); |
904 | flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | 900 | vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); |
905 | i = 0; | 901 | i = 0; |
906 | 902 | ||
907 | do { | 903 | do { |
908 | struct vm_area_struct * vma; | 904 | struct vm_area_struct *vma; |
905 | unsigned int foll_flags; | ||
909 | 906 | ||
910 | vma = find_extend_vma(mm, start); | 907 | vma = find_extend_vma(mm, start); |
911 | if (!vma && in_gate_area(tsk, start)) { | 908 | if (!vma && in_gate_area(tsk, start)) { |
@@ -946,7 +943,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
946 | } | 943 | } |
947 | 944 | ||
948 | if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED)) | 945 | if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED)) |
949 | || !(flags & vma->vm_flags)) | 946 | || !(vm_flags & vma->vm_flags)) |
950 | return i ? : -EFAULT; | 947 | return i ? : -EFAULT; |
951 | 948 | ||
952 | if (is_vm_hugetlb_page(vma)) { | 949 | if (is_vm_hugetlb_page(vma)) { |
@@ -954,29 +951,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
954 | &start, &len, i); | 951 | &start, &len, i); |
955 | continue; | 952 | continue; |
956 | } | 953 | } |
957 | spin_lock(&mm->page_table_lock); | 954 | |
955 | foll_flags = FOLL_TOUCH; | ||
956 | if (pages) | ||
957 | foll_flags |= FOLL_GET; | ||
958 | if (!write && !(vma->vm_flags & VM_LOCKED) && | ||
959 | (!vma->vm_ops || !vma->vm_ops->nopage)) | ||
960 | foll_flags |= FOLL_ANON; | ||
961 | |||
958 | do { | 962 | do { |
959 | int write_access = write; | ||
960 | struct page *page; | 963 | struct page *page; |
961 | 964 | ||
962 | cond_resched_lock(&mm->page_table_lock); | 965 | if (write) |
963 | while (!(page = follow_page(mm, start, write_access))) { | 966 | foll_flags |= FOLL_WRITE; |
964 | int ret; | ||
965 | |||
966 | /* | ||
967 | * Shortcut for anonymous pages. We don't want | ||
968 | * to force the creation of pages tables for | ||
969 | * insanely big anonymously mapped areas that | ||
970 | * nobody touched so far. This is important | ||
971 | * for doing a core dump for these mappings. | ||
972 | */ | ||
973 | if (!write && untouched_anonymous_page(mm,vma,start)) { | ||
974 | page = ZERO_PAGE(start); | ||
975 | break; | ||
976 | } | ||
977 | spin_unlock(&mm->page_table_lock); | ||
978 | ret = __handle_mm_fault(mm, vma, start, write_access); | ||
979 | 967 | ||
968 | cond_resched(); | ||
969 | while (!(page = follow_page(mm, start, foll_flags))) { | ||
970 | int ret; | ||
971 | ret = __handle_mm_fault(mm, vma, start, | ||
972 | foll_flags & FOLL_WRITE); | ||
980 | /* | 973 | /* |
981 | * The VM_FAULT_WRITE bit tells us that do_wp_page has | 974 | * The VM_FAULT_WRITE bit tells us that do_wp_page has |
982 | * broken COW when necessary, even if maybe_mkwrite | 975 | * broken COW when necessary, even if maybe_mkwrite |
@@ -984,7 +977,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
984 | * subsequent page lookups as if they were reads. | 977 | * subsequent page lookups as if they were reads. |
985 | */ | 978 | */ |
986 | if (ret & VM_FAULT_WRITE) | 979 | if (ret & VM_FAULT_WRITE) |
987 | write_access = 0; | 980 | foll_flags &= ~FOLL_WRITE; |
988 | 981 | ||
989 | switch (ret & ~VM_FAULT_WRITE) { | 982 | switch (ret & ~VM_FAULT_WRITE) { |
990 | case VM_FAULT_MINOR: | 983 | case VM_FAULT_MINOR: |
@@ -1000,12 +993,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1000 | default: | 993 | default: |
1001 | BUG(); | 994 | BUG(); |
1002 | } | 995 | } |
1003 | spin_lock(&mm->page_table_lock); | ||
1004 | } | 996 | } |
1005 | if (pages) { | 997 | if (pages) { |
1006 | pages[i] = page; | 998 | pages[i] = page; |
1007 | flush_dcache_page(page); | 999 | flush_dcache_page(page); |
1008 | page_cache_get(page); | ||
1009 | } | 1000 | } |
1010 | if (vmas) | 1001 | if (vmas) |
1011 | vmas[i] = vma; | 1002 | vmas[i] = vma; |
@@ -1013,7 +1004,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1013 | start += PAGE_SIZE; | 1004 | start += PAGE_SIZE; |
1014 | len--; | 1005 | len--; |
1015 | } while (len && start < vma->vm_end); | 1006 | } while (len && start < vma->vm_end); |
1016 | spin_unlock(&mm->page_table_lock); | ||
1017 | } while (len); | 1007 | } while (len); |
1018 | return i; | 1008 | return i; |
1019 | } | 1009 | } |
diff --git a/mm/nommu.c b/mm/nommu.c index dfb124ffb9be..d1e076a487cb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1049,7 +1049,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | |||
1049 | 1049 | ||
1050 | EXPORT_SYMBOL(find_vma); | 1050 | EXPORT_SYMBOL(find_vma); |
1051 | 1051 | ||
1052 | struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write) | 1052 | struct page *follow_page(struct mm_struct *mm, unsigned long address, |
1053 | unsigned int foll_flags) | ||
1053 | { | 1054 | { |
1054 | return NULL; | 1055 | return NULL; |
1055 | } | 1056 | } |