diff options
author | Hugh Dickins <hugh@veritas.com> | 2005-10-29 21:16:33 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-10-30 00:40:41 -0400 |
commit | deceb6cd17e6dfafe4c4f81b1b4153bc41b2cb70 (patch) | |
tree | 2a722f50e8edef8609a49f65bfcb222e499c44cc /mm/memory.c | |
parent | c34d1b4d165c67b966bca4aba026443d7ff161eb (diff) |
[PATCH] mm: follow_page with inner ptlock
Final step in pushing down common core's page_table_lock. follow_page no
longer wants caller to hold page_table_lock, uses pte_offset_map_lock itself;
and so no page_table_lock is taken in get_user_pages itself.
But get_user_pages (and get_futex_key) do then need follow_page to pin the
page for them: take Daniel's suggestion of bitflags to follow_page.
Need one for WRITE, another for TOUCH (it was the accessed flag before:
vanished along with check_user_page_readable, but surely get_numa_maps is
wrong to mark every page it finds as accessed), another for GET.
And another, ANON to dispose of untouched_anonymous_page: it seems silly for
that to descend a second time, let follow_page observe if there was no page
table and return ZERO_PAGE if so. Fix minor bug in that: check VM_LOCKED -
make_pages_present ought to make readonly anonymous present.
Give get_numa_maps a cond_resched while we're there.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 152 |
1 files changed, 71 insertions, 81 deletions
diff --git a/mm/memory.c b/mm/memory.c index 51f7c0a220d4..8461e2dd91d7 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -807,86 +807,82 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, | |||
807 | 807 | ||
808 | /* | 808 | /* |
809 | * Do a quick page-table lookup for a single page. | 809 | * Do a quick page-table lookup for a single page. |
810 | * mm->page_table_lock must be held. | ||
811 | */ | 810 | */ |
812 | struct page *follow_page(struct mm_struct *mm, unsigned long address, int write) | 811 | struct page *follow_page(struct mm_struct *mm, unsigned long address, |
812 | unsigned int flags) | ||
813 | { | 813 | { |
814 | pgd_t *pgd; | 814 | pgd_t *pgd; |
815 | pud_t *pud; | 815 | pud_t *pud; |
816 | pmd_t *pmd; | 816 | pmd_t *pmd; |
817 | pte_t *ptep, pte; | 817 | pte_t *ptep, pte; |
818 | spinlock_t *ptl; | ||
818 | unsigned long pfn; | 819 | unsigned long pfn; |
819 | struct page *page; | 820 | struct page *page; |
820 | 821 | ||
821 | page = follow_huge_addr(mm, address, write); | 822 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); |
822 | if (! IS_ERR(page)) | 823 | if (!IS_ERR(page)) { |
823 | return page; | 824 | BUG_ON(flags & FOLL_GET); |
825 | goto out; | ||
826 | } | ||
824 | 827 | ||
828 | page = NULL; | ||
825 | pgd = pgd_offset(mm, address); | 829 | pgd = pgd_offset(mm, address); |
826 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | 830 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) |
827 | goto out; | 831 | goto no_page_table; |
828 | 832 | ||
829 | pud = pud_offset(pgd, address); | 833 | pud = pud_offset(pgd, address); |
830 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | 834 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) |
831 | goto out; | 835 | goto no_page_table; |
832 | 836 | ||
833 | pmd = pmd_offset(pud, address); | 837 | pmd = pmd_offset(pud, address); |
834 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | 838 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) |
839 | goto no_page_table; | ||
840 | |||
841 | if (pmd_huge(*pmd)) { | ||
842 | BUG_ON(flags & FOLL_GET); | ||
843 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | ||
835 | goto out; | 844 | goto out; |
836 | if (pmd_huge(*pmd)) | 845 | } |
837 | return follow_huge_pmd(mm, address, pmd, write); | ||
838 | 846 | ||
839 | ptep = pte_offset_map(pmd, address); | 847 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); |
840 | if (!ptep) | 848 | if (!ptep) |
841 | goto out; | 849 | goto out; |
842 | 850 | ||
843 | pte = *ptep; | 851 | pte = *ptep; |
844 | pte_unmap(ptep); | 852 | if (!pte_present(pte)) |
845 | if (pte_present(pte)) { | 853 | goto unlock; |
846 | if (write && !pte_write(pte)) | 854 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
847 | goto out; | 855 | goto unlock; |
848 | pfn = pte_pfn(pte); | 856 | pfn = pte_pfn(pte); |
849 | if (pfn_valid(pfn)) { | 857 | if (!pfn_valid(pfn)) |
850 | page = pfn_to_page(pfn); | 858 | goto unlock; |
851 | if (write && !pte_dirty(pte) &&!PageDirty(page)) | ||
852 | set_page_dirty(page); | ||
853 | mark_page_accessed(page); | ||
854 | return page; | ||
855 | } | ||
856 | } | ||
857 | 859 | ||
860 | page = pfn_to_page(pfn); | ||
861 | if (flags & FOLL_GET) | ||
862 | get_page(page); | ||
863 | if (flags & FOLL_TOUCH) { | ||
864 | if ((flags & FOLL_WRITE) && | ||
865 | !pte_dirty(pte) && !PageDirty(page)) | ||
866 | set_page_dirty(page); | ||
867 | mark_page_accessed(page); | ||
868 | } | ||
869 | unlock: | ||
870 | pte_unmap_unlock(ptep, ptl); | ||
858 | out: | 871 | out: |
859 | return NULL; | 872 | return page; |
860 | } | ||
861 | |||
862 | static inline int | ||
863 | untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, | ||
864 | unsigned long address) | ||
865 | { | ||
866 | pgd_t *pgd; | ||
867 | pud_t *pud; | ||
868 | pmd_t *pmd; | ||
869 | |||
870 | /* Check if the vma is for an anonymous mapping. */ | ||
871 | if (vma->vm_ops && vma->vm_ops->nopage) | ||
872 | return 0; | ||
873 | |||
874 | /* Check if page directory entry exists. */ | ||
875 | pgd = pgd_offset(mm, address); | ||
876 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | ||
877 | return 1; | ||
878 | |||
879 | pud = pud_offset(pgd, address); | ||
880 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | ||
881 | return 1; | ||
882 | |||
883 | /* Check if page middle directory entry exists. */ | ||
884 | pmd = pmd_offset(pud, address); | ||
885 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | ||
886 | return 1; | ||
887 | 873 | ||
888 | /* There is a pte slot for 'address' in 'mm'. */ | 874 | no_page_table: |
889 | return 0; | 875 | /* |
876 | * When core dumping an enormous anonymous area that nobody | ||
877 | * has touched so far, we don't want to allocate page tables. | ||
878 | */ | ||
879 | if (flags & FOLL_ANON) { | ||
880 | page = ZERO_PAGE(address); | ||
881 | if (flags & FOLL_GET) | ||
882 | get_page(page); | ||
883 | BUG_ON(flags & FOLL_WRITE); | ||
884 | } | ||
885 | return page; | ||
890 | } | 886 | } |
891 | 887 | ||
892 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 888 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
@@ -894,18 +890,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
894 | struct page **pages, struct vm_area_struct **vmas) | 890 | struct page **pages, struct vm_area_struct **vmas) |
895 | { | 891 | { |
896 | int i; | 892 | int i; |
897 | unsigned int flags; | 893 | unsigned int vm_flags; |
898 | 894 | ||
899 | /* | 895 | /* |
900 | * Require read or write permissions. | 896 | * Require read or write permissions. |
901 | * If 'force' is set, we only require the "MAY" flags. | 897 | * If 'force' is set, we only require the "MAY" flags. |
902 | */ | 898 | */ |
903 | flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); | 899 | vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); |
904 | flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | 900 | vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); |
905 | i = 0; | 901 | i = 0; |
906 | 902 | ||
907 | do { | 903 | do { |
908 | struct vm_area_struct * vma; | 904 | struct vm_area_struct *vma; |
905 | unsigned int foll_flags; | ||
909 | 906 | ||
910 | vma = find_extend_vma(mm, start); | 907 | vma = find_extend_vma(mm, start); |
911 | if (!vma && in_gate_area(tsk, start)) { | 908 | if (!vma && in_gate_area(tsk, start)) { |
@@ -946,7 +943,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
946 | } | 943 | } |
947 | 944 | ||
948 | if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED)) | 945 | if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED)) |
949 | || !(flags & vma->vm_flags)) | 946 | || !(vm_flags & vma->vm_flags)) |
950 | return i ? : -EFAULT; | 947 | return i ? : -EFAULT; |
951 | 948 | ||
952 | if (is_vm_hugetlb_page(vma)) { | 949 | if (is_vm_hugetlb_page(vma)) { |
@@ -954,29 +951,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
954 | &start, &len, i); | 951 | &start, &len, i); |
955 | continue; | 952 | continue; |
956 | } | 953 | } |
957 | spin_lock(&mm->page_table_lock); | 954 | |
955 | foll_flags = FOLL_TOUCH; | ||
956 | if (pages) | ||
957 | foll_flags |= FOLL_GET; | ||
958 | if (!write && !(vma->vm_flags & VM_LOCKED) && | ||
959 | (!vma->vm_ops || !vma->vm_ops->nopage)) | ||
960 | foll_flags |= FOLL_ANON; | ||
961 | |||
958 | do { | 962 | do { |
959 | int write_access = write; | ||
960 | struct page *page; | 963 | struct page *page; |
961 | 964 | ||
962 | cond_resched_lock(&mm->page_table_lock); | 965 | if (write) |
963 | while (!(page = follow_page(mm, start, write_access))) { | 966 | foll_flags |= FOLL_WRITE; |
964 | int ret; | ||
965 | |||
966 | /* | ||
967 | * Shortcut for anonymous pages. We don't want | ||
968 | * to force the creation of pages tables for | ||
969 | * insanely big anonymously mapped areas that | ||
970 | * nobody touched so far. This is important | ||
971 | * for doing a core dump for these mappings. | ||
972 | */ | ||
973 | if (!write && untouched_anonymous_page(mm,vma,start)) { | ||
974 | page = ZERO_PAGE(start); | ||
975 | break; | ||
976 | } | ||
977 | spin_unlock(&mm->page_table_lock); | ||
978 | ret = __handle_mm_fault(mm, vma, start, write_access); | ||
979 | 967 | ||
968 | cond_resched(); | ||
969 | while (!(page = follow_page(mm, start, foll_flags))) { | ||
970 | int ret; | ||
971 | ret = __handle_mm_fault(mm, vma, start, | ||
972 | foll_flags & FOLL_WRITE); | ||
980 | /* | 973 | /* |
981 | * The VM_FAULT_WRITE bit tells us that do_wp_page has | 974 | * The VM_FAULT_WRITE bit tells us that do_wp_page has |
982 | * broken COW when necessary, even if maybe_mkwrite | 975 | * broken COW when necessary, even if maybe_mkwrite |
@@ -984,7 +977,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
984 | * subsequent page lookups as if they were reads. | 977 | * subsequent page lookups as if they were reads. |
985 | */ | 978 | */ |
986 | if (ret & VM_FAULT_WRITE) | 979 | if (ret & VM_FAULT_WRITE) |
987 | write_access = 0; | 980 | foll_flags &= ~FOLL_WRITE; |
988 | 981 | ||
989 | switch (ret & ~VM_FAULT_WRITE) { | 982 | switch (ret & ~VM_FAULT_WRITE) { |
990 | case VM_FAULT_MINOR: | 983 | case VM_FAULT_MINOR: |
@@ -1000,12 +993,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1000 | default: | 993 | default: |
1001 | BUG(); | 994 | BUG(); |
1002 | } | 995 | } |
1003 | spin_lock(&mm->page_table_lock); | ||
1004 | } | 996 | } |
1005 | if (pages) { | 997 | if (pages) { |
1006 | pages[i] = page; | 998 | pages[i] = page; |
1007 | flush_dcache_page(page); | 999 | flush_dcache_page(page); |
1008 | page_cache_get(page); | ||
1009 | } | 1000 | } |
1010 | if (vmas) | 1001 | if (vmas) |
1011 | vmas[i] = vma; | 1002 | vmas[i] = vma; |
@@ -1013,7 +1004,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1013 | start += PAGE_SIZE; | 1004 | start += PAGE_SIZE; |
1014 | len--; | 1005 | len--; |
1015 | } while (len && start < vma->vm_end); | 1006 | } while (len && start < vma->vm_end); |
1016 | spin_unlock(&mm->page_table_lock); | ||
1017 | } while (len); | 1007 | } while (len); |
1018 | return i; | 1008 | return i; |
1019 | } | 1009 | } |