diff options
author | Sagi Grimberg <sagig@mellanox.com> | 2012-10-08 19:33:33 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-09 03:22:58 -0400 |
commit | 2ec74c3ef2d8c58d71e0e00336fb6b891192155a (patch) | |
tree | 512b591504cdbee278c27afc50a7e3a558b4851a /mm/huge_memory.c | |
parent | 36e4f20af833d1ce196e6a4ade05dc26c44652d1 (diff) |
mm: move all mmu notifier invocations to be done outside the PT lock
In order to allow sleeping during mmu notifier calls, we need to avoid
invoking them under the page table spinlock. This patch solves the
problem by calling invalidate_page notification after releasing the lock
(but before freeing the page itself), or by wrapping the page invalidation
with calls to invalidate_range_begin and invalidate_range_end.
To prevent accidental changes to the invalidate_range_end arguments after
the call to invalidate_range_begin, the patch introduces a convention of
saving the arguments in consistently named locals:
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
...
mmun_start = ...
mmun_end = ...
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
...
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
The patch changes code to use this convention for all calls to
mmu_notifier_invalidate_range_start/end, except those where the calls are
close enough so that anyone who glances at the code can see the values
aren't changing.
This patchset is a preliminary step towards on-demand paging design to be
added to the RDMA stack.
Why do we want on-demand paging for Infiniband?
Applications register memory with an RDMA adapter using system calls,
and subsequently post IO operations that refer to the corresponding
virtual addresses directly to HW. Until now, this was achieved by
pinning the memory during the registration calls. The goal of on demand
paging is to avoid pinning the pages of registered memory regions (MRs).
This will allow users the same flexibility they get when swapping any
other part of their processes address spaces. Instead of requiring the
entire MR to fit in physical memory, we can allow the MR to be larger,
and only fit the current working set in physical memory.
Why should anyone care? What problems are users currently experiencing?
This can make programming with RDMA much simpler. Today, developers
that are working with more data than their RAM can hold need either to
deregister and reregister memory regions throughout their process's
life, or keep a single memory region and copy the data to it. On demand
paging will allow these developers to register a single MR at the
beginning of their process's life, and let the operating system manage
which pages needs to be fetched at a given time. In the future, we
might be able to provide a single memory access key for each process
that would provide the entire process's address as one large memory
region, and the developers wouldn't need to register memory regions at
all.
Is there any prospect that any other subsystems will utilise these
infrastructural changes? If so, which and how, etc?
As for other subsystems, I understand that XPMEM wanted to sleep in
MMU notifiers, as Christoph Lameter wrote at
http://lkml.indiana.edu/hypermail/linux/kernel/0802.1/0460.html and
perhaps Andrea knows about other use cases.
Scheduling in mmu notifications is required since we need to sync the
hardware with the secondary page tables change. A TLB flush of an IO
device is inherently slower than a CPU TLB flush, so our design works by
sending the invalidation request to the device, and waiting for an
interrupt before exiting the mmu notifier handler.
Avi said:
kvm may be a buyer. kvm::mmu_lock, which serializes guest page
faults, also protects long operations such as destroying large ranges.
It would be good to convert it into a spinlock, but as it is used inside
mmu notifiers, this cannot be done.
(there are alternatives, such as keeping the spinlock and using a
generation counter to do the teardown in O(1), which is what the "may"
is doing up there).
[akpm@linux-foundation.orgpossible speed tweak in hugetlb_cow(), cleanups]
Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Or Gerlitz <ogerlitz@mellanox.com>
Cc: Haggai Eran <haggaie@mellanox.com>
Cc: Shachar Raindel <raindel@mellanox.com>
Cc: Liran Liss <liranl@mellanox.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Avi Kivity <avi@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 42 |
1 files changed, 36 insertions, 6 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0e7740923fb9..08a943b9cf95 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -787,6 +787,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
787 | pmd_t _pmd; | 787 | pmd_t _pmd; |
788 | int ret = 0, i; | 788 | int ret = 0, i; |
789 | struct page **pages; | 789 | struct page **pages; |
790 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
791 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
790 | 792 | ||
791 | pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, | 793 | pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, |
792 | GFP_KERNEL); | 794 | GFP_KERNEL); |
@@ -823,12 +825,16 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
823 | cond_resched(); | 825 | cond_resched(); |
824 | } | 826 | } |
825 | 827 | ||
828 | mmun_start = haddr; | ||
829 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
830 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
831 | |||
826 | spin_lock(&mm->page_table_lock); | 832 | spin_lock(&mm->page_table_lock); |
827 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 833 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
828 | goto out_free_pages; | 834 | goto out_free_pages; |
829 | VM_BUG_ON(!PageHead(page)); | 835 | VM_BUG_ON(!PageHead(page)); |
830 | 836 | ||
831 | pmdp_clear_flush_notify(vma, haddr, pmd); | 837 | pmdp_clear_flush(vma, haddr, pmd); |
832 | /* leave pmd empty until pte is filled */ | 838 | /* leave pmd empty until pte is filled */ |
833 | 839 | ||
834 | pgtable = pgtable_trans_huge_withdraw(mm); | 840 | pgtable = pgtable_trans_huge_withdraw(mm); |
@@ -851,6 +857,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
851 | page_remove_rmap(page); | 857 | page_remove_rmap(page); |
852 | spin_unlock(&mm->page_table_lock); | 858 | spin_unlock(&mm->page_table_lock); |
853 | 859 | ||
860 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
861 | |||
854 | ret |= VM_FAULT_WRITE; | 862 | ret |= VM_FAULT_WRITE; |
855 | put_page(page); | 863 | put_page(page); |
856 | 864 | ||
@@ -859,6 +867,7 @@ out: | |||
859 | 867 | ||
860 | out_free_pages: | 868 | out_free_pages: |
861 | spin_unlock(&mm->page_table_lock); | 869 | spin_unlock(&mm->page_table_lock); |
870 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
862 | mem_cgroup_uncharge_start(); | 871 | mem_cgroup_uncharge_start(); |
863 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 872 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
864 | mem_cgroup_uncharge_page(pages[i]); | 873 | mem_cgroup_uncharge_page(pages[i]); |
@@ -875,6 +884,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
875 | int ret = 0; | 884 | int ret = 0; |
876 | struct page *page, *new_page; | 885 | struct page *page, *new_page; |
877 | unsigned long haddr; | 886 | unsigned long haddr; |
887 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
888 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
878 | 889 | ||
879 | VM_BUG_ON(!vma->anon_vma); | 890 | VM_BUG_ON(!vma->anon_vma); |
880 | spin_lock(&mm->page_table_lock); | 891 | spin_lock(&mm->page_table_lock); |
@@ -925,20 +936,24 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
925 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | 936 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); |
926 | __SetPageUptodate(new_page); | 937 | __SetPageUptodate(new_page); |
927 | 938 | ||
939 | mmun_start = haddr; | ||
940 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
941 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
942 | |||
928 | spin_lock(&mm->page_table_lock); | 943 | spin_lock(&mm->page_table_lock); |
929 | put_page(page); | 944 | put_page(page); |
930 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 945 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { |
931 | spin_unlock(&mm->page_table_lock); | 946 | spin_unlock(&mm->page_table_lock); |
932 | mem_cgroup_uncharge_page(new_page); | 947 | mem_cgroup_uncharge_page(new_page); |
933 | put_page(new_page); | 948 | put_page(new_page); |
934 | goto out; | 949 | goto out_mn; |
935 | } else { | 950 | } else { |
936 | pmd_t entry; | 951 | pmd_t entry; |
937 | VM_BUG_ON(!PageHead(page)); | 952 | VM_BUG_ON(!PageHead(page)); |
938 | entry = mk_pmd(new_page, vma->vm_page_prot); | 953 | entry = mk_pmd(new_page, vma->vm_page_prot); |
939 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 954 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
940 | entry = pmd_mkhuge(entry); | 955 | entry = pmd_mkhuge(entry); |
941 | pmdp_clear_flush_notify(vma, haddr, pmd); | 956 | pmdp_clear_flush(vma, haddr, pmd); |
942 | page_add_new_anon_rmap(new_page, vma, haddr); | 957 | page_add_new_anon_rmap(new_page, vma, haddr); |
943 | set_pmd_at(mm, haddr, pmd, entry); | 958 | set_pmd_at(mm, haddr, pmd, entry); |
944 | update_mmu_cache(vma, address, pmd); | 959 | update_mmu_cache(vma, address, pmd); |
@@ -946,10 +961,14 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
946 | put_page(page); | 961 | put_page(page); |
947 | ret |= VM_FAULT_WRITE; | 962 | ret |= VM_FAULT_WRITE; |
948 | } | 963 | } |
949 | out_unlock: | ||
950 | spin_unlock(&mm->page_table_lock); | 964 | spin_unlock(&mm->page_table_lock); |
965 | out_mn: | ||
966 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
951 | out: | 967 | out: |
952 | return ret; | 968 | return ret; |
969 | out_unlock: | ||
970 | spin_unlock(&mm->page_table_lock); | ||
971 | return ret; | ||
953 | } | 972 | } |
954 | 973 | ||
955 | struct page *follow_trans_huge_pmd(struct mm_struct *mm, | 974 | struct page *follow_trans_huge_pmd(struct mm_struct *mm, |
@@ -1162,7 +1181,11 @@ static int __split_huge_page_splitting(struct page *page, | |||
1162 | struct mm_struct *mm = vma->vm_mm; | 1181 | struct mm_struct *mm = vma->vm_mm; |
1163 | pmd_t *pmd; | 1182 | pmd_t *pmd; |
1164 | int ret = 0; | 1183 | int ret = 0; |
1184 | /* For mmu_notifiers */ | ||
1185 | const unsigned long mmun_start = address; | ||
1186 | const unsigned long mmun_end = address + HPAGE_PMD_SIZE; | ||
1165 | 1187 | ||
1188 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
1166 | spin_lock(&mm->page_table_lock); | 1189 | spin_lock(&mm->page_table_lock); |
1167 | pmd = page_check_address_pmd(page, mm, address, | 1190 | pmd = page_check_address_pmd(page, mm, address, |
1168 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); | 1191 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); |
@@ -1174,10 +1197,11 @@ static int __split_huge_page_splitting(struct page *page, | |||
1174 | * and it won't wait on the anon_vma->root->mutex to | 1197 | * and it won't wait on the anon_vma->root->mutex to |
1175 | * serialize against split_huge_page*. | 1198 | * serialize against split_huge_page*. |
1176 | */ | 1199 | */ |
1177 | pmdp_splitting_flush_notify(vma, address, pmd); | 1200 | pmdp_splitting_flush(vma, address, pmd); |
1178 | ret = 1; | 1201 | ret = 1; |
1179 | } | 1202 | } |
1180 | spin_unlock(&mm->page_table_lock); | 1203 | spin_unlock(&mm->page_table_lock); |
1204 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1181 | 1205 | ||
1182 | return ret; | 1206 | return ret; |
1183 | } | 1207 | } |
@@ -1898,6 +1922,8 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1898 | spinlock_t *ptl; | 1922 | spinlock_t *ptl; |
1899 | int isolated; | 1923 | int isolated; |
1900 | unsigned long hstart, hend; | 1924 | unsigned long hstart, hend; |
1925 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
1926 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
1901 | 1927 | ||
1902 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 1928 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
1903 | 1929 | ||
@@ -1952,6 +1978,9 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1952 | pte = pte_offset_map(pmd, address); | 1978 | pte = pte_offset_map(pmd, address); |
1953 | ptl = pte_lockptr(mm, pmd); | 1979 | ptl = pte_lockptr(mm, pmd); |
1954 | 1980 | ||
1981 | mmun_start = address; | ||
1982 | mmun_end = address + HPAGE_PMD_SIZE; | ||
1983 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
1955 | spin_lock(&mm->page_table_lock); /* probably unnecessary */ | 1984 | spin_lock(&mm->page_table_lock); /* probably unnecessary */ |
1956 | /* | 1985 | /* |
1957 | * After this gup_fast can't run anymore. This also removes | 1986 | * After this gup_fast can't run anymore. This also removes |
@@ -1959,8 +1988,9 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1959 | * huge and small TLB entries for the same virtual address | 1988 | * huge and small TLB entries for the same virtual address |
1960 | * to avoid the risk of CPU bugs in that area. | 1989 | * to avoid the risk of CPU bugs in that area. |
1961 | */ | 1990 | */ |
1962 | _pmd = pmdp_clear_flush_notify(vma, address, pmd); | 1991 | _pmd = pmdp_clear_flush(vma, address, pmd); |
1963 | spin_unlock(&mm->page_table_lock); | 1992 | spin_unlock(&mm->page_table_lock); |
1993 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1964 | 1994 | ||
1965 | spin_lock(ptl); | 1995 | spin_lock(ptl); |
1966 | isolated = __collapse_huge_page_isolate(vma, address, pte); | 1996 | isolated = __collapse_huge_page_isolate(vma, address, pte); |