aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHugh Dickins <hugh@veritas.com>2005-10-29 21:16:26 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-10-30 00:40:40 -0400
commit8f4e2101fd7df9031a754eedb82e2060b51f8c45 (patch)
tree624db00c6160d70376a57447b45b935b293e396b
parentb462705ac679f6195d1b23a752cda592d9107495 (diff)
[PATCH] mm: page fault handler locking
On the page fault path, the patch before last pushed acquiring the page_table_lock down to the head of handle_pte_fault (though it's also taken and dropped earlier when a new page table has to be allocated). Now delete that line, read "entry = *pte" without it, and go off to this or that page fault handler on the basis of this unlocked peek. Usually the handler can proceed without the lock, relying on the subsequent locked pte_same or pte_none test to back out when necessary; though do_wp_page needs the lock immediately, and do_file_page doesn't check (if there's a race, install_page just zaps the entry and reinstalls it). But on those architectures (notably i386 with PAE) whose pte is too big to be read atomically, if SMP or preemption is enabled, do_swap_page and do_file_page might cause irretrievable damage if passed a Frankenstein entry stitched together from unrelated parts. In those configs, "pte_unmap_same" has to take page_table_lock, validate orig_pte still the same, and drop page_table_lock before unmapping, before proceeding. Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock avoidance leaves more lone maps and unmaps than elsewhere. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--mm/memory.c150
1 files changed, 90 insertions, 60 deletions
diff --git a/mm/memory.c b/mm/memory.c
index a40e4b1cee4f..24ba688876d6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1219,6 +1219,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1219EXPORT_SYMBOL(remap_pfn_range); 1219EXPORT_SYMBOL(remap_pfn_range);
1220 1220
1221/* 1221/*
1222 * handle_pte_fault chooses page fault handler according to an entry
1223 * which was read non-atomically. Before making any commitment, on
1224 * those architectures or configurations (e.g. i386 with PAE) which
1225 * might give a mix of unmatched parts, do_swap_page and do_file_page
1226 * must check under lock before unmapping the pte and proceeding
1227 * (but do_wp_page is only called after already making such a check;
1228 * and do_anonymous_page and do_no_page can safely check later on).
1229 */
1230static inline int pte_unmap_same(struct mm_struct *mm,
1231 pte_t *page_table, pte_t orig_pte)
1232{
1233 int same = 1;
1234#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
1235 if (sizeof(pte_t) > sizeof(unsigned long)) {
1236 spin_lock(&mm->page_table_lock);
1237 same = pte_same(*page_table, orig_pte);
1238 spin_unlock(&mm->page_table_lock);
1239 }
1240#endif
1241 pte_unmap(page_table);
1242 return same;
1243}
1244
1245/*
1222 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when 1246 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
1223 * servicing faults for write access. In the normal case, do always want 1247 * servicing faults for write access. In the normal case, do always want
1224 * pte_mkwrite. But get_user_pages can cause write faults for mappings 1248 * pte_mkwrite. But get_user_pages can cause write faults for mappings
@@ -1245,12 +1269,13 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1245 * change only once the write actually happens. This avoids a few races, 1269 * change only once the write actually happens. This avoids a few races,
1246 * and potentially makes it more efficient. 1270 * and potentially makes it more efficient.
1247 * 1271 *
1248 * We hold the mm semaphore and the page_table_lock on entry and exit 1272 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1249 * with the page_table_lock released. 1273 * but allow concurrent faults), with pte both mapped and locked.
1274 * We return with mmap_sem still held, but pte unmapped and unlocked.
1250 */ 1275 */
1251static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 1276static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1252 unsigned long address, pte_t *page_table, pmd_t *pmd, 1277 unsigned long address, pte_t *page_table, pmd_t *pmd,
1253 pte_t orig_pte) 1278 spinlock_t *ptl, pte_t orig_pte)
1254{ 1279{
1255 struct page *old_page, *new_page; 1280 struct page *old_page, *new_page;
1256 unsigned long pfn = pte_pfn(orig_pte); 1281 unsigned long pfn = pte_pfn(orig_pte);
@@ -1288,8 +1313,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1288 * Ok, we need to copy. Oh, well.. 1313 * Ok, we need to copy. Oh, well..
1289 */ 1314 */
1290 page_cache_get(old_page); 1315 page_cache_get(old_page);
1291 pte_unmap(page_table); 1316 pte_unmap_unlock(page_table, ptl);
1292 spin_unlock(&mm->page_table_lock);
1293 1317
1294 if (unlikely(anon_vma_prepare(vma))) 1318 if (unlikely(anon_vma_prepare(vma)))
1295 goto oom; 1319 goto oom;
@@ -1307,8 +1331,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1307 /* 1331 /*
1308 * Re-check the pte - we dropped the lock 1332 * Re-check the pte - we dropped the lock
1309 */ 1333 */
1310 spin_lock(&mm->page_table_lock); 1334 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1311 page_table = pte_offset_map(pmd, address);
1312 if (likely(pte_same(*page_table, orig_pte))) { 1335 if (likely(pte_same(*page_table, orig_pte))) {
1313 page_remove_rmap(old_page); 1336 page_remove_rmap(old_page);
1314 if (!PageAnon(old_page)) { 1337 if (!PageAnon(old_page)) {
@@ -1321,7 +1344,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1321 ptep_establish(vma, address, page_table, entry); 1344 ptep_establish(vma, address, page_table, entry);
1322 update_mmu_cache(vma, address, entry); 1345 update_mmu_cache(vma, address, entry);
1323 lazy_mmu_prot_update(entry); 1346 lazy_mmu_prot_update(entry);
1324
1325 lru_cache_add_active(new_page); 1347 lru_cache_add_active(new_page);
1326 page_add_anon_rmap(new_page, vma, address); 1348 page_add_anon_rmap(new_page, vma, address);
1327 1349
@@ -1332,8 +1354,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1332 page_cache_release(new_page); 1354 page_cache_release(new_page);
1333 page_cache_release(old_page); 1355 page_cache_release(old_page);
1334unlock: 1356unlock:
1335 pte_unmap(page_table); 1357 pte_unmap_unlock(page_table, ptl);
1336 spin_unlock(&mm->page_table_lock);
1337 return ret; 1358 return ret;
1338oom: 1359oom:
1339 page_cache_release(old_page); 1360 page_cache_release(old_page);
@@ -1660,20 +1681,22 @@ void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struc
1660} 1681}
1661 1682
1662/* 1683/*
1663 * We hold the mm semaphore and the page_table_lock on entry and 1684 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1664 * should release the pagetable lock on exit.. 1685 * but allow concurrent faults), and pte mapped but not yet locked.
1686 * We return with mmap_sem still held, but pte unmapped and unlocked.
1665 */ 1687 */
1666static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, 1688static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1667 unsigned long address, pte_t *page_table, pmd_t *pmd, 1689 unsigned long address, pte_t *page_table, pmd_t *pmd,
1668 int write_access, pte_t orig_pte) 1690 int write_access, pte_t orig_pte)
1669{ 1691{
1692 spinlock_t *ptl;
1670 struct page *page; 1693 struct page *page;
1671 swp_entry_t entry; 1694 swp_entry_t entry;
1672 pte_t pte; 1695 pte_t pte;
1673 int ret = VM_FAULT_MINOR; 1696 int ret = VM_FAULT_MINOR;
1674 1697
1675 pte_unmap(page_table); 1698 if (!pte_unmap_same(mm, page_table, orig_pte))
1676 spin_unlock(&mm->page_table_lock); 1699 goto out;
1677 1700
1678 entry = pte_to_swp_entry(orig_pte); 1701 entry = pte_to_swp_entry(orig_pte);
1679 page = lookup_swap_cache(entry); 1702 page = lookup_swap_cache(entry);
@@ -1682,11 +1705,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1682 page = read_swap_cache_async(entry, vma, address); 1705 page = read_swap_cache_async(entry, vma, address);
1683 if (!page) { 1706 if (!page) {
1684 /* 1707 /*
1685 * Back out if somebody else faulted in this pte while 1708 * Back out if somebody else faulted in this pte
1686 * we released the page table lock. 1709 * while we released the pte lock.
1687 */ 1710 */
1688 spin_lock(&mm->page_table_lock); 1711 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1689 page_table = pte_offset_map(pmd, address);
1690 if (likely(pte_same(*page_table, orig_pte))) 1712 if (likely(pte_same(*page_table, orig_pte)))
1691 ret = VM_FAULT_OOM; 1713 ret = VM_FAULT_OOM;
1692 goto unlock; 1714 goto unlock;
@@ -1702,11 +1724,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1702 lock_page(page); 1724 lock_page(page);
1703 1725
1704 /* 1726 /*
1705 * Back out if somebody else faulted in this pte while we 1727 * Back out if somebody else already faulted in this pte.
1706 * released the page table lock.
1707 */ 1728 */
1708 spin_lock(&mm->page_table_lock); 1729 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1709 page_table = pte_offset_map(pmd, address);
1710 if (unlikely(!pte_same(*page_table, orig_pte))) 1730 if (unlikely(!pte_same(*page_table, orig_pte)))
1711 goto out_nomap; 1731 goto out_nomap;
1712 1732
@@ -1735,7 +1755,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1735 1755
1736 if (write_access) { 1756 if (write_access) {
1737 if (do_wp_page(mm, vma, address, 1757 if (do_wp_page(mm, vma, address,
1738 page_table, pmd, pte) == VM_FAULT_OOM) 1758 page_table, pmd, ptl, pte) == VM_FAULT_OOM)
1739 ret = VM_FAULT_OOM; 1759 ret = VM_FAULT_OOM;
1740 goto out; 1760 goto out;
1741 } 1761 }
@@ -1744,37 +1764,32 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1744 update_mmu_cache(vma, address, pte); 1764 update_mmu_cache(vma, address, pte);
1745 lazy_mmu_prot_update(pte); 1765 lazy_mmu_prot_update(pte);
1746unlock: 1766unlock:
1747 pte_unmap(page_table); 1767 pte_unmap_unlock(page_table, ptl);
1748 spin_unlock(&mm->page_table_lock);
1749out: 1768out:
1750 return ret; 1769 return ret;
1751out_nomap: 1770out_nomap:
1752 pte_unmap(page_table); 1771 pte_unmap_unlock(page_table, ptl);
1753 spin_unlock(&mm->page_table_lock);
1754 unlock_page(page); 1772 unlock_page(page);
1755 page_cache_release(page); 1773 page_cache_release(page);
1756 return ret; 1774 return ret;
1757} 1775}
1758 1776
1759/* 1777/*
1760 * We are called with the MM semaphore and page_table_lock 1778 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1761 * spinlock held to protect against concurrent faults in 1779 * but allow concurrent faults), and pte mapped but not yet locked.
1762 * multithreaded programs. 1780 * We return with mmap_sem still held, but pte unmapped and unlocked.
1763 */ 1781 */
1764static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 1782static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1765 unsigned long address, pte_t *page_table, pmd_t *pmd, 1783 unsigned long address, pte_t *page_table, pmd_t *pmd,
1766 int write_access) 1784 int write_access)
1767{ 1785{
1768 struct page *page = ZERO_PAGE(addr); 1786 struct page *page;
1787 spinlock_t *ptl;
1769 pte_t entry; 1788 pte_t entry;
1770 1789
1771 /* Mapping of ZERO_PAGE - vm_page_prot is readonly */
1772 entry = mk_pte(page, vma->vm_page_prot);
1773
1774 if (write_access) { 1790 if (write_access) {
1775 /* Allocate our own private page. */ 1791 /* Allocate our own private page. */
1776 pte_unmap(page_table); 1792 pte_unmap(page_table);
1777 spin_unlock(&mm->page_table_lock);
1778 1793
1779 if (unlikely(anon_vma_prepare(vma))) 1794 if (unlikely(anon_vma_prepare(vma)))
1780 goto oom; 1795 goto oom;
@@ -1782,23 +1797,28 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1782 if (!page) 1797 if (!page)
1783 goto oom; 1798 goto oom;
1784 1799
1785 spin_lock(&mm->page_table_lock);
1786 page_table = pte_offset_map(pmd, address);
1787
1788 if (!pte_none(*page_table)) {
1789 page_cache_release(page);
1790 goto unlock;
1791 }
1792 inc_mm_counter(mm, anon_rss);
1793 entry = mk_pte(page, vma->vm_page_prot); 1800 entry = mk_pte(page, vma->vm_page_prot);
1794 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1801 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1802
1803 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1804 if (!pte_none(*page_table))
1805 goto release;
1806 inc_mm_counter(mm, anon_rss);
1795 lru_cache_add_active(page); 1807 lru_cache_add_active(page);
1796 SetPageReferenced(page); 1808 SetPageReferenced(page);
1797 page_add_anon_rmap(page, vma, address); 1809 page_add_anon_rmap(page, vma, address);
1798 } else { 1810 } else {
1811 /* Map the ZERO_PAGE - vm_page_prot is readonly */
1812 page = ZERO_PAGE(address);
1813 page_cache_get(page);
1814 entry = mk_pte(page, vma->vm_page_prot);
1815
1816 ptl = &mm->page_table_lock;
1817 spin_lock(ptl);
1818 if (!pte_none(*page_table))
1819 goto release;
1799 inc_mm_counter(mm, file_rss); 1820 inc_mm_counter(mm, file_rss);
1800 page_add_file_rmap(page); 1821 page_add_file_rmap(page);
1801 page_cache_get(page);
1802 } 1822 }
1803 1823
1804 set_pte_at(mm, address, page_table, entry); 1824 set_pte_at(mm, address, page_table, entry);
@@ -1807,9 +1827,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1807 update_mmu_cache(vma, address, entry); 1827 update_mmu_cache(vma, address, entry);
1808 lazy_mmu_prot_update(entry); 1828 lazy_mmu_prot_update(entry);
1809unlock: 1829unlock:
1810 pte_unmap(page_table); 1830 pte_unmap_unlock(page_table, ptl);
1811 spin_unlock(&mm->page_table_lock);
1812 return VM_FAULT_MINOR; 1831 return VM_FAULT_MINOR;
1832release:
1833 page_cache_release(page);
1834 goto unlock;
1813oom: 1835oom:
1814 return VM_FAULT_OOM; 1836 return VM_FAULT_OOM;
1815} 1837}
@@ -1823,13 +1845,15 @@ oom:
1823 * As this is called only for pages that do not currently exist, we 1845 * As this is called only for pages that do not currently exist, we
1824 * do not need to flush old virtual caches or the TLB. 1846 * do not need to flush old virtual caches or the TLB.
1825 * 1847 *
1826 * This is called with the MM semaphore held and the page table 1848 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1827 * spinlock held. Exit with the spinlock released. 1849 * but allow concurrent faults), and pte mapped but not yet locked.
1850 * We return with mmap_sem still held, but pte unmapped and unlocked.
1828 */ 1851 */
1829static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 1852static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1830 unsigned long address, pte_t *page_table, pmd_t *pmd, 1853 unsigned long address, pte_t *page_table, pmd_t *pmd,
1831 int write_access) 1854 int write_access)
1832{ 1855{
1856 spinlock_t *ptl;
1833 struct page *new_page; 1857 struct page *new_page;
1834 struct address_space *mapping = NULL; 1858 struct address_space *mapping = NULL;
1835 pte_t entry; 1859 pte_t entry;
@@ -1838,7 +1862,6 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1838 int anon = 0; 1862 int anon = 0;
1839 1863
1840 pte_unmap(page_table); 1864 pte_unmap(page_table);
1841 spin_unlock(&mm->page_table_lock);
1842 1865
1843 if (vma->vm_file) { 1866 if (vma->vm_file) {
1844 mapping = vma->vm_file->f_mapping; 1867 mapping = vma->vm_file->f_mapping;
@@ -1878,21 +1901,20 @@ retry:
1878 anon = 1; 1901 anon = 1;
1879 } 1902 }
1880 1903
1881 spin_lock(&mm->page_table_lock); 1904 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1882 /* 1905 /*
1883 * For a file-backed vma, someone could have truncated or otherwise 1906 * For a file-backed vma, someone could have truncated or otherwise
1884 * invalidated this page. If unmap_mapping_range got called, 1907 * invalidated this page. If unmap_mapping_range got called,
1885 * retry getting the page. 1908 * retry getting the page.
1886 */ 1909 */
1887 if (mapping && unlikely(sequence != mapping->truncate_count)) { 1910 if (mapping && unlikely(sequence != mapping->truncate_count)) {
1888 spin_unlock(&mm->page_table_lock); 1911 pte_unmap_unlock(page_table, ptl);
1889 page_cache_release(new_page); 1912 page_cache_release(new_page);
1890 cond_resched(); 1913 cond_resched();
1891 sequence = mapping->truncate_count; 1914 sequence = mapping->truncate_count;
1892 smp_rmb(); 1915 smp_rmb();
1893 goto retry; 1916 goto retry;
1894 } 1917 }
1895 page_table = pte_offset_map(pmd, address);
1896 1918
1897 /* 1919 /*
1898 * This silly early PAGE_DIRTY setting removes a race 1920 * This silly early PAGE_DIRTY setting removes a race
@@ -1929,8 +1951,7 @@ retry:
1929 update_mmu_cache(vma, address, entry); 1951 update_mmu_cache(vma, address, entry);
1930 lazy_mmu_prot_update(entry); 1952 lazy_mmu_prot_update(entry);
1931unlock: 1953unlock:
1932 pte_unmap(page_table); 1954 pte_unmap_unlock(page_table, ptl);
1933 spin_unlock(&mm->page_table_lock);
1934 return ret; 1955 return ret;
1935oom: 1956oom:
1936 page_cache_release(new_page); 1957 page_cache_release(new_page);
@@ -1941,6 +1962,10 @@ oom:
1941 * Fault of a previously existing named mapping. Repopulate the pte 1962 * Fault of a previously existing named mapping. Repopulate the pte
1942 * from the encoded file_pte if possible. This enables swappable 1963 * from the encoded file_pte if possible. This enables swappable
1943 * nonlinear vmas. 1964 * nonlinear vmas.
1965 *
1966 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1967 * but allow concurrent faults), and pte mapped but not yet locked.
1968 * We return with mmap_sem still held, but pte unmapped and unlocked.
1944 */ 1969 */
1945static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, 1970static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
1946 unsigned long address, pte_t *page_table, pmd_t *pmd, 1971 unsigned long address, pte_t *page_table, pmd_t *pmd,
@@ -1949,8 +1974,8 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
1949 pgoff_t pgoff; 1974 pgoff_t pgoff;
1950 int err; 1975 int err;
1951 1976
1952 pte_unmap(page_table); 1977 if (!pte_unmap_same(mm, page_table, orig_pte))
1953 spin_unlock(&mm->page_table_lock); 1978 return VM_FAULT_MINOR;
1954 1979
1955 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { 1980 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
1956 /* 1981 /*
@@ -1989,8 +2014,8 @@ static inline int handle_pte_fault(struct mm_struct *mm,
1989 pte_t *pte, pmd_t *pmd, int write_access) 2014 pte_t *pte, pmd_t *pmd, int write_access)
1990{ 2015{
1991 pte_t entry; 2016 pte_t entry;
2017 spinlock_t *ptl;
1992 2018
1993 spin_lock(&mm->page_table_lock);
1994 entry = *pte; 2019 entry = *pte;
1995 if (!pte_present(entry)) { 2020 if (!pte_present(entry)) {
1996 if (pte_none(entry)) { 2021 if (pte_none(entry)) {
@@ -2007,17 +2032,22 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2007 pte, pmd, write_access, entry); 2032 pte, pmd, write_access, entry);
2008 } 2033 }
2009 2034
2035 ptl = &mm->page_table_lock;
2036 spin_lock(ptl);
2037 if (unlikely(!pte_same(*pte, entry)))
2038 goto unlock;
2010 if (write_access) { 2039 if (write_access) {
2011 if (!pte_write(entry)) 2040 if (!pte_write(entry))
2012 return do_wp_page(mm, vma, address, pte, pmd, entry); 2041 return do_wp_page(mm, vma, address,
2042 pte, pmd, ptl, entry);
2013 entry = pte_mkdirty(entry); 2043 entry = pte_mkdirty(entry);
2014 } 2044 }
2015 entry = pte_mkyoung(entry); 2045 entry = pte_mkyoung(entry);
2016 ptep_set_access_flags(vma, address, pte, entry, write_access); 2046 ptep_set_access_flags(vma, address, pte, entry, write_access);
2017 update_mmu_cache(vma, address, entry); 2047 update_mmu_cache(vma, address, entry);
2018 lazy_mmu_prot_update(entry); 2048 lazy_mmu_prot_update(entry);
2019 pte_unmap(pte); 2049unlock:
2020 spin_unlock(&mm->page_table_lock); 2050 pte_unmap_unlock(pte, ptl);
2021 return VM_FAULT_MINOR; 2051 return VM_FAULT_MINOR;
2022} 2052}
2023 2053