summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDavid Herrmann <dh.herrmann@gmail.com>2014-08-08 17:25:36 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-08 18:57:31 -0400
commit05f65b5c70909ef686f865f0a85406d74d75f70f (patch)
tree4d412f64bb993713d8896fc334db87f1d026d86c /mm
parent87b2d44026e0e315a7401551e95b189ac4b28217 (diff)
shm: wait for pins to be released when sealing
If we set SEAL_WRITE on a file, we must make sure there cannot be any ongoing write-operations on the file. For write() calls, we simply lock the inode mutex, for mmap() we simply verify there're no writable mappings. However, there might be pages pinned by AIO, Direct-IO and similar operations via GUP. We must make sure those do not write to the memfd file after we set SEAL_WRITE. As there is no way to notify GUP users to drop pages or to wait for them to be done, we implement the wait ourself: When setting SEAL_WRITE, we check all pages for their ref-count. If it's bigger than 1, we know there's some user of the page. We then mark the page and wait for up to 150ms for those ref-counts to be dropped. If the ref-counts are not dropped in time, we refuse the seal operation. Signed-off-by: David Herrmann <dh.herrmann@gmail.com> Acked-by: Hugh Dickins <hughd@google.com> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Cc: Ryan Lortie <desrt@desrt.ca> Cc: Lennart Poettering <lennart@poettering.net> Cc: Daniel Mack <zonque@gmail.com> Cc: Andy Lutomirski <luto@amacapital.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/shmem.c110
1 files changed, 109 insertions, 1 deletions
diff --git a/mm/shmem.c b/mm/shmem.c
index 4a5498795a2b..a42add14331c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1828,9 +1828,117 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
1828 return offset; 1828 return offset;
1829} 1829}
1830 1830
1831/*
1832 * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
1833 * so reuse a tag which we firmly believe is never set or cleared on shmem.
1834 */
1835#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE
1836#define LAST_SCAN 4 /* about 150ms max */
1837
1838static void shmem_tag_pins(struct address_space *mapping)
1839{
1840 struct radix_tree_iter iter;
1841 void **slot;
1842 pgoff_t start;
1843 struct page *page;
1844
1845 lru_add_drain();
1846 start = 0;
1847 rcu_read_lock();
1848
1849restart:
1850 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
1851 page = radix_tree_deref_slot(slot);
1852 if (!page || radix_tree_exception(page)) {
1853 if (radix_tree_deref_retry(page))
1854 goto restart;
1855 } else if (page_count(page) - page_mapcount(page) > 1) {
1856 spin_lock_irq(&mapping->tree_lock);
1857 radix_tree_tag_set(&mapping->page_tree, iter.index,
1858 SHMEM_TAG_PINNED);
1859 spin_unlock_irq(&mapping->tree_lock);
1860 }
1861
1862 if (need_resched()) {
1863 cond_resched_rcu();
1864 start = iter.index + 1;
1865 goto restart;
1866 }
1867 }
1868 rcu_read_unlock();
1869}
1870
1871/*
1872 * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
1873 * via get_user_pages(), drivers might have some pending I/O without any active
1874 * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
1875 * and see whether it has an elevated ref-count. If so, we tag them and wait for
1876 * them to be dropped.
1877 * The caller must guarantee that no new user will acquire writable references
1878 * to those pages to avoid races.
1879 */
1831static int shmem_wait_for_pins(struct address_space *mapping) 1880static int shmem_wait_for_pins(struct address_space *mapping)
1832{ 1881{
1833 return 0; 1882 struct radix_tree_iter iter;
1883 void **slot;
1884 pgoff_t start;
1885 struct page *page;
1886 int error, scan;
1887
1888 shmem_tag_pins(mapping);
1889
1890 error = 0;
1891 for (scan = 0; scan <= LAST_SCAN; scan++) {
1892 if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED))
1893 break;
1894
1895 if (!scan)
1896 lru_add_drain_all();
1897 else if (schedule_timeout_killable((HZ << scan) / 200))
1898 scan = LAST_SCAN;
1899
1900 start = 0;
1901 rcu_read_lock();
1902restart:
1903 radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
1904 start, SHMEM_TAG_PINNED) {
1905
1906 page = radix_tree_deref_slot(slot);
1907 if (radix_tree_exception(page)) {
1908 if (radix_tree_deref_retry(page))
1909 goto restart;
1910
1911 page = NULL;
1912 }
1913
1914 if (page &&
1915 page_count(page) - page_mapcount(page) != 1) {
1916 if (scan < LAST_SCAN)
1917 goto continue_resched;
1918
1919 /*
1920 * On the last scan, we clean up all those tags
1921 * we inserted; but make a note that we still
1922 * found pages pinned.
1923 */
1924 error = -EBUSY;
1925 }
1926
1927 spin_lock_irq(&mapping->tree_lock);
1928 radix_tree_tag_clear(&mapping->page_tree,
1929 iter.index, SHMEM_TAG_PINNED);
1930 spin_unlock_irq(&mapping->tree_lock);
1931continue_resched:
1932 if (need_resched()) {
1933 cond_resched_rcu();
1934 start = iter.index + 1;
1935 goto restart;
1936 }
1937 }
1938 rcu_read_unlock();
1939 }
1940
1941 return error;
1834} 1942}
1835 1943
1836#define F_ALL_SEALS (F_SEAL_SEAL | \ 1944#define F_ALL_SEALS (F_SEAL_SEAL | \