shmem: fix faulting into a hole, not taking i_mutex

Commit f00cdc6df7d7 ("shmem: fix faulting into a hole while it's punched") was buggy: Sasha sent a lockdep report to remind us that grabbing i_mutex in the fault path is a no-no (write syscall may already hold i_mutex while faulting user buffer). We tried a completely different approach (see following patch) but that proved inadequate: good enough for a rational workload, but not good enough against trinity - which forks off so many mappings of the object that contention on i_mmap_mutex while hole-puncher holds i_mutex builds into serious starvation when concurrent faults force the puncher to fall back to single-page unmap_mapping_range() searches of the i_mmap tree. So return to the original umbrella approach, but keep away from i_mutex this time. We really don't want to bloat every shmem inode with a new mutex or completion, just to protect this unlikely case from trinity. So extend the original with wait_queue_head on stack at the hole-punch end, and wait_queue item on the stack at the fault end. This involves further use of i_lock to guard against the races: lockdep has been happy so far, and I see fs/inode.c:unlock_new_inode() holds i_lock around wake_up_bit(), which is comparable to what we do here. i_lock is more convenient, but we could switch to shmem's info->lock. This issue has been tagged with CVE-2014-4171, which will require commit f00cdc6df7d7 and this and the following patch to be backported: we suggest to 3.1+, though in fact the trinity forkbomb effect might go back as far as 2.6.16, when madvise(,,MADV_REMOVE) came in - or might not, since much has changed, with i_mmap_mutex a spinlock before 3.0. Anyone running trinity on 3.0 and earlier? I don't think we need care. Signed-off-by: Hugh Dickins <hughd@google.com> Reported-by: Sasha Levin <sasha.levin@oracle.com> Tested-by: Sasha Levin <sasha.levin@oracle.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Konstantin Khlebnikov <koct9i@gmail.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Lukas Czerner <lczerner@redhat.com> Cc: Dave Jones <davej@redhat.com> Cc: <stable@vger.kernel.org> [3.1+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Hugh Dickins <hughd@google.com> 2014-07-23 17:00:10 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-07-23 18:10:54 -0400
commit: 8e205f779d1443a94b5ae81aa359cb535dd3021e (patch)
tree: e29022b5d5b9acc5e862c7c46100c89e888a5387 /mm/shmem.c
parent: c118678bc79e8241f9d3434d9324c6400d72f48a (diff)
1 files changed, 52 insertions, 26 deletions
diff --git a/mm/shmem.c b/mm/shmem.c
index 1140f49b6ded..c0719f082246 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -85,7 +85,7 @@ static struct vfsmount *shm_mnt;
 * a time): we would prefer not to enlarge the shmem inode just for that.
 */
 struct shmem_falloc {
-        int     mode;           /* FALLOC_FL mode currently operating */
+        wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
        pgoff_t start;          /* start of range currently being fallocated */
        pgoff_t next;           /* the next page offset to be fallocated */
        pgoff_t nr_falloced;    /* how many new pages have been fallocated */
@@ -760,7 +760,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
                        spin_lock(&inode->i_lock);
                        shmem_falloc = inode->i_private;
                        if (shmem_falloc &&
-                            !shmem_falloc->mode &&
+                            !shmem_falloc->waitq &&
                            index >= shmem_falloc->start &&
                            index < shmem_falloc->next)
                                shmem_falloc->nr_unswapped++;
@@ -1248,38 +1248,58 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
         * Trinity finds that probing a hole which tmpfs is punching can
         * prevent the hole-punch from ever completing: which in turn
         * locks writers out with its hold on i_mutex.  So refrain from
-         * faulting pages into the hole while it's being punched, and
+         * faulting pages into the hole while it's being punched.  Although
-         * wait on i_mutex to be released if vmf->flags permits.
+         * shmem_undo_range() does remove the additions, it may be unable to
+         * keep up, as each new page needs its own unmap_mapping_range() call,
+         * and the i_mmap tree grows ever slower to scan if new vmas are added.
+         *
+         * It does not matter if we sometimes reach this check just before the
+         * hole-punch begins, so that one fault then races with the punch:
+         * we just need to make racing faults a rare case.
+         *
+         * The implementation below would be much simpler if we just used a
+         * standard mutex or completion: but we cannot take i_mutex in fault,
+         * and bloating every shmem inode for this unlikely case would be sad.
         */
        if (unlikely(inode->i_private)) {
                struct shmem_falloc *shmem_falloc;
                spin_lock(&inode->i_lock);
                shmem_falloc = inode->i_private;
-                if (!shmem_falloc ||
+                if (shmem_falloc &&
-                    shmem_falloc->mode != FALLOC_FL_PUNCH_HOLE ||
+                    shmem_falloc->waitq &&
-                    vmf->pgoff < shmem_falloc->start ||
+                    vmf->pgoff >= shmem_falloc->start &&
-                    vmf->pgoff >= shmem_falloc->next)
+                    vmf->pgoff < shmem_falloc->next) {
-                        shmem_falloc = NULL;
+                        wait_queue_head_t *shmem_falloc_waitq;
-                spin_unlock(&inode->i_lock);
+                        DEFINE_WAIT(shmem_fault_wait);
-                /*
-                 * i_lock has protected us from taking shmem_falloc seriously
+                        ret = VM_FAULT_NOPAGE;
-                 * once return from shmem_fallocate() went back up that stack.
-                 * i_lock does not serialize with i_mutex at all, but it does
-                 * not matter if sometimes we wait unnecessarily, or sometimes
-                 * miss out on waiting: we just need to make those cases rare.
-                 */
-                if (shmem_falloc) {
                        if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
                           !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
+                                /* It's polite to up mmap_sem if we can */
                                up_read(&vma->vm_mm->mmap_sem);
-                                mutex_lock(&inode->i_mutex);
+                                ret = VM_FAULT_RETRY;
-                                mutex_unlock(&inode->i_mutex);
-                                return VM_FAULT_RETRY;
                        }
-                        /* cond_resched? Leave that to GUP or return to user */
-                        return VM_FAULT_NOPAGE;
+                        shmem_falloc_waitq = shmem_falloc->waitq;
+                        prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
+                                        TASK_UNINTERRUPTIBLE);
+                        spin_unlock(&inode->i_lock);
+                        schedule();
+                        /*
+                         * shmem_falloc_waitq points into the shmem_fallocate()
+                         * stack of the hole-punching task: shmem_falloc_waitq
+                         * is usually invalid by the time we reach here, but
+                         * finish_wait() does not dereference it in that case;
+                         * though i_lock needed lest racing with wake_up_all().
+                         */
+                        spin_lock(&inode->i_lock);
+                        finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
+                        spin_unlock(&inode->i_lock);
+                        return ret;
                }
+                spin_unlock(&inode->i_lock);
        }
        error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
@@ -1774,13 +1794,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
        mutex_lock(&inode->i_mutex);
-        shmem_falloc.mode = mode & ~FALLOC_FL_KEEP_SIZE;
        if (mode & FALLOC_FL_PUNCH_HOLE) {
                struct address_space *mapping = file->f_mapping;
                loff_t unmap_start = round_up(offset, PAGE_SIZE);
                loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
+                DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
+                shmem_falloc.waitq = &shmem_falloc_waitq;
                shmem_falloc.start = unmap_start >> PAGE_SHIFT;
                shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
                spin_lock(&inode->i_lock);
@@ -1792,8 +1812,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                                            1 + unmap_end - unmap_start, 0);
                shmem_truncate_range(inode, offset, offset + len - 1);
                /* No need to unmap again: hole-punching leaves COWed pages */
+                spin_lock(&inode->i_lock);
+                inode->i_private = NULL;
+                wake_up_all(&shmem_falloc_waitq);
+                spin_unlock(&inode->i_lock);
                error = 0;
-                goto undone;
+                goto out;
        }
        /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
@@ -1809,6 +1834,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                goto out;
        }
+        shmem_falloc.waitq = NULL;
        shmem_falloc.start = start;
        shmem_falloc.next  = start;
        shmem_falloc.nr_falloced = 0;
author	Hugh Dickins <hughd@google.com>	2014-07-23 17:00:10 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-07-23 18:10:54 -0400
commit	8e205f779d1443a94b5ae81aa359cb535dd3021e (patch)
tree	e29022b5d5b9acc5e862c7c46100c89e888a5387 /mm/shmem.c
parent	c118678bc79e8241f9d3434d9324c6400d72f48a (diff)

diff --git a/mm/shmem.c b/mm/shmem.c index 1140f49b6ded..c0719f082246 100644 --- a/mm/shmem.c +++ b/mm/shmem.c
@@ -85,7 +85,7 @@ static struct vfsmount *shm_mnt;
85	* a time): we would prefer not to enlarge the shmem inode just for that.	85	* a time): we would prefer not to enlarge the shmem inode just for that.
86	*/	86	*/
87	struct shmem_falloc {	87	struct shmem_falloc {
88	int mode; /* FALLOC_FL mode currently operating */	88	wait_queue_head_t waitq; / faults into hole wait for punch to end */
89	pgoff_t start; /* start of range currently being fallocated */	89	pgoff_t start; /* start of range currently being fallocated */
90	pgoff_t next; /* the next page offset to be fallocated */	90	pgoff_t next; /* the next page offset to be fallocated */
91	pgoff_t nr_falloced; /* how many new pages have been fallocated */	91	pgoff_t nr_falloced; /* how many new pages have been fallocated */
@@ -760,7 +760,7 @@ static int shmem_writepage(struct page page, struct writeback_control wbc)
760	spin_lock(&inode->i_lock);	760	spin_lock(&inode->i_lock);
761	shmem_falloc = inode->i_private;	761	shmem_falloc = inode->i_private;
762	if (shmem_falloc &&	762	if (shmem_falloc &&
763	!shmem_falloc->mode &&	763	!shmem_falloc->waitq &&
764	index >= shmem_falloc->start &&	764	index >= shmem_falloc->start &&
765	index < shmem_falloc->next)	765	index < shmem_falloc->next)
766	shmem_falloc->nr_unswapped++;	766	shmem_falloc->nr_unswapped++;
@@ -1248,38 +1248,58 @@ static int shmem_fault(struct vm_area_struct vma, struct vm_fault vmf)
1248	* Trinity finds that probing a hole which tmpfs is punching can	1248	* Trinity finds that probing a hole which tmpfs is punching can
1249	* prevent the hole-punch from ever completing: which in turn	1249	* prevent the hole-punch from ever completing: which in turn
1250	* locks writers out with its hold on i_mutex. So refrain from	1250	* locks writers out with its hold on i_mutex. So refrain from
1251	* faulting pages into the hole while it's being punched, and	1251	* faulting pages into the hole while it's being punched. Although
1252	* wait on i_mutex to be released if vmf->flags permits.	1252	* shmem_undo_range() does remove the additions, it may be unable to
		1253	* keep up, as each new page needs its own unmap_mapping_range() call,
		1254	* and the i_mmap tree grows ever slower to scan if new vmas are added.
		1255	*
		1256	* It does not matter if we sometimes reach this check just before the
		1257	* hole-punch begins, so that one fault then races with the punch:
		1258	* we just need to make racing faults a rare case.
		1259	*
		1260	* The implementation below would be much simpler if we just used a
		1261	* standard mutex or completion: but we cannot take i_mutex in fault,
		1262	* and bloating every shmem inode for this unlikely case would be sad.
1253	*/	1263	*/
1254	if (unlikely(inode->i_private)) {	1264	if (unlikely(inode->i_private)) {
1255	struct shmem_falloc *shmem_falloc;	1265	struct shmem_falloc *shmem_falloc;
1256		1266
1257	spin_lock(&inode->i_lock);	1267	spin_lock(&inode->i_lock);
1258	shmem_falloc = inode->i_private;	1268	shmem_falloc = inode->i_private;
1259	if (!shmem_falloc \|\|	1269	if (shmem_falloc &&
1260	shmem_falloc->mode != FALLOC_FL_PUNCH_HOLE \|\|	1270	shmem_falloc->waitq &&
1261	vmf->pgoff < shmem_falloc->start \|\|	1271	vmf->pgoff >= shmem_falloc->start &&
1262	vmf->pgoff >= shmem_falloc->next)	1272	vmf->pgoff < shmem_falloc->next) {
1263	shmem_falloc = NULL;	1273	wait_queue_head_t *shmem_falloc_waitq;
1264	spin_unlock(&inode->i_lock);	1274	DEFINE_WAIT(shmem_fault_wait);
1265	/*	1275
1266	* i_lock has protected us from taking shmem_falloc seriously	1276	ret = VM_FAULT_NOPAGE;
1267	* once return from shmem_fallocate() went back up that stack.
1268	* i_lock does not serialize with i_mutex at all, but it does
1269	* not matter if sometimes we wait unnecessarily, or sometimes
1270	* miss out on waiting: we just need to make those cases rare.
1271	*/
1272	if (shmem_falloc) {
1273	if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&	1277	if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
1274	!(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {	1278	!(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
		1279	/* It's polite to up mmap_sem if we can */
1275	up_read(&vma->vm_mm->mmap_sem);	1280	up_read(&vma->vm_mm->mmap_sem);
1276	mutex_lock(&inode->i_mutex);	1281	ret = VM_FAULT_RETRY;
1277	mutex_unlock(&inode->i_mutex);
1278	return VM_FAULT_RETRY;
1279	}	1282	}
1280	/* cond_resched? Leave that to GUP or return to user */	1283
1281	return VM_FAULT_NOPAGE;	1284	shmem_falloc_waitq = shmem_falloc->waitq;
		1285	prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
		1286	TASK_UNINTERRUPTIBLE);
		1287	spin_unlock(&inode->i_lock);
		1288	schedule();
		1289
		1290	/*
		1291	* shmem_falloc_waitq points into the shmem_fallocate()
		1292	* stack of the hole-punching task: shmem_falloc_waitq
		1293	* is usually invalid by the time we reach here, but
		1294	* finish_wait() does not dereference it in that case;
		1295	* though i_lock needed lest racing with wake_up_all().
		1296	*/
		1297	spin_lock(&inode->i_lock);
		1298	finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
		1299	spin_unlock(&inode->i_lock);
		1300	return ret;
1282	}	1301	}
		1302	spin_unlock(&inode->i_lock);
1283	}	1303	}
1284		1304
1285	error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);	1305	error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
@@ -1774,13 +1794,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1774		1794
1775	mutex_lock(&inode->i_mutex);	1795	mutex_lock(&inode->i_mutex);
1776		1796
1777	shmem_falloc.mode = mode & ~FALLOC_FL_KEEP_SIZE;
1778
1779	if (mode & FALLOC_FL_PUNCH_HOLE) {	1797	if (mode & FALLOC_FL_PUNCH_HOLE) {
1780	struct address_space *mapping = file->f_mapping;	1798	struct address_space *mapping = file->f_mapping;
1781	loff_t unmap_start = round_up(offset, PAGE_SIZE);	1799	loff_t unmap_start = round_up(offset, PAGE_SIZE);
1782	loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;	1800	loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
		1801	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
1783		1802
		1803	shmem_falloc.waitq = &shmem_falloc_waitq;
1784	shmem_falloc.start = unmap_start >> PAGE_SHIFT;	1804	shmem_falloc.start = unmap_start >> PAGE_SHIFT;
1785	shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;	1805	shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
1786	spin_lock(&inode->i_lock);	1806	spin_lock(&inode->i_lock);
@@ -1792,8 +1812,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1792	1 + unmap_end - unmap_start, 0);	1812	1 + unmap_end - unmap_start, 0);
1793	shmem_truncate_range(inode, offset, offset + len - 1);	1813	shmem_truncate_range(inode, offset, offset + len - 1);
1794	/* No need to unmap again: hole-punching leaves COWed pages */	1814	/* No need to unmap again: hole-punching leaves COWed pages */
		1815
		1816	spin_lock(&inode->i_lock);
		1817	inode->i_private = NULL;
		1818	wake_up_all(&shmem_falloc_waitq);
		1819	spin_unlock(&inode->i_lock);
1795	error = 0;	1820	error = 0;
1796	goto undone;	1821	goto out;
1797	}	1822	}
1798		1823
1799	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */	1824	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
@@ -1809,6 +1834,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1809	goto out;	1834	goto out;
1810	}	1835	}
1811		1836
		1837	shmem_falloc.waitq = NULL;
1812	shmem_falloc.start = start;	1838	shmem_falloc.start = start;
1813	shmem_falloc.next = start;	1839	shmem_falloc.next = start;
1814	shmem_falloc.nr_falloced = 0;	1840	shmem_falloc.nr_falloced = 0;