diff options
author | Hugh Dickins <hughd@google.com> | 2014-07-23 17:00:10 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-07-23 18:10:54 -0400 |
commit | 8e205f779d1443a94b5ae81aa359cb535dd3021e (patch) | |
tree | e29022b5d5b9acc5e862c7c46100c89e888a5387 /mm/shmem.c | |
parent | c118678bc79e8241f9d3434d9324c6400d72f48a (diff) |
shmem: fix faulting into a hole, not taking i_mutex
Commit f00cdc6df7d7 ("shmem: fix faulting into a hole while it's
punched") was buggy: Sasha sent a lockdep report to remind us that
grabbing i_mutex in the fault path is a no-no (write syscall may already
hold i_mutex while faulting user buffer).
We tried a completely different approach (see following patch) but that
proved inadequate: good enough for a rational workload, but not good
enough against trinity - which forks off so many mappings of the object
that contention on i_mmap_mutex while hole-puncher holds i_mutex builds
into serious starvation when concurrent faults force the puncher to fall
back to single-page unmap_mapping_range() searches of the i_mmap tree.
So return to the original umbrella approach, but keep away from i_mutex
this time. We really don't want to bloat every shmem inode with a new
mutex or completion, just to protect this unlikely case from trinity.
So extend the original with wait_queue_head on stack at the hole-punch
end, and wait_queue item on the stack at the fault end.
This involves further use of i_lock to guard against the races: lockdep
has been happy so far, and I see fs/inode.c:unlock_new_inode() holds
i_lock around wake_up_bit(), which is comparable to what we do here.
i_lock is more convenient, but we could switch to shmem's info->lock.
This issue has been tagged with CVE-2014-4171, which will require commit
f00cdc6df7d7 and this and the following patch to be backported: we
suggest to 3.1+, though in fact the trinity forkbomb effect might go
back as far as 2.6.16, when madvise(,,MADV_REMOVE) came in - or might
not, since much has changed, with i_mmap_mutex a spinlock before 3.0.
Anyone running trinity on 3.0 and earlier? I don't think we need care.
Signed-off-by: Hugh Dickins <hughd@google.com>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Lukas Czerner <lczerner@redhat.com>
Cc: Dave Jones <davej@redhat.com>
Cc: <stable@vger.kernel.org> [3.1+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/shmem.c')
-rw-r--r-- | mm/shmem.c | 78 |
1 files changed, 52 insertions, 26 deletions
diff --git a/mm/shmem.c b/mm/shmem.c index 1140f49b6ded..c0719f082246 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -85,7 +85,7 @@ static struct vfsmount *shm_mnt; | |||
85 | * a time): we would prefer not to enlarge the shmem inode just for that. | 85 | * a time): we would prefer not to enlarge the shmem inode just for that. |
86 | */ | 86 | */ |
87 | struct shmem_falloc { | 87 | struct shmem_falloc { |
88 | int mode; /* FALLOC_FL mode currently operating */ | 88 | wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ |
89 | pgoff_t start; /* start of range currently being fallocated */ | 89 | pgoff_t start; /* start of range currently being fallocated */ |
90 | pgoff_t next; /* the next page offset to be fallocated */ | 90 | pgoff_t next; /* the next page offset to be fallocated */ |
91 | pgoff_t nr_falloced; /* how many new pages have been fallocated */ | 91 | pgoff_t nr_falloced; /* how many new pages have been fallocated */ |
@@ -760,7 +760,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
760 | spin_lock(&inode->i_lock); | 760 | spin_lock(&inode->i_lock); |
761 | shmem_falloc = inode->i_private; | 761 | shmem_falloc = inode->i_private; |
762 | if (shmem_falloc && | 762 | if (shmem_falloc && |
763 | !shmem_falloc->mode && | 763 | !shmem_falloc->waitq && |
764 | index >= shmem_falloc->start && | 764 | index >= shmem_falloc->start && |
765 | index < shmem_falloc->next) | 765 | index < shmem_falloc->next) |
766 | shmem_falloc->nr_unswapped++; | 766 | shmem_falloc->nr_unswapped++; |
@@ -1248,38 +1248,58 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1248 | * Trinity finds that probing a hole which tmpfs is punching can | 1248 | * Trinity finds that probing a hole which tmpfs is punching can |
1249 | * prevent the hole-punch from ever completing: which in turn | 1249 | * prevent the hole-punch from ever completing: which in turn |
1250 | * locks writers out with its hold on i_mutex. So refrain from | 1250 | * locks writers out with its hold on i_mutex. So refrain from |
1251 | * faulting pages into the hole while it's being punched, and | 1251 | * faulting pages into the hole while it's being punched. Although |
1252 | * wait on i_mutex to be released if vmf->flags permits. | 1252 | * shmem_undo_range() does remove the additions, it may be unable to |
1253 | * keep up, as each new page needs its own unmap_mapping_range() call, | ||
1254 | * and the i_mmap tree grows ever slower to scan if new vmas are added. | ||
1255 | * | ||
1256 | * It does not matter if we sometimes reach this check just before the | ||
1257 | * hole-punch begins, so that one fault then races with the punch: | ||
1258 | * we just need to make racing faults a rare case. | ||
1259 | * | ||
1260 | * The implementation below would be much simpler if we just used a | ||
1261 | * standard mutex or completion: but we cannot take i_mutex in fault, | ||
1262 | * and bloating every shmem inode for this unlikely case would be sad. | ||
1253 | */ | 1263 | */ |
1254 | if (unlikely(inode->i_private)) { | 1264 | if (unlikely(inode->i_private)) { |
1255 | struct shmem_falloc *shmem_falloc; | 1265 | struct shmem_falloc *shmem_falloc; |
1256 | 1266 | ||
1257 | spin_lock(&inode->i_lock); | 1267 | spin_lock(&inode->i_lock); |
1258 | shmem_falloc = inode->i_private; | 1268 | shmem_falloc = inode->i_private; |
1259 | if (!shmem_falloc || | 1269 | if (shmem_falloc && |
1260 | shmem_falloc->mode != FALLOC_FL_PUNCH_HOLE || | 1270 | shmem_falloc->waitq && |
1261 | vmf->pgoff < shmem_falloc->start || | 1271 | vmf->pgoff >= shmem_falloc->start && |
1262 | vmf->pgoff >= shmem_falloc->next) | 1272 | vmf->pgoff < shmem_falloc->next) { |
1263 | shmem_falloc = NULL; | 1273 | wait_queue_head_t *shmem_falloc_waitq; |
1264 | spin_unlock(&inode->i_lock); | 1274 | DEFINE_WAIT(shmem_fault_wait); |
1265 | /* | 1275 | |
1266 | * i_lock has protected us from taking shmem_falloc seriously | 1276 | ret = VM_FAULT_NOPAGE; |
1267 | * once return from shmem_fallocate() went back up that stack. | ||
1268 | * i_lock does not serialize with i_mutex at all, but it does | ||
1269 | * not matter if sometimes we wait unnecessarily, or sometimes | ||
1270 | * miss out on waiting: we just need to make those cases rare. | ||
1271 | */ | ||
1272 | if (shmem_falloc) { | ||
1273 | if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && | 1277 | if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && |
1274 | !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { | 1278 | !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { |
1279 | /* It's polite to up mmap_sem if we can */ | ||
1275 | up_read(&vma->vm_mm->mmap_sem); | 1280 | up_read(&vma->vm_mm->mmap_sem); |
1276 | mutex_lock(&inode->i_mutex); | 1281 | ret = VM_FAULT_RETRY; |
1277 | mutex_unlock(&inode->i_mutex); | ||
1278 | return VM_FAULT_RETRY; | ||
1279 | } | 1282 | } |
1280 | /* cond_resched? Leave that to GUP or return to user */ | 1283 | |
1281 | return VM_FAULT_NOPAGE; | 1284 | shmem_falloc_waitq = shmem_falloc->waitq; |
1285 | prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, | ||
1286 | TASK_UNINTERRUPTIBLE); | ||
1287 | spin_unlock(&inode->i_lock); | ||
1288 | schedule(); | ||
1289 | |||
1290 | /* | ||
1291 | * shmem_falloc_waitq points into the shmem_fallocate() | ||
1292 | * stack of the hole-punching task: shmem_falloc_waitq | ||
1293 | * is usually invalid by the time we reach here, but | ||
1294 | * finish_wait() does not dereference it in that case; | ||
1295 | * though i_lock needed lest racing with wake_up_all(). | ||
1296 | */ | ||
1297 | spin_lock(&inode->i_lock); | ||
1298 | finish_wait(shmem_falloc_waitq, &shmem_fault_wait); | ||
1299 | spin_unlock(&inode->i_lock); | ||
1300 | return ret; | ||
1282 | } | 1301 | } |
1302 | spin_unlock(&inode->i_lock); | ||
1283 | } | 1303 | } |
1284 | 1304 | ||
1285 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); | 1305 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); |
@@ -1774,13 +1794,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, | |||
1774 | 1794 | ||
1775 | mutex_lock(&inode->i_mutex); | 1795 | mutex_lock(&inode->i_mutex); |
1776 | 1796 | ||
1777 | shmem_falloc.mode = mode & ~FALLOC_FL_KEEP_SIZE; | ||
1778 | |||
1779 | if (mode & FALLOC_FL_PUNCH_HOLE) { | 1797 | if (mode & FALLOC_FL_PUNCH_HOLE) { |
1780 | struct address_space *mapping = file->f_mapping; | 1798 | struct address_space *mapping = file->f_mapping; |
1781 | loff_t unmap_start = round_up(offset, PAGE_SIZE); | 1799 | loff_t unmap_start = round_up(offset, PAGE_SIZE); |
1782 | loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; | 1800 | loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; |
1801 | DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); | ||
1783 | 1802 | ||
1803 | shmem_falloc.waitq = &shmem_falloc_waitq; | ||
1784 | shmem_falloc.start = unmap_start >> PAGE_SHIFT; | 1804 | shmem_falloc.start = unmap_start >> PAGE_SHIFT; |
1785 | shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; | 1805 | shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; |
1786 | spin_lock(&inode->i_lock); | 1806 | spin_lock(&inode->i_lock); |
@@ -1792,8 +1812,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, | |||
1792 | 1 + unmap_end - unmap_start, 0); | 1812 | 1 + unmap_end - unmap_start, 0); |
1793 | shmem_truncate_range(inode, offset, offset + len - 1); | 1813 | shmem_truncate_range(inode, offset, offset + len - 1); |
1794 | /* No need to unmap again: hole-punching leaves COWed pages */ | 1814 | /* No need to unmap again: hole-punching leaves COWed pages */ |
1815 | |||
1816 | spin_lock(&inode->i_lock); | ||
1817 | inode->i_private = NULL; | ||
1818 | wake_up_all(&shmem_falloc_waitq); | ||
1819 | spin_unlock(&inode->i_lock); | ||
1795 | error = 0; | 1820 | error = 0; |
1796 | goto undone; | 1821 | goto out; |
1797 | } | 1822 | } |
1798 | 1823 | ||
1799 | /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ | 1824 | /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ |
@@ -1809,6 +1834,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, | |||
1809 | goto out; | 1834 | goto out; |
1810 | } | 1835 | } |
1811 | 1836 | ||
1837 | shmem_falloc.waitq = NULL; | ||
1812 | shmem_falloc.start = start; | 1838 | shmem_falloc.start = start; |
1813 | shmem_falloc.next = start; | 1839 | shmem_falloc.next = start; |
1814 | shmem_falloc.nr_falloced = 0; | 1840 | shmem_falloc.nr_falloced = 0; |