aboutsummaryrefslogtreecommitdiffstats
path: root/mm/shmem.c
diff options
context:
space:
mode:
authorHugh Dickins <hughd@google.com>2012-05-29 18:06:41 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-05-29 19:22:23 -0400
commite2d12e22c59ce714008aa5266d769f8568d74eac (patch)
treec4c52d14887c54304af947c2a6c75cb8d1bb7826 /mm/shmem.c
parent17cf28afea2a1112f240a3a2da8af883be024811 (diff)
tmpfs: support fallocate preallocation
The systemd plumbers expressed a wish that tmpfs support preallocation. Cong Wang wrote a patch, but several kernel guys expressed scepticism: https://lkml.org/lkml/2011/11/18/137 Christoph Hellwig: What for exactly? Please explain why preallocating on tmpfs would make any sense. Kay Sievers: To be able to safely use mmap(), regarding SIGBUS, on files on the /dev/shm filesystem. The glibc fallback loop for -ENOSYS [or -EOPNOTSUPP] on fallocate is just ugly. Hugh Dickins: If tmpfs is going to support fallocate(FALLOC_FL_PUNCH_HOLE), it would seem perverse to permit the deallocation but fail the allocation. Christoph Hellwig: Agreed. Now that we do have shmem_fallocate() for hole-punching, plumb in basic support for preallocation mode too. It's fairly straightforward (though quite a few details needed attention), except for when it fails part way through. What a pity that fallocate(2) was not specified to return the length allocated, permitting short fallocations! As it is, when it fails part way through, we ought to free what has just been allocated by this system call; but must be very sure not to free any allocated earlier, or any allocated by racing accesses (not all excluded by i_mutex). But we cannot distinguish them: so in this patch simply leak allocations on partial failure (they will be freed later if the file is removed). An attractive alternative approach would have been for fallocate() not to allocate pages at all, but note reservations by entries in the radix-tree. But that would give less assurance, and, critically, would be hard to fit with mem cgroups (who owns the reservations?): allocating pages lets fallocate() behave in just the same way as write(). Based-on-patch-by: Cong Wang <amwang@redhat.com> Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Cong Wang <amwang@redhat.com> Cc: Kay Sievers <kay@vrfy.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/shmem.c')
-rw-r--r--mm/shmem.c61
1 files changed, 60 insertions, 1 deletions
diff --git a/mm/shmem.c b/mm/shmem.c
index f368d0acb52c..9b90d89e54ce 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1602,7 +1602,9 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1602 loff_t len) 1602 loff_t len)
1603{ 1603{
1604 struct inode *inode = file->f_path.dentry->d_inode; 1604 struct inode *inode = file->f_path.dentry->d_inode;
1605 int error = -EOPNOTSUPP; 1605 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1606 pgoff_t start, index, end;
1607 int error;
1606 1608
1607 mutex_lock(&inode->i_mutex); 1609 mutex_lock(&inode->i_mutex);
1608 1610
@@ -1617,8 +1619,65 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1617 shmem_truncate_range(inode, offset, offset + len - 1); 1619 shmem_truncate_range(inode, offset, offset + len - 1);
1618 /* No need to unmap again: hole-punching leaves COWed pages */ 1620 /* No need to unmap again: hole-punching leaves COWed pages */
1619 error = 0; 1621 error = 0;
1622 goto out;
1620 } 1623 }
1621 1624
1625 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
1626 error = inode_newsize_ok(inode, offset + len);
1627 if (error)
1628 goto out;
1629
1630 start = offset >> PAGE_CACHE_SHIFT;
1631 end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1632 /* Try to avoid a swapstorm if len is impossible to satisfy */
1633 if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
1634 error = -ENOSPC;
1635 goto out;
1636 }
1637
1638 for (index = start; index < end; index++) {
1639 struct page *page;
1640
1641 /*
1642 * Good, the fallocate(2) manpage permits EINTR: we may have
1643 * been interrupted because we are using up too much memory.
1644 */
1645 if (signal_pending(current))
1646 error = -EINTR;
1647 else
1648 error = shmem_getpage(inode, index, &page, SGP_WRITE,
1649 NULL);
1650 if (error) {
1651 /*
1652 * We really ought to free what we allocated so far,
1653 * but it would be wrong to free pages allocated
1654 * earlier, or already now in use: i_mutex does not
1655 * exclude all cases. We do not know what to free.
1656 */
1657 goto ctime;
1658 }
1659
1660 if (!PageUptodate(page)) {
1661 clear_highpage(page);
1662 flush_dcache_page(page);
1663 SetPageUptodate(page);
1664 }
1665 /*
1666 * set_page_dirty so that memory pressure will swap rather
1667 * than free the pages we are allocating (and SGP_CACHE pages
1668 * might still be clean: we now need to mark those dirty too).
1669 */
1670 set_page_dirty(page);
1671 unlock_page(page);
1672 page_cache_release(page);
1673 cond_resched();
1674 }
1675
1676 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
1677 i_size_write(inode, offset + len);
1678ctime:
1679 inode->i_ctime = CURRENT_TIME;
1680out:
1622 mutex_unlock(&inode->i_mutex); 1681 mutex_unlock(&inode->i_mutex);
1623 return error; 1682 return error;
1624} 1683}