aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDavid Gibson <david@gibson.dropbear.id.au>2012-03-21 19:34:12 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-21 20:54:59 -0400
commit90481622d75715bfcb68501280a917dbfe516029 (patch)
tree63f7d9e4455366ab326ee74e6b39acf76b618fcf /fs
parenta1d776ee3147cec2a54a645e92eb2e3e2f65a137 (diff)
hugepages: fix use after free bug in "quota" handling
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the general quota handling code, and they don't much resemble its behaviour. Rather than being about maintaining limits on on-disk block usage by particular users, they are instead about maintaining limits on in-memory page usage (including anonymous MAP_PRIVATE copied-on-write pages) associated with a particular hugetlbfs filesystem instance. Worse, they work by having callbacks to the hugetlbfs filesystem code from the low-level page handling code, in particular from free_huge_page(). This is a layering violation of itself, but more importantly, if the kernel does a get_user_pages() on hugepages (which can happen from KVM amongst others), then the free_huge_page() can be delayed until after the associated inode has already been freed. If an unmount occurs at the wrong time, even the hugetlbfs superblock where the "quota" limits are stored may have been freed. Andrew Barry proposed a patch to fix this by having hugepages, instead of storing a pointer to their address_space and reaching the superblock from there, had the hugepages store pointers directly to the superblock, bumping the reference count as appropriate to avoid it being freed. Andrew Morton rejected that version, however, on the grounds that it made the existing layering violation worse. This is a reworked version of Andrew's patch, which removes the extra, and some of the existing, layering violation. It works by introducing the concept of a hugepage "subpool" at the lower hugepage mm layer - that is a finite logical pool of hugepages to allocate from. hugetlbfs now creates a subpool for each filesystem instance with a page limit set, and a pointer to the subpool gets added to each allocated hugepage, instead of the address_space pointer used now. The subpool has its own lifetime and is only freed once all pages in it _and_ all other references to it (i.e. superblocks) are gone. subpools are optional - a NULL subpool pointer is taken by the code to mean that no subpool limits are in effect. Previous discussion of this bug found in: "Fix refcounting in hugetlbfs quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or http://marc.info/?l=linux-mm&m=126928970510627&w=1 v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to alloc_huge_page() - since it already takes the vma, it is not necessary. Signed-off-by: Andrew Barry <abarry@cray.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Cc: Hugh Dickins <hughd@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Hillf Danton <dhillf@gmail.com> Cc: Paul Mackerras <paulus@samba.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/hugetlbfs/inode.c54
1 files changed, 21 insertions, 33 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 4fbd9fccd550..7913e3252167 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -626,9 +626,15 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
626 spin_lock(&sbinfo->stat_lock); 626 spin_lock(&sbinfo->stat_lock);
627 /* If no limits set, just report 0 for max/free/used 627 /* If no limits set, just report 0 for max/free/used
628 * blocks, like simple_statfs() */ 628 * blocks, like simple_statfs() */
629 if (sbinfo->max_blocks >= 0) { 629 if (sbinfo->spool) {
630 buf->f_blocks = sbinfo->max_blocks; 630 long free_pages;
631 buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; 631
632 spin_lock(&sbinfo->spool->lock);
633 buf->f_blocks = sbinfo->spool->max_hpages;
634 free_pages = sbinfo->spool->max_hpages
635 - sbinfo->spool->used_hpages;
636 buf->f_bavail = buf->f_bfree = free_pages;
637 spin_unlock(&sbinfo->spool->lock);
632 buf->f_files = sbinfo->max_inodes; 638 buf->f_files = sbinfo->max_inodes;
633 buf->f_ffree = sbinfo->free_inodes; 639 buf->f_ffree = sbinfo->free_inodes;
634 } 640 }
@@ -644,6 +650,10 @@ static void hugetlbfs_put_super(struct super_block *sb)
644 650
645 if (sbi) { 651 if (sbi) {
646 sb->s_fs_info = NULL; 652 sb->s_fs_info = NULL;
653
654 if (sbi->spool)
655 hugepage_put_subpool(sbi->spool);
656
647 kfree(sbi); 657 kfree(sbi);
648 } 658 }
649} 659}
@@ -874,10 +884,14 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
874 sb->s_fs_info = sbinfo; 884 sb->s_fs_info = sbinfo;
875 sbinfo->hstate = config.hstate; 885 sbinfo->hstate = config.hstate;
876 spin_lock_init(&sbinfo->stat_lock); 886 spin_lock_init(&sbinfo->stat_lock);
877 sbinfo->max_blocks = config.nr_blocks;
878 sbinfo->free_blocks = config.nr_blocks;
879 sbinfo->max_inodes = config.nr_inodes; 887 sbinfo->max_inodes = config.nr_inodes;
880 sbinfo->free_inodes = config.nr_inodes; 888 sbinfo->free_inodes = config.nr_inodes;
889 sbinfo->spool = NULL;
890 if (config.nr_blocks != -1) {
891 sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
892 if (!sbinfo->spool)
893 goto out_free;
894 }
881 sb->s_maxbytes = MAX_LFS_FILESIZE; 895 sb->s_maxbytes = MAX_LFS_FILESIZE;
882 sb->s_blocksize = huge_page_size(config.hstate); 896 sb->s_blocksize = huge_page_size(config.hstate);
883 sb->s_blocksize_bits = huge_page_shift(config.hstate); 897 sb->s_blocksize_bits = huge_page_shift(config.hstate);
@@ -896,38 +910,12 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
896 sb->s_root = root; 910 sb->s_root = root;
897 return 0; 911 return 0;
898out_free: 912out_free:
913 if (sbinfo->spool)
914 kfree(sbinfo->spool);
899 kfree(sbinfo); 915 kfree(sbinfo);
900 return -ENOMEM; 916 return -ENOMEM;
901} 917}
902 918
903int hugetlb_get_quota(struct address_space *mapping, long delta)
904{
905 int ret = 0;
906 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
907
908 if (sbinfo->free_blocks > -1) {
909 spin_lock(&sbinfo->stat_lock);
910 if (sbinfo->free_blocks - delta >= 0)
911 sbinfo->free_blocks -= delta;
912 else
913 ret = -ENOMEM;
914 spin_unlock(&sbinfo->stat_lock);
915 }
916
917 return ret;
918}
919
920void hugetlb_put_quota(struct address_space *mapping, long delta)
921{
922 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
923
924 if (sbinfo->free_blocks > -1) {
925 spin_lock(&sbinfo->stat_lock);
926 sbinfo->free_blocks += delta;
927 spin_unlock(&sbinfo->stat_lock);
928 }
929}
930
931static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, 919static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
932 int flags, const char *dev_name, void *data) 920 int flags, const char *dev_name, void *data)
933{ 921{