aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Gibson <david@gibson.dropbear.id.au>2012-03-21 19:34:12 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-21 20:54:59 -0400
commit90481622d75715bfcb68501280a917dbfe516029 (patch)
tree63f7d9e4455366ab326ee74e6b39acf76b618fcf
parenta1d776ee3147cec2a54a645e92eb2e3e2f65a137 (diff)
hugepages: fix use after free bug in "quota" handling
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the general quota handling code, and they don't much resemble its behaviour. Rather than being about maintaining limits on on-disk block usage by particular users, they are instead about maintaining limits on in-memory page usage (including anonymous MAP_PRIVATE copied-on-write pages) associated with a particular hugetlbfs filesystem instance. Worse, they work by having callbacks to the hugetlbfs filesystem code from the low-level page handling code, in particular from free_huge_page(). This is a layering violation of itself, but more importantly, if the kernel does a get_user_pages() on hugepages (which can happen from KVM amongst others), then the free_huge_page() can be delayed until after the associated inode has already been freed. If an unmount occurs at the wrong time, even the hugetlbfs superblock where the "quota" limits are stored may have been freed. Andrew Barry proposed a patch to fix this by having hugepages, instead of storing a pointer to their address_space and reaching the superblock from there, had the hugepages store pointers directly to the superblock, bumping the reference count as appropriate to avoid it being freed. Andrew Morton rejected that version, however, on the grounds that it made the existing layering violation worse. This is a reworked version of Andrew's patch, which removes the extra, and some of the existing, layering violation. It works by introducing the concept of a hugepage "subpool" at the lower hugepage mm layer - that is a finite logical pool of hugepages to allocate from. hugetlbfs now creates a subpool for each filesystem instance with a page limit set, and a pointer to the subpool gets added to each allocated hugepage, instead of the address_space pointer used now. The subpool has its own lifetime and is only freed once all pages in it _and_ all other references to it (i.e. superblocks) are gone. subpools are optional - a NULL subpool pointer is taken by the code to mean that no subpool limits are in effect. Previous discussion of this bug found in: "Fix refcounting in hugetlbfs quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or http://marc.info/?l=linux-mm&m=126928970510627&w=1 v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to alloc_huge_page() - since it already takes the vma, it is not necessary. Signed-off-by: Andrew Barry <abarry@cray.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Cc: Hugh Dickins <hughd@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Hillf Danton <dhillf@gmail.com> Cc: Paul Mackerras <paulus@samba.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/hugetlbfs/inode.c54
-rw-r--r--include/linux/hugetlb.h14
-rw-r--r--mm/hugetlb.c135
3 files changed, 139 insertions, 64 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 4fbd9fccd550..7913e3252167 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -626,9 +626,15 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
626 spin_lock(&sbinfo->stat_lock); 626 spin_lock(&sbinfo->stat_lock);
627 /* If no limits set, just report 0 for max/free/used 627 /* If no limits set, just report 0 for max/free/used
628 * blocks, like simple_statfs() */ 628 * blocks, like simple_statfs() */
629 if (sbinfo->max_blocks >= 0) { 629 if (sbinfo->spool) {
630 buf->f_blocks = sbinfo->max_blocks; 630 long free_pages;
631 buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; 631
632 spin_lock(&sbinfo->spool->lock);
633 buf->f_blocks = sbinfo->spool->max_hpages;
634 free_pages = sbinfo->spool->max_hpages
635 - sbinfo->spool->used_hpages;
636 buf->f_bavail = buf->f_bfree = free_pages;
637 spin_unlock(&sbinfo->spool->lock);
632 buf->f_files = sbinfo->max_inodes; 638 buf->f_files = sbinfo->max_inodes;
633 buf->f_ffree = sbinfo->free_inodes; 639 buf->f_ffree = sbinfo->free_inodes;
634 } 640 }
@@ -644,6 +650,10 @@ static void hugetlbfs_put_super(struct super_block *sb)
644 650
645 if (sbi) { 651 if (sbi) {
646 sb->s_fs_info = NULL; 652 sb->s_fs_info = NULL;
653
654 if (sbi->spool)
655 hugepage_put_subpool(sbi->spool);
656
647 kfree(sbi); 657 kfree(sbi);
648 } 658 }
649} 659}
@@ -874,10 +884,14 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
874 sb->s_fs_info = sbinfo; 884 sb->s_fs_info = sbinfo;
875 sbinfo->hstate = config.hstate; 885 sbinfo->hstate = config.hstate;
876 spin_lock_init(&sbinfo->stat_lock); 886 spin_lock_init(&sbinfo->stat_lock);
877 sbinfo->max_blocks = config.nr_blocks;
878 sbinfo->free_blocks = config.nr_blocks;
879 sbinfo->max_inodes = config.nr_inodes; 887 sbinfo->max_inodes = config.nr_inodes;
880 sbinfo->free_inodes = config.nr_inodes; 888 sbinfo->free_inodes = config.nr_inodes;
889 sbinfo->spool = NULL;
890 if (config.nr_blocks != -1) {
891 sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
892 if (!sbinfo->spool)
893 goto out_free;
894 }
881 sb->s_maxbytes = MAX_LFS_FILESIZE; 895 sb->s_maxbytes = MAX_LFS_FILESIZE;
882 sb->s_blocksize = huge_page_size(config.hstate); 896 sb->s_blocksize = huge_page_size(config.hstate);
883 sb->s_blocksize_bits = huge_page_shift(config.hstate); 897 sb->s_blocksize_bits = huge_page_shift(config.hstate);
@@ -896,38 +910,12 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
896 sb->s_root = root; 910 sb->s_root = root;
897 return 0; 911 return 0;
898out_free: 912out_free:
913 if (sbinfo->spool)
914 kfree(sbinfo->spool);
899 kfree(sbinfo); 915 kfree(sbinfo);
900 return -ENOMEM; 916 return -ENOMEM;
901} 917}
902 918
903int hugetlb_get_quota(struct address_space *mapping, long delta)
904{
905 int ret = 0;
906 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
907
908 if (sbinfo->free_blocks > -1) {
909 spin_lock(&sbinfo->stat_lock);
910 if (sbinfo->free_blocks - delta >= 0)
911 sbinfo->free_blocks -= delta;
912 else
913 ret = -ENOMEM;
914 spin_unlock(&sbinfo->stat_lock);
915 }
916
917 return ret;
918}
919
920void hugetlb_put_quota(struct address_space *mapping, long delta)
921{
922 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
923
924 if (sbinfo->free_blocks > -1) {
925 spin_lock(&sbinfo->stat_lock);
926 sbinfo->free_blocks += delta;
927 spin_unlock(&sbinfo->stat_lock);
928 }
929}
930
931static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, 919static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
932 int flags, const char *dev_name, void *data) 920 int flags, const char *dev_name, void *data)
933{ 921{
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7adc4923e7ac..cf0181738c9e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -14,6 +14,15 @@ struct user_struct;
14#include <linux/shm.h> 14#include <linux/shm.h>
15#include <asm/tlbflush.h> 15#include <asm/tlbflush.h>
16 16
17struct hugepage_subpool {
18 spinlock_t lock;
19 long count;
20 long max_hpages, used_hpages;
21};
22
23struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
24void hugepage_put_subpool(struct hugepage_subpool *spool);
25
17int PageHuge(struct page *page); 26int PageHuge(struct page *page);
18 27
19void reset_vma_resv_huge_pages(struct vm_area_struct *vma); 28void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
@@ -129,12 +138,11 @@ enum {
129 138
130#ifdef CONFIG_HUGETLBFS 139#ifdef CONFIG_HUGETLBFS
131struct hugetlbfs_sb_info { 140struct hugetlbfs_sb_info {
132 long max_blocks; /* blocks allowed */
133 long free_blocks; /* blocks free */
134 long max_inodes; /* inodes allowed */ 141 long max_inodes; /* inodes allowed */
135 long free_inodes; /* inodes free */ 142 long free_inodes; /* inodes free */
136 spinlock_t stat_lock; 143 spinlock_t stat_lock;
137 struct hstate *hstate; 144 struct hstate *hstate;
145 struct hugepage_subpool *spool;
138}; 146};
139 147
140static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) 148static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
@@ -146,8 +154,6 @@ extern const struct file_operations hugetlbfs_file_operations;
146extern const struct vm_operations_struct hugetlb_vm_ops; 154extern const struct vm_operations_struct hugetlb_vm_ops;
147struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, 155struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
148 struct user_struct **user, int creat_flags); 156 struct user_struct **user, int creat_flags);
149int hugetlb_get_quota(struct address_space *mapping, long delta);
150void hugetlb_put_quota(struct address_space *mapping, long delta);
151 157
152static inline int is_file_hugepages(struct file *file) 158static inline int is_file_hugepages(struct file *file)
153{ 159{
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b1c314877334..afa057a1d3fe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;
53 */ 53 */
54static DEFINE_SPINLOCK(hugetlb_lock); 54static DEFINE_SPINLOCK(hugetlb_lock);
55 55
56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
57{
58 bool free = (spool->count == 0) && (spool->used_hpages == 0);
59
60 spin_unlock(&spool->lock);
61
62 /* If no pages are used, and no other handles to the subpool
63 * remain, free the subpool the subpool remain */
64 if (free)
65 kfree(spool);
66}
67
68struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
69{
70 struct hugepage_subpool *spool;
71
72 spool = kmalloc(sizeof(*spool), GFP_KERNEL);
73 if (!spool)
74 return NULL;
75
76 spin_lock_init(&spool->lock);
77 spool->count = 1;
78 spool->max_hpages = nr_blocks;
79 spool->used_hpages = 0;
80
81 return spool;
82}
83
84void hugepage_put_subpool(struct hugepage_subpool *spool)
85{
86 spin_lock(&spool->lock);
87 BUG_ON(!spool->count);
88 spool->count--;
89 unlock_or_release_subpool(spool);
90}
91
92static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
93 long delta)
94{
95 int ret = 0;
96
97 if (!spool)
98 return 0;
99
100 spin_lock(&spool->lock);
101 if ((spool->used_hpages + delta) <= spool->max_hpages) {
102 spool->used_hpages += delta;
103 } else {
104 ret = -ENOMEM;
105 }
106 spin_unlock(&spool->lock);
107
108 return ret;
109}
110
111static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
112 long delta)
113{
114 if (!spool)
115 return;
116
117 spin_lock(&spool->lock);
118 spool->used_hpages -= delta;
119 /* If hugetlbfs_put_super couldn't free spool due to
120 * an outstanding quota reference, free it now. */
121 unlock_or_release_subpool(spool);
122}
123
124static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
125{
126 return HUGETLBFS_SB(inode->i_sb)->spool;
127}
128
129static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
130{
131 return subpool_inode(vma->vm_file->f_dentry->d_inode);
132}
133
56/* 134/*
57 * Region tracking -- allows tracking of reservations and instantiated pages 135 * Region tracking -- allows tracking of reservations and instantiated pages
58 * across the pages in a mapping. 136 * across the pages in a mapping.
@@ -540,9 +618,9 @@ static void free_huge_page(struct page *page)
540 */ 618 */
541 struct hstate *h = page_hstate(page); 619 struct hstate *h = page_hstate(page);
542 int nid = page_to_nid(page); 620 int nid = page_to_nid(page);
543 struct address_space *mapping; 621 struct hugepage_subpool *spool =
622 (struct hugepage_subpool *)page_private(page);
544 623
545 mapping = (struct address_space *) page_private(page);
546 set_page_private(page, 0); 624 set_page_private(page, 0);
547 page->mapping = NULL; 625 page->mapping = NULL;
548 BUG_ON(page_count(page)); 626 BUG_ON(page_count(page));
@@ -558,8 +636,7 @@ static void free_huge_page(struct page *page)
558 enqueue_huge_page(h, page); 636 enqueue_huge_page(h, page);
559 } 637 }
560 spin_unlock(&hugetlb_lock); 638 spin_unlock(&hugetlb_lock);
561 if (mapping) 639 hugepage_subpool_put_pages(spool, 1);
562 hugetlb_put_quota(mapping, 1);
563} 640}
564 641
565static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 642static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -977,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h,
977/* 1054/*
978 * Determine if the huge page at addr within the vma has an associated 1055 * Determine if the huge page at addr within the vma has an associated
979 * reservation. Where it does not we will need to logically increase 1056 * reservation. Where it does not we will need to logically increase
980 * reservation and actually increase quota before an allocation can occur. 1057 * reservation and actually increase subpool usage before an allocation
981 * Where any new reservation would be required the reservation change is 1058 * can occur. Where any new reservation would be required the
982 * prepared, but not committed. Once the page has been quota'd allocated 1059 * reservation change is prepared, but not committed. Once the page
983 * an instantiated the change should be committed via vma_commit_reservation. 1060 * has been allocated from the subpool and instantiated the change should
984 * No action is required on failure. 1061 * be committed via vma_commit_reservation. No action is required on
1062 * failure.
985 */ 1063 */
986static long vma_needs_reservation(struct hstate *h, 1064static long vma_needs_reservation(struct hstate *h,
987 struct vm_area_struct *vma, unsigned long addr) 1065 struct vm_area_struct *vma, unsigned long addr)
@@ -1030,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h,
1030static struct page *alloc_huge_page(struct vm_area_struct *vma, 1108static struct page *alloc_huge_page(struct vm_area_struct *vma,
1031 unsigned long addr, int avoid_reserve) 1109 unsigned long addr, int avoid_reserve)
1032{ 1110{
1111 struct hugepage_subpool *spool = subpool_vma(vma);
1033 struct hstate *h = hstate_vma(vma); 1112 struct hstate *h = hstate_vma(vma);
1034 struct page *page; 1113 struct page *page;
1035 struct address_space *mapping = vma->vm_file->f_mapping;
1036 struct inode *inode = mapping->host;
1037 long chg; 1114 long chg;
1038 1115
1039 /* 1116 /*
1040 * Processes that did not create the mapping will have no reserves and 1117 * Processes that did not create the mapping will have no
1041 * will not have accounted against quota. Check that the quota can be 1118 * reserves and will not have accounted against subpool
1042 * made before satisfying the allocation 1119 * limit. Check that the subpool limit can be made before
1043 * MAP_NORESERVE mappings may also need pages and quota allocated 1120 * satisfying the allocation MAP_NORESERVE mappings may also
1044 * if no reserve mapping overlaps. 1121 * need pages and subpool limit allocated allocated if no reserve
1122 * mapping overlaps.
1045 */ 1123 */
1046 chg = vma_needs_reservation(h, vma, addr); 1124 chg = vma_needs_reservation(h, vma, addr);
1047 if (chg < 0) 1125 if (chg < 0)
1048 return ERR_PTR(-VM_FAULT_OOM); 1126 return ERR_PTR(-VM_FAULT_OOM);
1049 if (chg) 1127 if (chg)
1050 if (hugetlb_get_quota(inode->i_mapping, chg)) 1128 if (hugepage_subpool_get_pages(spool, chg))
1051 return ERR_PTR(-VM_FAULT_SIGBUS); 1129 return ERR_PTR(-VM_FAULT_SIGBUS);
1052 1130
1053 spin_lock(&hugetlb_lock); 1131 spin_lock(&hugetlb_lock);
@@ -1057,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1057 if (!page) { 1135 if (!page) {
1058 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1136 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1059 if (!page) { 1137 if (!page) {
1060 hugetlb_put_quota(inode->i_mapping, chg); 1138 hugepage_subpool_put_pages(spool, chg);
1061 return ERR_PTR(-VM_FAULT_SIGBUS); 1139 return ERR_PTR(-VM_FAULT_SIGBUS);
1062 } 1140 }
1063 } 1141 }
1064 1142
1065 set_page_private(page, (unsigned long) mapping); 1143 set_page_private(page, (unsigned long)spool);
1066 1144
1067 vma_commit_reservation(h, vma, addr); 1145 vma_commit_reservation(h, vma, addr);
1068 1146
@@ -2083,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2083{ 2161{
2084 struct hstate *h = hstate_vma(vma); 2162 struct hstate *h = hstate_vma(vma);
2085 struct resv_map *reservations = vma_resv_map(vma); 2163 struct resv_map *reservations = vma_resv_map(vma);
2164 struct hugepage_subpool *spool = subpool_vma(vma);
2086 unsigned long reserve; 2165 unsigned long reserve;
2087 unsigned long start; 2166 unsigned long start;
2088 unsigned long end; 2167 unsigned long end;
@@ -2098,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2098 2177
2099 if (reserve) { 2178 if (reserve) {
2100 hugetlb_acct_memory(h, -reserve); 2179 hugetlb_acct_memory(h, -reserve);
2101 hugetlb_put_quota(vma->vm_file->f_mapping, reserve); 2180 hugepage_subpool_put_pages(spool, reserve);
2102 } 2181 }
2103 } 2182 }
2104} 2183}
@@ -2331,7 +2410,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2331 */ 2410 */
2332 address = address & huge_page_mask(h); 2411 address = address & huge_page_mask(h);
2333 pgoff = vma_hugecache_offset(h, vma, address); 2412 pgoff = vma_hugecache_offset(h, vma, address);
2334 mapping = (struct address_space *)page_private(page); 2413 mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
2335 2414
2336 /* 2415 /*
2337 * Take the mapping lock for the duration of the table walk. As 2416 * Take the mapping lock for the duration of the table walk. As
@@ -2884,11 +2963,12 @@ int hugetlb_reserve_pages(struct inode *inode,
2884{ 2963{
2885 long ret, chg; 2964 long ret, chg;
2886 struct hstate *h = hstate_inode(inode); 2965 struct hstate *h = hstate_inode(inode);
2966 struct hugepage_subpool *spool = subpool_inode(inode);
2887 2967
2888 /* 2968 /*
2889 * Only apply hugepage reservation if asked. At fault time, an 2969 * Only apply hugepage reservation if asked. At fault time, an
2890 * attempt will be made for VM_NORESERVE to allocate a page 2970 * attempt will be made for VM_NORESERVE to allocate a page
2891 * and filesystem quota without using reserves 2971 * without using reserves
2892 */ 2972 */
2893 if (vm_flags & VM_NORESERVE) 2973 if (vm_flags & VM_NORESERVE)
2894 return 0; 2974 return 0;
@@ -2915,17 +2995,17 @@ int hugetlb_reserve_pages(struct inode *inode,
2915 if (chg < 0) 2995 if (chg < 0)
2916 return chg; 2996 return chg;
2917 2997
2918 /* There must be enough filesystem quota for the mapping */ 2998 /* There must be enough pages in the subpool for the mapping */
2919 if (hugetlb_get_quota(inode->i_mapping, chg)) 2999 if (hugepage_subpool_get_pages(spool, chg))
2920 return -ENOSPC; 3000 return -ENOSPC;
2921 3001
2922 /* 3002 /*
2923 * Check enough hugepages are available for the reservation. 3003 * Check enough hugepages are available for the reservation.
2924 * Hand back the quota if there are not 3004 * Hand the pages back to the subpool if there are not
2925 */ 3005 */
2926 ret = hugetlb_acct_memory(h, chg); 3006 ret = hugetlb_acct_memory(h, chg);
2927 if (ret < 0) { 3007 if (ret < 0) {
2928 hugetlb_put_quota(inode->i_mapping, chg); 3008 hugepage_subpool_put_pages(spool, chg);
2929 return ret; 3009 return ret;
2930 } 3010 }
2931 3011
@@ -2949,12 +3029,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2949{ 3029{
2950 struct hstate *h = hstate_inode(inode); 3030 struct hstate *h = hstate_inode(inode);
2951 long chg = region_truncate(&inode->i_mapping->private_list, offset); 3031 long chg = region_truncate(&inode->i_mapping->private_list, offset);
3032 struct hugepage_subpool *spool = subpool_inode(inode);
2952 3033
2953 spin_lock(&inode->i_lock); 3034 spin_lock(&inode->i_lock);
2954 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 3035 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
2955 spin_unlock(&inode->i_lock); 3036 spin_unlock(&inode->i_lock);
2956 3037
2957 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 3038 hugepage_subpool_put_pages(spool, (chg - freed));
2958 hugetlb_acct_memory(h, -(chg - freed)); 3039 hugetlb_acct_memory(h, -(chg - freed));
2959} 3040}
2960 3041