aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDavid Gibson <david@gibson.dropbear.id.au>2012-03-21 19:34:12 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-21 20:54:59 -0400
commit90481622d75715bfcb68501280a917dbfe516029 (patch)
tree63f7d9e4455366ab326ee74e6b39acf76b618fcf /mm
parenta1d776ee3147cec2a54a645e92eb2e3e2f65a137 (diff)
hugepages: fix use after free bug in "quota" handling
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the general quota handling code, and they don't much resemble its behaviour. Rather than being about maintaining limits on on-disk block usage by particular users, they are instead about maintaining limits on in-memory page usage (including anonymous MAP_PRIVATE copied-on-write pages) associated with a particular hugetlbfs filesystem instance. Worse, they work by having callbacks to the hugetlbfs filesystem code from the low-level page handling code, in particular from free_huge_page(). This is a layering violation of itself, but more importantly, if the kernel does a get_user_pages() on hugepages (which can happen from KVM amongst others), then the free_huge_page() can be delayed until after the associated inode has already been freed. If an unmount occurs at the wrong time, even the hugetlbfs superblock where the "quota" limits are stored may have been freed. Andrew Barry proposed a patch to fix this by having hugepages, instead of storing a pointer to their address_space and reaching the superblock from there, had the hugepages store pointers directly to the superblock, bumping the reference count as appropriate to avoid it being freed. Andrew Morton rejected that version, however, on the grounds that it made the existing layering violation worse. This is a reworked version of Andrew's patch, which removes the extra, and some of the existing, layering violation. It works by introducing the concept of a hugepage "subpool" at the lower hugepage mm layer - that is a finite logical pool of hugepages to allocate from. hugetlbfs now creates a subpool for each filesystem instance with a page limit set, and a pointer to the subpool gets added to each allocated hugepage, instead of the address_space pointer used now. The subpool has its own lifetime and is only freed once all pages in it _and_ all other references to it (i.e. superblocks) are gone. subpools are optional - a NULL subpool pointer is taken by the code to mean that no subpool limits are in effect. Previous discussion of this bug found in: "Fix refcounting in hugetlbfs quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or http://marc.info/?l=linux-mm&m=126928970510627&w=1 v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to alloc_huge_page() - since it already takes the vma, it is not necessary. Signed-off-by: Andrew Barry <abarry@cray.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Cc: Hugh Dickins <hughd@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Hillf Danton <dhillf@gmail.com> Cc: Paul Mackerras <paulus@samba.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c135
1 files changed, 108 insertions, 27 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b1c31487733..afa057a1d3f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;
53 */ 53 */
54static DEFINE_SPINLOCK(hugetlb_lock); 54static DEFINE_SPINLOCK(hugetlb_lock);
55 55
56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
57{
58 bool free = (spool->count == 0) && (spool->used_hpages == 0);
59
60 spin_unlock(&spool->lock);
61
62 /* If no pages are used, and no other handles to the subpool
63 * remain, free the subpool the subpool remain */
64 if (free)
65 kfree(spool);
66}
67
68struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
69{
70 struct hugepage_subpool *spool;
71
72 spool = kmalloc(sizeof(*spool), GFP_KERNEL);
73 if (!spool)
74 return NULL;
75
76 spin_lock_init(&spool->lock);
77 spool->count = 1;
78 spool->max_hpages = nr_blocks;
79 spool->used_hpages = 0;
80
81 return spool;
82}
83
84void hugepage_put_subpool(struct hugepage_subpool *spool)
85{
86 spin_lock(&spool->lock);
87 BUG_ON(!spool->count);
88 spool->count--;
89 unlock_or_release_subpool(spool);
90}
91
92static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
93 long delta)
94{
95 int ret = 0;
96
97 if (!spool)
98 return 0;
99
100 spin_lock(&spool->lock);
101 if ((spool->used_hpages + delta) <= spool->max_hpages) {
102 spool->used_hpages += delta;
103 } else {
104 ret = -ENOMEM;
105 }
106 spin_unlock(&spool->lock);
107
108 return ret;
109}
110
111static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
112 long delta)
113{
114 if (!spool)
115 return;
116
117 spin_lock(&spool->lock);
118 spool->used_hpages -= delta;
119 /* If hugetlbfs_put_super couldn't free spool due to
120 * an outstanding quota reference, free it now. */
121 unlock_or_release_subpool(spool);
122}
123
124static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
125{
126 return HUGETLBFS_SB(inode->i_sb)->spool;
127}
128
129static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
130{
131 return subpool_inode(vma->vm_file->f_dentry->d_inode);
132}
133
56/* 134/*
57 * Region tracking -- allows tracking of reservations and instantiated pages 135 * Region tracking -- allows tracking of reservations and instantiated pages
58 * across the pages in a mapping. 136 * across the pages in a mapping.
@@ -540,9 +618,9 @@ static void free_huge_page(struct page *page)
540 */ 618 */
541 struct hstate *h = page_hstate(page); 619 struct hstate *h = page_hstate(page);
542 int nid = page_to_nid(page); 620 int nid = page_to_nid(page);
543 struct address_space *mapping; 621 struct hugepage_subpool *spool =
622 (struct hugepage_subpool *)page_private(page);
544 623
545 mapping = (struct address_space *) page_private(page);
546 set_page_private(page, 0); 624 set_page_private(page, 0);
547 page->mapping = NULL; 625 page->mapping = NULL;
548 BUG_ON(page_count(page)); 626 BUG_ON(page_count(page));
@@ -558,8 +636,7 @@ static void free_huge_page(struct page *page)
558 enqueue_huge_page(h, page); 636 enqueue_huge_page(h, page);
559 } 637 }
560 spin_unlock(&hugetlb_lock); 638 spin_unlock(&hugetlb_lock);
561 if (mapping) 639 hugepage_subpool_put_pages(spool, 1);
562 hugetlb_put_quota(mapping, 1);
563} 640}
564 641
565static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 642static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -977,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h,
977/* 1054/*
978 * Determine if the huge page at addr within the vma has an associated 1055 * Determine if the huge page at addr within the vma has an associated
979 * reservation. Where it does not we will need to logically increase 1056 * reservation. Where it does not we will need to logically increase
980 * reservation and actually increase quota before an allocation can occur. 1057 * reservation and actually increase subpool usage before an allocation
981 * Where any new reservation would be required the reservation change is 1058 * can occur. Where any new reservation would be required the
982 * prepared, but not committed. Once the page has been quota'd allocated 1059 * reservation change is prepared, but not committed. Once the page
983 * an instantiated the change should be committed via vma_commit_reservation. 1060 * has been allocated from the subpool and instantiated the change should
984 * No action is required on failure. 1061 * be committed via vma_commit_reservation. No action is required on
1062 * failure.
985 */ 1063 */
986static long vma_needs_reservation(struct hstate *h, 1064static long vma_needs_reservation(struct hstate *h,
987 struct vm_area_struct *vma, unsigned long addr) 1065 struct vm_area_struct *vma, unsigned long addr)
@@ -1030,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h,
1030static struct page *alloc_huge_page(struct vm_area_struct *vma, 1108static struct page *alloc_huge_page(struct vm_area_struct *vma,
1031 unsigned long addr, int avoid_reserve) 1109 unsigned long addr, int avoid_reserve)
1032{ 1110{
1111 struct hugepage_subpool *spool = subpool_vma(vma);
1033 struct hstate *h = hstate_vma(vma); 1112 struct hstate *h = hstate_vma(vma);
1034 struct page *page; 1113 struct page *page;
1035 struct address_space *mapping = vma->vm_file->f_mapping;
1036 struct inode *inode = mapping->host;
1037 long chg; 1114 long chg;
1038 1115
1039 /* 1116 /*
1040 * Processes that did not create the mapping will have no reserves and 1117 * Processes that did not create the mapping will have no
1041 * will not have accounted against quota. Check that the quota can be 1118 * reserves and will not have accounted against subpool
1042 * made before satisfying the allocation 1119 * limit. Check that the subpool limit can be made before
1043 * MAP_NORESERVE mappings may also need pages and quota allocated 1120 * satisfying the allocation MAP_NORESERVE mappings may also
1044 * if no reserve mapping overlaps. 1121 * need pages and subpool limit allocated allocated if no reserve
1122 * mapping overlaps.
1045 */ 1123 */
1046 chg = vma_needs_reservation(h, vma, addr); 1124 chg = vma_needs_reservation(h, vma, addr);
1047 if (chg < 0) 1125 if (chg < 0)
1048 return ERR_PTR(-VM_FAULT_OOM); 1126 return ERR_PTR(-VM_FAULT_OOM);
1049 if (chg) 1127 if (chg)
1050 if (hugetlb_get_quota(inode->i_mapping, chg)) 1128 if (hugepage_subpool_get_pages(spool, chg))
1051 return ERR_PTR(-VM_FAULT_SIGBUS); 1129 return ERR_PTR(-VM_FAULT_SIGBUS);
1052 1130
1053 spin_lock(&hugetlb_lock); 1131 spin_lock(&hugetlb_lock);
@@ -1057,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1057 if (!page) { 1135 if (!page) {
1058 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1136 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1059 if (!page) { 1137 if (!page) {
1060 hugetlb_put_quota(inode->i_mapping, chg); 1138 hugepage_subpool_put_pages(spool, chg);
1061 return ERR_PTR(-VM_FAULT_SIGBUS); 1139 return ERR_PTR(-VM_FAULT_SIGBUS);
1062 } 1140 }
1063 } 1141 }
1064 1142
1065 set_page_private(page, (unsigned long) mapping); 1143 set_page_private(page, (unsigned long)spool);
1066 1144
1067 vma_commit_reservation(h, vma, addr); 1145 vma_commit_reservation(h, vma, addr);
1068 1146
@@ -2083,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2083{ 2161{
2084 struct hstate *h = hstate_vma(vma); 2162 struct hstate *h = hstate_vma(vma);
2085 struct resv_map *reservations = vma_resv_map(vma); 2163 struct resv_map *reservations = vma_resv_map(vma);
2164 struct hugepage_subpool *spool = subpool_vma(vma);
2086 unsigned long reserve; 2165 unsigned long reserve;
2087 unsigned long start; 2166 unsigned long start;
2088 unsigned long end; 2167 unsigned long end;
@@ -2098,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2098 2177
2099 if (reserve) { 2178 if (reserve) {
2100 hugetlb_acct_memory(h, -reserve); 2179 hugetlb_acct_memory(h, -reserve);
2101 hugetlb_put_quota(vma->vm_file->f_mapping, reserve); 2180 hugepage_subpool_put_pages(spool, reserve);
2102 } 2181 }
2103 } 2182 }
2104} 2183}
@@ -2331,7 +2410,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2331 */ 2410 */
2332 address = address & huge_page_mask(h); 2411 address = address & huge_page_mask(h);
2333 pgoff = vma_hugecache_offset(h, vma, address); 2412 pgoff = vma_hugecache_offset(h, vma, address);
2334 mapping = (struct address_space *)page_private(page); 2413 mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
2335 2414
2336 /* 2415 /*
2337 * Take the mapping lock for the duration of the table walk. As 2416 * Take the mapping lock for the duration of the table walk. As
@@ -2884,11 +2963,12 @@ int hugetlb_reserve_pages(struct inode *inode,
2884{ 2963{
2885 long ret, chg; 2964 long ret, chg;
2886 struct hstate *h = hstate_inode(inode); 2965 struct hstate *h = hstate_inode(inode);
2966 struct hugepage_subpool *spool = subpool_inode(inode);
2887 2967
2888 /* 2968 /*
2889 * Only apply hugepage reservation if asked. At fault time, an 2969 * Only apply hugepage reservation if asked. At fault time, an
2890 * attempt will be made for VM_NORESERVE to allocate a page 2970 * attempt will be made for VM_NORESERVE to allocate a page
2891 * and filesystem quota without using reserves 2971 * without using reserves
2892 */ 2972 */
2893 if (vm_flags & VM_NORESERVE) 2973 if (vm_flags & VM_NORESERVE)
2894 return 0; 2974 return 0;
@@ -2915,17 +2995,17 @@ int hugetlb_reserve_pages(struct inode *inode,
2915 if (chg < 0) 2995 if (chg < 0)
2916 return chg; 2996 return chg;
2917 2997
2918 /* There must be enough filesystem quota for the mapping */ 2998 /* There must be enough pages in the subpool for the mapping */
2919 if (hugetlb_get_quota(inode->i_mapping, chg)) 2999 if (hugepage_subpool_get_pages(spool, chg))
2920 return -ENOSPC; 3000 return -ENOSPC;
2921 3001
2922 /* 3002 /*
2923 * Check enough hugepages are available for the reservation. 3003 * Check enough hugepages are available for the reservation.
2924 * Hand back the quota if there are not 3004 * Hand the pages back to the subpool if there are not
2925 */ 3005 */
2926 ret = hugetlb_acct_memory(h, chg); 3006 ret = hugetlb_acct_memory(h, chg);
2927 if (ret < 0) { 3007 if (ret < 0) {
2928 hugetlb_put_quota(inode->i_mapping, chg); 3008 hugepage_subpool_put_pages(spool, chg);
2929 return ret; 3009 return ret;
2930 } 3010 }
2931 3011
@@ -2949,12 +3029,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2949{ 3029{
2950 struct hstate *h = hstate_inode(inode); 3030 struct hstate *h = hstate_inode(inode);
2951 long chg = region_truncate(&inode->i_mapping->private_list, offset); 3031 long chg = region_truncate(&inode->i_mapping->private_list, offset);
3032 struct hugepage_subpool *spool = subpool_inode(inode);
2952 3033
2953 spin_lock(&inode->i_lock); 3034 spin_lock(&inode->i_lock);
2954 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 3035 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
2955 spin_unlock(&inode->i_lock); 3036 spin_unlock(&inode->i_lock);
2956 3037
2957 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 3038 hugepage_subpool_put_pages(spool, (chg - freed));
2958 hugetlb_acct_memory(h, -(chg - freed)); 3039 hugetlb_acct_memory(h, -(chg - freed));
2959} 3040}
2960 3041