diff options
author | David Gibson <david@gibson.dropbear.id.au> | 2012-03-21 19:34:12 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-21 20:54:59 -0400 |
commit | 90481622d75715bfcb68501280a917dbfe516029 (patch) | |
tree | 63f7d9e4455366ab326ee74e6b39acf76b618fcf /mm | |
parent | a1d776ee3147cec2a54a645e92eb2e3e2f65a137 (diff) |
hugepages: fix use after free bug in "quota" handling
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the
general quota handling code, and they don't much resemble its behaviour.
Rather than being about maintaining limits on on-disk block usage by
particular users, they are instead about maintaining limits on in-memory
page usage (including anonymous MAP_PRIVATE copied-on-write pages)
associated with a particular hugetlbfs filesystem instance.
Worse, they work by having callbacks to the hugetlbfs filesystem code from
the low-level page handling code, in particular from free_huge_page().
This is a layering violation of itself, but more importantly, if the
kernel does a get_user_pages() on hugepages (which can happen from KVM
amongst others), then the free_huge_page() can be delayed until after the
associated inode has already been freed. If an unmount occurs at the
wrong time, even the hugetlbfs superblock where the "quota" limits are
stored may have been freed.
Andrew Barry proposed a patch to fix this by having hugepages, instead of
storing a pointer to their address_space and reaching the superblock from
there, had the hugepages store pointers directly to the superblock,
bumping the reference count as appropriate to avoid it being freed.
Andrew Morton rejected that version, however, on the grounds that it made
the existing layering violation worse.
This is a reworked version of Andrew's patch, which removes the extra, and
some of the existing, layering violation. It works by introducing the
concept of a hugepage "subpool" at the lower hugepage mm layer - that is a
finite logical pool of hugepages to allocate from. hugetlbfs now creates
a subpool for each filesystem instance with a page limit set, and a
pointer to the subpool gets added to each allocated hugepage, instead of
the address_space pointer used now. The subpool has its own lifetime and
is only freed once all pages in it _and_ all other references to it (i.e.
superblocks) are gone.
subpools are optional - a NULL subpool pointer is taken by the code to
mean that no subpool limits are in effect.
Previous discussion of this bug found in: "Fix refcounting in hugetlbfs
quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or
http://marc.info/?l=linux-mm&m=126928970510627&w=1
v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to
alloc_huge_page() - since it already takes the vma, it is not necessary.
Signed-off-by: Andrew Barry <abarry@cray.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/hugetlb.c | 135 |
1 files changed, 108 insertions, 27 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b1c31487733..afa057a1d3f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size; | |||
53 | */ | 53 | */ |
54 | static DEFINE_SPINLOCK(hugetlb_lock); | 54 | static DEFINE_SPINLOCK(hugetlb_lock); |
55 | 55 | ||
56 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) | ||
57 | { | ||
58 | bool free = (spool->count == 0) && (spool->used_hpages == 0); | ||
59 | |||
60 | spin_unlock(&spool->lock); | ||
61 | |||
62 | /* If no pages are used, and no other handles to the subpool | ||
63 | * remain, free the subpool the subpool remain */ | ||
64 | if (free) | ||
65 | kfree(spool); | ||
66 | } | ||
67 | |||
68 | struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) | ||
69 | { | ||
70 | struct hugepage_subpool *spool; | ||
71 | |||
72 | spool = kmalloc(sizeof(*spool), GFP_KERNEL); | ||
73 | if (!spool) | ||
74 | return NULL; | ||
75 | |||
76 | spin_lock_init(&spool->lock); | ||
77 | spool->count = 1; | ||
78 | spool->max_hpages = nr_blocks; | ||
79 | spool->used_hpages = 0; | ||
80 | |||
81 | return spool; | ||
82 | } | ||
83 | |||
84 | void hugepage_put_subpool(struct hugepage_subpool *spool) | ||
85 | { | ||
86 | spin_lock(&spool->lock); | ||
87 | BUG_ON(!spool->count); | ||
88 | spool->count--; | ||
89 | unlock_or_release_subpool(spool); | ||
90 | } | ||
91 | |||
92 | static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, | ||
93 | long delta) | ||
94 | { | ||
95 | int ret = 0; | ||
96 | |||
97 | if (!spool) | ||
98 | return 0; | ||
99 | |||
100 | spin_lock(&spool->lock); | ||
101 | if ((spool->used_hpages + delta) <= spool->max_hpages) { | ||
102 | spool->used_hpages += delta; | ||
103 | } else { | ||
104 | ret = -ENOMEM; | ||
105 | } | ||
106 | spin_unlock(&spool->lock); | ||
107 | |||
108 | return ret; | ||
109 | } | ||
110 | |||
111 | static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, | ||
112 | long delta) | ||
113 | { | ||
114 | if (!spool) | ||
115 | return; | ||
116 | |||
117 | spin_lock(&spool->lock); | ||
118 | spool->used_hpages -= delta; | ||
119 | /* If hugetlbfs_put_super couldn't free spool due to | ||
120 | * an outstanding quota reference, free it now. */ | ||
121 | unlock_or_release_subpool(spool); | ||
122 | } | ||
123 | |||
124 | static inline struct hugepage_subpool *subpool_inode(struct inode *inode) | ||
125 | { | ||
126 | return HUGETLBFS_SB(inode->i_sb)->spool; | ||
127 | } | ||
128 | |||
129 | static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) | ||
130 | { | ||
131 | return subpool_inode(vma->vm_file->f_dentry->d_inode); | ||
132 | } | ||
133 | |||
56 | /* | 134 | /* |
57 | * Region tracking -- allows tracking of reservations and instantiated pages | 135 | * Region tracking -- allows tracking of reservations and instantiated pages |
58 | * across the pages in a mapping. | 136 | * across the pages in a mapping. |
@@ -540,9 +618,9 @@ static void free_huge_page(struct page *page) | |||
540 | */ | 618 | */ |
541 | struct hstate *h = page_hstate(page); | 619 | struct hstate *h = page_hstate(page); |
542 | int nid = page_to_nid(page); | 620 | int nid = page_to_nid(page); |
543 | struct address_space *mapping; | 621 | struct hugepage_subpool *spool = |
622 | (struct hugepage_subpool *)page_private(page); | ||
544 | 623 | ||
545 | mapping = (struct address_space *) page_private(page); | ||
546 | set_page_private(page, 0); | 624 | set_page_private(page, 0); |
547 | page->mapping = NULL; | 625 | page->mapping = NULL; |
548 | BUG_ON(page_count(page)); | 626 | BUG_ON(page_count(page)); |
@@ -558,8 +636,7 @@ static void free_huge_page(struct page *page) | |||
558 | enqueue_huge_page(h, page); | 636 | enqueue_huge_page(h, page); |
559 | } | 637 | } |
560 | spin_unlock(&hugetlb_lock); | 638 | spin_unlock(&hugetlb_lock); |
561 | if (mapping) | 639 | hugepage_subpool_put_pages(spool, 1); |
562 | hugetlb_put_quota(mapping, 1); | ||
563 | } | 640 | } |
564 | 641 | ||
565 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | 642 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
@@ -977,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
977 | /* | 1054 | /* |
978 | * Determine if the huge page at addr within the vma has an associated | 1055 | * Determine if the huge page at addr within the vma has an associated |
979 | * reservation. Where it does not we will need to logically increase | 1056 | * reservation. Where it does not we will need to logically increase |
980 | * reservation and actually increase quota before an allocation can occur. | 1057 | * reservation and actually increase subpool usage before an allocation |
981 | * Where any new reservation would be required the reservation change is | 1058 | * can occur. Where any new reservation would be required the |
982 | * prepared, but not committed. Once the page has been quota'd allocated | 1059 | * reservation change is prepared, but not committed. Once the page |
983 | * an instantiated the change should be committed via vma_commit_reservation. | 1060 | * has been allocated from the subpool and instantiated the change should |
984 | * No action is required on failure. | 1061 | * be committed via vma_commit_reservation. No action is required on |
1062 | * failure. | ||
985 | */ | 1063 | */ |
986 | static long vma_needs_reservation(struct hstate *h, | 1064 | static long vma_needs_reservation(struct hstate *h, |
987 | struct vm_area_struct *vma, unsigned long addr) | 1065 | struct vm_area_struct *vma, unsigned long addr) |
@@ -1030,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h, | |||
1030 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 1108 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
1031 | unsigned long addr, int avoid_reserve) | 1109 | unsigned long addr, int avoid_reserve) |
1032 | { | 1110 | { |
1111 | struct hugepage_subpool *spool = subpool_vma(vma); | ||
1033 | struct hstate *h = hstate_vma(vma); | 1112 | struct hstate *h = hstate_vma(vma); |
1034 | struct page *page; | 1113 | struct page *page; |
1035 | struct address_space *mapping = vma->vm_file->f_mapping; | ||
1036 | struct inode *inode = mapping->host; | ||
1037 | long chg; | 1114 | long chg; |
1038 | 1115 | ||
1039 | /* | 1116 | /* |
1040 | * Processes that did not create the mapping will have no reserves and | 1117 | * Processes that did not create the mapping will have no |
1041 | * will not have accounted against quota. Check that the quota can be | 1118 | * reserves and will not have accounted against subpool |
1042 | * made before satisfying the allocation | 1119 | * limit. Check that the subpool limit can be made before |
1043 | * MAP_NORESERVE mappings may also need pages and quota allocated | 1120 | * satisfying the allocation MAP_NORESERVE mappings may also |
1044 | * if no reserve mapping overlaps. | 1121 | * need pages and subpool limit allocated allocated if no reserve |
1122 | * mapping overlaps. | ||
1045 | */ | 1123 | */ |
1046 | chg = vma_needs_reservation(h, vma, addr); | 1124 | chg = vma_needs_reservation(h, vma, addr); |
1047 | if (chg < 0) | 1125 | if (chg < 0) |
1048 | return ERR_PTR(-VM_FAULT_OOM); | 1126 | return ERR_PTR(-VM_FAULT_OOM); |
1049 | if (chg) | 1127 | if (chg) |
1050 | if (hugetlb_get_quota(inode->i_mapping, chg)) | 1128 | if (hugepage_subpool_get_pages(spool, chg)) |
1051 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1129 | return ERR_PTR(-VM_FAULT_SIGBUS); |
1052 | 1130 | ||
1053 | spin_lock(&hugetlb_lock); | 1131 | spin_lock(&hugetlb_lock); |
@@ -1057,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1057 | if (!page) { | 1135 | if (!page) { |
1058 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | 1136 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
1059 | if (!page) { | 1137 | if (!page) { |
1060 | hugetlb_put_quota(inode->i_mapping, chg); | 1138 | hugepage_subpool_put_pages(spool, chg); |
1061 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1139 | return ERR_PTR(-VM_FAULT_SIGBUS); |
1062 | } | 1140 | } |
1063 | } | 1141 | } |
1064 | 1142 | ||
1065 | set_page_private(page, (unsigned long) mapping); | 1143 | set_page_private(page, (unsigned long)spool); |
1066 | 1144 | ||
1067 | vma_commit_reservation(h, vma, addr); | 1145 | vma_commit_reservation(h, vma, addr); |
1068 | 1146 | ||
@@ -2083,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
2083 | { | 2161 | { |
2084 | struct hstate *h = hstate_vma(vma); | 2162 | struct hstate *h = hstate_vma(vma); |
2085 | struct resv_map *reservations = vma_resv_map(vma); | 2163 | struct resv_map *reservations = vma_resv_map(vma); |
2164 | struct hugepage_subpool *spool = subpool_vma(vma); | ||
2086 | unsigned long reserve; | 2165 | unsigned long reserve; |
2087 | unsigned long start; | 2166 | unsigned long start; |
2088 | unsigned long end; | 2167 | unsigned long end; |
@@ -2098,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
2098 | 2177 | ||
2099 | if (reserve) { | 2178 | if (reserve) { |
2100 | hugetlb_acct_memory(h, -reserve); | 2179 | hugetlb_acct_memory(h, -reserve); |
2101 | hugetlb_put_quota(vma->vm_file->f_mapping, reserve); | 2180 | hugepage_subpool_put_pages(spool, reserve); |
2102 | } | 2181 | } |
2103 | } | 2182 | } |
2104 | } | 2183 | } |
@@ -2331,7 +2410,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2331 | */ | 2410 | */ |
2332 | address = address & huge_page_mask(h); | 2411 | address = address & huge_page_mask(h); |
2333 | pgoff = vma_hugecache_offset(h, vma, address); | 2412 | pgoff = vma_hugecache_offset(h, vma, address); |
2334 | mapping = (struct address_space *)page_private(page); | 2413 | mapping = vma->vm_file->f_dentry->d_inode->i_mapping; |
2335 | 2414 | ||
2336 | /* | 2415 | /* |
2337 | * Take the mapping lock for the duration of the table walk. As | 2416 | * Take the mapping lock for the duration of the table walk. As |
@@ -2884,11 +2963,12 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2884 | { | 2963 | { |
2885 | long ret, chg; | 2964 | long ret, chg; |
2886 | struct hstate *h = hstate_inode(inode); | 2965 | struct hstate *h = hstate_inode(inode); |
2966 | struct hugepage_subpool *spool = subpool_inode(inode); | ||
2887 | 2967 | ||
2888 | /* | 2968 | /* |
2889 | * Only apply hugepage reservation if asked. At fault time, an | 2969 | * Only apply hugepage reservation if asked. At fault time, an |
2890 | * attempt will be made for VM_NORESERVE to allocate a page | 2970 | * attempt will be made for VM_NORESERVE to allocate a page |
2891 | * and filesystem quota without using reserves | 2971 | * without using reserves |
2892 | */ | 2972 | */ |
2893 | if (vm_flags & VM_NORESERVE) | 2973 | if (vm_flags & VM_NORESERVE) |
2894 | return 0; | 2974 | return 0; |
@@ -2915,17 +2995,17 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2915 | if (chg < 0) | 2995 | if (chg < 0) |
2916 | return chg; | 2996 | return chg; |
2917 | 2997 | ||
2918 | /* There must be enough filesystem quota for the mapping */ | 2998 | /* There must be enough pages in the subpool for the mapping */ |
2919 | if (hugetlb_get_quota(inode->i_mapping, chg)) | 2999 | if (hugepage_subpool_get_pages(spool, chg)) |
2920 | return -ENOSPC; | 3000 | return -ENOSPC; |
2921 | 3001 | ||
2922 | /* | 3002 | /* |
2923 | * Check enough hugepages are available for the reservation. | 3003 | * Check enough hugepages are available for the reservation. |
2924 | * Hand back the quota if there are not | 3004 | * Hand the pages back to the subpool if there are not |
2925 | */ | 3005 | */ |
2926 | ret = hugetlb_acct_memory(h, chg); | 3006 | ret = hugetlb_acct_memory(h, chg); |
2927 | if (ret < 0) { | 3007 | if (ret < 0) { |
2928 | hugetlb_put_quota(inode->i_mapping, chg); | 3008 | hugepage_subpool_put_pages(spool, chg); |
2929 | return ret; | 3009 | return ret; |
2930 | } | 3010 | } |
2931 | 3011 | ||
@@ -2949,12 +3029,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
2949 | { | 3029 | { |
2950 | struct hstate *h = hstate_inode(inode); | 3030 | struct hstate *h = hstate_inode(inode); |
2951 | long chg = region_truncate(&inode->i_mapping->private_list, offset); | 3031 | long chg = region_truncate(&inode->i_mapping->private_list, offset); |
3032 | struct hugepage_subpool *spool = subpool_inode(inode); | ||
2952 | 3033 | ||
2953 | spin_lock(&inode->i_lock); | 3034 | spin_lock(&inode->i_lock); |
2954 | inode->i_blocks -= (blocks_per_huge_page(h) * freed); | 3035 | inode->i_blocks -= (blocks_per_huge_page(h) * freed); |
2955 | spin_unlock(&inode->i_lock); | 3036 | spin_unlock(&inode->i_lock); |
2956 | 3037 | ||
2957 | hugetlb_put_quota(inode->i_mapping, (chg - freed)); | 3038 | hugepage_subpool_put_pages(spool, (chg - freed)); |
2958 | hugetlb_acct_memory(h, -(chg - freed)); | 3039 | hugetlb_acct_memory(h, -(chg - freed)); |
2959 | } | 3040 | } |
2960 | 3041 | ||