hugepages: fix use after free bug in "quota" handling

hugetlbfs_{get,put}_quota() are badly named. They don't interact with the general quota handling code, and they don't much resemble its behaviour. Rather than being about maintaining limits on on-disk block usage by particular users, they are instead about maintaining limits on in-memory page usage (including anonymous MAP_PRIVATE copied-on-write pages) associated with a particular hugetlbfs filesystem instance. Worse, they work by having callbacks to the hugetlbfs filesystem code from the low-level page handling code, in particular from free_huge_page(). This is a layering violation of itself, but more importantly, if the kernel does a get_user_pages() on hugepages (which can happen from KVM amongst others), then the free_huge_page() can be delayed until after the associated inode has already been freed. If an unmount occurs at the wrong time, even the hugetlbfs superblock where the "quota" limits are stored may have been freed. Andrew Barry proposed a patch to fix this by having hugepages, instead of storing a pointer to their address_space and reaching the superblock from there, had the hugepages store pointers directly to the superblock, bumping the reference count as appropriate to avoid it being freed. Andrew Morton rejected that version, however, on the grounds that it made the existing layering violation worse. This is a reworked version of Andrew's patch, which removes the extra, and some of the existing, layering violation. It works by introducing the concept of a hugepage "subpool" at the lower hugepage mm layer - that is a finite logical pool of hugepages to allocate from. hugetlbfs now creates a subpool for each filesystem instance with a page limit set, and a pointer to the subpool gets added to each allocated hugepage, instead of the address_space pointer used now. The subpool has its own lifetime and is only freed once all pages in it _and_ all other references to it (i.e. superblocks) are gone. subpools are optional - a NULL subpool pointer is taken by the code to mean that no subpool limits are in effect. Previous discussion of this bug found in: "Fix refcounting in hugetlbfs quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or http://marc.info/?l=linux-mm&m=126928970510627&w=1 v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to alloc_huge_page() - since it already takes the vma, it is not necessary. Signed-off-by: Andrew Barry <abarry@cray.com> Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Cc: Hugh Dickins <hughd@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Hillf Danton <dhillf@gmail.com> Cc: Paul Mackerras <paulus@samba.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: David Gibson <david@gibson.dropbear.id.au> 2012-03-21 19:34:12 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-03-21 20:54:59 -0400
commit: 90481622d75715bfcb68501280a917dbfe516029 (patch)
tree: 63f7d9e4455366ab326ee74e6b39acf76b618fcf
parent: a1d776ee3147cec2a54a645e92eb2e3e2f65a137 (diff)
3 files changed, 139 insertions, 64 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 4fbd9fccd550..7913e3252167 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -626,9 +626,15 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
                spin_lock(&sbinfo->stat_lock);
                /* If no limits set, just report 0 for max/free/used
                 * blocks, like simple_statfs() */
-                if (sbinfo->max_blocks >= 0) {
+                if (sbinfo->spool) {
-                        buf->f_blocks = sbinfo->max_blocks;
+                        long free_pages;
-                        buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+                        spin_lock(&sbinfo->spool->lock);
+                        buf->f_blocks = sbinfo->spool->max_hpages;
+                        free_pages = sbinfo->spool->max_hpages
+                                - sbinfo->spool->used_hpages;
+                        buf->f_bavail = buf->f_bfree = free_pages;
+                        spin_unlock(&sbinfo->spool->lock);
                        buf->f_files = sbinfo->max_inodes;
                        buf->f_ffree = sbinfo->free_inodes;
                }
@@ -644,6 +650,10 @@ static void hugetlbfs_put_super(struct super_block *sb)
        if (sbi) {
                sb->s_fs_info = NULL;
+                if (sbi->spool)
+                        hugepage_put_subpool(sbi->spool);
                kfree(sbi);
        }
 }
@@ -874,10 +884,14 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_fs_info = sbinfo;
        sbinfo->hstate = config.hstate;
        spin_lock_init(&sbinfo->stat_lock);
-        sbinfo->max_blocks = config.nr_blocks;
-        sbinfo->free_blocks = config.nr_blocks;
        sbinfo->max_inodes = config.nr_inodes;
        sbinfo->free_inodes = config.nr_inodes;
+        sbinfo->spool = NULL;
+        if (config.nr_blocks != -1) {
+                sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
+                if (!sbinfo->spool)
+                        goto out_free;
+        }
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_blocksize = huge_page_size(config.hstate);
        sb->s_blocksize_bits = huge_page_shift(config.hstate);
@@ -896,38 +910,12 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_root = root;
        return 0;
 out_free:
+        if (sbinfo->spool)
+                kfree(sbinfo->spool);
        kfree(sbinfo);
        return -ENOMEM;
 }
-int hugetlb_get_quota(struct address_space *mapping, long delta)
-{
-        int ret = 0;
-        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
-        if (sbinfo->free_blocks > -1) {
-                spin_lock(&sbinfo->stat_lock);
-                if (sbinfo->free_blocks - delta >= 0)
-                        sbinfo->free_blocks -= delta;
-                else
-                        ret = -ENOMEM;
-                spin_unlock(&sbinfo->stat_lock);
-        }
-        return ret;
-}
-void hugetlb_put_quota(struct address_space *mapping, long delta)
-{
-        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
-        if (sbinfo->free_blocks > -1) {
-                spin_lock(&sbinfo->stat_lock);
-                sbinfo->free_blocks += delta;
-                spin_unlock(&sbinfo->stat_lock);
-        }
-}
 static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
 {
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7adc4923e7ac..cf0181738c9e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -14,6 +14,15 @@ struct user_struct;
 #include <linux/shm.h>
 #include <asm/tlbflush.h>
+struct hugepage_subpool {
+        spinlock_t lock;
+        long count;
+        long max_hpages, used_hpages;
+};
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
+void hugepage_put_subpool(struct hugepage_subpool *spool);
 int PageHuge(struct page *page);
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
@@ -129,12 +138,11 @@ enum {
 #ifdef CONFIG_HUGETLBFS
 struct hugetlbfs_sb_info {
-        long    max_blocks;   /* blocks allowed */
-        long    free_blocks;  /* blocks free */
        long    max_inodes;   /* inodes allowed */
        long    free_inodes;  /* inodes free */
        spinlock_t      stat_lock;
        struct hstate *hstate;
+        struct hugepage_subpool *spool;
 };
 static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
@@ -146,8 +154,6 @@ extern const struct file_operations hugetlbfs_file_operations;
 extern const struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
                                struct user_struct **user, int creat_flags);
-int hugetlb_get_quota(struct address_space *mapping, long delta);
-void hugetlb_put_quota(struct address_space *mapping, long delta);
 static inline int is_file_hugepages(struct file *file)
 {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b1c314877334..afa057a1d3fe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;
 */
 static DEFINE_SPINLOCK(hugetlb_lock);
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+{
+        bool free = (spool->count == 0) && (spool->used_hpages == 0);
+        spin_unlock(&spool->lock);
+        /* If no pages are used, and no other handles to the subpool
+         * remain, free the subpool the subpool remain */
+        if (free)
+                kfree(spool);
+}
+struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+{
+        struct hugepage_subpool *spool;
+        spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+        if (!spool)
+                return NULL;
+        spin_lock_init(&spool->lock);
+        spool->count = 1;
+        spool->max_hpages = nr_blocks;
+        spool->used_hpages = 0;
+        return spool;
+}
+void hugepage_put_subpool(struct hugepage_subpool *spool)
+{
+        spin_lock(&spool->lock);
+        BUG_ON(!spool->count);
+        spool->count--;
+        unlock_or_release_subpool(spool);
+}
+static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+                                      long delta)
+{
+        int ret = 0;
+        if (!spool)
+                return 0;
+        spin_lock(&spool->lock);
+        if ((spool->used_hpages + delta) <= spool->max_hpages) {
+                spool->used_hpages += delta;
+        } else {
+                ret = -ENOMEM;
+        }
+        spin_unlock(&spool->lock);
+        return ret;
+}
+static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+                                       long delta)
+{
+        if (!spool)
+                return;
+        spin_lock(&spool->lock);
+        spool->used_hpages -= delta;
+        /* If hugetlbfs_put_super couldn't free spool due to
+        * an outstanding quota reference, free it now. */
+        unlock_or_release_subpool(spool);
+}
+static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
+{
+        return HUGETLBFS_SB(inode->i_sb)->spool;
+}
+static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
+{
+        return subpool_inode(vma->vm_file->f_dentry->d_inode);
+}
 /*
 * Region tracking -- allows tracking of reservations and instantiated pages
 *                    across the pages in a mapping.
@@ -540,9 +618,9 @@ static void free_huge_page(struct page *page)
         */
        struct hstate *h = page_hstate(page);
        int nid = page_to_nid(page);
-        struct address_space *mapping;
+        struct hugepage_subpool *spool =
+                (struct hugepage_subpool *)page_private(page);
-        mapping = (struct address_space *) page_private(page);
        set_page_private(page, 0);
        page->mapping = NULL;
        BUG_ON(page_count(page));
@@ -558,8 +636,7 @@ static void free_huge_page(struct page *page)
                enqueue_huge_page(h, page);
        }
        spin_unlock(&hugetlb_lock);
-        if (mapping)
+        hugepage_subpool_put_pages(spool, 1);
-                hugetlb_put_quota(mapping, 1);
 }
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -977,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h,
 /*
 * Determine if the huge page at addr within the vma has an associated
 * reservation.  Where it does not we will need to logically increase
- * reservation and actually increase quota before an allocation can occur.
+ * reservation and actually increase subpool usage before an allocation
- * Where any new reservation would be required the reservation change is
+ * can occur.  Where any new reservation would be required the
- * prepared, but not committed.  Once the page has been quota'd allocated
+ * reservation change is prepared, but not committed.  Once the page
- * an instantiated the change should be committed via vma_commit_reservation.
+ * has been allocated from the subpool and instantiated the change should
- * No action is required on failure.
+ * be committed via vma_commit_reservation.  No action is required on
+ * failure.
 */
 static long vma_needs_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
@@ -1030,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h,
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr, int avoid_reserve)
 {
+        struct hugepage_subpool *spool = subpool_vma(vma);
        struct hstate *h = hstate_vma(vma);
        struct page *page;
-        struct address_space *mapping = vma->vm_file->f_mapping;
-        struct inode *inode = mapping->host;
        long chg;
        /*
-         * Processes that did not create the mapping will have no reserves and
+         * Processes that did not create the mapping will have no
-         * will not have accounted against quota. Check that the quota can be
+         * reserves and will not have accounted against subpool
-         * made before satisfying the allocation
+         * limit. Check that the subpool limit can be made before
-         * MAP_NORESERVE mappings may also need pages and quota allocated
+         * satisfying the allocation MAP_NORESERVE mappings may also
-         * if no reserve mapping overlaps.
+         * need pages and subpool limit allocated allocated if no reserve
+         * mapping overlaps.
         */
        chg = vma_needs_reservation(h, vma, addr);
        if (chg < 0)
                return ERR_PTR(-VM_FAULT_OOM);
        if (chg)
-                if (hugetlb_get_quota(inode->i_mapping, chg))
+                if (hugepage_subpool_get_pages(spool, chg))
                        return ERR_PTR(-VM_FAULT_SIGBUS);
        spin_lock(&hugetlb_lock);
@@ -1057,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        if (!page) {
                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
                if (!page) {
-                        hugetlb_put_quota(inode->i_mapping, chg);
+                        hugepage_subpool_put_pages(spool, chg);
                        return ERR_PTR(-VM_FAULT_SIGBUS);
                }
        }
-        set_page_private(page, (unsigned long) mapping);
+        set_page_private(page, (unsigned long)spool);
        vma_commit_reservation(h, vma, addr);
@@ -2083,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 {
        struct hstate *h = hstate_vma(vma);
        struct resv_map *reservations = vma_resv_map(vma);
+        struct hugepage_subpool *spool = subpool_vma(vma);
        unsigned long reserve;
        unsigned long start;
        unsigned long end;
@@ -2098,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
                if (reserve) {
                        hugetlb_acct_memory(h, -reserve);
-                        hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+                        hugepage_subpool_put_pages(spool, reserve);
                }
        }
 }
@@ -2331,7 +2410,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        address = address & huge_page_mask(h);
        pgoff = vma_hugecache_offset(h, vma, address);
-        mapping = (struct address_space *)page_private(page);
+        mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
        /*
         * Take the mapping lock for the duration of the table walk. As
@@ -2884,11 +2963,12 @@ int hugetlb_reserve_pages(struct inode *inode,
 {
        long ret, chg;
        struct hstate *h = hstate_inode(inode);
+        struct hugepage_subpool *spool = subpool_inode(inode);
        /*
         * Only apply hugepage reservation if asked. At fault time, an
         * attempt will be made for VM_NORESERVE to allocate a page
-         * and filesystem quota without using reserves
+         * without using reserves
         */
        if (vm_flags & VM_NORESERVE)
                return 0;
@@ -2915,17 +2995,17 @@ int hugetlb_reserve_pages(struct inode *inode,
        if (chg < 0)
                return chg;
-        /* There must be enough filesystem quota for the mapping */
+        /* There must be enough pages in the subpool for the mapping */
-        if (hugetlb_get_quota(inode->i_mapping, chg))
+        if (hugepage_subpool_get_pages(spool, chg))
                return -ENOSPC;
        /*
         * Check enough hugepages are available for the reservation.
-         * Hand back the quota if there are not
+         * Hand the pages back to the subpool if there are not
         */
        ret = hugetlb_acct_memory(h, chg);
        if (ret < 0) {
-                hugetlb_put_quota(inode->i_mapping, chg);
+                hugepage_subpool_put_pages(spool, chg);
                return ret;
        }
@@ -2949,12 +3029,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 {
        struct hstate *h = hstate_inode(inode);
        long chg = region_truncate(&inode->i_mapping->private_list, offset);
+        struct hugepage_subpool *spool = subpool_inode(inode);
        spin_lock(&inode->i_lock);
        inode->i_blocks -= (blocks_per_huge_page(h) * freed);
        spin_unlock(&inode->i_lock);
-        hugetlb_put_quota(inode->i_mapping, (chg - freed));
+        hugepage_subpool_put_pages(spool, (chg - freed));
        hugetlb_acct_memory(h, -(chg - freed));
 }
author	David Gibson <david@gibson.dropbear.id.au>	2012-03-21 19:34:12 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-03-21 20:54:59 -0400
commit	90481622d75715bfcb68501280a917dbfe516029 (patch)
tree	63f7d9e4455366ab326ee74e6b39acf76b618fcf
parent	a1d776ee3147cec2a54a645e92eb2e3e2f65a137 (diff)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 4fbd9fccd550..7913e3252167 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c
@@ -626,9 +626,15 @@ static int hugetlbfs_statfs(struct dentry dentry, struct kstatfs buf)
626	spin_lock(&sbinfo->stat_lock);	626	spin_lock(&sbinfo->stat_lock);
627	/* If no limits set, just report 0 for max/free/used	627	/* If no limits set, just report 0 for max/free/used
628	* blocks, like simple_statfs() */	628	* blocks, like simple_statfs() */
629	if (sbinfo->max_blocks >= 0) {	629	if (sbinfo->spool) {
630	buf->f_blocks = sbinfo->max_blocks;	630	long free_pages;
631	buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;	631
		632	spin_lock(&sbinfo->spool->lock);
		633	buf->f_blocks = sbinfo->spool->max_hpages;
		634	free_pages = sbinfo->spool->max_hpages
		635	- sbinfo->spool->used_hpages;
		636	buf->f_bavail = buf->f_bfree = free_pages;
		637	spin_unlock(&sbinfo->spool->lock);
632	buf->f_files = sbinfo->max_inodes;	638	buf->f_files = sbinfo->max_inodes;
633	buf->f_ffree = sbinfo->free_inodes;	639	buf->f_ffree = sbinfo->free_inodes;
634	}	640	}
@@ -644,6 +650,10 @@ static void hugetlbfs_put_super(struct super_block *sb)
644		650
645	if (sbi) {	651	if (sbi) {
646	sb->s_fs_info = NULL;	652	sb->s_fs_info = NULL;
		653
		654	if (sbi->spool)
		655	hugepage_put_subpool(sbi->spool);
		656
647	kfree(sbi);	657	kfree(sbi);
648	}	658	}
649	}	659	}
@@ -874,10 +884,14 @@ hugetlbfs_fill_super(struct super_block sb, void data, int silent)
874	sb->s_fs_info = sbinfo;	884	sb->s_fs_info = sbinfo;
875	sbinfo->hstate = config.hstate;	885	sbinfo->hstate = config.hstate;
876	spin_lock_init(&sbinfo->stat_lock);	886	spin_lock_init(&sbinfo->stat_lock);
877	sbinfo->max_blocks = config.nr_blocks;
878	sbinfo->free_blocks = config.nr_blocks;
879	sbinfo->max_inodes = config.nr_inodes;	887	sbinfo->max_inodes = config.nr_inodes;
880	sbinfo->free_inodes = config.nr_inodes;	888	sbinfo->free_inodes = config.nr_inodes;
		889	sbinfo->spool = NULL;
		890	if (config.nr_blocks != -1) {
		891	sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
		892	if (!sbinfo->spool)
		893	goto out_free;
		894	}
881	sb->s_maxbytes = MAX_LFS_FILESIZE;	895	sb->s_maxbytes = MAX_LFS_FILESIZE;
882	sb->s_blocksize = huge_page_size(config.hstate);	896	sb->s_blocksize = huge_page_size(config.hstate);
883	sb->s_blocksize_bits = huge_page_shift(config.hstate);	897	sb->s_blocksize_bits = huge_page_shift(config.hstate);
@@ -896,38 +910,12 @@ hugetlbfs_fill_super(struct super_block sb, void data, int silent)
896	sb->s_root = root;	910	sb->s_root = root;
897	return 0;	911	return 0;
898	out_free:	912	out_free:
		913	if (sbinfo->spool)
		914	kfree(sbinfo->spool);
899	kfree(sbinfo);	915	kfree(sbinfo);
900	return -ENOMEM;	916	return -ENOMEM;
901	}	917	}
902		918
903	int hugetlb_get_quota(struct address_space *mapping, long delta)
904	{
905	int ret = 0;
906	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
907
908	if (sbinfo->free_blocks > -1) {
909	spin_lock(&sbinfo->stat_lock);
910	if (sbinfo->free_blocks - delta >= 0)
911	sbinfo->free_blocks -= delta;
912	else
913	ret = -ENOMEM;
914	spin_unlock(&sbinfo->stat_lock);
915	}
916
917	return ret;
918	}
919
920	void hugetlb_put_quota(struct address_space *mapping, long delta)
921	{
922	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
923
924	if (sbinfo->free_blocks > -1) {
925	spin_lock(&sbinfo->stat_lock);
926	sbinfo->free_blocks += delta;
927	spin_unlock(&sbinfo->stat_lock);
928	}
929	}
930
931	static struct dentry hugetlbfs_mount(struct file_system_type fs_type,	919	static struct dentry hugetlbfs_mount(struct file_system_type fs_type,
932	int flags, const char dev_name, void data)	920	int flags, const char dev_name, void data)
933	{	921	{


diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7adc4923e7ac..cf0181738c9e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h
@@ -14,6 +14,15 @@ struct user_struct;
14	#include <linux/shm.h>	14	#include <linux/shm.h>
15	#include <asm/tlbflush.h>	15	#include <asm/tlbflush.h>
16		16
		17	struct hugepage_subpool {
		18	spinlock_t lock;
		19	long count;
		20	long max_hpages, used_hpages;
		21	};
		22
		23	struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
		24	void hugepage_put_subpool(struct hugepage_subpool *spool);
		25
17	int PageHuge(struct page *page);	26	int PageHuge(struct page *page);
18		27
19	void reset_vma_resv_huge_pages(struct vm_area_struct *vma);	28	void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
@@ -129,12 +138,11 @@ enum {
129		138
130	#ifdef CONFIG_HUGETLBFS	139	#ifdef CONFIG_HUGETLBFS
131	struct hugetlbfs_sb_info {	140	struct hugetlbfs_sb_info {
132	long max_blocks; /* blocks allowed */
133	long free_blocks; /* blocks free */
134	long max_inodes; /* inodes allowed */	141	long max_inodes; /* inodes allowed */
135	long free_inodes; /* inodes free */	142	long free_inodes; /* inodes free */
136	spinlock_t stat_lock;	143	spinlock_t stat_lock;
137	struct hstate *hstate;	144	struct hstate *hstate;
		145	struct hugepage_subpool *spool;
138	};	146	};
139		147
140	static inline struct hugetlbfs_sb_info HUGETLBFS_SB(struct super_block sb)	148	static inline struct hugetlbfs_sb_info HUGETLBFS_SB(struct super_block sb)
@@ -146,8 +154,6 @@ extern const struct file_operations hugetlbfs_file_operations;
146	extern const struct vm_operations_struct hugetlb_vm_ops;	154	extern const struct vm_operations_struct hugetlb_vm_ops;
147	struct file hugetlb_file_setup(const char name, size_t size, vm_flags_t acct,	155	struct file hugetlb_file_setup(const char name, size_t size, vm_flags_t acct,
148	struct user_struct **user, int creat_flags);	156	struct user_struct **user, int creat_flags);
149	int hugetlb_get_quota(struct address_space *mapping, long delta);
150	void hugetlb_put_quota(struct address_space *mapping, long delta);
151		157
152	static inline int is_file_hugepages(struct file *file)	158	static inline int is_file_hugepages(struct file *file)
153	{	159	{


diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b1c314877334..afa057a1d3fe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;
53	*/	53	*/
54	static DEFINE_SPINLOCK(hugetlb_lock);	54	static DEFINE_SPINLOCK(hugetlb_lock);
55		55
		56	static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
		57	{
		58	bool free = (spool->count == 0) && (spool->used_hpages == 0);
		59
		60	spin_unlock(&spool->lock);
		61
		62	/* If no pages are used, and no other handles to the subpool
		63	* remain, free the subpool the subpool remain */
		64	if (free)
		65	kfree(spool);
		66	}
		67
		68	struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
		69	{
		70	struct hugepage_subpool *spool;
		71
		72	spool = kmalloc(sizeof(*spool), GFP_KERNEL);
		73	if (!spool)
		74	return NULL;
		75
		76	spin_lock_init(&spool->lock);
		77	spool->count = 1;
		78	spool->max_hpages = nr_blocks;
		79	spool->used_hpages = 0;
		80
		81	return spool;
		82	}
		83
		84	void hugepage_put_subpool(struct hugepage_subpool *spool)
		85	{
		86	spin_lock(&spool->lock);
		87	BUG_ON(!spool->count);
		88	spool->count--;
		89	unlock_or_release_subpool(spool);
		90	}
		91
		92	static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
		93	long delta)
		94	{
		95	int ret = 0;
		96
		97	if (!spool)
		98	return 0;
		99
		100	spin_lock(&spool->lock);
		101	if ((spool->used_hpages + delta) <= spool->max_hpages) {
		102	spool->used_hpages += delta;
		103	} else {
		104	ret = -ENOMEM;
		105	}
		106	spin_unlock(&spool->lock);
		107
		108	return ret;
		109	}
		110
		111	static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
		112	long delta)
		113	{
		114	if (!spool)
		115	return;
		116
		117	spin_lock(&spool->lock);
		118	spool->used_hpages -= delta;
		119	/* If hugetlbfs_put_super couldn't free spool due to
		120	* an outstanding quota reference, free it now. */
		121	unlock_or_release_subpool(spool);
		122	}
		123
		124	static inline struct hugepage_subpool subpool_inode(struct inode inode)
		125	{
		126	return HUGETLBFS_SB(inode->i_sb)->spool;
		127	}
		128
		129	static inline struct hugepage_subpool subpool_vma(struct vm_area_struct vma)
		130	{
		131	return subpool_inode(vma->vm_file->f_dentry->d_inode);
		132	}
		133
56	/*	134	/*
57	* Region tracking -- allows tracking of reservations and instantiated pages	135	* Region tracking -- allows tracking of reservations and instantiated pages
58	* across the pages in a mapping.	136	* across the pages in a mapping.
@@ -540,9 +618,9 @@ static void free_huge_page(struct page *page)
540	*/	618	*/
541	struct hstate *h = page_hstate(page);	619	struct hstate *h = page_hstate(page);
542	int nid = page_to_nid(page);	620	int nid = page_to_nid(page);
543	struct address_space *mapping;	621	struct hugepage_subpool *spool =
		622	(struct hugepage_subpool *)page_private(page);
544		623
545	mapping = (struct address_space *) page_private(page);
546	set_page_private(page, 0);	624	set_page_private(page, 0);
547	page->mapping = NULL;	625	page->mapping = NULL;
548	BUG_ON(page_count(page));	626	BUG_ON(page_count(page));
@@ -558,8 +636,7 @@ static void free_huge_page(struct page *page)
558	enqueue_huge_page(h, page);	636	enqueue_huge_page(h, page);
559	}	637	}
560	spin_unlock(&hugetlb_lock);	638	spin_unlock(&hugetlb_lock);
561	if (mapping)	639	hugepage_subpool_put_pages(spool, 1);
562	hugetlb_put_quota(mapping, 1);
563	}	640	}
564		641
565	static void prep_new_huge_page(struct hstate h, struct page page, int nid)	642	static void prep_new_huge_page(struct hstate h, struct page page, int nid)
@@ -977,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h,
977	/*	1054	/*
978	* Determine if the huge page at addr within the vma has an associated	1055	* Determine if the huge page at addr within the vma has an associated
979	* reservation. Where it does not we will need to logically increase	1056	* reservation. Where it does not we will need to logically increase
980	* reservation and actually increase quota before an allocation can occur.	1057	* reservation and actually increase subpool usage before an allocation
981	* Where any new reservation would be required the reservation change is	1058	* can occur. Where any new reservation would be required the
982	* prepared, but not committed. Once the page has been quota'd allocated	1059	* reservation change is prepared, but not committed. Once the page
983	* an instantiated the change should be committed via vma_commit_reservation.	1060	* has been allocated from the subpool and instantiated the change should
984	* No action is required on failure.	1061	* be committed via vma_commit_reservation. No action is required on
		1062	* failure.
985	*/	1063	*/
986	static long vma_needs_reservation(struct hstate *h,	1064	static long vma_needs_reservation(struct hstate *h,
987	struct vm_area_struct *vma, unsigned long addr)	1065	struct vm_area_struct *vma, unsigned long addr)
@@ -1030,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h,
1030	static struct page alloc_huge_page(struct vm_area_struct vma,	1108	static struct page alloc_huge_page(struct vm_area_struct vma,
1031	unsigned long addr, int avoid_reserve)	1109	unsigned long addr, int avoid_reserve)
1032	{	1110	{
		1111	struct hugepage_subpool *spool = subpool_vma(vma);
1033	struct hstate *h = hstate_vma(vma);	1112	struct hstate *h = hstate_vma(vma);
1034	struct page *page;	1113	struct page *page;
1035	struct address_space *mapping = vma->vm_file->f_mapping;
1036	struct inode *inode = mapping->host;
1037	long chg;	1114	long chg;
1038		1115
1039	/*	1116	/*
1040	* Processes that did not create the mapping will have no reserves and	1117	* Processes that did not create the mapping will have no
1041	* will not have accounted against quota. Check that the quota can be	1118	* reserves and will not have accounted against subpool
1042	* made before satisfying the allocation	1119	* limit. Check that the subpool limit can be made before
1043	* MAP_NORESERVE mappings may also need pages and quota allocated	1120	* satisfying the allocation MAP_NORESERVE mappings may also
1044	* if no reserve mapping overlaps.	1121	* need pages and subpool limit allocated allocated if no reserve
		1122	* mapping overlaps.
1045	*/	1123	*/
1046	chg = vma_needs_reservation(h, vma, addr);	1124	chg = vma_needs_reservation(h, vma, addr);
1047	if (chg < 0)	1125	if (chg < 0)
1048	return ERR_PTR(-VM_FAULT_OOM);	1126	return ERR_PTR(-VM_FAULT_OOM);
1049	if (chg)	1127	if (chg)
1050	if (hugetlb_get_quota(inode->i_mapping, chg))	1128	if (hugepage_subpool_get_pages(spool, chg))
1051	return ERR_PTR(-VM_FAULT_SIGBUS);	1129	return ERR_PTR(-VM_FAULT_SIGBUS);
1052		1130
1053	spin_lock(&hugetlb_lock);	1131	spin_lock(&hugetlb_lock);
@@ -1057,12 +1135,12 @@ static struct page alloc_huge_page(struct vm_area_struct vma,
1057	if (!page) {	1135	if (!page) {
1058	page = alloc_buddy_huge_page(h, NUMA_NO_NODE);	1136	page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1059	if (!page) {	1137	if (!page) {
1060	hugetlb_put_quota(inode->i_mapping, chg);	1138	hugepage_subpool_put_pages(spool, chg);
1061	return ERR_PTR(-VM_FAULT_SIGBUS);	1139	return ERR_PTR(-VM_FAULT_SIGBUS);
1062	}	1140	}
1063	}	1141	}
1064		1142
1065	set_page_private(page, (unsigned long) mapping);	1143	set_page_private(page, (unsigned long)spool);
1066		1144
1067	vma_commit_reservation(h, vma, addr);	1145	vma_commit_reservation(h, vma, addr);
1068		1146
@@ -2083,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2083	{	2161	{
2084	struct hstate *h = hstate_vma(vma);	2162	struct hstate *h = hstate_vma(vma);
2085	struct resv_map *reservations = vma_resv_map(vma);	2163	struct resv_map *reservations = vma_resv_map(vma);
		2164	struct hugepage_subpool *spool = subpool_vma(vma);
2086	unsigned long reserve;	2165	unsigned long reserve;
2087	unsigned long start;	2166	unsigned long start;
2088	unsigned long end;	2167	unsigned long end;
@@ -2098,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2098		2177
2099	if (reserve) {	2178	if (reserve) {
2100	hugetlb_acct_memory(h, -reserve);	2179	hugetlb_acct_memory(h, -reserve);
2101	hugetlb_put_quota(vma->vm_file->f_mapping, reserve);	2180	hugepage_subpool_put_pages(spool, reserve);
2102	}	2181	}
2103	}	2182	}
2104	}	2183	}
@@ -2331,7 +2410,7 @@ static int unmap_ref_private(struct mm_struct mm, struct vm_area_struct vma,
2331	*/	2410	*/
2332	address = address & huge_page_mask(h);	2411	address = address & huge_page_mask(h);
2333	pgoff = vma_hugecache_offset(h, vma, address);	2412	pgoff = vma_hugecache_offset(h, vma, address);
2334	mapping = (struct address_space *)page_private(page);	2413	mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
2335		2414
2336	/*	2415	/*
2337	* Take the mapping lock for the duration of the table walk. As	2416	* Take the mapping lock for the duration of the table walk. As
@@ -2884,11 +2963,12 @@ int hugetlb_reserve_pages(struct inode *inode,
2884	{	2963	{
2885	long ret, chg;	2964	long ret, chg;
2886	struct hstate *h = hstate_inode(inode);	2965	struct hstate *h = hstate_inode(inode);
		2966	struct hugepage_subpool *spool = subpool_inode(inode);
2887		2967
2888	/*	2968	/*
2889	* Only apply hugepage reservation if asked. At fault time, an	2969	* Only apply hugepage reservation if asked. At fault time, an
2890	* attempt will be made for VM_NORESERVE to allocate a page	2970	* attempt will be made for VM_NORESERVE to allocate a page
2891	* and filesystem quota without using reserves	2971	* without using reserves
2892	*/	2972	*/
2893	if (vm_flags & VM_NORESERVE)	2973	if (vm_flags & VM_NORESERVE)
2894	return 0;	2974	return 0;
@@ -2915,17 +2995,17 @@ int hugetlb_reserve_pages(struct inode *inode,
2915	if (chg < 0)	2995	if (chg < 0)
2916	return chg;	2996	return chg;
2917		2997
2918	/* There must be enough filesystem quota for the mapping */	2998	/* There must be enough pages in the subpool for the mapping */
2919	if (hugetlb_get_quota(inode->i_mapping, chg))	2999	if (hugepage_subpool_get_pages(spool, chg))
2920	return -ENOSPC;	3000	return -ENOSPC;
2921		3001
2922	/*	3002	/*
2923	* Check enough hugepages are available for the reservation.	3003	* Check enough hugepages are available for the reservation.
2924	* Hand back the quota if there are not	3004	* Hand the pages back to the subpool if there are not
2925	*/	3005	*/
2926	ret = hugetlb_acct_memory(h, chg);	3006	ret = hugetlb_acct_memory(h, chg);
2927	if (ret < 0) {	3007	if (ret < 0) {
2928	hugetlb_put_quota(inode->i_mapping, chg);	3008	hugepage_subpool_put_pages(spool, chg);
2929	return ret;	3009	return ret;
2930	}	3010	}
2931		3011
@@ -2949,12 +3029,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2949	{	3029	{
2950	struct hstate *h = hstate_inode(inode);	3030	struct hstate *h = hstate_inode(inode);
2951	long chg = region_truncate(&inode->i_mapping->private_list, offset);	3031	long chg = region_truncate(&inode->i_mapping->private_list, offset);
		3032	struct hugepage_subpool *spool = subpool_inode(inode);
2952		3033
2953	spin_lock(&inode->i_lock);	3034	spin_lock(&inode->i_lock);
2954	inode->i_blocks -= (blocks_per_huge_page(h) * freed);	3035	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
2955	spin_unlock(&inode->i_lock);	3036	spin_unlock(&inode->i_lock);
2956		3037
2957	hugetlb_put_quota(inode->i_mapping, (chg - freed));	3038	hugepage_subpool_put_pages(spool, (chg - freed));
2958	hugetlb_acct_memory(h, -(chg - freed));	3039	hugetlb_acct_memory(h, -(chg - freed));
2959	}	3040	}
2960		3041