From c6a918200c4f4ebf74b7e1ae4fea9115c7b297f8 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 15 Apr 2015 16:13:36 -0700 Subject: hugetlbfs: add minimum size tracking fields to subpool structure hugetlbfs allocates huge pages from the global pool as needed. Even if the global pool contains a sufficient number pages for the filesystem size at mount time, those global pages could be grabbed for some other use. As a result, filesystem huge page allocations may fail due to lack of pages. Applications such as a database want to use huge pages for performance reasons. hugetlbfs filesystem semantics with ownership and modes work well to manage access to a pool of huge pages. However, the application would like some reasonable assurance that allocations will not fail due to a lack of huge pages. At application startup time, the application would like to configure itself to use a specific number of huge pages. Before starting, the application can check to make sure that enough huge pages exist in the system global pools. However, there are no guarantees that those pages will be available when needed by the application. What the application wants is exclusive use of a subset of huge pages. Add a new hugetlbfs mount option 'min_size=' to indicate that the specified number of pages will be available for use by the filesystem. At mount time, this number of huge pages will be reserved for exclusive use of the filesystem. If there is not a sufficient number of free pages, the mount will fail. As pages are allocated to and freeed from the filesystem, the number of reserved pages is adjusted so that the specified minimum is maintained. This patch (of 4): Add a field to the subpool structure to indicate the minimimum number of huge pages to always be used by this subpool. This minimum count includes allocated pages as well as reserved pages. If the minimum number of pages for the subpool have not been allocated, pages are reserved up to this minimum. An additional field (rsv_hpages) is used to track the number of pages reserved to meet this minimum size. The hstate pointer in the subpool is convenient to have when reserving and unreserving the pages. Signed-off-by: Mike Kravetz Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Joonsoo Kim Cc: Andi Kleen Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8874c8ad55aa..4494976c2042 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -77,14 +77,13 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) { struct hugepage_subpool *spool; - spool = kmalloc(sizeof(*spool), GFP_KERNEL); + spool = kzalloc(sizeof(*spool), GFP_KERNEL); if (!spool) return NULL; spin_lock_init(&spool->lock); spool->count = 1; spool->max_hpages = nr_blocks; - spool->used_hpages = 0; return spool; } -- cgit v1.2.2 From 1c5ecae3a93fa1ab51a784d77e9c9ed54e67c65f Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 15 Apr 2015 16:13:39 -0700 Subject: hugetlbfs: add minimum size accounting to subpools The same routines that perform subpool maximum size accounting hugepage_subpool_get/put_pages() are modified to also perform minimum size accounting. When a delta value is passed to these routines, calculate how global reservations must be adjusted to maintain the subpool minimum size. The routines now return this global reserve count adjustment. This global reserve count adjustment is then passed to the global accounting routine hugetlb_acct_memory(). Signed-off-by: Mike Kravetz Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Joonsoo Kim Cc: Andi Kleen Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 100 insertions(+), 23 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4494976c2042..499cb72c74b1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -96,36 +96,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool) unlock_or_release_subpool(spool); } -static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, +/* + * Subpool accounting for allocating and reserving pages. + * Return -ENOMEM if there are not enough resources to satisfy the + * the request. Otherwise, return the number of pages by which the + * global pools must be adjusted (upward). The returned value may + * only be different than the passed value (delta) in the case where + * a subpool minimum size must be manitained. + */ +static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, long delta) { - int ret = 0; + long ret = delta; if (!spool) - return 0; + return ret; spin_lock(&spool->lock); - if ((spool->used_hpages + delta) <= spool->max_hpages) { - spool->used_hpages += delta; - } else { - ret = -ENOMEM; + + if (spool->max_hpages != -1) { /* maximum size accounting */ + if ((spool->used_hpages + delta) <= spool->max_hpages) + spool->used_hpages += delta; + else { + ret = -ENOMEM; + goto unlock_ret; + } } - spin_unlock(&spool->lock); + if (spool->min_hpages != -1) { /* minimum size accounting */ + if (delta > spool->rsv_hpages) { + /* + * Asking for more reserves than those already taken on + * behalf of subpool. Return difference. + */ + ret = delta - spool->rsv_hpages; + spool->rsv_hpages = 0; + } else { + ret = 0; /* reserves already accounted for */ + spool->rsv_hpages -= delta; + } + } + +unlock_ret: + spin_unlock(&spool->lock); return ret; } -static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, +/* + * Subpool accounting for freeing and unreserving pages. + * Return the number of global page reservations that must be dropped. + * The return value may only be different than the passed value (delta) + * in the case where a subpool minimum size must be maintained. + */ +static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, long delta) { + long ret = delta; + if (!spool) - return; + return delta; spin_lock(&spool->lock); - spool->used_hpages -= delta; - /* If hugetlbfs_put_super couldn't free spool due to - * an outstanding quota reference, free it now. */ + + if (spool->max_hpages != -1) /* maximum size accounting */ + spool->used_hpages -= delta; + + if (spool->min_hpages != -1) { /* minimum size accounting */ + if (spool->rsv_hpages + delta <= spool->min_hpages) + ret = 0; + else + ret = spool->rsv_hpages + delta - spool->min_hpages; + + spool->rsv_hpages += delta; + if (spool->rsv_hpages > spool->min_hpages) + spool->rsv_hpages = spool->min_hpages; + } + + /* + * If hugetlbfs_put_super couldn't free spool due to an outstanding + * quota reference, free it now. + */ unlock_or_release_subpool(spool); + + return ret; } static inline struct hugepage_subpool *subpool_inode(struct inode *inode) @@ -873,6 +926,14 @@ void free_huge_page(struct page *page) restore_reserve = PagePrivate(page); ClearPagePrivate(page); + /* + * A return code of zero implies that the subpool will be under its + * minimum size if the reservation is not restored after page is free. + * Therefore, force restore_reserve operation. + */ + if (hugepage_subpool_put_pages(spool, 1) == 0) + restore_reserve = true; + spin_lock(&hugetlb_lock); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); @@ -890,7 +951,6 @@ void free_huge_page(struct page *page) enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); - hugepage_subpool_put_pages(spool, 1); } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) @@ -1385,7 +1445,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, if (chg < 0) return ERR_PTR(-ENOMEM); if (chg || avoid_reserve) - if (hugepage_subpool_get_pages(spool, 1)) + if (hugepage_subpool_get_pages(spool, 1) < 0) return ERR_PTR(-ENOSPC); ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); @@ -2453,6 +2513,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) struct resv_map *resv = vma_resv_map(vma); struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve, start, end; + long gbl_reserve; if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return; @@ -2465,8 +2526,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) kref_put(&resv->refs, resv_map_release); if (reserve) { - hugetlb_acct_memory(h, -reserve); - hugepage_subpool_put_pages(spool, reserve); + /* + * Decrement reserve counts. The global reserve count may be + * adjusted if the subpool has a minimum size. + */ + gbl_reserve = hugepage_subpool_put_pages(spool, reserve); + hugetlb_acct_memory(h, -gbl_reserve); } } @@ -3446,6 +3511,7 @@ int hugetlb_reserve_pages(struct inode *inode, struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; + long gbl_reserve; /* * Only apply hugepage reservation if asked. At fault time, an @@ -3482,8 +3548,13 @@ int hugetlb_reserve_pages(struct inode *inode, goto out_err; } - /* There must be enough pages in the subpool for the mapping */ - if (hugepage_subpool_get_pages(spool, chg)) { + /* + * There must be enough pages in the subpool for the mapping. If + * the subpool has a minimum size, there may be some global + * reservations already in place (gbl_reserve). + */ + gbl_reserve = hugepage_subpool_get_pages(spool, chg); + if (gbl_reserve < 0) { ret = -ENOSPC; goto out_err; } @@ -3492,9 +3563,10 @@ int hugetlb_reserve_pages(struct inode *inode, * Check enough hugepages are available for the reservation. * Hand the pages back to the subpool if there are not */ - ret = hugetlb_acct_memory(h, chg); + ret = hugetlb_acct_memory(h, gbl_reserve); if (ret < 0) { - hugepage_subpool_put_pages(spool, chg); + /* put back original number of pages, chg */ + (void)hugepage_subpool_put_pages(spool, chg); goto out_err; } @@ -3524,6 +3596,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) struct resv_map *resv_map = inode_resv_map(inode); long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); + long gbl_reserve; if (resv_map) chg = region_truncate(resv_map, offset); @@ -3531,8 +3604,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); - hugepage_subpool_put_pages(spool, (chg - freed)); - hugetlb_acct_memory(h, -(chg - freed)); + /* + * If the subpool has a minimum size, the number of global + * reservations to be released may be adjusted. + */ + gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); + hugetlb_acct_memory(h, -gbl_reserve); } #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE -- cgit v1.2.2 From 7ca02d0ae586fe7df59632966a64f3f1a756ef05 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 15 Apr 2015 16:13:42 -0700 Subject: hugetlbfs: accept subpool min_size mount option and setup accordingly Make 'min_size=' be an option when mounting a hugetlbfs. This option takes the same value as the 'size' option. min_size can be specified without specifying size. If both are specified, min_size must be less that or equal to size else the mount will fail. If min_size is specified, then at mount time an attempt is made to reserve min_size pages. If the reservation fails, the mount fails. At umount time, the reserved pages are released. Signed-off-by: Mike Kravetz Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Joonsoo Kim Cc: Andi Kleen Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 499cb72c74b1..995c8d65a95c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock); static int num_fault_mutexes; static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; +/* Forward declaration */ +static int hugetlb_acct_memory(struct hstate *h, long delta); + static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) { bool free = (spool->count == 0) && (spool->used_hpages == 0); @@ -68,12 +71,18 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) spin_unlock(&spool->lock); /* If no pages are used, and no other handles to the subpool - * remain, free the subpool the subpool remain */ - if (free) + * remain, give up any reservations mased on minimum size and + * free the subpool */ + if (free) { + if (spool->min_hpages != -1) + hugetlb_acct_memory(spool->hstate, + -spool->min_hpages); kfree(spool); + } } -struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) +struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, + long min_hpages) { struct hugepage_subpool *spool; @@ -83,7 +92,15 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) spin_lock_init(&spool->lock); spool->count = 1; - spool->max_hpages = nr_blocks; + spool->max_hpages = max_hpages; + spool->hstate = h; + spool->min_hpages = min_hpages; + + if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { + kfree(spool); + return NULL; + } + spool->rsv_hpages = min_hpages; return spool; } -- cgit v1.2.2 From bcc54222309c70ebcb6c69c156fba4a13dee0a3b Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 15 Apr 2015 16:14:38 -0700 Subject: mm: hugetlb: introduce page_huge_active We are not safe from calling isolate_huge_page() on a hugepage concurrently, which can make the victim hugepage in invalid state and results in BUG_ON(). The root problem of this is that we don't have any information on struct page (so easily accessible) about hugepages' activeness. Note that hugepages' activeness means just being linked to hstate->hugepage_activelist, which is not the same as normal pages' activeness represented by PageActive flag. Normal pages are isolated by isolate_lru_page() which prechecks PageLRU before isolation, so let's do similarly for hugetlb with a new paeg_huge_active(). set/clear_page_huge_active() should be called within hugetlb_lock. But hugetlb_cow() and hugetlb_no_page() don't do this, being justified because in these functions set_page_huge_active() is called right after the hugepage is allocated and no other thread tries to isolate it. [akpm@linux-foundation.org: s/PageHugeActive/page_huge_active/, make it return bool] [fengguang.wu@intel.com: set_page_huge_active() can be static] Signed-off-by: Naoya Horiguchi Cc: Hugh Dickins Reviewed-by: Michal Hocko Cc: Mel Gorman Cc: Johannes Weiner Cc: David Rientjes Signed-off-by: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 41 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 995c8d65a95c..05407831016b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -924,6 +924,31 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } +/* + * Test to determine whether the hugepage is "active/in-use" (i.e. being linked + * to hstate->hugepage_activelist.) + * + * This function can be called for tail pages, but never returns true for them. + */ +bool page_huge_active(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHuge(page), page); + return PageHead(page) && PagePrivate(&page[1]); +} + +/* never called for tail page */ +static void set_page_huge_active(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHeadHuge(page), page); + SetPagePrivate(&page[1]); +} + +static void clear_page_huge_active(struct page *page) +{ + VM_BUG_ON_PAGE(!PageHeadHuge(page), page); + ClearPagePrivate(&page[1]); +} + void free_huge_page(struct page *page) { /* @@ -952,6 +977,7 @@ void free_huge_page(struct page *page) restore_reserve = true; spin_lock(&hugetlb_lock); + clear_page_huge_active(page); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); if (restore_reserve) @@ -2972,6 +2998,7 @@ retry_avoidcopy: copy_user_huge_page(new_page, old_page, address, vma, pages_per_huge_page(h)); __SetPageUptodate(new_page); + set_page_huge_active(new_page); mmun_start = address & huge_page_mask(h); mmun_end = mmun_start + huge_page_size(h); @@ -3084,6 +3111,7 @@ retry: } clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); + set_page_huge_active(page); if (vma->vm_flags & VM_MAYSHARE) { int err; @@ -3913,19 +3941,26 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) bool isolate_huge_page(struct page *page, struct list_head *list) { + bool ret = true; + VM_BUG_ON_PAGE(!PageHead(page), page); - if (!get_page_unless_zero(page)) - return false; spin_lock(&hugetlb_lock); + if (!page_huge_active(page) || !get_page_unless_zero(page)) { + ret = false; + goto unlock; + } + clear_page_huge_active(page); list_move_tail(&page->lru, list); +unlock: spin_unlock(&hugetlb_lock); - return true; + return ret; } void putback_active_hugepage(struct page *page) { VM_BUG_ON_PAGE(!PageHead(page), page); spin_lock(&hugetlb_lock); + set_page_huge_active(page); list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); spin_unlock(&hugetlb_lock); put_page(page); -- cgit v1.2.2 From 7e1f049efb86bd86c06b80eeac0ef80cdeb8c0e7 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 15 Apr 2015 16:14:41 -0700 Subject: mm: hugetlb: cleanup using paeg_huge_active() Now we have an easy access to hugepages' activeness, so existing helpers to get the information can be cleaned up. [akpm@linux-foundation.org: s/PageHugeActive/page_huge_active/] Signed-off-by: Naoya Horiguchi Cc: Hugh Dickins Reviewed-by: Michal Hocko Cc: Mel Gorman Cc: Johannes Weiner Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 42 +++++------------------------------------- 1 file changed, 5 insertions(+), 37 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 05407831016b..271e4432734c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3896,20 +3896,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, #ifdef CONFIG_MEMORY_FAILURE -/* Should be called in hugetlb_lock */ -static int is_hugepage_on_freelist(struct page *hpage) -{ - struct page *page; - struct page *tmp; - struct hstate *h = page_hstate(hpage); - int nid = page_to_nid(hpage); - - list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) - if (page == hpage) - return 1; - return 0; -} - /* * This function is called from memory failure code. * Assume the caller holds page lock of the head page. @@ -3921,7 +3907,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) int ret = -EBUSY; spin_lock(&hugetlb_lock); - if (is_hugepage_on_freelist(hpage)) { + /* + * Just checking !page_huge_active is not enough, because that could be + * an isolated/hwpoisoned hugepage (which have >0 refcount). + */ + if (!page_huge_active(hpage) && !page_count(hpage)) { /* * Hwpoisoned hugepage isn't linked to activelist or freelist, * but dangling hpage->lru can trigger list-debug warnings @@ -3965,25 +3955,3 @@ void putback_active_hugepage(struct page *page) spin_unlock(&hugetlb_lock); put_page(page); } - -bool is_hugepage_active(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHuge(page), page); - /* - * This function can be called for a tail page because the caller, - * scan_movable_pages, scans through a given pfn-range which typically - * covers one memory block. In systems using gigantic hugepage (1GB - * for x86_64,) a hugepage is larger than a memory block, and we don't - * support migrating such large hugepages for now, so return false - * when called for tail pages. - */ - if (PageTail(page)) - return false; - /* - * Refcount of a hwpoisoned hugepages is 1, but they are not active, - * so we should return false for them. - */ - if (unlikely(PageHWPoison(page))) - return false; - return page_count(page) > 0; -} -- cgit v1.2.2