From 75353bed36cfbbfb55bbde0896bbf5a02d9ba355 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 23 Jul 2008 21:27:03 -0700 Subject: mm/hugetlb.c: fix duplicate variable It's confusing that set_max_huge_pages() contained two different variables named "ret", and although the code works correctly this should be fixed. The inner of the two variables can simply be removed. Spotted by sparse. Signed-off-by: Adrian Bunk Cc: "KOSAKI Motohiro" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ab171274ef21..2c5c9ee4220d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -603,7 +603,6 @@ static unsigned long set_max_huge_pages(unsigned long count) } while (count > persistent_huge_pages) { - int ret; /* * If this allocation races such that we no longer need the * page, free_huge_page will handle it by freeing the page -- cgit v1.2.2 From fc1b8a73dd71226902a11928dd5500326e101df9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 23 Jul 2008 21:27:22 -0700 Subject: hugetlb: move hugetlb_acct_memory() This is a patchset to give reliable behaviour to a process that successfully calls mmap(MAP_PRIVATE) on a hugetlbfs file. Currently, it is possible for the process to be killed due to a small hugepage pool size even if it calls mlock(). MAP_SHARED mappings on hugetlbfs reserve huge pages at mmap() time. This guarantees all future faults against the mapping will succeed. This allows local allocations at first use improving NUMA locality whilst retaining reliability. MAP_PRIVATE mappings do not reserve pages. This can result in an application being SIGKILLed later if a huge page is not available at fault time. This makes huge pages usage very ill-advised in some cases as the unexpected application failure cannot be detected and handled as it is immediately fatal. Although an application may force instantiation of the pages using mlock(), this may lead to poor memory placement and the process may still be killed when performing COW. This patchset introduces a reliability guarantee for the process which creates a private mapping, i.e. the process that calls mmap() on a hugetlbfs file successfully. The first patch of the set is purely mechanical code move to make later diffs easier to read. The second patch will guarantee faults up until the process calls fork(). After patch two, as long as the child keeps the mappings, the parent is no longer guaranteed to be reliable. Patch 3 guarantees that the parent will always successfully COW by unmapping the pages from the child in the event there are insufficient pages in the hugepage pool in allocate a new page, be it via a static or dynamic pool. Existing hugepage-aware applications are unlikely to be affected by this change. For much of hugetlbfs's history, pages were pre-faulted at mmap() time or mmap() failed which acts in a reserve-like manner. If the pool is sized correctly already so that parent and child can fault reliably, the application will not even notice the reserves. It's only when the pool is too small for the application to function perfectly reliably that the reserves come into play. Credit goes to Andy Whitcroft for cleaning up a number of mistakes during review before the patches were released. This patch: A later patch in this set needs to call hugetlb_acct_memory() before it is defined. This patch moves the function without modification. This makes later diffs easier to read. Signed-off-by: Mel Gorman Acked-by: Adam Litke Cc: Andy Whitcroft Cc: William Lee Irwin III Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 82 ++++++++++++++++++++++++++++++------------------------------ 1 file changed, 41 insertions(+), 41 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2c5c9ee4220d..a4dbba8965f3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -716,6 +716,47 @@ unsigned long hugetlb_total_pages(void) return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); } +static int hugetlb_acct_memory(long delta) +{ + int ret = -ENOMEM; + + spin_lock(&hugetlb_lock); + /* + * When cpuset is configured, it breaks the strict hugetlb page + * reservation as the accounting is done on a global variable. Such + * reservation is completely rubbish in the presence of cpuset because + * the reservation is not checked against page availability for the + * current cpuset. Application can still potentially OOM'ed by kernel + * with lack of free htlb page in cpuset that the task is in. + * Attempt to enforce strict accounting with cpuset is almost + * impossible (or too ugly) because cpuset is too fluid that + * task or memory node can be dynamically moved between cpusets. + * + * The change of semantics for shared hugetlb mapping with cpuset is + * undesirable. However, in order to preserve some of the semantics, + * we fall back to check against current free page availability as + * a best attempt and hopefully to minimize the impact of changing + * semantics that cpuset has. + */ + if (delta > 0) { + if (gather_surplus_pages(delta) < 0) + goto out; + + if (delta > cpuset_mems_nr(free_huge_pages_node)) { + return_unused_surplus_pages(delta); + goto out; + } + } + + ret = 0; + if (delta < 0) + return_unused_surplus_pages((unsigned long) -delta); + +out: + spin_unlock(&hugetlb_lock); + return ret; +} + /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the @@ -1248,47 +1289,6 @@ static long region_truncate(struct list_head *head, long end) return chg; } -static int hugetlb_acct_memory(long delta) -{ - int ret = -ENOMEM; - - spin_lock(&hugetlb_lock); - /* - * When cpuset is configured, it breaks the strict hugetlb page - * reservation as the accounting is done on a global variable. Such - * reservation is completely rubbish in the presence of cpuset because - * the reservation is not checked against page availability for the - * current cpuset. Application can still potentially OOM'ed by kernel - * with lack of free htlb page in cpuset that the task is in. - * Attempt to enforce strict accounting with cpuset is almost - * impossible (or too ugly) because cpuset is too fluid that - * task or memory node can be dynamically moved between cpusets. - * - * The change of semantics for shared hugetlb mapping with cpuset is - * undesirable. However, in order to preserve some of the semantics, - * we fall back to check against current free page availability as - * a best attempt and hopefully to minimize the impact of changing - * semantics that cpuset has. - */ - if (delta > 0) { - if (gather_surplus_pages(delta) < 0) - goto out; - - if (delta > cpuset_mems_nr(free_huge_pages_node)) { - return_unused_surplus_pages(delta); - goto out; - } - } - - ret = 0; - if (delta < 0) - return_unused_surplus_pages((unsigned long) -delta); - -out: - spin_unlock(&hugetlb_lock); - return ret; -} - int hugetlb_reserve_pages(struct inode *inode, long from, long to) { long ret, chg; -- cgit v1.2.2 From a1e78772d72b2616ed20e54896e68e0e7044854e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 23 Jul 2008 21:27:23 -0700 Subject: hugetlb: reserve huge pages for reliable MAP_PRIVATE hugetlbfs mappings until fork() This patch reserves huge pages at mmap() time for MAP_PRIVATE mappings in a similar manner to the reservations taken for MAP_SHARED mappings. The reserve count is accounted both globally and on a per-VMA basis for private mappings. This guarantees that a process that successfully calls mmap() will successfully fault all pages in the future unless fork() is called. The characteristics of private mappings of hugetlbfs files behaviour after this patch are; 1. The process calling mmap() is guaranteed to succeed all future faults until it forks(). 2. On fork(), the parent may die due to SIGKILL on writes to the private mapping if enough pages are not available for the COW. For reasonably reliable behaviour in the face of a small huge page pool, children of hugepage-aware processes should not reference the mappings; such as might occur when fork()ing to exec(). 3. On fork(), the child VMAs inherit no reserves. Reads on pages already faulted by the parent will succeed. Successful writes will depend on enough huge pages being free in the pool. 4. Quotas of the hugetlbfs mount are checked at reserve time for the mapper and at fault time otherwise. Before this patch, all reads or writes in the child potentially needs page allocations that can later lead to the death of the parent. This applies to reads and writes of uninstantiated pages as well as COW. After the patch it is only a write to an instantiated page that causes problems. Signed-off-by: Mel Gorman Acked-by: Adam Litke Cc: Andy Whitcroft Cc: William Lee Irwin III Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 158 ++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 119 insertions(+), 39 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a4dbba8965f3..0af500db3632 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -40,6 +40,69 @@ static int hugetlb_next_nid; */ static DEFINE_SPINLOCK(hugetlb_lock); +/* + * These helpers are used to track how many pages are reserved for + * faults in a MAP_PRIVATE mapping. Only the process that called mmap() + * is guaranteed to have their future faults succeed. + * + * With the exception of reset_vma_resv_huge_pages() which is called at fork(), + * the reserve counters are updated with the hugetlb_lock held. It is safe + * to reset the VMA at fork() time as it is not in use yet and there is no + * chance of the global counters getting corrupted as a result of the values. + */ +static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + if (!(vma->vm_flags & VM_SHARED)) + return (unsigned long)vma->vm_private_data; + return 0; +} + +static void set_vma_resv_huge_pages(struct vm_area_struct *vma, + unsigned long reserve) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + VM_BUG_ON(vma->vm_flags & VM_SHARED); + + vma->vm_private_data = (void *)reserve; +} + +/* Decrement the reserved pages in the hugepage pool by one */ +static void decrement_hugepage_resv_vma(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHARED) { + /* Shared mappings always use reserves */ + resv_huge_pages--; + } else { + /* + * Only the process that called mmap() has reserves for + * private mappings. + */ + if (vma_resv_huge_pages(vma)) { + resv_huge_pages--; + reserve = (unsigned long)vma->vm_private_data - 1; + vma->vm_private_data = (void *)reserve; + } + } +} + +void reset_vma_resv_huge_pages(struct vm_area_struct *vma) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + if (!(vma->vm_flags & VM_SHARED)) + vma->vm_private_data = (void *)0; +} + +/* Returns true if the VMA has associated reserve pages */ +static int vma_has_private_reserves(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHARED) + return 0; + if (!vma_resv_huge_pages(vma)) + return 0; + return 1; +} + static void clear_huge_page(struct page *page, unsigned long addr) { int i; @@ -101,6 +164,15 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, struct zone *zone; struct zoneref *z; + /* + * A child process with MAP_PRIVATE mappings created by their parent + * have no page reserves. This check ensures that reservations are + * not "stolen". The child may still get SIGKILLed + */ + if (!vma_has_private_reserves(vma) && + free_huge_pages - resv_huge_pages == 0) + return NULL; + for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { nid = zone_to_nid(zone); @@ -111,8 +183,8 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, list_del(&page->lru); free_huge_pages--; free_huge_pages_node[nid]--; - if (vma && vma->vm_flags & VM_MAYSHARE) - resv_huge_pages--; + decrement_hugepage_resv_vma(vma); + break; } } @@ -461,55 +533,40 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) } } - -static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, - unsigned long addr) +static struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr) { struct page *page; + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + unsigned int chg = 0; + + /* + * Processes that did not create the mapping will have no reserves and + * will not have accounted against quota. Check that the quota can be + * made before satisfying the allocation + */ + if (!vma_has_private_reserves(vma)) { + chg = 1; + if (hugetlb_get_quota(inode->i_mapping, chg)) + return ERR_PTR(-ENOSPC); + } spin_lock(&hugetlb_lock); page = dequeue_huge_page_vma(vma, addr); spin_unlock(&hugetlb_lock); - return page ? page : ERR_PTR(-VM_FAULT_OOM); -} -static struct page *alloc_huge_page_private(struct vm_area_struct *vma, - unsigned long addr) -{ - struct page *page = NULL; - - if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) - return ERR_PTR(-VM_FAULT_SIGBUS); - - spin_lock(&hugetlb_lock); - if (free_huge_pages > resv_huge_pages) - page = dequeue_huge_page_vma(vma, addr); - spin_unlock(&hugetlb_lock); if (!page) { page = alloc_buddy_huge_page(vma, addr); if (!page) { - hugetlb_put_quota(vma->vm_file->f_mapping, 1); + hugetlb_put_quota(inode->i_mapping, chg); return ERR_PTR(-VM_FAULT_OOM); } } - return page; -} -static struct page *alloc_huge_page(struct vm_area_struct *vma, - unsigned long addr) -{ - struct page *page; - struct address_space *mapping = vma->vm_file->f_mapping; - - if (vma->vm_flags & VM_MAYSHARE) - page = alloc_huge_page_shared(vma, addr); - else - page = alloc_huge_page_private(vma, addr); + set_page_refcounted(page); + set_page_private(page, (unsigned long) mapping); - if (!IS_ERR(page)) { - set_page_refcounted(page); - set_page_private(page, (unsigned long) mapping); - } return page; } @@ -757,6 +814,13 @@ out: return ret; } +static void hugetlb_vm_op_close(struct vm_area_struct *vma) +{ + unsigned long reserve = vma_resv_huge_pages(vma); + if (reserve) + hugetlb_acct_memory(-reserve); +} + /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the @@ -771,6 +835,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, + .close = hugetlb_vm_op_close, }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, @@ -1289,11 +1354,25 @@ static long region_truncate(struct list_head *head, long end) return chg; } -int hugetlb_reserve_pages(struct inode *inode, long from, long to) +int hugetlb_reserve_pages(struct inode *inode, + long from, long to, + struct vm_area_struct *vma) { long ret, chg; - chg = region_chg(&inode->i_mapping->private_list, from, to); + /* + * Shared mappings base their reservation on the number of pages that + * are already allocated on behalf of the file. Private mappings need + * to reserve the full area even if read-only as mprotect() may be + * called to make the mapping read-write. Assume !vma is a shm mapping + */ + if (!vma || vma->vm_flags & VM_SHARED) + chg = region_chg(&inode->i_mapping->private_list, from, to); + else { + chg = to - from; + set_vma_resv_huge_pages(vma, chg); + } + if (chg < 0) return chg; @@ -1304,7 +1383,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to) hugetlb_put_quota(inode->i_mapping, chg); return ret; } - region_add(&inode->i_mapping->private_list, from, to); + if (!vma || vma->vm_flags & VM_SHARED) + region_add(&inode->i_mapping->private_list, from, to); return 0; } -- cgit v1.2.2 From 04f2cbe35699d22dbf428373682ead85ca1240f5 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 23 Jul 2008 21:27:25 -0700 Subject: hugetlb: guarantee that COW faults for a process that called mmap(MAP_PRIVATE) on hugetlbfs will succeed After patch 2 in this series, a process that successfully calls mmap() for a MAP_PRIVATE mapping will be guaranteed to successfully fault until a process calls fork(). At that point, the next write fault from the parent could fail due to COW if the child still has a reference. We only reserve pages for the parent but a copy must be made to avoid leaking data from the parent to the child after fork(). Reserves could be taken for both parent and child at fork time to guarantee faults but if the mapping is large it is highly likely we will not have sufficient pages for the reservation, and it is common to fork only to exec() immediatly after. A failure here would be very undesirable. Note that the current behaviour of mainline with MAP_PRIVATE pages is pretty bad. The following situation is allowed to occur today. 1. Process calls mmap(MAP_PRIVATE) 2. Process calls mlock() to fault all pages and makes sure it succeeds 3. Process forks() 4. Process writes to MAP_PRIVATE mapping while child still exists 5. If the COW fails at this point, the process gets SIGKILLed even though it had taken care to ensure the pages existed This patch improves the situation by guaranteeing the reliability of the process that successfully calls mmap(). When the parent performs COW, it will try to satisfy the allocation without using reserves. If that fails the parent will steal the page leaving any children without a page. Faults from the child after that point will result in failure. If the child COW happens first, an attempt will be made to allocate the page without reserves and the child will get SIGKILLed on failure. To summarise the new behaviour: 1. If the original mapper performs COW on a private mapping with multiple references, it will attempt to allocate a hugepage from the pool or the buddy allocator without using the existing reserves. On fail, VMAs mapping the same area are traversed and the page being COW'd is unmapped where found. It will then steal the original page as the last mapper in the normal way. 2. The VMAs the pages were unmapped from are flagged to note that pages with data no longer exist. Future no-page faults on those VMAs will terminate the process as otherwise it would appear that data was corrupted. A warning is printed to the console that this situation occured. 2. If the child performs COW first, it will attempt to satisfy the COW from the pool if there are enough pages or via the buddy allocator if overcommit is allowed and the buddy allocator can satisfy the request. If it fails, the child will be killed. If the pool is large enough, existing applications will not notice that the reserves were a factor. Existing applications depending on the no-reserves been set are unlikely to exist as for much of the history of hugetlbfs, pages were prefaulted at mmap(), allocating the pages at that point or failing the mmap(). [npiggin@suse.de: fix CONFIG_HUGETLB=n build] Signed-off-by: Mel Gorman Acked-by: Adam Litke Cc: Andy Whitcroft Cc: William Lee Irwin III Cc: Hugh Dickins Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 201 +++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 183 insertions(+), 18 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0af500db3632..a2d29b84501f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -40,6 +40,9 @@ static int hugetlb_next_nid; */ static DEFINE_SPINLOCK(hugetlb_lock); +#define HPAGE_RESV_OWNER (1UL << (BITS_PER_LONG - 1)) +#define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2)) +#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) /* * These helpers are used to track how many pages are reserved for * faults in a MAP_PRIVATE mapping. Only the process that called mmap() @@ -54,17 +57,32 @@ static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); if (!(vma->vm_flags & VM_SHARED)) - return (unsigned long)vma->vm_private_data; + return (unsigned long)vma->vm_private_data & ~HPAGE_RESV_MASK; return 0; } static void set_vma_resv_huge_pages(struct vm_area_struct *vma, unsigned long reserve) { + unsigned long flags; VM_BUG_ON(!is_vm_hugetlb_page(vma)); VM_BUG_ON(vma->vm_flags & VM_SHARED); - vma->vm_private_data = (void *)reserve; + flags = (unsigned long)vma->vm_private_data & HPAGE_RESV_MASK; + vma->vm_private_data = (void *)(reserve | flags); +} + +static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) +{ + unsigned long reserveflags = (unsigned long)vma->vm_private_data; + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + vma->vm_private_data = (void *)(reserveflags | flags); +} + +static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + return ((unsigned long)vma->vm_private_data & flag) != 0; } /* Decrement the reserved pages in the hugepage pool by one */ @@ -78,14 +96,18 @@ static void decrement_hugepage_resv_vma(struct vm_area_struct *vma) * Only the process that called mmap() has reserves for * private mappings. */ - if (vma_resv_huge_pages(vma)) { + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + unsigned long flags, reserve; resv_huge_pages--; + flags = (unsigned long)vma->vm_private_data & + HPAGE_RESV_MASK; reserve = (unsigned long)vma->vm_private_data - 1; - vma->vm_private_data = (void *)reserve; + vma->vm_private_data = (void *)(reserve | flags); } } } +/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); @@ -153,7 +175,7 @@ static struct page *dequeue_huge_page(void) } static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, - unsigned long address) + unsigned long address, int avoid_reserve) { int nid; struct page *page = NULL; @@ -173,6 +195,10 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, free_huge_pages - resv_huge_pages == 0) return NULL; + /* If reserves cannot be used, ensure enough pages are in the pool */ + if (avoid_reserve && free_huge_pages - resv_huge_pages == 0) + return NULL; + for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { nid = zone_to_nid(zone); @@ -183,7 +209,9 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, list_del(&page->lru); free_huge_pages--; free_huge_pages_node[nid]--; - decrement_hugepage_resv_vma(vma); + + if (!avoid_reserve) + decrement_hugepage_resv_vma(vma); break; } @@ -534,7 +562,7 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) } static struct page *alloc_huge_page(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, int avoid_reserve) { struct page *page; struct address_space *mapping = vma->vm_file->f_mapping; @@ -546,14 +574,15 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, * will not have accounted against quota. Check that the quota can be * made before satisfying the allocation */ - if (!vma_has_private_reserves(vma)) { + if (!(vma->vm_flags & VM_SHARED) && + !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { chg = 1; if (hugetlb_get_quota(inode->i_mapping, chg)) return ERR_PTR(-ENOSPC); } spin_lock(&hugetlb_lock); - page = dequeue_huge_page_vma(vma, addr); + page = dequeue_huge_page_vma(vma, addr, avoid_reserve); spin_unlock(&hugetlb_lock); if (!page) { @@ -909,7 +938,7 @@ nomem: } void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) + unsigned long end, struct page *ref_page) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -937,6 +966,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, if (huge_pmd_unshare(mm, &address, ptep)) continue; + /* + * If a reference page is supplied, it is because a specific + * page is being unmapped, not a range. Ensure the page we + * are about to unmap is the actual page of interest. + */ + if (ref_page) { + pte = huge_ptep_get(ptep); + if (huge_pte_none(pte)) + continue; + page = pte_page(pte); + if (page != ref_page) + continue; + + /* + * Mark the VMA as having unmapped its page so that + * future faults in this VMA will fail rather than + * looking like data was lost + */ + set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); + } + pte = huge_ptep_get_and_clear(mm, address, ptep); if (huge_pte_none(pte)) continue; @@ -955,7 +1005,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) + unsigned long end, struct page *ref_page) { /* * It is undesirable to test vma->vm_file as it should be non-null @@ -967,19 +1017,68 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, */ if (vma->vm_file) { spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); - __unmap_hugepage_range(vma, start, end); + __unmap_hugepage_range(vma, start, end, ref_page); spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); } } +/* + * This is called when the original mapper is failing to COW a MAP_PRIVATE + * mappping it owns the reserve page for. The intention is to unmap the page + * from other VMAs and let the children be SIGKILLed if they are faulting the + * same region. + */ +int unmap_ref_private(struct mm_struct *mm, + struct vm_area_struct *vma, + struct page *page, + unsigned long address) +{ + struct vm_area_struct *iter_vma; + struct address_space *mapping; + struct prio_tree_iter iter; + pgoff_t pgoff; + + /* + * vm_pgoff is in PAGE_SIZE units, hence the different calculation + * from page cache lookup which is in HPAGE_SIZE units. + */ + address = address & huge_page_mask(hstate_vma(vma)); + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + + (vma->vm_pgoff >> PAGE_SHIFT); + mapping = (struct address_space *)page_private(page); + + vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + /* Do not unmap the current VMA */ + if (iter_vma == vma) + continue; + + /* + * Unmap the page from other VMAs without their own reserves. + * They get marked to be SIGKILLed if they fault in these + * areas. This is because a future no-page fault on this VMA + * could insert a zeroed page instead of the data existing + * from the time of fork. This would look like data corruption + */ + if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) + unmap_hugepage_range(iter_vma, + address, address + HPAGE_SIZE, + page); + } + + return 1; +} + static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, pte_t pte) + unsigned long address, pte_t *ptep, pte_t pte, + struct page *pagecache_page) { struct page *old_page, *new_page; int avoidcopy; + int outside_reserve = 0; old_page = pte_page(pte); +retry_avoidcopy: /* If no-one else is actually using this page, avoid the copy * and just make the page writable */ avoidcopy = (page_count(old_page) == 1); @@ -988,11 +1087,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } + /* + * If the process that created a MAP_PRIVATE mapping is about to + * perform a COW due to a shared page count, attempt to satisfy + * the allocation without using the existing reserves. The pagecache + * page is used to determine if the reserve at this address was + * consumed or not. If reserves were used, a partial faulted mapping + * at the time of fork() could consume its reserves on COW instead + * of the full address range. + */ + if (!(vma->vm_flags & VM_SHARED) && + is_vma_resv_set(vma, HPAGE_RESV_OWNER) && + old_page != pagecache_page) + outside_reserve = 1; + page_cache_get(old_page); - new_page = alloc_huge_page(vma, address); + new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { page_cache_release(old_page); + + /* + * If a process owning a MAP_PRIVATE mapping fails to COW, + * it is due to references held by a child and an insufficient + * huge page pool. To guarantee the original mappers + * reliability, unmap the page from child processes. The child + * may get SIGKILLed if it later faults. + */ + if (outside_reserve) { + BUG_ON(huge_pte_none(pte)); + if (unmap_ref_private(mm, vma, old_page, address)) { + BUG_ON(page_count(old_page) != 1); + BUG_ON(huge_pte_none(pte)); + goto retry_avoidcopy; + } + WARN_ON_ONCE(1); + } + return -PTR_ERR(new_page); } @@ -1015,6 +1146,20 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } +/* Return the pagecache page at a given address within a VMA */ +static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma, + unsigned long address) +{ + struct address_space *mapping; + unsigned long idx; + + mapping = vma->vm_file->f_mapping; + idx = ((address - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + + return find_lock_page(mapping, idx); +} + static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, int write_access) { @@ -1025,6 +1170,18 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping; pte_t new_pte; + /* + * Currently, we are forced to kill the process in the event the + * original mapper has unmapped pages from the child due to a failed + * COW. Warn that such a situation has occured as it may not be obvious + */ + if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { + printk(KERN_WARNING + "PID %d killed due to inadequate hugepage pool\n", + current->pid); + return ret; + } + mapping = vma->vm_file->f_mapping; idx = ((address - vma->vm_start) >> HPAGE_SHIFT) + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); @@ -1039,7 +1196,7 @@ retry: size = i_size_read(mapping->host) >> HPAGE_SHIFT; if (idx >= size) goto out; - page = alloc_huge_page(vma, address); + page = alloc_huge_page(vma, address, 0); if (IS_ERR(page)) { ret = -PTR_ERR(page); goto out; @@ -1081,7 +1238,7 @@ retry: if (write_access && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, new_pte); + ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); } spin_unlock(&mm->page_table_lock); @@ -1126,8 +1283,15 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (likely(pte_same(entry, huge_ptep_get(ptep)))) - if (write_access && !pte_write(entry)) - ret = hugetlb_cow(mm, vma, address, ptep, entry); + if (write_access && !pte_write(entry)) { + struct page *page; + page = hugetlbfs_pagecache_page(vma, address); + ret = hugetlb_cow(mm, vma, address, ptep, entry, page); + if (page) { + unlock_page(page); + put_page(page); + } + } spin_unlock(&mm->page_table_lock); mutex_unlock(&hugetlb_instantiation_mutex); @@ -1371,6 +1535,7 @@ int hugetlb_reserve_pages(struct inode *inode, else { chg = to - from; set_vma_resv_huge_pages(vma, chg); + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } if (chg < 0) -- cgit v1.2.2 From e7c4b0bfd025f71cf7624b7c1be174f63caade33 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 23 Jul 2008 21:27:26 -0700 Subject: huge page private reservation review cleanups Create some new accessors for vma private data to cut down on and contain the casts. Encapsulates the huge and small page offset calculations. Also adds a couple of VM_BUG_ONs for consistency. [akpm@linux-foundation.org: Make things static] Signed-off-by: Andy Whitcroft Acked-by: Mel Gorman Cc: Adam Litke Cc: Johannes Weiner Cc: Andy Whitcroft Cc: William Lee Irwin III Cc: Hugh Dickins Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 58 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 13 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a2d29b84501f..3e873f0101fb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -40,6 +40,28 @@ static int hugetlb_next_nid; */ static DEFINE_SPINLOCK(hugetlb_lock); +/* + * Convert the address within this vma to the page offset within + * the mapping, in base page units. + */ +static pgoff_t vma_page_offset(struct vm_area_struct *vma, + unsigned long address) +{ + return ((address - vma->vm_start) >> PAGE_SHIFT) + + (vma->vm_pgoff >> PAGE_SHIFT); +} + +/* + * Convert the address within this vma to the page offset within + * the mapping, in pagecache page units; huge pages here. + */ +static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma, + unsigned long address) +{ + return ((address - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); +} + #define HPAGE_RESV_OWNER (1UL << (BITS_PER_LONG - 1)) #define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2)) #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) @@ -53,36 +75,48 @@ static DEFINE_SPINLOCK(hugetlb_lock); * to reset the VMA at fork() time as it is not in use yet and there is no * chance of the global counters getting corrupted as a result of the values. */ +static unsigned long get_vma_private_data(struct vm_area_struct *vma) +{ + return (unsigned long)vma->vm_private_data; +} + +static void set_vma_private_data(struct vm_area_struct *vma, + unsigned long value) +{ + vma->vm_private_data = (void *)value; +} + static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); if (!(vma->vm_flags & VM_SHARED)) - return (unsigned long)vma->vm_private_data & ~HPAGE_RESV_MASK; + return get_vma_private_data(vma) & ~HPAGE_RESV_MASK; return 0; } static void set_vma_resv_huge_pages(struct vm_area_struct *vma, unsigned long reserve) { - unsigned long flags; VM_BUG_ON(!is_vm_hugetlb_page(vma)); VM_BUG_ON(vma->vm_flags & VM_SHARED); - flags = (unsigned long)vma->vm_private_data & HPAGE_RESV_MASK; - vma->vm_private_data = (void *)(reserve | flags); + set_vma_private_data(vma, + (get_vma_private_data(vma) & HPAGE_RESV_MASK) | reserve); } static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { - unsigned long reserveflags = (unsigned long)vma->vm_private_data; VM_BUG_ON(!is_vm_hugetlb_page(vma)); - vma->vm_private_data = (void *)(reserveflags | flags); + VM_BUG_ON(vma->vm_flags & VM_SHARED); + + set_vma_private_data(vma, get_vma_private_data(vma) | flags); } static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - return ((unsigned long)vma->vm_private_data & flag) != 0; + + return (get_vma_private_data(vma) & flag) != 0; } /* Decrement the reserved pages in the hugepage pool by one */ @@ -1151,11 +1185,10 @@ static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma, unsigned long address) { struct address_space *mapping; - unsigned long idx; + pgoff_t idx; mapping = vma->vm_file->f_mapping; - idx = ((address - vma->vm_start) >> HPAGE_SHIFT) - + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + idx = vma_pagecache_offset(vma, address); return find_lock_page(mapping, idx); } @@ -1164,7 +1197,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, int write_access) { int ret = VM_FAULT_SIGBUS; - unsigned long idx; + pgoff_t idx; unsigned long size; struct page *page; struct address_space *mapping; @@ -1183,8 +1216,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, } mapping = vma->vm_file->f_mapping; - idx = ((address - vma->vm_start) >> HPAGE_SHIFT) - + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + idx = vma_pagecache_offset(vma, address); /* * Use page lock to guard against racing truncation -- cgit v1.2.2 From 9682290484370ce68ba23cd2ec2838e301934199 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 23 Jul 2008 21:27:29 -0700 Subject: hugetlb: move reservation region support earlier The following patch will require use of the reservation regions support. Move this earlier in the file. No changes have been made to this code. Signed-off-by: Andy Whitcroft Cc: Mel Gorman Acked-by: Adam Litke Cc: Johannes Weiner Cc: Andy Whitcroft Cc: William Lee Irwin III Cc: Hugh Dickins Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 246 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 125 insertions(+), 121 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3e873f0101fb..05bc9af4fca9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -40,6 +40,131 @@ static int hugetlb_next_nid; */ static DEFINE_SPINLOCK(hugetlb_lock); +/* + * Region tracking -- allows tracking of reservations and instantiated pages + * across the pages in a mapping. + */ +struct file_region { + struct list_head link; + long from; + long to; +}; + +static long region_add(struct list_head *head, long f, long t) +{ + struct file_region *rg, *nrg, *trg; + + /* Locate the region we are either in or before. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + + /* Check for and consume any regions we now overlap with. */ + nrg = rg; + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + break; + + /* If this area reaches higher then extend our area to + * include it completely. If this is not the first area + * which we intend to reuse, free it. */ + if (rg->to > t) + t = rg->to; + if (rg != nrg) { + list_del(&rg->link); + kfree(rg); + } + } + nrg->from = f; + nrg->to = t; + return 0; +} + +static long region_chg(struct list_head *head, long f, long t) +{ + struct file_region *rg, *nrg; + long chg = 0; + + /* Locate the region we are before or in. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* If we are below the current region then a new region is required. + * Subtle, allocate a new region at the position but make it zero + * size such that we can guarantee to record the reservation. */ + if (&rg->link == head || t < rg->from) { + nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); + if (!nrg) + return -ENOMEM; + nrg->from = f; + nrg->to = f; + INIT_LIST_HEAD(&nrg->link); + list_add(&nrg->link, rg->link.prev); + + return t - f; + } + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + chg = t - f; + + /* Check for and consume any regions we now overlap with. */ + list_for_each_entry(rg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + return chg; + + /* We overlap with this area, if it extends futher than + * us then we must extend ourselves. Account for its + * existing reservation. */ + if (rg->to > t) { + chg += rg->to - t; + t = rg->to; + } + chg -= rg->to - rg->from; + } + return chg; +} + +static long region_truncate(struct list_head *head, long end) +{ + struct file_region *rg, *trg; + long chg = 0; + + /* Locate the region we are either in or before. */ + list_for_each_entry(rg, head, link) + if (end <= rg->to) + break; + if (&rg->link == head) + return 0; + + /* If we are in the middle of a region then adjust it. */ + if (end > rg->from) { + chg = rg->to - end; + rg->to = end; + rg = list_entry(rg->link.next, typeof(*rg), link); + } + + /* Drop any remaining regions. */ + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + chg += rg->to - rg->from; + list_del(&rg->link); + kfree(rg); + } + return chg; +} + /* * Convert the address within this vma to the page offset within * the mapping, in base page units. @@ -1429,127 +1554,6 @@ void hugetlb_change_protection(struct vm_area_struct *vma, flush_tlb_range(vma, start, end); } -struct file_region { - struct list_head link; - long from; - long to; -}; - -static long region_add(struct list_head *head, long f, long t) -{ - struct file_region *rg, *nrg, *trg; - - /* Locate the region we are either in or before. */ - list_for_each_entry(rg, head, link) - if (f <= rg->to) - break; - - /* Round our left edge to the current segment if it encloses us. */ - if (f > rg->from) - f = rg->from; - - /* Check for and consume any regions we now overlap with. */ - nrg = rg; - list_for_each_entry_safe(rg, trg, rg->link.prev, link) { - if (&rg->link == head) - break; - if (rg->from > t) - break; - - /* If this area reaches higher then extend our area to - * include it completely. If this is not the first area - * which we intend to reuse, free it. */ - if (rg->to > t) - t = rg->to; - if (rg != nrg) { - list_del(&rg->link); - kfree(rg); - } - } - nrg->from = f; - nrg->to = t; - return 0; -} - -static long region_chg(struct list_head *head, long f, long t) -{ - struct file_region *rg, *nrg; - long chg = 0; - - /* Locate the region we are before or in. */ - list_for_each_entry(rg, head, link) - if (f <= rg->to) - break; - - /* If we are below the current region then a new region is required. - * Subtle, allocate a new region at the position but make it zero - * size such that we can guarantee to record the reservation. */ - if (&rg->link == head || t < rg->from) { - nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); - if (!nrg) - return -ENOMEM; - nrg->from = f; - nrg->to = f; - INIT_LIST_HEAD(&nrg->link); - list_add(&nrg->link, rg->link.prev); - - return t - f; - } - - /* Round our left edge to the current segment if it encloses us. */ - if (f > rg->from) - f = rg->from; - chg = t - f; - - /* Check for and consume any regions we now overlap with. */ - list_for_each_entry(rg, rg->link.prev, link) { - if (&rg->link == head) - break; - if (rg->from > t) - return chg; - - /* We overlap with this area, if it extends futher than - * us then we must extend ourselves. Account for its - * existing reservation. */ - if (rg->to > t) { - chg += rg->to - t; - t = rg->to; - } - chg -= rg->to - rg->from; - } - return chg; -} - -static long region_truncate(struct list_head *head, long end) -{ - struct file_region *rg, *trg; - long chg = 0; - - /* Locate the region we are either in or before. */ - list_for_each_entry(rg, head, link) - if (end <= rg->to) - break; - if (&rg->link == head) - return 0; - - /* If we are in the middle of a region then adjust it. */ - if (end > rg->from) { - chg = rg->to - end; - rg->to = end; - rg = list_entry(rg->link.next, typeof(*rg), link); - } - - /* Drop any remaining regions. */ - list_for_each_entry_safe(rg, trg, rg->link.prev, link) { - if (&rg->link == head) - break; - chg += rg->to - rg->from; - list_del(&rg->link); - kfree(rg); - } - return chg; -} - int hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma) -- cgit v1.2.2 From c37f9fb11c976ffc08200d631dada6dcbfd07ea4 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 23 Jul 2008 21:27:30 -0700 Subject: hugetlb: allow huge page mappings to be created without reservations By default all shared mappings and most private mappings now have reservations associated with them. This improves semantics by providing allocation guarentees to the mapper. However a small number of applications may attempt to make very large sparse mappings, with these strict reservations the system will never be able to honour the mapping. This patch set brings MAP_NORESERVE support to hugetlb files. This allows new mappings to be made to hugetlbfs files without an associated reservation, for both shared and private mappings. This allows applications which want to create very sparse mappings to opt-out of the reservation system. Obviously as there is no reservation they are liable to fault at runtime if the huge page pool becomes exhausted; buyer beware. Signed-off-by: Andy Whitcroft Cc: Mel Gorman Cc: Adam Litke Cc: Johannes Weiner Cc: Andy Whitcroft Cc: William Lee Irwin III Cc: Hugh Dickins Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 5 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 05bc9af4fca9..72acbb29d2cc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -247,6 +247,9 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) /* Decrement the reserved pages in the hugepage pool by one */ static void decrement_hugepage_resv_vma(struct vm_area_struct *vma) { + if (vma->vm_flags & VM_NORESERVE) + return; + if (vma->vm_flags & VM_SHARED) { /* Shared mappings always use reserves */ resv_huge_pages--; @@ -720,25 +723,65 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) } } +/* + * Determine if the huge page at addr within the vma has an associated + * reservation. Where it does not we will need to logically increase + * reservation and actually increase quota before an allocation can occur. + * Where any new reservation would be required the reservation change is + * prepared, but not committed. Once the page has been quota'd allocated + * an instantiated the change should be committed via vma_commit_reservation. + * No action is required on failure. + */ +static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + + if (vma->vm_flags & VM_SHARED) { + pgoff_t idx = vma_pagecache_offset(vma, addr); + return region_chg(&inode->i_mapping->private_list, + idx, idx + 1); + + } else { + if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + return 1; + } + + return 0; +} +static void vma_commit_reservation(struct vm_area_struct *vma, + unsigned long addr) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + + if (vma->vm_flags & VM_SHARED) { + pgoff_t idx = vma_pagecache_offset(vma, addr); + region_add(&inode->i_mapping->private_list, idx, idx + 1); + } +} + static struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { struct page *page; struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - unsigned int chg = 0; + unsigned int chg; /* * Processes that did not create the mapping will have no reserves and * will not have accounted against quota. Check that the quota can be * made before satisfying the allocation + * MAP_NORESERVE mappings may also need pages and quota allocated + * if no reserve mapping overlaps. */ - if (!(vma->vm_flags & VM_SHARED) && - !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - chg = 1; + chg = vma_needs_reservation(vma, addr); + if (chg < 0) + return ERR_PTR(chg); + if (chg) if (hugetlb_get_quota(inode->i_mapping, chg)) return ERR_PTR(-ENOSPC); - } spin_lock(&hugetlb_lock); page = dequeue_huge_page_vma(vma, addr, avoid_reserve); @@ -755,6 +798,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, set_page_refcounted(page); set_page_private(page, (unsigned long) mapping); + vma_commit_reservation(vma, addr); + return page; } @@ -1560,6 +1605,9 @@ int hugetlb_reserve_pages(struct inode *inode, { long ret, chg; + if (vma && vma->vm_flags & VM_NORESERVE) + return 0; + /* * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need -- cgit v1.2.2 From 84afd99b8398c9d73af8238aa3cd835858e3097a Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 23 Jul 2008 21:27:32 -0700 Subject: hugetlb reservations: fix hugetlb MAP_PRIVATE reservations across vma splits When a hugetlb mapping with a reservation is split, a new VMA is cloned from the original. This new VMA is a direct copy of the original including the reservation count. When this pair of VMAs are unmapped we will incorrect double account the unused reservation and the overall reservation count will be incorrect, in extreme cases it will wrap. The problem occurs when we split an existing VMA say to unmap a page in the middle. split_vma() will create a new VMA copying all fields from the original. As we are storing our reservation count in vm_private_data this is also copies, endowing the new VMA with a duplicate of the original VMA's reservation. Neither of the new VMAs can exhaust these reservations as they are too small, but when we unmap and close these VMAs we will incorrect credit the remainder twice and resv_huge_pages will become out of sync. This can lead to allocation failures on mappings with reservations and even to resv_huge_pages wrapping which prevents all subsequent hugepage allocations. The simple fix would be to correctly apportion the remaining reservation count when the split is made. However the only hook we have vm_ops->open only has the new VMA we do not know the identity of the preceeding VMA. Also even if we did have that VMA to hand we do not know how much of the reservation was consumed each side of the split. This patch therefore takes a different tack. We know that the whole of any private mapping (which has a reservation) has a reservation over its whole size. Any present pages represent consumed reservation. Therefore if we track the instantiated pages we can calculate the remaining reservation. This patch reuses the existing regions code to track the regions for which we have consumed reservation (ie. the instantiated pages), as each page is faulted in we record the consumption of reservation for the new page. When we need to return unused reservations at unmap time we simply count the consumed reservation region subtracting that from the whole of the map. During a VMA split the newly opened VMA will point to the same region map, as this map is offset oriented it remains valid for both of the split VMAs. This map is referenced counted so that it is removed when all VMAs which are part of the mmap are gone. Thanks to Adam Litke and Mel Gorman for their review feedback. Signed-off-by: Andy Whitcroft Acked-by: Mel Gorman Cc: Adam Litke Cc: Johannes Weiner Cc: Andy Whitcroft Cc: William Lee Irwin III Cc: Hugh Dickins Cc: Michael Kerrisk Cc: Jon Tollefson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 172 +++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 145 insertions(+), 27 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 72acbb29d2cc..65616941a383 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -43,6 +43,16 @@ static DEFINE_SPINLOCK(hugetlb_lock); /* * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. + * + * The region data structures are protected by a combination of the mmap_sem + * and the hugetlb_instantion_mutex. To access or modify a region the caller + * must either hold the mmap_sem for write, or the mmap_sem for read and + * the hugetlb_instantiation mutex: + * + * down_write(&mm->mmap_sem); + * or + * down_read(&mm->mmap_sem); + * mutex_lock(&hugetlb_instantiation_mutex); */ struct file_region { struct list_head link; @@ -165,6 +175,30 @@ static long region_truncate(struct list_head *head, long end) return chg; } +static long region_count(struct list_head *head, long f, long t) +{ + struct file_region *rg; + long chg = 0; + + /* Locate each segment we overlap with, and count that overlap. */ + list_for_each_entry(rg, head, link) { + int seg_from; + int seg_to; + + if (rg->to <= f) + continue; + if (rg->from >= t) + break; + + seg_from = max(rg->from, f); + seg_to = min(rg->to, t); + + chg += seg_to - seg_from; + } + + return chg; +} + /* * Convert the address within this vma to the page offset within * the mapping, in base page units. @@ -187,9 +221,15 @@ static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma, (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); } -#define HPAGE_RESV_OWNER (1UL << (BITS_PER_LONG - 1)) -#define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2)) +/* + * Flags for MAP_PRIVATE reservations. These are stored in the bottom + * bits of the reservation map pointer, which are always clear due to + * alignment. + */ +#define HPAGE_RESV_OWNER (1UL << 0) +#define HPAGE_RESV_UNMAPPED (1UL << 1) #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) + /* * These helpers are used to track how many pages are reserved for * faults in a MAP_PRIVATE mapping. Only the process that called mmap() @@ -199,6 +239,15 @@ static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma, * the reserve counters are updated with the hugetlb_lock held. It is safe * to reset the VMA at fork() time as it is not in use yet and there is no * chance of the global counters getting corrupted as a result of the values. + * + * The private mapping reservation is represented in a subtly different + * manner to a shared mapping. A shared mapping has a region map associated + * with the underlying file, this region map represents the backing file + * pages which have ever had a reservation assigned which this persists even + * after the page is instantiated. A private mapping has a region map + * associated with the original mmap which is attached to all VMAs which + * reference it, this region map represents those offsets which have consumed + * reservation ie. where pages have been instantiated. */ static unsigned long get_vma_private_data(struct vm_area_struct *vma) { @@ -211,22 +260,48 @@ static void set_vma_private_data(struct vm_area_struct *vma, vma->vm_private_data = (void *)value; } -static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma) +struct resv_map { + struct kref refs; + struct list_head regions; +}; + +struct resv_map *resv_map_alloc(void) +{ + struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); + if (!resv_map) + return NULL; + + kref_init(&resv_map->refs); + INIT_LIST_HEAD(&resv_map->regions); + + return resv_map; +} + +void resv_map_release(struct kref *ref) +{ + struct resv_map *resv_map = container_of(ref, struct resv_map, refs); + + /* Clear out any active regions before we release the map. */ + region_truncate(&resv_map->regions, 0); + kfree(resv_map); +} + +static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); if (!(vma->vm_flags & VM_SHARED)) - return get_vma_private_data(vma) & ~HPAGE_RESV_MASK; + return (struct resv_map *)(get_vma_private_data(vma) & + ~HPAGE_RESV_MASK); return 0; } -static void set_vma_resv_huge_pages(struct vm_area_struct *vma, - unsigned long reserve) +static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); VM_BUG_ON(vma->vm_flags & VM_SHARED); - set_vma_private_data(vma, - (get_vma_private_data(vma) & HPAGE_RESV_MASK) | reserve); + set_vma_private_data(vma, (get_vma_private_data(vma) & + HPAGE_RESV_MASK) | (unsigned long)map); } static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) @@ -253,19 +328,12 @@ static void decrement_hugepage_resv_vma(struct vm_area_struct *vma) if (vma->vm_flags & VM_SHARED) { /* Shared mappings always use reserves */ resv_huge_pages--; - } else { + } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { /* * Only the process that called mmap() has reserves for * private mappings. */ - if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - unsigned long flags, reserve; - resv_huge_pages--; - flags = (unsigned long)vma->vm_private_data & - HPAGE_RESV_MASK; - reserve = (unsigned long)vma->vm_private_data - 1; - vma->vm_private_data = (void *)(reserve | flags); - } + resv_huge_pages--; } } @@ -282,7 +350,7 @@ static int vma_has_private_reserves(struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHARED) return 0; - if (!vma_resv_huge_pages(vma)) + if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return 0; return 1; } @@ -742,12 +810,19 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) return region_chg(&inode->i_mapping->private_list, idx, idx + 1); - } else { - if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) - return 1; - } + } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + return 1; - return 0; + } else { + int err; + pgoff_t idx = vma_pagecache_offset(vma, addr); + struct resv_map *reservations = vma_resv_map(vma); + + err = region_chg(&reservations->regions, idx, idx + 1); + if (err < 0) + return err; + return 0; + } } static void vma_commit_reservation(struct vm_area_struct *vma, unsigned long addr) @@ -758,6 +833,13 @@ static void vma_commit_reservation(struct vm_area_struct *vma, if (vma->vm_flags & VM_SHARED) { pgoff_t idx = vma_pagecache_offset(vma, addr); region_add(&inode->i_mapping->private_list, idx, idx + 1); + + } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + pgoff_t idx = vma_pagecache_offset(vma, addr); + struct resv_map *reservations = vma_resv_map(vma); + + /* Mark this page used in the map. */ + region_add(&reservations->regions, idx, idx + 1); } } @@ -1047,11 +1129,41 @@ out: return ret; } +static void hugetlb_vm_op_open(struct vm_area_struct *vma) +{ + struct resv_map *reservations = vma_resv_map(vma); + + /* + * This new VMA should share its siblings reservation map if present. + * The VMA will only ever have a valid reservation map pointer where + * it is being copied for another still existing VMA. As that VMA + * has a reference to the reservation map it cannot dissappear until + * after this open call completes. It is therefore safe to take a + * new reference here without additional locking. + */ + if (reservations) + kref_get(&reservations->refs); +} + static void hugetlb_vm_op_close(struct vm_area_struct *vma) { - unsigned long reserve = vma_resv_huge_pages(vma); - if (reserve) - hugetlb_acct_memory(-reserve); + struct resv_map *reservations = vma_resv_map(vma); + unsigned long reserve; + unsigned long start; + unsigned long end; + + if (reservations) { + start = vma_pagecache_offset(vma, vma->vm_start); + end = vma_pagecache_offset(vma, vma->vm_end); + + reserve = (end - start) - + region_count(&reservations->regions, start, end); + + kref_put(&reservations->refs, resv_map_release); + + if (reserve) + hugetlb_acct_memory(-reserve); + } } /* @@ -1068,6 +1180,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, + .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, }; @@ -1617,8 +1730,13 @@ int hugetlb_reserve_pages(struct inode *inode, if (!vma || vma->vm_flags & VM_SHARED) chg = region_chg(&inode->i_mapping->private_list, from, to); else { + struct resv_map *resv_map = resv_map_alloc(); + if (!resv_map) + return -ENOMEM; + chg = to - from; - set_vma_resv_huge_pages(vma, chg); + + set_vma_resv_map(vma, resv_map); set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } -- cgit v1.2.2 From a858f7b2e9bb4eb665176dde5cf32eeaaf90f153 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:27:33 -0700 Subject: vma_page_offset() has no callees: drop it Hugh adds: vma_pagecache_offset() has a dangerously misleading name, since it's using hugepage units: rename it to vma_hugecache_offset(). [apw@shadowen.org: restack onto fixed MAP_PRIVATE reservations] [akpm@linux-foundation.org: vma_split conversion] Signed-off-by: Johannes Weiner Signed-off-by: Hugh Dickins Cc: Adam Litke Cc: Nishanth Aravamudan Cc: Andi Kleen Cc: Nick Piggin Signed-off-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 65616941a383..eda9642254a0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -199,22 +199,11 @@ static long region_count(struct list_head *head, long f, long t) return chg; } -/* - * Convert the address within this vma to the page offset within - * the mapping, in base page units. - */ -static pgoff_t vma_page_offset(struct vm_area_struct *vma, - unsigned long address) -{ - return ((address - vma->vm_start) >> PAGE_SHIFT) + - (vma->vm_pgoff >> PAGE_SHIFT); -} - /* * Convert the address within this vma to the page offset within * the mapping, in pagecache page units; huge pages here. */ -static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma, +static pgoff_t vma_hugecache_offset(struct vm_area_struct *vma, unsigned long address) { return ((address - vma->vm_start) >> HPAGE_SHIFT) + @@ -806,7 +795,7 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) struct inode *inode = mapping->host; if (vma->vm_flags & VM_SHARED) { - pgoff_t idx = vma_pagecache_offset(vma, addr); + pgoff_t idx = vma_hugecache_offset(vma, addr); return region_chg(&inode->i_mapping->private_list, idx, idx + 1); @@ -815,7 +804,7 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) } else { int err; - pgoff_t idx = vma_pagecache_offset(vma, addr); + pgoff_t idx = vma_hugecache_offset(vma, addr); struct resv_map *reservations = vma_resv_map(vma); err = region_chg(&reservations->regions, idx, idx + 1); @@ -831,11 +820,11 @@ static void vma_commit_reservation(struct vm_area_struct *vma, struct inode *inode = mapping->host; if (vma->vm_flags & VM_SHARED) { - pgoff_t idx = vma_pagecache_offset(vma, addr); + pgoff_t idx = vma_hugecache_offset(vma, addr); region_add(&inode->i_mapping->private_list, idx, idx + 1); } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - pgoff_t idx = vma_pagecache_offset(vma, addr); + pgoff_t idx = vma_hugecache_offset(vma, addr); struct resv_map *reservations = vma_resv_map(vma); /* Mark this page used in the map. */ @@ -1153,8 +1142,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) unsigned long end; if (reservations) { - start = vma_pagecache_offset(vma, vma->vm_start); - end = vma_pagecache_offset(vma, vma->vm_end); + start = vma_hugecache_offset(vma, vma->vm_start); + end = vma_hugecache_offset(vma, vma->vm_end); reserve = (end - start) - region_count(&reservations->regions, start, end); @@ -1471,7 +1460,7 @@ static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma, pgoff_t idx; mapping = vma->vm_file->f_mapping; - idx = vma_pagecache_offset(vma, address); + idx = vma_hugecache_offset(vma, address); return find_lock_page(mapping, idx); } @@ -1499,7 +1488,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, } mapping = vma->vm_file->f_mapping; - idx = vma_pagecache_offset(vma, address); + idx = vma_hugecache_offset(vma, address); /* * Use page lock to guard against racing truncation -- cgit v1.2.2 From b7ba30c679ed1eb7ed3ed8f281f6493282042bd4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:40 -0700 Subject: hugetlb: factor out prep_new_huge_page Needed to avoid code duplication in follow up patches. Acked-by: Adam Litke Acked-by: Nishanth Aravamudan Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eda9642254a0..32dff4290c66 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -513,6 +513,16 @@ static int adjust_pool_surplus(int delta) return ret; } +static void prep_new_huge_page(struct page *page, int nid) +{ + set_compound_page_dtor(page, free_huge_page); + spin_lock(&hugetlb_lock); + nr_huge_pages++; + nr_huge_pages_node[nid]++; + spin_unlock(&hugetlb_lock); + put_page(page); /* free it into the hugepage allocator */ +} + static struct page *alloc_fresh_huge_page_node(int nid) { struct page *page; @@ -526,12 +536,7 @@ static struct page *alloc_fresh_huge_page_node(int nid) __free_pages(page, HUGETLB_PAGE_ORDER); return NULL; } - set_compound_page_dtor(page, free_huge_page); - spin_lock(&hugetlb_lock); - nr_huge_pages++; - nr_huge_pages_node[nid]++; - spin_unlock(&hugetlb_lock); - put_page(page); /* free it into the hugepage allocator */ + prep_new_huge_page(page, nid); } return page; -- cgit v1.2.2 From a5516438959d90b071ff0a484ce4f3f523dc3152 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:41 -0700 Subject: hugetlb: modular state for hugetlb page size The goal of this patchset is to support multiple hugetlb page sizes. This is achieved by introducing a new struct hstate structure, which encapsulates the important hugetlb state and constants (eg. huge page size, number of huge pages currently allocated, etc). The hstate structure is then passed around the code which requires these fields, they will do the right thing regardless of the exact hstate they are operating on. This patch adds the hstate structure, with a single global instance of it (default_hstate), and does the basic work of converting hugetlb to use the hstate. Future patches will add more hstate structures to allow for different hugetlbfs mounts to have different page sizes. [akpm@linux-foundation.org: coding-style fixes] Acked-by: Adam Litke Acked-by: Nishanth Aravamudan Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 368 ++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 201 insertions(+), 167 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 32dff4290c66..0d8153e25f09 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -22,18 +22,12 @@ #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; -static unsigned long surplus_huge_pages; -static unsigned long nr_overcommit_huge_pages; unsigned long max_huge_pages; unsigned long sysctl_overcommit_huge_pages; -static struct list_head hugepage_freelists[MAX_NUMNODES]; -static unsigned int nr_huge_pages_node[MAX_NUMNODES]; -static unsigned int free_huge_pages_node[MAX_NUMNODES]; -static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; -static int hugetlb_next_nid; + +struct hstate default_hstate; /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages @@ -203,11 +197,11 @@ static long region_count(struct list_head *head, long f, long t) * Convert the address within this vma to the page offset within * the mapping, in pagecache page units; huge pages here. */ -static pgoff_t vma_hugecache_offset(struct vm_area_struct *vma, - unsigned long address) +static pgoff_t vma_hugecache_offset(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) { - return ((address - vma->vm_start) >> HPAGE_SHIFT) + - (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + return ((address - vma->vm_start) >> huge_page_shift(h)) + + (vma->vm_pgoff >> huge_page_order(h)); } /* @@ -309,20 +303,21 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) } /* Decrement the reserved pages in the hugepage pool by one */ -static void decrement_hugepage_resv_vma(struct vm_area_struct *vma) +static void decrement_hugepage_resv_vma(struct hstate *h, + struct vm_area_struct *vma) { if (vma->vm_flags & VM_NORESERVE) return; if (vma->vm_flags & VM_SHARED) { /* Shared mappings always use reserves */ - resv_huge_pages--; + h->resv_huge_pages--; } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { /* * Only the process that called mmap() has reserves for * private mappings. */ - resv_huge_pages--; + h->resv_huge_pages--; } } @@ -344,12 +339,13 @@ static int vma_has_private_reserves(struct vm_area_struct *vma) return 1; } -static void clear_huge_page(struct page *page, unsigned long addr) +static void clear_huge_page(struct page *page, + unsigned long addr, unsigned long sz) { int i; might_sleep(); - for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { + for (i = 0; i < sz/PAGE_SIZE; i++) { cond_resched(); clear_user_highpage(page + i, addr + i * PAGE_SIZE); } @@ -359,41 +355,43 @@ static void copy_huge_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma) { int i; + struct hstate *h = hstate_vma(vma); might_sleep(); - for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { + for (i = 0; i < pages_per_huge_page(h); i++) { cond_resched(); copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); } } -static void enqueue_huge_page(struct page *page) +static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); - list_add(&page->lru, &hugepage_freelists[nid]); - free_huge_pages++; - free_huge_pages_node[nid]++; + list_add(&page->lru, &h->hugepage_freelists[nid]); + h->free_huge_pages++; + h->free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page(void) +static struct page *dequeue_huge_page(struct hstate *h) { int nid; struct page *page = NULL; for (nid = 0; nid < MAX_NUMNODES; ++nid) { - if (!list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, + if (!list_empty(&h->hugepage_freelists[nid])) { + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); - free_huge_pages--; - free_huge_pages_node[nid]--; + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; break; } } return page; } -static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, +static struct page *dequeue_huge_page_vma(struct hstate *h, + struct vm_area_struct *vma, unsigned long address, int avoid_reserve) { int nid; @@ -411,26 +409,26 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, * not "stolen". The child may still get SIGKILLed */ if (!vma_has_private_reserves(vma) && - free_huge_pages - resv_huge_pages == 0) + h->free_huge_pages - h->resv_huge_pages == 0) return NULL; /* If reserves cannot be used, ensure enough pages are in the pool */ - if (avoid_reserve && free_huge_pages - resv_huge_pages == 0) + if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) return NULL; for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { nid = zone_to_nid(zone); if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && - !list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, + !list_empty(&h->hugepage_freelists[nid])) { + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); - free_huge_pages--; - free_huge_pages_node[nid]--; + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; if (!avoid_reserve) - decrement_hugepage_resv_vma(vma); + decrement_hugepage_resv_vma(h, vma); break; } @@ -439,12 +437,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, return page; } -static void update_and_free_page(struct page *page) +static void update_and_free_page(struct hstate *h, struct page *page) { int i; - nr_huge_pages--; - nr_huge_pages_node[page_to_nid(page)]--; - for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { + + h->nr_huge_pages--; + h->nr_huge_pages_node[page_to_nid(page)]--; + for (i = 0; i < pages_per_huge_page(h); i++) { page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 1 << PG_private | 1<< PG_writeback); @@ -452,11 +451,16 @@ static void update_and_free_page(struct page *page) set_compound_page_dtor(page, NULL); set_page_refcounted(page); arch_release_hugepage(page); - __free_pages(page, HUGETLB_PAGE_ORDER); + __free_pages(page, huge_page_order(h)); } static void free_huge_page(struct page *page) { + /* + * Can't pass hstate in here because it is called from the + * compound page destructor. + */ + struct hstate *h = &default_hstate; int nid = page_to_nid(page); struct address_space *mapping; @@ -466,12 +470,12 @@ static void free_huge_page(struct page *page) INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); - if (surplus_huge_pages_node[nid]) { - update_and_free_page(page); - surplus_huge_pages--; - surplus_huge_pages_node[nid]--; + if (h->surplus_huge_pages_node[nid]) { + update_and_free_page(h, page); + h->surplus_huge_pages--; + h->surplus_huge_pages_node[nid]--; } else { - enqueue_huge_page(page); + enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); if (mapping) @@ -483,7 +487,7 @@ static void free_huge_page(struct page *page) * balanced by operating on them in a round-robin fashion. * Returns 1 if an adjustment was made. */ -static int adjust_pool_surplus(int delta) +static int adjust_pool_surplus(struct hstate *h, int delta) { static int prev_nid; int nid = prev_nid; @@ -496,15 +500,15 @@ static int adjust_pool_surplus(int delta) nid = first_node(node_online_map); /* To shrink on this node, there must be a surplus page */ - if (delta < 0 && !surplus_huge_pages_node[nid]) + if (delta < 0 && !h->surplus_huge_pages_node[nid]) continue; /* Surplus cannot exceed the total number of pages */ - if (delta > 0 && surplus_huge_pages_node[nid] >= - nr_huge_pages_node[nid]) + if (delta > 0 && h->surplus_huge_pages_node[nid] >= + h->nr_huge_pages_node[nid]) continue; - surplus_huge_pages += delta; - surplus_huge_pages_node[nid] += delta; + h->surplus_huge_pages += delta; + h->surplus_huge_pages_node[nid] += delta; ret = 1; break; } while (nid != prev_nid); @@ -513,46 +517,46 @@ static int adjust_pool_surplus(int delta) return ret; } -static void prep_new_huge_page(struct page *page, int nid) +static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { set_compound_page_dtor(page, free_huge_page); spin_lock(&hugetlb_lock); - nr_huge_pages++; - nr_huge_pages_node[nid]++; + h->nr_huge_pages++; + h->nr_huge_pages_node[nid]++; spin_unlock(&hugetlb_lock); put_page(page); /* free it into the hugepage allocator */ } -static struct page *alloc_fresh_huge_page_node(int nid) +static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, - HUGETLB_PAGE_ORDER); + huge_page_order(h)); if (page) { if (arch_prepare_hugepage(page)) { __free_pages(page, HUGETLB_PAGE_ORDER); return NULL; } - prep_new_huge_page(page, nid); + prep_new_huge_page(h, page, nid); } return page; } -static int alloc_fresh_huge_page(void) +static int alloc_fresh_huge_page(struct hstate *h) { struct page *page; int start_nid; int next_nid; int ret = 0; - start_nid = hugetlb_next_nid; + start_nid = h->hugetlb_next_nid; do { - page = alloc_fresh_huge_page_node(hugetlb_next_nid); + page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); if (page) ret = 1; /* @@ -566,11 +570,11 @@ static int alloc_fresh_huge_page(void) * if we just successfully allocated a hugepage so that * the next caller gets hugepages on the next node. */ - next_nid = next_node(hugetlb_next_nid, node_online_map); + next_nid = next_node(h->hugetlb_next_nid, node_online_map); if (next_nid == MAX_NUMNODES) next_nid = first_node(node_online_map); - hugetlb_next_nid = next_nid; - } while (!page && hugetlb_next_nid != start_nid); + h->hugetlb_next_nid = next_nid; + } while (!page && h->hugetlb_next_nid != start_nid); if (ret) count_vm_event(HTLB_BUDDY_PGALLOC); @@ -580,8 +584,8 @@ static int alloc_fresh_huge_page(void) return ret; } -static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, - unsigned long address) +static struct page *alloc_buddy_huge_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) { struct page *page; unsigned int nid; @@ -610,18 +614,18 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, * per-node value is checked there. */ spin_lock(&hugetlb_lock); - if (surplus_huge_pages >= nr_overcommit_huge_pages) { + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { spin_unlock(&hugetlb_lock); return NULL; } else { - nr_huge_pages++; - surplus_huge_pages++; + h->nr_huge_pages++; + h->surplus_huge_pages++; } spin_unlock(&hugetlb_lock); page = alloc_pages(htlb_alloc_mask|__GFP_COMP| __GFP_REPEAT|__GFP_NOWARN, - HUGETLB_PAGE_ORDER); + huge_page_order(h)); spin_lock(&hugetlb_lock); if (page) { @@ -636,12 +640,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, /* * We incremented the global counters already */ - nr_huge_pages_node[nid]++; - surplus_huge_pages_node[nid]++; + h->nr_huge_pages_node[nid]++; + h->surplus_huge_pages_node[nid]++; __count_vm_event(HTLB_BUDDY_PGALLOC); } else { - nr_huge_pages--; - surplus_huge_pages--; + h->nr_huge_pages--; + h->surplus_huge_pages--; __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); } spin_unlock(&hugetlb_lock); @@ -653,16 +657,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, * Increase the hugetlb pool such that it can accomodate a reservation * of size 'delta'. */ -static int gather_surplus_pages(int delta) +static int gather_surplus_pages(struct hstate *h, int delta) { struct list_head surplus_list; struct page *page, *tmp; int ret, i; int needed, allocated; - needed = (resv_huge_pages + delta) - free_huge_pages; + needed = (h->resv_huge_pages + delta) - h->free_huge_pages; if (needed <= 0) { - resv_huge_pages += delta; + h->resv_huge_pages += delta; return 0; } @@ -673,7 +677,7 @@ static int gather_surplus_pages(int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_buddy_huge_page(NULL, 0); + page = alloc_buddy_huge_page(h, NULL, 0); if (!page) { /* * We were not able to allocate enough pages to @@ -694,7 +698,8 @@ retry: * because either resv_huge_pages or free_huge_pages may have changed. */ spin_lock(&hugetlb_lock); - needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); + needed = (h->resv_huge_pages + delta) - + (h->free_huge_pages + allocated); if (needed > 0) goto retry; @@ -707,7 +712,7 @@ retry: * before they are reserved. */ needed += allocated; - resv_huge_pages += delta; + h->resv_huge_pages += delta; ret = 0; free: /* Free the needed pages to the hugetlb pool */ @@ -715,7 +720,7 @@ free: if ((--needed) < 0) break; list_del(&page->lru); - enqueue_huge_page(page); + enqueue_huge_page(h, page); } /* Free unnecessary surplus pages to the buddy allocator */ @@ -743,7 +748,8 @@ free: * allocated to satisfy the reservation must be explicitly freed if they were * never used. */ -static void return_unused_surplus_pages(unsigned long unused_resv_pages) +static void return_unused_surplus_pages(struct hstate *h, + unsigned long unused_resv_pages) { static int nid = -1; struct page *page; @@ -758,27 +764,27 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) unsigned long remaining_iterations = num_online_nodes(); /* Uncommit the reservation */ - resv_huge_pages -= unused_resv_pages; + h->resv_huge_pages -= unused_resv_pages; - nr_pages = min(unused_resv_pages, surplus_huge_pages); + nr_pages = min(unused_resv_pages, h->surplus_huge_pages); while (remaining_iterations-- && nr_pages) { nid = next_node(nid, node_online_map); if (nid == MAX_NUMNODES) nid = first_node(node_online_map); - if (!surplus_huge_pages_node[nid]) + if (!h->surplus_huge_pages_node[nid]) continue; - if (!list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, + if (!list_empty(&h->hugepage_freelists[nid])) { + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); - update_and_free_page(page); - free_huge_pages--; - free_huge_pages_node[nid]--; - surplus_huge_pages--; - surplus_huge_pages_node[nid]--; + update_and_free_page(h, page); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + h->surplus_huge_pages--; + h->surplus_huge_pages_node[nid]--; nr_pages--; remaining_iterations = num_online_nodes(); } @@ -794,13 +800,14 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) * an instantiated the change should be committed via vma_commit_reservation. * No action is required on failure. */ -static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) +static int vma_needs_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; if (vma->vm_flags & VM_SHARED) { - pgoff_t idx = vma_hugecache_offset(vma, addr); + pgoff_t idx = vma_hugecache_offset(h, vma, addr); return region_chg(&inode->i_mapping->private_list, idx, idx + 1); @@ -809,7 +816,7 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) } else { int err; - pgoff_t idx = vma_hugecache_offset(vma, addr); + pgoff_t idx = vma_hugecache_offset(h, vma, addr); struct resv_map *reservations = vma_resv_map(vma); err = region_chg(&reservations->regions, idx, idx + 1); @@ -818,18 +825,18 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) return 0; } } -static void vma_commit_reservation(struct vm_area_struct *vma, - unsigned long addr) +static void vma_commit_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; if (vma->vm_flags & VM_SHARED) { - pgoff_t idx = vma_hugecache_offset(vma, addr); + pgoff_t idx = vma_hugecache_offset(h, vma, addr); region_add(&inode->i_mapping->private_list, idx, idx + 1); } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - pgoff_t idx = vma_hugecache_offset(vma, addr); + pgoff_t idx = vma_hugecache_offset(h, vma, addr); struct resv_map *reservations = vma_resv_map(vma); /* Mark this page used in the map. */ @@ -840,6 +847,7 @@ static void vma_commit_reservation(struct vm_area_struct *vma, static struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { + struct hstate *h = hstate_vma(vma); struct page *page; struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; @@ -852,7 +860,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, * MAP_NORESERVE mappings may also need pages and quota allocated * if no reserve mapping overlaps. */ - chg = vma_needs_reservation(vma, addr); + chg = vma_needs_reservation(h, vma, addr); if (chg < 0) return ERR_PTR(chg); if (chg) @@ -860,11 +868,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return ERR_PTR(-ENOSPC); spin_lock(&hugetlb_lock); - page = dequeue_huge_page_vma(vma, addr, avoid_reserve); + page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); spin_unlock(&hugetlb_lock); if (!page) { - page = alloc_buddy_huge_page(vma, addr); + page = alloc_buddy_huge_page(h, vma, addr); if (!page) { hugetlb_put_quota(inode->i_mapping, chg); return ERR_PTR(-VM_FAULT_OOM); @@ -874,7 +882,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, set_page_refcounted(page); set_page_private(page, (unsigned long) mapping); - vma_commit_reservation(vma, addr); + vma_commit_reservation(h, vma, addr); return page; } @@ -882,21 +890,28 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, static int __init hugetlb_init(void) { unsigned long i; + struct hstate *h = &default_hstate; if (HPAGE_SHIFT == 0) return 0; + if (!h->order) { + h->order = HPAGE_SHIFT - PAGE_SHIFT; + h->mask = HPAGE_MASK; + } + for (i = 0; i < MAX_NUMNODES; ++i) - INIT_LIST_HEAD(&hugepage_freelists[i]); + INIT_LIST_HEAD(&h->hugepage_freelists[i]); - hugetlb_next_nid = first_node(node_online_map); + h->hugetlb_next_nid = first_node(node_online_map); for (i = 0; i < max_huge_pages; ++i) { - if (!alloc_fresh_huge_page()) + if (!alloc_fresh_huge_page(h)) break; } - max_huge_pages = free_huge_pages = nr_huge_pages = i; - printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); + max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i; + printk(KERN_INFO "Total HugeTLB memory allocated, %ld\n", + h->free_huge_pages); return 0; } module_init(hugetlb_init); @@ -922,34 +937,36 @@ static unsigned int cpuset_mems_nr(unsigned int *array) #ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM -static void try_to_free_low(unsigned long count) +static void try_to_free_low(struct hstate *h, unsigned long count) { int i; for (i = 0; i < MAX_NUMNODES; ++i) { struct page *page, *next; - list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { - if (count >= nr_huge_pages) + struct list_head *freel = &h->hugepage_freelists[i]; + list_for_each_entry_safe(page, next, freel, lru) { + if (count >= h->nr_huge_pages) return; if (PageHighMem(page)) continue; list_del(&page->lru); update_and_free_page(page); - free_huge_pages--; - free_huge_pages_node[page_to_nid(page)]--; + h->free_huge_pages--; + h->free_huge_pages_node[page_to_nid(page)]--; } } } #else -static inline void try_to_free_low(unsigned long count) +static inline void try_to_free_low(struct hstate *h, unsigned long count) { } #endif -#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) +#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) static unsigned long set_max_huge_pages(unsigned long count) { unsigned long min_count, ret; + struct hstate *h = &default_hstate; /* * Increase the pool size @@ -963,19 +980,19 @@ static unsigned long set_max_huge_pages(unsigned long count) * within all the constraints specified by the sysctls. */ spin_lock(&hugetlb_lock); - while (surplus_huge_pages && count > persistent_huge_pages) { - if (!adjust_pool_surplus(-1)) + while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { + if (!adjust_pool_surplus(h, -1)) break; } - while (count > persistent_huge_pages) { + while (count > persistent_huge_pages(h)) { /* * If this allocation races such that we no longer need the * page, free_huge_page will handle it by freeing the page * and reducing the surplus. */ spin_unlock(&hugetlb_lock); - ret = alloc_fresh_huge_page(); + ret = alloc_fresh_huge_page(h); spin_lock(&hugetlb_lock); if (!ret) goto out; @@ -997,21 +1014,21 @@ static unsigned long set_max_huge_pages(unsigned long count) * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. */ - min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; + min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; min_count = max(count, min_count); - try_to_free_low(min_count); - while (min_count < persistent_huge_pages) { - struct page *page = dequeue_huge_page(); + try_to_free_low(h, min_count); + while (min_count < persistent_huge_pages(h)) { + struct page *page = dequeue_huge_page(h); if (!page) break; - update_and_free_page(page); + update_and_free_page(h, page); } - while (count < persistent_huge_pages) { - if (!adjust_pool_surplus(1)) + while (count < persistent_huge_pages(h)) { + if (!adjust_pool_surplus(h, 1)) break; } out: - ret = persistent_huge_pages; + ret = persistent_huge_pages(h); spin_unlock(&hugetlb_lock); return ret; } @@ -1041,9 +1058,10 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { + struct hstate *h = &default_hstate; proc_doulongvec_minmax(table, write, file, buffer, length, ppos); spin_lock(&hugetlb_lock); - nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; + h->nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; spin_unlock(&hugetlb_lock); return 0; } @@ -1052,37 +1070,40 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, int hugetlb_report_meminfo(char *buf) { + struct hstate *h = &default_hstate; return sprintf(buf, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" "HugePages_Rsvd: %5lu\n" "HugePages_Surp: %5lu\n" "Hugepagesize: %5lu kB\n", - nr_huge_pages, - free_huge_pages, - resv_huge_pages, - surplus_huge_pages, - HPAGE_SIZE/1024); + h->nr_huge_pages, + h->free_huge_pages, + h->resv_huge_pages, + h->surplus_huge_pages, + 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); } int hugetlb_report_node_meminfo(int nid, char *buf) { + struct hstate *h = &default_hstate; return sprintf(buf, "Node %d HugePages_Total: %5u\n" "Node %d HugePages_Free: %5u\n" "Node %d HugePages_Surp: %5u\n", - nid, nr_huge_pages_node[nid], - nid, free_huge_pages_node[nid], - nid, surplus_huge_pages_node[nid]); + nid, h->nr_huge_pages_node[nid], + nid, h->free_huge_pages_node[nid], + nid, h->surplus_huge_pages_node[nid]); } /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { - return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); + struct hstate *h = &default_hstate; + return h->nr_huge_pages * pages_per_huge_page(h); } -static int hugetlb_acct_memory(long delta) +static int hugetlb_acct_memory(struct hstate *h, long delta) { int ret = -ENOMEM; @@ -1105,18 +1126,18 @@ static int hugetlb_acct_memory(long delta) * semantics that cpuset has. */ if (delta > 0) { - if (gather_surplus_pages(delta) < 0) + if (gather_surplus_pages(h, delta) < 0) goto out; - if (delta > cpuset_mems_nr(free_huge_pages_node)) { - return_unused_surplus_pages(delta); + if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { + return_unused_surplus_pages(h, delta); goto out; } } ret = 0; if (delta < 0) - return_unused_surplus_pages((unsigned long) -delta); + return_unused_surplus_pages(h, (unsigned long) -delta); out: spin_unlock(&hugetlb_lock); @@ -1141,14 +1162,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) static void hugetlb_vm_op_close(struct vm_area_struct *vma) { + struct hstate *h = hstate_vma(vma); struct resv_map *reservations = vma_resv_map(vma); unsigned long reserve; unsigned long start; unsigned long end; if (reservations) { - start = vma_hugecache_offset(vma, vma->vm_start); - end = vma_hugecache_offset(vma, vma->vm_end); + start = vma_hugecache_offset(h, vma, vma->vm_start); + end = vma_hugecache_offset(h, vma, vma->vm_end); reserve = (end - start) - region_count(&reservations->regions, start, end); @@ -1156,7 +1178,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) kref_put(&reservations->refs, resv_map_release); if (reserve) - hugetlb_acct_memory(-reserve); + hugetlb_acct_memory(h, -reserve); } } @@ -1214,14 +1236,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct page *ptepage; unsigned long addr; int cow; + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; - for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { + for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { src_pte = huge_pte_offset(src, addr); if (!src_pte) continue; - dst_pte = huge_pte_alloc(dst, addr); + dst_pte = huge_pte_alloc(dst, addr, sz); if (!dst_pte) goto nomem; @@ -1257,6 +1281,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, pte_t pte; struct page *page; struct page *tmp; + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); + /* * A page gathering list, protected by per file i_mmap_lock. The * lock is used to avoid list corruption from multiple unmapping @@ -1265,11 +1292,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, LIST_HEAD(page_list); WARN_ON(!is_vm_hugetlb_page(vma)); - BUG_ON(start & ~HPAGE_MASK); - BUG_ON(end & ~HPAGE_MASK); + BUG_ON(start & ~huge_page_mask(h)); + BUG_ON(end & ~huge_page_mask(h)); spin_lock(&mm->page_table_lock); - for (address = start; address < end; address += HPAGE_SIZE) { + for (address = start; address < end; address += sz) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; @@ -1383,6 +1410,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, struct page *pagecache_page) { + struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; int avoidcopy; int outside_reserve = 0; @@ -1443,7 +1471,7 @@ retry_avoidcopy: __SetPageUptodate(new_page); spin_lock(&mm->page_table_lock); - ptep = huge_pte_offset(mm, address & HPAGE_MASK); + ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { /* Break COW */ huge_ptep_clear_flush(vma, address, ptep); @@ -1458,14 +1486,14 @@ retry_avoidcopy: } /* Return the pagecache page at a given address within a VMA */ -static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma, - unsigned long address) +static struct page *hugetlbfs_pagecache_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) { struct address_space *mapping; pgoff_t idx; mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(vma, address); + idx = vma_hugecache_offset(h, vma, address); return find_lock_page(mapping, idx); } @@ -1473,6 +1501,7 @@ static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma, static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, int write_access) { + struct hstate *h = hstate_vma(vma); int ret = VM_FAULT_SIGBUS; pgoff_t idx; unsigned long size; @@ -1493,7 +1522,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, } mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(vma, address); + idx = vma_hugecache_offset(h, vma, address); /* * Use page lock to guard against racing truncation @@ -1502,7 +1531,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, retry: page = find_lock_page(mapping, idx); if (!page) { - size = i_size_read(mapping->host) >> HPAGE_SHIFT; + size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto out; page = alloc_huge_page(vma, address, 0); @@ -1510,7 +1539,7 @@ retry: ret = -PTR_ERR(page); goto out; } - clear_huge_page(page, address); + clear_huge_page(page, address, huge_page_size(h)); __SetPageUptodate(page); if (vma->vm_flags & VM_SHARED) { @@ -1526,14 +1555,14 @@ retry: } spin_lock(&inode->i_lock); - inode->i_blocks += BLOCKS_PER_HUGEPAGE; + inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); } else lock_page(page); } spin_lock(&mm->page_table_lock); - size = i_size_read(mapping->host) >> HPAGE_SHIFT; + size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto backout; @@ -1569,8 +1598,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t entry; int ret; static DEFINE_MUTEX(hugetlb_instantiation_mutex); + struct hstate *h = hstate_vma(vma); - ptep = huge_pte_alloc(mm, address); + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); if (!ptep) return VM_FAULT_OOM; @@ -1594,7 +1624,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (likely(pte_same(entry, huge_ptep_get(ptep)))) if (write_access && !pte_write(entry)) { struct page *page; - page = hugetlbfs_pagecache_page(vma, address); + page = hugetlbfs_pagecache_page(h, vma, address); ret = hugetlb_cow(mm, vma, address, ptep, entry, page); if (page) { unlock_page(page); @@ -1615,6 +1645,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long pfn_offset; unsigned long vaddr = *position; int remainder = *length; + struct hstate *h = hstate_vma(vma); spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { @@ -1626,7 +1657,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * each hugepage. We have to make * sure we get the * first, for the page indexing below to work. */ - pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); + pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); if (!pte || huge_pte_none(huge_ptep_get(pte)) || (write && !pte_write(huge_ptep_get(pte)))) { @@ -1644,7 +1675,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } - pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; + pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; page = pte_page(huge_ptep_get(pte)); same_page: if (pages) { @@ -1660,7 +1691,7 @@ same_page: --remainder; ++i; if (vaddr < vma->vm_end && remainder && - pfn_offset < HPAGE_SIZE/PAGE_SIZE) { + pfn_offset < pages_per_huge_page(h)) { /* * We use pfn_offset to avoid touching the pageframes * of this compound page. @@ -1682,13 +1713,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, unsigned long start = address; pte_t *ptep; pte_t pte; + struct hstate *h = hstate_vma(vma); BUG_ON(address >= end); flush_cache_range(vma, address, end); spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); spin_lock(&mm->page_table_lock); - for (; address < end; address += HPAGE_SIZE) { + for (; address < end; address += huge_page_size(h)) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; @@ -1711,6 +1743,7 @@ int hugetlb_reserve_pages(struct inode *inode, struct vm_area_struct *vma) { long ret, chg; + struct hstate *h = hstate_inode(inode); if (vma && vma->vm_flags & VM_NORESERVE) return 0; @@ -1739,7 +1772,7 @@ int hugetlb_reserve_pages(struct inode *inode, if (hugetlb_get_quota(inode->i_mapping, chg)) return -ENOSPC; - ret = hugetlb_acct_memory(chg); + ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugetlb_put_quota(inode->i_mapping, chg); return ret; @@ -1751,12 +1784,13 @@ int hugetlb_reserve_pages(struct inode *inode, void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) { + struct hstate *h = hstate_inode(inode); long chg = region_truncate(&inode->i_mapping->private_list, offset); spin_lock(&inode->i_lock); - inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; + inode->i_blocks -= blocks_per_huge_page(h); spin_unlock(&inode->i_lock); hugetlb_put_quota(inode->i_mapping, (chg - freed)); - hugetlb_acct_memory(-(chg - freed)); + hugetlb_acct_memory(h, -(chg - freed)); } -- cgit v1.2.2 From e5ff215941d59f8ae6bf58f6428dc5c26745a612 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:42 -0700 Subject: hugetlb: multiple hstates for multiple page sizes Add basic support for more than one hstate in hugetlbfs. This is the key to supporting multiple hugetlbfs page sizes at once. - Rather than a single hstate, we now have an array, with an iterator - default_hstate continues to be the struct hstate which we use by default - Add functions for architectures to register new hstates [akpm@linux-foundation.org: coding-style fixes] Acked-by: Adam Litke Acked-by: Nishanth Aravamudan Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 148 ++++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 121 insertions(+), 27 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0d8153e25f09..82378d44a0c5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -22,12 +22,19 @@ #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -unsigned long max_huge_pages; -unsigned long sysctl_overcommit_huge_pages; static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; -struct hstate default_hstate; +static int max_hstate; +unsigned int default_hstate_idx; +struct hstate hstates[HUGE_MAX_HSTATE]; + +/* for command line parsing */ +static struct hstate * __initdata parsed_hstate; +static unsigned long __initdata default_hstate_max_huge_pages; + +#define for_each_hstate(h) \ + for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages @@ -454,13 +461,24 @@ static void update_and_free_page(struct hstate *h, struct page *page) __free_pages(page, huge_page_order(h)); } +struct hstate *size_to_hstate(unsigned long size) +{ + struct hstate *h; + + for_each_hstate(h) { + if (huge_page_size(h) == size) + return h; + } + return NULL; +} + static void free_huge_page(struct page *page) { /* * Can't pass hstate in here because it is called from the * compound page destructor. */ - struct hstate *h = &default_hstate; + struct hstate *h = page_hstate(page); int nid = page_to_nid(page); struct address_space *mapping; @@ -887,39 +905,94 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; } -static int __init hugetlb_init(void) +static void __init hugetlb_init_one_hstate(struct hstate *h) { unsigned long i; - struct hstate *h = &default_hstate; - - if (HPAGE_SHIFT == 0) - return 0; - - if (!h->order) { - h->order = HPAGE_SHIFT - PAGE_SHIFT; - h->mask = HPAGE_MASK; - } for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); h->hugetlb_next_nid = first_node(node_online_map); - for (i = 0; i < max_huge_pages; ++i) { + for (i = 0; i < h->max_huge_pages; ++i) { if (!alloc_fresh_huge_page(h)) break; } - max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i; - printk(KERN_INFO "Total HugeTLB memory allocated, %ld\n", - h->free_huge_pages); + h->max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i; +} + +static void __init hugetlb_init_hstates(void) +{ + struct hstate *h; + + for_each_hstate(h) { + hugetlb_init_one_hstate(h); + } +} + +static void __init report_hugepages(void) +{ + struct hstate *h; + + for_each_hstate(h) { + printk(KERN_INFO "Total HugeTLB memory allocated, " + "%ld %dMB pages\n", + h->free_huge_pages, + 1 << (h->order + PAGE_SHIFT - 20)); + } +} + +static int __init hugetlb_init(void) +{ + BUILD_BUG_ON(HPAGE_SHIFT == 0); + + if (!size_to_hstate(HPAGE_SIZE)) { + hugetlb_add_hstate(HUGETLB_PAGE_ORDER); + parsed_hstate->max_huge_pages = default_hstate_max_huge_pages; + } + default_hstate_idx = size_to_hstate(HPAGE_SIZE) - hstates; + + hugetlb_init_hstates(); + + report_hugepages(); + return 0; } module_init(hugetlb_init); +/* Should be called on processing a hugepagesz=... option */ +void __init hugetlb_add_hstate(unsigned order) +{ + struct hstate *h; + if (size_to_hstate(PAGE_SIZE << order)) { + printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); + return; + } + BUG_ON(max_hstate >= HUGE_MAX_HSTATE); + BUG_ON(order == 0); + h = &hstates[max_hstate++]; + h->order = order; + h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); + hugetlb_init_one_hstate(h); + parsed_hstate = h; +} + static int __init hugetlb_setup(char *s) { - if (sscanf(s, "%lu", &max_huge_pages) <= 0) - max_huge_pages = 0; + unsigned long *mhp; + + /* + * !max_hstate means we haven't parsed a hugepagesz= parameter yet, + * so this hugepages= parameter goes to the "default hstate". + */ + if (!max_hstate) + mhp = &default_hstate_max_huge_pages; + else + mhp = &parsed_hstate->max_huge_pages; + + if (sscanf(s, "%lu", mhp) <= 0) + *mhp = 0; + return 1; } __setup("hugepages=", hugetlb_setup); @@ -950,7 +1023,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count) if (PageHighMem(page)) continue; list_del(&page->lru); - update_and_free_page(page); + update_and_free_page(h, page); h->free_huge_pages--; h->free_huge_pages_node[page_to_nid(page)]--; } @@ -963,10 +1036,9 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) #endif #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) -static unsigned long set_max_huge_pages(unsigned long count) +static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) { unsigned long min_count, ret; - struct hstate *h = &default_hstate; /* * Increase the pool size @@ -1037,8 +1109,19 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { + struct hstate *h = &default_hstate; + unsigned long tmp; + + if (!write) + tmp = h->max_huge_pages; + + table->data = &tmp; + table->maxlen = sizeof(unsigned long); proc_doulongvec_minmax(table, write, file, buffer, length, ppos); - max_huge_pages = set_max_huge_pages(max_huge_pages); + + if (write) + h->max_huge_pages = set_max_huge_pages(h, tmp); + return 0; } @@ -1059,10 +1142,21 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; + unsigned long tmp; + + if (!write) + tmp = h->nr_overcommit_huge_pages; + + table->data = &tmp; + table->maxlen = sizeof(unsigned long); proc_doulongvec_minmax(table, write, file, buffer, length, ppos); - spin_lock(&hugetlb_lock); - h->nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; - spin_unlock(&hugetlb_lock); + + if (write) { + spin_lock(&hugetlb_lock); + h->nr_overcommit_huge_pages = tmp; + spin_unlock(&hugetlb_lock); + } + return 0; } -- cgit v1.2.2 From a137e1cc6d6e7d315fef03962a2a5a113348b13b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:43 -0700 Subject: hugetlbfs: per mount huge page sizes Add the ability to configure the hugetlb hstate used on a per mount basis. - Add a new pagesize= option to the hugetlbfs mount that allows setting the page size - This option causes the mount code to find the hstate corresponding to the specified size, and sets up a pointer to the hstate in the mount's superblock. - Change the hstate accessors to use this information rather than the global_hstate they were using (requires a slight change in mm/memory.c so we don't NULL deref in the error-unmap path -- see comments). [np: take hstate out of hugetlbfs inode and vma->vm_private_data] Acked-by: Adam Litke Acked-by: Nishanth Aravamudan Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 82378d44a0c5..4cf7a90e9140 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1439,19 +1439,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { - /* - * It is undesirable to test vma->vm_file as it should be non-null - * for valid hugetlb area. However, vm_file will be NULL in the error - * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, - * do_mmap_pgoff() nullifies vma->vm_file before calling this function - * to clean up. Since no pte has actually been setup, it is safe to - * do nothing in this case. - */ - if (vma->vm_file) { - spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); - __unmap_hugepage_range(vma, start, end, ref_page); - spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); - } + spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + __unmap_hugepage_range(vma, start, end, ref_page); + spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); } /* -- cgit v1.2.2 From a3437870160cf2caaac6bdd76c7377a5a4145a8c Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Wed, 23 Jul 2008 21:27:44 -0700 Subject: hugetlb: new sysfs interface Provide new hugepages user APIs that are more suited to multiple hstates in sysfs. There is a new directory, /sys/kernel/hugepages. Underneath that directory there will be a directory per-supported hugepage size, e.g.: /sys/kernel/hugepages/hugepages-64kB /sys/kernel/hugepages/hugepages-16384kB /sys/kernel/hugepages/hugepages-16777216kB corresponding to 64k, 16m and 16g respectively. Within each hugepages-size directory there are a number of files, corresponding to the tracked counters in the hstate, e.g.: /sys/kernel/hugepages/hugepages-64/nr_hugepages /sys/kernel/hugepages/hugepages-64/nr_overcommit_hugepages /sys/kernel/hugepages/hugepages-64/free_hugepages /sys/kernel/hugepages/hugepages-64/resv_hugepages /sys/kernel/hugepages/hugepages-64/surplus_hugepages Of these files, the first two are read-write and the latter three are read-only. The size of the hugepage being manipulated is trivially deducible from the enclosing directory and is always expressed in kB (to match meminfo). [dave@linux.vnet.ibm.com: fix build] [nacc@us.ibm.com: hugetlb: hang off of /sys/kernel/mm rather than /sys/kernel] [nacc@us.ibm.com: hugetlb: remove CONFIG_SYSFS dependency] Acked-by: Greg Kroah-Hartman Signed-off-by: Nishanth Aravamudan Signed-off-by: Nick Piggin Cc: Dave Hansen Signed-off-by: Nishanth Aravamudan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 288 +++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 222 insertions(+), 66 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4cf7a90e9140..bb49ce5d0067 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -942,72 +943,6 @@ static void __init report_hugepages(void) } } -static int __init hugetlb_init(void) -{ - BUILD_BUG_ON(HPAGE_SHIFT == 0); - - if (!size_to_hstate(HPAGE_SIZE)) { - hugetlb_add_hstate(HUGETLB_PAGE_ORDER); - parsed_hstate->max_huge_pages = default_hstate_max_huge_pages; - } - default_hstate_idx = size_to_hstate(HPAGE_SIZE) - hstates; - - hugetlb_init_hstates(); - - report_hugepages(); - - return 0; -} -module_init(hugetlb_init); - -/* Should be called on processing a hugepagesz=... option */ -void __init hugetlb_add_hstate(unsigned order) -{ - struct hstate *h; - if (size_to_hstate(PAGE_SIZE << order)) { - printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); - return; - } - BUG_ON(max_hstate >= HUGE_MAX_HSTATE); - BUG_ON(order == 0); - h = &hstates[max_hstate++]; - h->order = order; - h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); - hugetlb_init_one_hstate(h); - parsed_hstate = h; -} - -static int __init hugetlb_setup(char *s) -{ - unsigned long *mhp; - - /* - * !max_hstate means we haven't parsed a hugepagesz= parameter yet, - * so this hugepages= parameter goes to the "default hstate". - */ - if (!max_hstate) - mhp = &default_hstate_max_huge_pages; - else - mhp = &parsed_hstate->max_huge_pages; - - if (sscanf(s, "%lu", mhp) <= 0) - *mhp = 0; - - return 1; -} -__setup("hugepages=", hugetlb_setup); - -static unsigned int cpuset_mems_nr(unsigned int *array) -{ - int node; - unsigned int nr = 0; - - for_each_node_mask(node, cpuset_current_mems_allowed) - nr += array[node]; - - return nr; -} - #ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM static void try_to_free_low(struct hstate *h, unsigned long count) @@ -1105,6 +1040,227 @@ out: return ret; } +#define HSTATE_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +#define HSTATE_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static struct kobject *hugepages_kobj; +static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; + +static struct hstate *kobj_to_hstate(struct kobject *kobj) +{ + int i; + for (i = 0; i < HUGE_MAX_HSTATE; i++) + if (hstate_kobjs[i] == kobj) + return &hstates[i]; + BUG(); + return NULL; +} + +static ssize_t nr_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->nr_huge_pages); +} +static ssize_t nr_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long input; + struct hstate *h = kobj_to_hstate(kobj); + + err = strict_strtoul(buf, 10, &input); + if (err) + return 0; + + h->max_huge_pages = set_max_huge_pages(h, input); + + return count; +} +HSTATE_ATTR(nr_hugepages); + +static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); +} +static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long input; + struct hstate *h = kobj_to_hstate(kobj); + + err = strict_strtoul(buf, 10, &input); + if (err) + return 0; + + spin_lock(&hugetlb_lock); + h->nr_overcommit_huge_pages = input; + spin_unlock(&hugetlb_lock); + + return count; +} +HSTATE_ATTR(nr_overcommit_hugepages); + +static ssize_t free_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->free_huge_pages); +} +HSTATE_ATTR_RO(free_hugepages); + +static ssize_t resv_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->resv_huge_pages); +} +HSTATE_ATTR_RO(resv_hugepages); + +static ssize_t surplus_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->surplus_huge_pages); +} +HSTATE_ATTR_RO(surplus_hugepages); + +static struct attribute *hstate_attrs[] = { + &nr_hugepages_attr.attr, + &nr_overcommit_hugepages_attr.attr, + &free_hugepages_attr.attr, + &resv_hugepages_attr.attr, + &surplus_hugepages_attr.attr, + NULL, +}; + +static struct attribute_group hstate_attr_group = { + .attrs = hstate_attrs, +}; + +static int __init hugetlb_sysfs_add_hstate(struct hstate *h) +{ + int retval; + + hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, + hugepages_kobj); + if (!hstate_kobjs[h - hstates]) + return -ENOMEM; + + retval = sysfs_create_group(hstate_kobjs[h - hstates], + &hstate_attr_group); + if (retval) + kobject_put(hstate_kobjs[h - hstates]); + + return retval; +} + +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); + if (!hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h); + if (err) + printk(KERN_ERR "Hugetlb: Unable to add hstate %s", + h->name); + } +} + +static void __exit hugetlb_exit(void) +{ + struct hstate *h; + + for_each_hstate(h) { + kobject_put(hstate_kobjs[h - hstates]); + } + + kobject_put(hugepages_kobj); +} +module_exit(hugetlb_exit); + +static int __init hugetlb_init(void) +{ + BUILD_BUG_ON(HPAGE_SHIFT == 0); + + if (!size_to_hstate(HPAGE_SIZE)) { + hugetlb_add_hstate(HUGETLB_PAGE_ORDER); + parsed_hstate->max_huge_pages = default_hstate_max_huge_pages; + } + default_hstate_idx = size_to_hstate(HPAGE_SIZE) - hstates; + + hugetlb_init_hstates(); + + report_hugepages(); + + hugetlb_sysfs_init(); + + return 0; +} +module_init(hugetlb_init); + +/* Should be called on processing a hugepagesz=... option */ +void __init hugetlb_add_hstate(unsigned order) +{ + struct hstate *h; + if (size_to_hstate(PAGE_SIZE << order)) { + printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); + return; + } + BUG_ON(max_hstate >= HUGE_MAX_HSTATE); + BUG_ON(order == 0); + h = &hstates[max_hstate++]; + h->order = order; + h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); + snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", + huge_page_size(h)/1024); + hugetlb_init_one_hstate(h); + parsed_hstate = h; +} + +static int __init hugetlb_setup(char *s) +{ + unsigned long *mhp; + + /* + * !max_hstate means we haven't parsed a hugepagesz= parameter yet, + * so this hugepages= parameter goes to the "default hstate". + */ + if (!max_hstate) + mhp = &default_hstate_max_huge_pages; + else + mhp = &parsed_hstate->max_huge_pages; + + if (sscanf(s, "%lu", mhp) <= 0) + *mhp = 0; + + return 1; +} +__setup("hugepages=", hugetlb_setup); + +static unsigned int cpuset_mems_nr(unsigned int *array) +{ + int node; + unsigned int nr = 0; + + for_each_node_mask(node, cpuset_current_mems_allowed) + nr += array[node]; + + return nr; +} + int hugetlb_sysctl_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) -- cgit v1.2.2 From 5ced66c901f1cf0b684feb15c2cd8b126e263d07 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:45 -0700 Subject: hugetlb: abstract numa round robin selection Need this as a separate function for a future patch. No behaviour change. Acked-by: Adam Litke Acked-by: Nishanth Aravamudan Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bb49ce5d0067..5e620e25cf08 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -565,6 +565,27 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) return page; } +/* + * Use a helper variable to find the next node and then + * copy it back to hugetlb_next_nid afterwards: + * otherwise there's a window in which a racer might + * pass invalid nid MAX_NUMNODES to alloc_pages_node. + * But we don't need to use a spin_lock here: it really + * doesn't matter if occasionally a racer chooses the + * same nid as we do. Move nid forward in the mask even + * if we just successfully allocated a hugepage so that + * the next caller gets hugepages on the next node. + */ +static int hstate_next_node(struct hstate *h) +{ + int next_nid; + next_nid = next_node(h->hugetlb_next_nid, node_online_map); + if (next_nid == MAX_NUMNODES) + next_nid = first_node(node_online_map); + h->hugetlb_next_nid = next_nid; + return next_nid; +} + static int alloc_fresh_huge_page(struct hstate *h) { struct page *page; @@ -578,21 +599,7 @@ static int alloc_fresh_huge_page(struct hstate *h) page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); if (page) ret = 1; - /* - * Use a helper variable to find the next node and then - * copy it back to hugetlb_next_nid afterwards: - * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_node. - * But we don't need to use a spin_lock here: it really - * doesn't matter if occasionally a racer chooses the - * same nid as we do. Move nid forward in the mask even - * if we just successfully allocated a hugepage so that - * the next caller gets hugepages on the next node. - */ - next_nid = next_node(h->hugetlb_next_nid, node_online_map); - if (next_nid == MAX_NUMNODES) - next_nid = first_node(node_online_map); - h->hugetlb_next_nid = next_nid; + next_nid = hstate_next_node(h); } while (!page && h->hugetlb_next_nid != start_nid); if (ret) -- cgit v1.2.2 From aa888a74977a8f2120ae9332376e179c39a6b07d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:47 -0700 Subject: hugetlb: support larger than MAX_ORDER This is needed on x86-64 to handle GB pages in hugetlbfs, because it is not practical to enlarge MAX_ORDER to 1GB. Instead the 1GB pages are only allocated at boot using the bootmem allocator using the hugepages=... option. These 1G bootmem pages are never freed. In theory it would be possible to implement that with some complications, but since it would be a one-way street (>= MAX_ORDER pages cannot be allocated later) I decided not to currently. The >= MAX_ORDER code is not ifdef'ed per architecture. It is not very big and the ifdef uglyness seemed not be worth it. Known problems: /proc/meminfo and "free" do not display the memory allocated for gb pages in "Total". This is a little confusing for the user. Acked-by: Andrew Hastings Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 81 insertions(+), 2 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5e620e25cf08..1a6fe87555b2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -489,7 +490,7 @@ static void free_huge_page(struct page *page) INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); - if (h->surplus_huge_pages_node[nid]) { + if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { update_and_free_page(h, page); h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; @@ -550,6 +551,9 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; + if (h->order >= MAX_ORDER) + return NULL; + page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, @@ -616,6 +620,9 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, struct page *page; unsigned int nid; + if (h->order >= MAX_ORDER) + return NULL; + /* * Assume we will successfully allocate the surplus page to * prevent racing processes from causing the surplus to exceed @@ -792,6 +799,10 @@ static void return_unused_surplus_pages(struct hstate *h, /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; + /* Cannot return gigantic pages currently */ + if (h->order >= MAX_ORDER) + return; + nr_pages = min(unused_resv_pages, h->surplus_huge_pages); while (remaining_iterations-- && nr_pages) { @@ -913,6 +924,63 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; } +static __initdata LIST_HEAD(huge_boot_pages); + +struct huge_bootmem_page { + struct list_head list; + struct hstate *hstate; +}; + +static int __init alloc_bootmem_huge_page(struct hstate *h) +{ + struct huge_bootmem_page *m; + int nr_nodes = nodes_weight(node_online_map); + + while (nr_nodes) { + void *addr; + + addr = __alloc_bootmem_node_nopanic( + NODE_DATA(h->hugetlb_next_nid), + huge_page_size(h), huge_page_size(h), 0); + + if (addr) { + /* + * Use the beginning of the huge page to store the + * huge_bootmem_page struct (until gather_bootmem + * puts them into the mem_map). + */ + m = addr; + if (m) + goto found; + } + hstate_next_node(h); + nr_nodes--; + } + return 0; + +found: + BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); + /* Put them into a private list first because mem_map is not up yet */ + list_add(&m->list, &huge_boot_pages); + m->hstate = h; + return 1; +} + +/* Put bootmem huge pages into the standard lists after mem_map is up */ +static void __init gather_bootmem_prealloc(void) +{ + struct huge_bootmem_page *m; + + list_for_each_entry(m, &huge_boot_pages, list) { + struct page *page = virt_to_page(m); + struct hstate *h = m->hstate; + __ClearPageReserved(page); + WARN_ON(page_count(page) != 1); + prep_compound_page(page, h->order); + prep_new_huge_page(h, page, page_to_nid(page)); + } +} + static void __init hugetlb_init_one_hstate(struct hstate *h) { unsigned long i; @@ -923,7 +991,10 @@ static void __init hugetlb_init_one_hstate(struct hstate *h) h->hugetlb_next_nid = first_node(node_online_map); for (i = 0; i < h->max_huge_pages; ++i) { - if (!alloc_fresh_huge_page(h)) + if (h->order >= MAX_ORDER) { + if (!alloc_bootmem_huge_page(h)) + break; + } else if (!alloc_fresh_huge_page(h)) break; } h->max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i; @@ -956,6 +1027,9 @@ static void try_to_free_low(struct hstate *h, unsigned long count) { int i; + if (h->order >= MAX_ORDER) + return; + for (i = 0; i < MAX_NUMNODES; ++i) { struct page *page, *next; struct list_head *freel = &h->hugepage_freelists[i]; @@ -982,6 +1056,9 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) { unsigned long min_count, ret; + if (h->order >= MAX_ORDER) + return h->max_huge_pages; + /* * Increase the pool size * First take pages out of surplus state. Then make up the @@ -1210,6 +1287,8 @@ static int __init hugetlb_init(void) hugetlb_init_hstates(); + gather_bootmem_prealloc(); + report_hugepages(); hugetlb_sysfs_init(); -- cgit v1.2.2 From 8faa8b077b2cdc4e4646842fe50b07840955a013 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:48 -0700 Subject: hugetlb: support boot allocate different sizes Make some infrastructure changes to allow boot-time allocation of different hugepage page sizes. - move all basic hstate initialisation into hugetlb_add_hstate - create a new function hugetlb_hstate_alloc_pages() to do the actual initial page allocations. Call this function early in order to allocate giant pages from bootmem. - Check for multiple hugepages= parameters Acked-by: Adam Litke Acked-by: Nishanth Aravamudan Acked-by: Andrew Hastings Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1a6fe87555b2..243a8684d180 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -981,15 +981,10 @@ static void __init gather_bootmem_prealloc(void) } } -static void __init hugetlb_init_one_hstate(struct hstate *h) +static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long i; - for (i = 0; i < MAX_NUMNODES; ++i) - INIT_LIST_HEAD(&h->hugepage_freelists[i]); - - h->hugetlb_next_nid = first_node(node_online_map); - for (i = 0; i < h->max_huge_pages; ++i) { if (h->order >= MAX_ORDER) { if (!alloc_bootmem_huge_page(h)) @@ -997,7 +992,7 @@ static void __init hugetlb_init_one_hstate(struct hstate *h) } else if (!alloc_fresh_huge_page(h)) break; } - h->max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i; + h->max_huge_pages = i; } static void __init hugetlb_init_hstates(void) @@ -1005,7 +1000,9 @@ static void __init hugetlb_init_hstates(void) struct hstate *h; for_each_hstate(h) { - hugetlb_init_one_hstate(h); + /* oversize hugepages were init'ed in early boot */ + if (h->order < MAX_ORDER) + hugetlb_hstate_alloc_pages(h); } } @@ -1301,6 +1298,8 @@ module_init(hugetlb_init); void __init hugetlb_add_hstate(unsigned order) { struct hstate *h; + unsigned long i; + if (size_to_hstate(PAGE_SIZE << order)) { printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); return; @@ -1310,15 +1309,21 @@ void __init hugetlb_add_hstate(unsigned order) h = &hstates[max_hstate++]; h->order = order; h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); + h->nr_huge_pages = 0; + h->free_huge_pages = 0; + for (i = 0; i < MAX_NUMNODES; ++i) + INIT_LIST_HEAD(&h->hugepage_freelists[i]); + h->hugetlb_next_nid = first_node(node_online_map); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); - hugetlb_init_one_hstate(h); + parsed_hstate = h; } static int __init hugetlb_setup(char *s) { unsigned long *mhp; + static unsigned long *last_mhp; /* * !max_hstate means we haven't parsed a hugepagesz= parameter yet, @@ -1329,9 +1334,25 @@ static int __init hugetlb_setup(char *s) else mhp = &parsed_hstate->max_huge_pages; + if (mhp == last_mhp) { + printk(KERN_WARNING "hugepages= specified twice without " + "interleaving hugepagesz=, ignoring\n"); + return 1; + } + if (sscanf(s, "%lu", mhp) <= 0) *mhp = 0; + /* + * Global state is always initialized later in hugetlb_init. + * But we need to allocate >= MAX_ORDER hstates here early to still + * use the bootmem allocator. + */ + if (max_hstate && parsed_hstate->order >= MAX_ORDER) + hugetlb_hstate_alloc_pages(parsed_hstate); + + last_mhp = mhp; + return 1; } __setup("hugepages=", hugetlb_setup); -- cgit v1.2.2 From 4abd32dbab201c3ced0b0af12accea77cd9eeffc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:49 -0700 Subject: hugetlb: printk cleanup - Reword sentence to clarify meaning with multiple options - Add support for using GB prefixes for the page size - Add extra printk to delayed > MAX_ORDER allocation code Acked-by: Adam Litke Acked-by: Nishanth Aravamudan Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 243a8684d180..0c74c14dd2f7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1006,15 +1006,27 @@ static void __init hugetlb_init_hstates(void) } } +static char * __init memfmt(char *buf, unsigned long n) +{ + if (n >= (1UL << 30)) + sprintf(buf, "%lu GB", n >> 30); + else if (n >= (1UL << 20)) + sprintf(buf, "%lu MB", n >> 20); + else + sprintf(buf, "%lu KB", n >> 10); + return buf; +} + static void __init report_hugepages(void) { struct hstate *h; for_each_hstate(h) { - printk(KERN_INFO "Total HugeTLB memory allocated, " - "%ld %dMB pages\n", - h->free_huge_pages, - 1 << (h->order + PAGE_SHIFT - 20)); + char buf[32]; + printk(KERN_INFO "HugeTLB registered %s page size, " + "pre-allocated %ld pages\n", + memfmt(buf, huge_page_size(h)), + h->free_huge_pages); } } -- cgit v1.2.2 From ceb868796181dc95ea01a110e123afd391639873 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:50 -0700 Subject: hugetlb: introduce pud_huge Straight forward extensions for huge pages located in the PUD instead of PMDs. Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0c74c14dd2f7..107c1ce223cb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1996,6 +1996,15 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } +/* Can be overriden by architectures */ +__attribute__((weak)) struct page * +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int write) +{ + BUG(); + return NULL; +} + int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i, -- cgit v1.2.2 From e11bfbfcb08ef4223b863799897c19cdf7c5bc00 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 23 Jul 2008 21:27:52 -0700 Subject: hugetlb: override default huge page size Allow configurations with the default huge page size which is different to the traditional HPAGE_SIZE size. The default huge page size is the one represented in the legacy /proc ABIs, SHM, and which is defaulted to when mounting hugetlbfs filesystems. This is implemented with a new kernel option default_hugepagesz=, which defaults to HPAGE_SIZE if not specified. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 107c1ce223cb..2a2f6e869401 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -34,6 +34,7 @@ struct hstate hstates[HUGE_MAX_HSTATE]; /* for command line parsing */ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; +static unsigned long __initdata default_hstate_size; #define for_each_hstate(h) \ for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) @@ -1288,11 +1289,14 @@ static int __init hugetlb_init(void) { BUILD_BUG_ON(HPAGE_SHIFT == 0); - if (!size_to_hstate(HPAGE_SIZE)) { - hugetlb_add_hstate(HUGETLB_PAGE_ORDER); - parsed_hstate->max_huge_pages = default_hstate_max_huge_pages; + if (!size_to_hstate(default_hstate_size)) { + default_hstate_size = HPAGE_SIZE; + if (!size_to_hstate(default_hstate_size)) + hugetlb_add_hstate(HUGETLB_PAGE_ORDER); } - default_hstate_idx = size_to_hstate(HPAGE_SIZE) - hstates; + default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; + if (default_hstate_max_huge_pages) + default_hstate.max_huge_pages = default_hstate_max_huge_pages; hugetlb_init_hstates(); @@ -1332,7 +1336,7 @@ void __init hugetlb_add_hstate(unsigned order) parsed_hstate = h; } -static int __init hugetlb_setup(char *s) +static int __init hugetlb_nrpages_setup(char *s) { unsigned long *mhp; static unsigned long *last_mhp; @@ -1367,7 +1371,14 @@ static int __init hugetlb_setup(char *s) return 1; } -__setup("hugepages=", hugetlb_setup); +__setup("hugepages=", hugetlb_nrpages_setup); + +static int __init hugetlb_default_setup(char *s) +{ + default_hstate_size = memparse(s, &s); + return 1; +} +__setup("default_hugepagesz=", hugetlb_default_setup); static unsigned int cpuset_mems_nr(unsigned int *array) { -- cgit v1.2.2 From 53ba51d21d6e048424ab8aadfebdb1f25ae07b60 Mon Sep 17 00:00:00 2001 From: Jon Tollefson Date: Wed, 23 Jul 2008 21:27:52 -0700 Subject: hugetlb: allow arch overridden hugepage allocation Allow alloc_bootmem_huge_page() to be overridden by architectures that can't always use bootmem. This requires huge_boot_pages to be available for use by this function. This is required for powerpc 16G pages, which have to be reserved prior to boot-time. The location of these pages are indicated in the device tree. Acked-by: Adam Litke Signed-off-by: Jon Tollefson Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2a2f6e869401..3e1506b808a3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -31,6 +31,8 @@ static int max_hstate; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; +__initdata LIST_HEAD(huge_boot_pages); + /* for command line parsing */ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; @@ -925,14 +927,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; } -static __initdata LIST_HEAD(huge_boot_pages); - -struct huge_bootmem_page { - struct list_head list; - struct hstate *hstate; -}; - -static int __init alloc_bootmem_huge_page(struct hstate *h) +__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) { struct huge_bootmem_page *m; int nr_nodes = nodes_weight(node_online_map); -- cgit v1.2.2 From 7f09ca51e925ba62e9ebfd4979f093e97e38adeb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 23 Jul 2008 21:27:58 -0700 Subject: hugetlb: fix a hugepage reservation check for MAP_SHARED When removing a huge page from the hugepage pool for a fault the system checks to see if the mapping requires additional pages to be reserved, and if it does whether there are any unreserved pages remaining. If not, the allocation fails without even attempting to get a page. In order to determine whether to apply this check we call vma_has_private_reserves() which tells us if this vma is MAP_PRIVATE and is the owner. This incorrectly triggers the remaining reservation test for MAP_SHARED mappings which prevents allocation of the final page in the pool even though it is reserved for this mapping. In reality we only want to check this for MAP_PRIVATE mappings where the process is not the original mapper. Replace vma_has_private_reserves() with vma_has_reserves() which indicates whether further reserves are required, and update the caller. Signed-off-by: Mel Gorman Acked-by: Adam Litke Acked-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3e1506b808a3..8c20aed62b9c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -342,13 +342,13 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) } /* Returns true if the VMA has associated reserve pages */ -static int vma_has_private_reserves(struct vm_area_struct *vma) +static int vma_has_reserves(struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHARED) - return 0; - if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) - return 0; - return 1; + return 1; + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + return 1; + return 0; } static void clear_huge_page(struct page *page, @@ -420,7 +420,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, * have no page reserves. This check ensures that reservations are * not "stolen". The child may still get SIGKILLed */ - if (!vma_has_private_reserves(vma) && + if (!vma_has_reserves(vma) && h->free_huge_pages - h->resv_huge_pages == 0) return NULL; -- cgit v1.2.2 From 7251ff78b94c2a68d267623d09b32672b20662c1 Mon Sep 17 00:00:00 2001 From: Adam Litke Date: Wed, 23 Jul 2008 21:27:59 -0700 Subject: hugetlb: quota is not freed for unused reserved private huge pages With shared reservations (and now also with private reservations), we reserve huge pages at mmap time. We also account for the mapping against fs quota to prevent a reservation from being preempted by quota exhaustion. When testing with the libhugetlbfs test suite, I found a problem with quota accounting. FS quota for allocated pages is handled correctly but we are not releasing quota for private pages that were reserved but never allocated. Do this in hugetlb_vm_op_close() at the same time as unused page reservations are released. Signed-off-by: Adam Litke Cc: Mel Gorman Cc: Johannes Weiner Cc: William Lee Irwin III Cc: Hugh Dickins Acked-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8c20aed62b9c..41341c414194 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1552,8 +1552,10 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) kref_put(&reservations->refs, resv_map_release); - if (reserve) + if (reserve) { hugetlb_acct_memory(h, -reserve); + hugetlb_put_quota(vma->vm_file->f_mapping, reserve); + } } } -- cgit v1.2.2 From e44d1b2998d62a1f2f4d7eb17b56ba396535509f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 25 Jul 2008 12:57:41 +0200 Subject: mm/hugetlb.c: fix build failure with !CONFIG_SYSCTL on !CONFIG_SYSCTL on x86 with latest -git i get: mm/hugetlb.c: In function 'decrement_hugepage_resv_vma': mm/hugetlb.c:83: error: 'reserve' undeclared (first use in this function) mm/hugetlb.c:83: error: (Each undeclared identifier is reported only once mm/hugetlb.c:83: error: for each function it appears in.) Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 41341c414194..a8bf4ab01f86 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1026,6 +1026,17 @@ static void __init report_hugepages(void) } } +static unsigned int cpuset_mems_nr(unsigned int *array) +{ + int node; + unsigned int nr = 0; + + for_each_node_mask(node, cpuset_current_mems_allowed) + nr += array[node]; + + return nr; +} + #ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM static void try_to_free_low(struct hstate *h, unsigned long count) @@ -1375,17 +1386,6 @@ static int __init hugetlb_default_setup(char *s) } __setup("default_hugepagesz=", hugetlb_default_setup); -static unsigned int cpuset_mems_nr(unsigned int *array) -{ - int node; - unsigned int nr = 0; - - for_each_node_mask(node, cpuset_current_mems_allowed) - nr += array[node]; - - return nr; -} - int hugetlb_sysctl_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) -- cgit v1.2.2 From 8a21346058ad946134b6ddfeb5de975c3cfcf5da Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Fri, 25 Jul 2008 19:44:37 -0700 Subject: hugetlb: fix CONFIG_SYSCTL=n build Fixes a build failure reported by Alan Cox: mm/hugetlb.c: In function `hugetlb_acct_memory': mm/hugetlb.c:1507: error: implicit declaration of function `cpuset_mems_nr' Also reverts Ingo's commit e44d1b2998d62a1f2f4d7eb17b56ba396535509f Author: Ingo Molnar Date: Fri Jul 25 12:57:41 2008 +0200 mm/hugetlb.c: fix build failure with !CONFIG_SYSCTL which fixed the build error but added some unused-static-function warnings. Signed-off-by: Nishanth Aravamudan Cc: Alan Cox Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a8bf4ab01f86..3be79dc18c5c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1026,18 +1026,6 @@ static void __init report_hugepages(void) } } -static unsigned int cpuset_mems_nr(unsigned int *array) -{ - int node; - unsigned int nr = 0; - - for_each_node_mask(node, cpuset_current_mems_allowed) - nr += array[node]; - - return nr; -} - -#ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM static void try_to_free_low(struct hstate *h, unsigned long count) { @@ -1386,6 +1374,18 @@ static int __init hugetlb_default_setup(char *s) } __setup("default_hugepagesz=", hugetlb_default_setup); +static unsigned int cpuset_mems_nr(unsigned int *array) +{ + int node; + unsigned int nr = 0; + + for_each_node_mask(node, cpuset_current_mems_allowed) + nr += array[node]; + + return nr; +} + +#ifdef CONFIG_SYSCTL int hugetlb_sysctl_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) -- cgit v1.2.2 From cddb8a5c14aa89810b40495d94d3d2a0faee6619 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 28 Jul 2008 15:46:29 -0700 Subject: mmu-notifiers: core With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages. There are secondary MMUs (with secondary sptes and secondary tlbs) too. sptes in the kvm case are shadow pagetables, but when I say spte in mmu-notifier context, I mean "secondary pte". In GRU case there's no actual secondary pte and there's only a secondary tlb because the GRU secondary MMU has no knowledge about sptes and every secondary tlb miss event in the MMU always generates a page fault that has to be resolved by the CPU (this is not the case of KVM where the a secondary tlb miss will walk sptes in hardware and it will refill the secondary tlb transparently to software if the corresponding spte is present). The same way zap_page_range has to invalidate the pte before freeing the page, the spte (and secondary tlb) must also be invalidated before any page is freed and reused. Currently we take a page_count pin on every page mapped by sptes, but that means the pages can't be swapped whenever they're mapped by any spte because they're part of the guest working set. Furthermore a spte unmap event can immediately lead to a page to be freed when the pin is released (so requiring the same complex and relatively slow tlb_gather smp safe logic we have in zap_page_range and that can be avoided completely if the spte unmap event doesn't require an unpin of the page previously mapped in the secondary MMU). The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know when the VM is swapping or freeing or doing anything on the primary MMU so that the secondary MMU code can drop sptes before the pages are freed, avoiding all page pinning and allowing 100% reliable swapping of guest physical address space. Furthermore it avoids the code that teardown the mappings of the secondary MMU, to implement a logic like tlb_gather in zap_page_range that would require many IPI to flush other cpu tlbs, for each fixed number of spte unmapped. To make an example: if what happens on the primary MMU is a protection downgrade (from writeable to wrprotect) the secondary MMU mappings will be invalidated, and the next secondary-mmu-page-fault will call get_user_pages and trigger a do_wp_page through get_user_pages if it called get_user_pages with write=1, and it'll re-establishing an updated spte or secondary-tlb-mapping on the copied page. Or it will setup a readonly spte or readonly tlb mapping if it's a guest-read, if it calls get_user_pages with write=0. This is just an example. This allows to map any page pointed by any pte (and in turn visible in the primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an full MMU with both sptes and secondary-tlb like the shadow-pagetable layer with kvm), or a remote DMA in software like XPMEM (hence needing of schedule in XPMEM code to send the invalidate to the remote node, while no need to schedule in kvm/gru as it's an immediate event like invalidating primary-mmu pte). At least for KVM without this patch it's impossible to swap guests reliably. And having this feature and removing the page pin allows several other optimizations that simplify life considerably. Dependencies: 1) mm_take_all_locks() to register the mmu notifier when the whole VM isn't doing anything with "mm". This allows mmu notifier users to keep track if the VM is in the middle of the invalidate_range_begin/end critical section with an atomic counter incraese in range_begin and decreased in range_end. No secondary MMU page fault is allowed to map any spte or secondary tlb reference, while the VM is in the middle of range_begin/end as any page returned by get_user_pages in that critical section could later immediately be freed without any further ->invalidate_page notification (invalidate_range_begin/end works on ranges and ->invalidate_page isn't called immediately before freeing the page). To stop all page freeing and pagetable overwrites the mmap_sem must be taken in write mode and all other anon_vma/i_mmap locks must be taken too. 2) It'd be a waste to add branches in the VM if nobody could possibly run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of mmu notifiers, but this already allows to compile a KVM external module against a kernel with mmu notifiers enabled and from the next pull from kvm.git we'll start using them. And GRU/XPMEM will also be able to continue the development by enabling KVM=m in their config, until they submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n). This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM are all =n. The mmu_notifier_register call can fail because mm_take_all_locks may be interrupted by a signal and return -EINTR. Because mmu_notifier_reigster is used when a driver startup, a failure can be gracefully handled. Here an example of the change applied to kvm to register the mmu notifiers. Usually when a driver startups other allocations are required anyway and -ENOMEM failure paths exists already. struct kvm *kvm_arch_create_vm(void) { struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); + int err; if (!kvm) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops; + err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm); + if (err) { + kfree(kvm); + return ERR_PTR(err); + } + return kvm; } mmu_notifier_unregister returns void and it's reliable. The patch also adds a few needed but missing includes that would prevent kernel to compile after these changes on non-x86 archs (x86 didn't need them by luck). [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix mm/filemap_xip.c build] [akpm@linux-foundation.org: fix mm/mmu_notifier.c build] Signed-off-by: Andrea Arcangeli Signed-off-by: Nick Piggin Signed-off-by: Christoph Lameter Cc: Jack Steiner Cc: Robin Holt Cc: Nick Piggin Cc: Peter Zijlstra Cc: Kanoj Sarcar Cc: Roland Dreier Cc: Steve Wise Cc: Avi Kivity Cc: Hugh Dickins Cc: Rusty Russell Cc: Anthony Liguori Cc: Chris Wright Cc: Marcelo Tosatti Cc: Eric Dumazet Cc: "Paul E. McKenney" Cc: Izik Eidus Cc: Anthony Liguori Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3be79dc18c5c..80eb0d31d0d3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -1672,6 +1673,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); + mmu_notifier_invalidate_range_start(mm, start, end); spin_lock(&mm->page_table_lock); for (address = start; address < end; address += sz) { ptep = huge_pte_offset(mm, address); @@ -1713,6 +1715,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, } spin_unlock(&mm->page_table_lock); flush_tlb_range(vma, start, end); + mmu_notifier_invalidate_range_end(mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { list_del(&page->lru); put_page(page); -- cgit v1.2.2 From 78a34ae29bf1c9df62a5bd0f0798b6c62a54d520 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 28 Jul 2008 15:46:30 -0700 Subject: mm/hugetlb.c must #include This patch fixes the following build error on sh caused by commit aa888a74977a8f2120ae9332376e179c39a6b07d ("hugetlb: support larger than MAX_ORDER"): mm/hugetlb.c: In function 'alloc_bootmem_huge_page': mm/hugetlb.c:958: error: implicit declaration of function 'virt_to_phys' Signed-off-by: Adrian Bunk Cc: Hirokazu Takata Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 80eb0d31d0d3..254ce2b90158 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -20,6 +20,7 @@ #include #include +#include #include #include "internal.h" -- cgit v1.2.2 From 7cb93181629c613ee2b8f4ffe3446f8003074842 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 30 Jul 2008 02:18:26 +0900 Subject: mm/hugetlb.c must #include This patch fixes the following build error on sh caused by commit aa888a74977a8f2120ae9332376e179c39a6b07d (hugetlb: support larger than MAX_ORDER): <-- snip --> ... CC mm/hugetlb.o /home/bunk/linux/kernel-2.6/git/linux-2.6/mm/hugetlb.c: In function 'alloc_bootmem_huge_page': /home/bunk/linux/kernel-2.6/git/linux-2.6/mm/hugetlb.c:958: error: implicit declaration of function 'virt_to_phys' make[2]: *** [mm/hugetlb.o] Error 1 <-- snip --> Reported-by: Adrian Bunk Signed-off-by: Adrian Bunk Signed-off-by: Paul Mundt --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3be79dc18c5c..b3c78640b629 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -16,7 +16,7 @@ #include #include #include - +#include #include #include -- cgit v1.2.2 From 0ef89d25d3e390dfa7c46772907951744a4067dc Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 31 Jul 2008 00:07:30 -0700 Subject: mm/hugetlb: don't crash when HPAGE_SHIFT is 0 Some platform decide whether they support huge pages at boot time. On these, such as powerpc, HPAGE_SHIFT is a variable, not a constant, and is set to 0 when there is no such support. The patches to introduce multiple huge pages support broke that causing the kernel to crash at boot time on machines such as POWER3 which lack support for multiple page sizes. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d237a02eb228..28a2980ee435 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1283,7 +1283,12 @@ module_exit(hugetlb_exit); static int __init hugetlb_init(void) { - BUILD_BUG_ON(HPAGE_SHIFT == 0); + /* Some platform decide whether they support huge pages at boot + * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when + * there is no such support + */ + if (HPAGE_SHIFT == 0) + return 0; if (!size_to_hstate(default_hstate_size)) { default_hstate_size = HPAGE_SIZE; -- cgit v1.2.2 From d6606683a5e3dac35cb979c7195f54ed827567bd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 6 Aug 2008 12:04:54 -0700 Subject: Revert duplicate "mm/hugetlb.c must #include " This reverts commit 7cb93181629c613ee2b8f4ffe3446f8003074842, since we did that patch twice, and the problem was already fixed earlier by 78a34ae29bf1c9df62a5bd0f0798b6c62a54d520. Reported-by: Andi Kleen Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 28a2980ee435..757ca983fd99 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -17,7 +17,7 @@ #include #include #include -#include + #include #include #include -- cgit v1.2.2 From caff3a2c333e11a794308bd9a875a09b94fee24a Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Tue, 12 Aug 2008 15:08:38 -0700 Subject: hugetlb: call arch_prepare_hugepage() for surplus pages The s390 software large page emulation implements shared page tables by using page->index of the first tail page from a compound large page to store page table information. This is set up in arch_prepare_hugepage(), which is called from alloc_fresh_huge_page_node(). A similar call to arch_prepare_hugepage() is missing for surplus large pages that are allocated in alloc_buddy_huge_page(), which breaks the software emulation mode for (surplus) large pages on s390. This patch adds the missing call to arch_prepare_hugepage(). It will have no effect on other architectures where arch_prepare_hugepage() is a nop. Also, use the correct order in the error path in alloc_fresh_huge_page_node(). Acked-by: Martin Schwidefsky Signed-off-by: Gerald Schaefer Acked-by: Nick Piggin Acked-by: Adam Litke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 757ca983fd99..92155db888b9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -565,7 +565,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) huge_page_order(h)); if (page) { if (arch_prepare_hugepage(page)) { - __free_pages(page, HUGETLB_PAGE_ORDER); + __free_pages(page, huge_page_order(h)); return NULL; } prep_new_huge_page(h, page, nid); @@ -665,6 +665,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); + if (page && arch_prepare_hugepage(page)) { + __free_pages(page, huge_page_order(h)); + return NULL; + } + spin_lock(&hugetlb_lock); if (page) { /* -- cgit v1.2.2 From 57303d80175e10056bf51206f9961d586f02f967 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 12 Aug 2008 15:08:47 -0700 Subject: hugetlbfs: allocate structures for reservation tracking outside of spinlocks In the normal case, hugetlbfs reserves hugepages at map time so that the pages exist for future faults. A struct file_region is used to track when reservations have been consumed and where. These file_regions are allocated as necessary with kmalloc() which can sleep with the mm->page_table_lock held. This is wrong and triggers may-sleep warning when PREEMPT is enabled. Updates to the underlying file_region are done in two phases. The first phase prepares the region for the change, allocating any necessary memory, without actually making the change. The second phase actually commits the change. This patch makes use of this by checking the reservations before the page_table_lock is taken; triggering any necessary allocations. This may then be safely repeated within the locks without any allocations being required. Credit to Mel Gorman for diagnosing this failure and initial versions of the patch. Signed-off-by: Andy Whitcroft Tested-by: Gerald Schaefer Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 92155db888b9..4c97c174e2e1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1942,6 +1942,15 @@ retry: lock_page(page); } + /* + * If we are going to COW a private mapping later, we examine the + * pending reservations for this page now. This will ensure that + * any allocations necessary to record that reservation occur outside + * the spinlock. + */ + if (write_access && !(vma->vm_flags & VM_SHARED)) + vma_needs_reservation(h, vma, address); + spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) @@ -1978,6 +1987,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); @@ -2000,19 +2010,35 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = 0; + /* + * If we are going to COW the mapping later, we examine the pending + * reservations for this page now. This will ensure that any + * allocations necessary to record that reservation occur outside the + * spinlock. For private mappings, we also lookup the pagecache + * page now as it is used to determine if a reservation has been + * consumed. + */ + if (write_access && !pte_write(entry)) { + vma_needs_reservation(h, vma, address); + + if (!(vma->vm_flags & VM_SHARED)) + pagecache_page = hugetlbfs_pagecache_page(h, + vma, address); + } + spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (likely(pte_same(entry, huge_ptep_get(ptep)))) - if (write_access && !pte_write(entry)) { - struct page *page; - page = hugetlbfs_pagecache_page(h, vma, address); - ret = hugetlb_cow(mm, vma, address, ptep, entry, page); - if (page) { - unlock_page(page); - put_page(page); - } - } + if (write_access && !pte_write(entry)) + ret = hugetlb_cow(mm, vma, address, ptep, entry, + pagecache_page); spin_unlock(&mm->page_table_lock); + + if (pagecache_page) { + unlock_page(pagecache_page); + put_page(pagecache_page); + } + mutex_unlock(&hugetlb_instantiation_mutex); return ret; -- cgit v1.2.2 From 2b26736c88db85c038e04c2306d0745553e69602 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 12 Aug 2008 15:08:49 -0700 Subject: allocate structures for reservation tracking in hugetlbfs outside of spinlocks v2 [Andrew this should replace the previous version which did not check the returns from the region prepare for errors. This has been tested by us and Gerald and it looks good. Bah, while reviewing the locking based on your previous email I spotted that we need to check the return from the vma_needs_reservation call for allocation errors. Here is an updated patch to correct this. This passes testing here.] Signed-off-by: Andy Whitcroft Tested-by: Gerald Schaefer Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4c97c174e2e1..67a71191136e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1949,7 +1949,10 @@ retry: * the spinlock. */ if (write_access && !(vma->vm_flags & VM_SHARED)) - vma_needs_reservation(h, vma, address); + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> huge_page_shift(h); @@ -1976,6 +1979,7 @@ out: backout: spin_unlock(&mm->page_table_lock); +backout_unlocked: unlock_page(page); put_page(page); goto out; @@ -2004,8 +2008,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { ret = hugetlb_no_page(mm, vma, address, ptep, write_access); - mutex_unlock(&hugetlb_instantiation_mutex); - return ret; + goto out_unlock; } ret = 0; @@ -2019,7 +2022,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * consumed. */ if (write_access && !pte_write(entry)) { - vma_needs_reservation(h, vma, address); + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto out_unlock; + } if (!(vma->vm_flags & VM_SHARED)) pagecache_page = hugetlbfs_pagecache_page(h, @@ -2039,6 +2045,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, put_page(pagecache_page); } +out_unlock: mutex_unlock(&hugetlb_instantiation_mutex); return ret; -- cgit v1.2.2