aboutsummaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
authorKirill A. Shutemov <kirill.shutemov@linux.intel.com>2016-01-15 19:54:17 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-15 20:56:32 -0500
commit9a982250f773cc8c76f1eee68a770b7cbf2faf78 (patch)
treede5a99423acf031b98510369d4dc2cf4b6e496ac /mm/huge_memory.c
parent248db92da13f25073e7ebbd5fb95615aafd771d1 (diff)
thp: introduce deferred_split_huge_page()
Currently we don't split huge page on partial unmap. It's not an ideal situation. It can lead to memory overhead. Furtunately, we can detect partial unmap on page_remove_rmap(). But we cannot call split_huge_page() from there due to locking context. It's also counterproductive to do directly from munmap() codepath: in many cases we will hit this from exit(2) and splitting the huge page just to free it up in small pages is not what we really want. The patch introduce deferred_split_huge_page() which put the huge page into queue for splitting. The splitting itself will happen when we get memory pressure via shrinker interface. The page will be dropped from list on freeing through compound page destructor. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Tested-by: Sasha Levin <sasha.levin@oracle.com> Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Jerome Marchand <jmarchan@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Rik van Riel <riel@redhat.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Steve Capper <steve.capper@linaro.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c139
1 files changed, 135 insertions, 4 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b6ac6c43d6a4..4acf55b31f7c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -135,6 +135,10 @@ static struct khugepaged_scan khugepaged_scan = {
135 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), 135 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
136}; 136};
137 137
138static DEFINE_SPINLOCK(split_queue_lock);
139static LIST_HEAD(split_queue);
140static unsigned long split_queue_len;
141static struct shrinker deferred_split_shrinker;
138 142
139static void set_recommended_min_free_kbytes(void) 143static void set_recommended_min_free_kbytes(void)
140{ 144{
@@ -667,6 +671,9 @@ static int __init hugepage_init(void)
667 err = register_shrinker(&huge_zero_page_shrinker); 671 err = register_shrinker(&huge_zero_page_shrinker);
668 if (err) 672 if (err)
669 goto err_hzp_shrinker; 673 goto err_hzp_shrinker;
674 err = register_shrinker(&deferred_split_shrinker);
675 if (err)
676 goto err_split_shrinker;
670 677
671 /* 678 /*
672 * By default disable transparent hugepages on smaller systems, 679 * By default disable transparent hugepages on smaller systems,
@@ -684,6 +691,8 @@ static int __init hugepage_init(void)
684 691
685 return 0; 692 return 0;
686err_khugepaged: 693err_khugepaged:
694 unregister_shrinker(&deferred_split_shrinker);
695err_split_shrinker:
687 unregister_shrinker(&huge_zero_page_shrinker); 696 unregister_shrinker(&huge_zero_page_shrinker);
688err_hzp_shrinker: 697err_hzp_shrinker:
689 khugepaged_slab_exit(); 698 khugepaged_slab_exit();
@@ -740,6 +749,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
740 return entry; 749 return entry;
741} 750}
742 751
752static inline struct list_head *page_deferred_list(struct page *page)
753{
754 /*
755 * ->lru in the tail pages is occupied by compound_head.
756 * Let's use ->mapping + ->index in the second tail page as list_head.
757 */
758 return (struct list_head *)&page[2].mapping;
759}
760
761void prep_transhuge_page(struct page *page)
762{
763 /*
764 * we use page->mapping and page->indexlru in second tail page
765 * as list_head: assuming THP order >= 2
766 */
767 BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
768
769 INIT_LIST_HEAD(page_deferred_list(page));
770 set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
771}
772
743static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, 773static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
744 struct vm_area_struct *vma, 774 struct vm_area_struct *vma,
745 unsigned long address, pmd_t *pmd, 775 unsigned long address, pmd_t *pmd,
@@ -896,6 +926,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
896 count_vm_event(THP_FAULT_FALLBACK); 926 count_vm_event(THP_FAULT_FALLBACK);
897 return VM_FAULT_FALLBACK; 927 return VM_FAULT_FALLBACK;
898 } 928 }
929 prep_transhuge_page(page);
899 return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, 930 return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
900 flags); 931 flags);
901} 932}
@@ -1192,7 +1223,9 @@ alloc:
1192 } else 1223 } else
1193 new_page = NULL; 1224 new_page = NULL;
1194 1225
1195 if (unlikely(!new_page)) { 1226 if (likely(new_page)) {
1227 prep_transhuge_page(new_page);
1228 } else {
1196 if (!page) { 1229 if (!page) {
1197 split_huge_pmd(vma, pmd, address); 1230 split_huge_pmd(vma, pmd, address);
1198 ret |= VM_FAULT_FALLBACK; 1231 ret |= VM_FAULT_FALLBACK;
@@ -2109,6 +2142,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
2109 return NULL; 2142 return NULL;
2110 } 2143 }
2111 2144
2145 prep_transhuge_page(*hpage);
2112 count_vm_event(THP_COLLAPSE_ALLOC); 2146 count_vm_event(THP_COLLAPSE_ALLOC);
2113 return *hpage; 2147 return *hpage;
2114} 2148}
@@ -2120,8 +2154,12 @@ static int khugepaged_find_target_node(void)
2120 2154
2121static inline struct page *alloc_hugepage(int defrag) 2155static inline struct page *alloc_hugepage(int defrag)
2122{ 2156{
2123 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), 2157 struct page *page;
2124 HPAGE_PMD_ORDER); 2158
2159 page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
2160 if (page)
2161 prep_transhuge_page(page);
2162 return page;
2125} 2163}
2126 2164
2127static struct page *khugepaged_alloc_hugepage(bool *wait) 2165static struct page *khugepaged_alloc_hugepage(bool *wait)
@@ -3098,7 +3136,7 @@ static int __split_huge_page_tail(struct page *head, int tail,
3098 set_page_idle(page_tail); 3136 set_page_idle(page_tail);
3099 3137
3100 /* ->mapping in first tail page is compound_mapcount */ 3138 /* ->mapping in first tail page is compound_mapcount */
3101 VM_BUG_ON_PAGE(tail != 1 && page_tail->mapping != TAIL_MAPPING, 3139 VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
3102 page_tail); 3140 page_tail);
3103 page_tail->mapping = head->mapping; 3141 page_tail->mapping = head->mapping;
3104 3142
@@ -3207,12 +3245,20 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
3207 freeze_page(anon_vma, head); 3245 freeze_page(anon_vma, head);
3208 VM_BUG_ON_PAGE(compound_mapcount(head), head); 3246 VM_BUG_ON_PAGE(compound_mapcount(head), head);
3209 3247
3248 /* Prevent deferred_split_scan() touching ->_count */
3249 spin_lock(&split_queue_lock);
3210 count = page_count(head); 3250 count = page_count(head);
3211 mapcount = total_mapcount(head); 3251 mapcount = total_mapcount(head);
3212 if (mapcount == count - 1) { 3252 if (mapcount == count - 1) {
3253 if (!list_empty(page_deferred_list(head))) {
3254 split_queue_len--;
3255 list_del(page_deferred_list(head));
3256 }
3257 spin_unlock(&split_queue_lock);
3213 __split_huge_page(page, list); 3258 __split_huge_page(page, list);
3214 ret = 0; 3259 ret = 0;
3215 } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount > count - 1) { 3260 } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount > count - 1) {
3261 spin_unlock(&split_queue_lock);
3216 pr_alert("total_mapcount: %u, page_count(): %u\n", 3262 pr_alert("total_mapcount: %u, page_count(): %u\n",
3217 mapcount, count); 3263 mapcount, count);
3218 if (PageTail(page)) 3264 if (PageTail(page))
@@ -3220,6 +3266,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
3220 dump_page(page, "total_mapcount(head) > page_count(head) - 1"); 3266 dump_page(page, "total_mapcount(head) > page_count(head) - 1");
3221 BUG(); 3267 BUG();
3222 } else { 3268 } else {
3269 spin_unlock(&split_queue_lock);
3223 unfreeze_page(anon_vma, head); 3270 unfreeze_page(anon_vma, head);
3224 ret = -EBUSY; 3271 ret = -EBUSY;
3225 } 3272 }
@@ -3231,3 +3278,87 @@ out:
3231 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); 3278 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
3232 return ret; 3279 return ret;
3233} 3280}
3281
3282void free_transhuge_page(struct page *page)
3283{
3284 unsigned long flags;
3285
3286 spin_lock_irqsave(&split_queue_lock, flags);
3287 if (!list_empty(page_deferred_list(page))) {
3288 split_queue_len--;
3289 list_del(page_deferred_list(page));
3290 }
3291 spin_unlock_irqrestore(&split_queue_lock, flags);
3292 free_compound_page(page);
3293}
3294
3295void deferred_split_huge_page(struct page *page)
3296{
3297 unsigned long flags;
3298
3299 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3300
3301 spin_lock_irqsave(&split_queue_lock, flags);
3302 if (list_empty(page_deferred_list(page))) {
3303 list_add_tail(page_deferred_list(page), &split_queue);
3304 split_queue_len++;
3305 }
3306 spin_unlock_irqrestore(&split_queue_lock, flags);
3307}
3308
3309static unsigned long deferred_split_count(struct shrinker *shrink,
3310 struct shrink_control *sc)
3311{
3312 /*
3313 * Split a page from split_queue will free up at least one page,
3314 * at most HPAGE_PMD_NR - 1. We don't track exact number.
3315 * Let's use HPAGE_PMD_NR / 2 as ballpark.
3316 */
3317 return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
3318}
3319
3320static unsigned long deferred_split_scan(struct shrinker *shrink,
3321 struct shrink_control *sc)
3322{
3323 unsigned long flags;
3324 LIST_HEAD(list), *pos, *next;
3325 struct page *page;
3326 int split = 0;
3327
3328 spin_lock_irqsave(&split_queue_lock, flags);
3329 list_splice_init(&split_queue, &list);
3330
3331 /* Take pin on all head pages to avoid freeing them under us */
3332 list_for_each_safe(pos, next, &list) {
3333 page = list_entry((void *)pos, struct page, mapping);
3334 page = compound_head(page);
3335 /* race with put_compound_page() */
3336 if (!get_page_unless_zero(page)) {
3337 list_del_init(page_deferred_list(page));
3338 split_queue_len--;
3339 }
3340 }
3341 spin_unlock_irqrestore(&split_queue_lock, flags);
3342
3343 list_for_each_safe(pos, next, &list) {
3344 page = list_entry((void *)pos, struct page, mapping);
3345 lock_page(page);
3346 /* split_huge_page() removes page from list on success */
3347 if (!split_huge_page(page))
3348 split++;
3349 unlock_page(page);
3350 put_page(page);
3351 }
3352
3353 spin_lock_irqsave(&split_queue_lock, flags);
3354 list_splice_tail(&list, &split_queue);
3355 spin_unlock_irqrestore(&split_queue_lock, flags);
3356
3357 return split * HPAGE_PMD_NR / 2;
3358}
3359
3360static struct shrinker deferred_split_shrinker = {
3361 .count_objects = deferred_split_count,
3362 .scan_objects = deferred_split_scan,
3363 .seeks = DEFAULT_SEEKS,
3364};