aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKirill A. Shutemov <kirill.shutemov@linux.intel.com>2016-07-26 18:25:51 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-07-26 19:19:19 -0400
commitbaa355fd331424526e742d41d9b90d5f9d10f716 (patch)
tree762004078f781ba9fd053186dcb7d9925d3c41bd
parent37f9f5595c26d3cb644ca2fab83dc4c4db119f9f (diff)
thp: file pages support for split_huge_page()
Basic scheme is the same as for anon THP. Main differences: - File pages are on radix-tree, so we have head->_count offset by HPAGE_PMD_NR. The count got distributed to small pages during split. - mapping->tree_lock prevents non-lockless access to pages under split over radix-tree; - Lockless access is prevented by setting the head->_count to 0 during split; - After split, some pages can be beyond i_size. We drop them from radix-tree. - We don't setup migration entries. Just unmap pages. It helps handling cases when i_size is in the middle of the page: no need handle unmap pages beyond i_size manually. Link: http://lkml.kernel.org/r/1466021202-61880-20-git-send-email-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/gup.c2
-rw-r--r--mm/huge_memory.c160
2 files changed, 117 insertions, 45 deletions
diff --git a/mm/gup.c b/mm/gup.c
index 9671e29f8ffd..547741f5f7a7 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -288,6 +288,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
288 ret = split_huge_page(page); 288 ret = split_huge_page(page);
289 unlock_page(page); 289 unlock_page(page);
290 put_page(page); 290 put_page(page);
291 if (pmd_none(*pmd))
292 return no_page_table(vma, flags);
291 } 293 }
292 294
293 return ret ? ERR_PTR(ret) : 295 return ret ? ERR_PTR(ret) :
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3a20f11248a1..486077742650 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -30,6 +30,7 @@
30#include <linux/hashtable.h> 30#include <linux/hashtable.h>
31#include <linux/userfaultfd_k.h> 31#include <linux/userfaultfd_k.h>
32#include <linux/page_idle.h> 32#include <linux/page_idle.h>
33#include <linux/shmem_fs.h>
33 34
34#include <asm/tlb.h> 35#include <asm/tlb.h>
35#include <asm/pgalloc.h> 36#include <asm/pgalloc.h>
@@ -3187,12 +3188,15 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
3187 3188
3188static void freeze_page(struct page *page) 3189static void freeze_page(struct page *page)
3189{ 3190{
3190 enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | 3191 enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
3191 TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED; 3192 TTU_RMAP_LOCKED;
3192 int i, ret; 3193 int i, ret;
3193 3194
3194 VM_BUG_ON_PAGE(!PageHead(page), page); 3195 VM_BUG_ON_PAGE(!PageHead(page), page);
3195 3196
3197 if (PageAnon(page))
3198 ttu_flags |= TTU_MIGRATION;
3199
3196 /* We only need TTU_SPLIT_HUGE_PMD once */ 3200 /* We only need TTU_SPLIT_HUGE_PMD once */
3197 ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD); 3201 ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD);
3198 for (i = 1; !ret && i < HPAGE_PMD_NR; i++) { 3202 for (i = 1; !ret && i < HPAGE_PMD_NR; i++) {
@@ -3202,7 +3206,7 @@ static void freeze_page(struct page *page)
3202 3206
3203 ret = try_to_unmap(page + i, ttu_flags); 3207 ret = try_to_unmap(page + i, ttu_flags);
3204 } 3208 }
3205 VM_BUG_ON(ret); 3209 VM_BUG_ON_PAGE(ret, page + i - 1);
3206} 3210}
3207 3211
3208static void unfreeze_page(struct page *page) 3212static void unfreeze_page(struct page *page)
@@ -3224,15 +3228,20 @@ static void __split_huge_page_tail(struct page *head, int tail,
3224 /* 3228 /*
3225 * tail_page->_refcount is zero and not changing from under us. But 3229 * tail_page->_refcount is zero and not changing from under us. But
3226 * get_page_unless_zero() may be running from under us on the 3230 * get_page_unless_zero() may be running from under us on the
3227 * tail_page. If we used atomic_set() below instead of atomic_inc(), we 3231 * tail_page. If we used atomic_set() below instead of atomic_inc() or
3228 * would then run atomic_set() concurrently with 3232 * atomic_add(), we would then run atomic_set() concurrently with
3229 * get_page_unless_zero(), and atomic_set() is implemented in C not 3233 * get_page_unless_zero(), and atomic_set() is implemented in C not
3230 * using locked ops. spin_unlock on x86 sometime uses locked ops 3234 * using locked ops. spin_unlock on x86 sometime uses locked ops
3231 * because of PPro errata 66, 92, so unless somebody can guarantee 3235 * because of PPro errata 66, 92, so unless somebody can guarantee
3232 * atomic_set() here would be safe on all archs (and not only on x86), 3236 * atomic_set() here would be safe on all archs (and not only on x86),
3233 * it's safer to use atomic_inc(). 3237 * it's safer to use atomic_inc()/atomic_add().
3234 */ 3238 */
3235 page_ref_inc(page_tail); 3239 if (PageAnon(head)) {
3240 page_ref_inc(page_tail);
3241 } else {
3242 /* Additional pin to radix tree */
3243 page_ref_add(page_tail, 2);
3244 }
3236 3245
3237 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 3246 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
3238 page_tail->flags |= (head->flags & 3247 page_tail->flags |= (head->flags &
@@ -3268,25 +3277,44 @@ static void __split_huge_page_tail(struct page *head, int tail,
3268 lru_add_page_tail(head, page_tail, lruvec, list); 3277 lru_add_page_tail(head, page_tail, lruvec, list);
3269} 3278}
3270 3279
3271static void __split_huge_page(struct page *page, struct list_head *list) 3280static void __split_huge_page(struct page *page, struct list_head *list,
3281 unsigned long flags)
3272{ 3282{
3273 struct page *head = compound_head(page); 3283 struct page *head = compound_head(page);
3274 struct zone *zone = page_zone(head); 3284 struct zone *zone = page_zone(head);
3275 struct lruvec *lruvec; 3285 struct lruvec *lruvec;
3286 pgoff_t end = -1;
3276 int i; 3287 int i;
3277 3288
3278 /* prevent PageLRU to go away from under us, and freeze lru stats */
3279 spin_lock_irq(&zone->lru_lock);
3280 lruvec = mem_cgroup_page_lruvec(head, zone); 3289 lruvec = mem_cgroup_page_lruvec(head, zone);
3281 3290
3282 /* complete memcg works before add pages to LRU */ 3291 /* complete memcg works before add pages to LRU */
3283 mem_cgroup_split_huge_fixup(head); 3292 mem_cgroup_split_huge_fixup(head);
3284 3293
3285 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) 3294 if (!PageAnon(page))
3295 end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE);
3296
3297 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
3286 __split_huge_page_tail(head, i, lruvec, list); 3298 __split_huge_page_tail(head, i, lruvec, list);
3299 /* Some pages can be beyond i_size: drop them from page cache */
3300 if (head[i].index >= end) {
3301 __ClearPageDirty(head + i);
3302 __delete_from_page_cache(head + i, NULL);
3303 put_page(head + i);
3304 }
3305 }
3287 3306
3288 ClearPageCompound(head); 3307 ClearPageCompound(head);
3289 spin_unlock_irq(&zone->lru_lock); 3308 /* See comment in __split_huge_page_tail() */
3309 if (PageAnon(head)) {
3310 page_ref_inc(head);
3311 } else {
3312 /* Additional pin to radix tree */
3313 page_ref_add(head, 2);
3314 spin_unlock(&head->mapping->tree_lock);
3315 }
3316
3317 spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags);
3290 3318
3291 unfreeze_page(head); 3319 unfreeze_page(head);
3292 3320
@@ -3411,36 +3439,54 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
3411{ 3439{
3412 struct page *head = compound_head(page); 3440 struct page *head = compound_head(page);
3413 struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); 3441 struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
3414 struct anon_vma *anon_vma; 3442 struct anon_vma *anon_vma = NULL;
3415 int count, mapcount, ret; 3443 struct address_space *mapping = NULL;
3444 int count, mapcount, extra_pins, ret;
3416 bool mlocked; 3445 bool mlocked;
3417 unsigned long flags; 3446 unsigned long flags;
3418 3447
3419 VM_BUG_ON_PAGE(is_huge_zero_page(page), page); 3448 VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
3420 VM_BUG_ON_PAGE(!PageAnon(page), page);
3421 VM_BUG_ON_PAGE(!PageLocked(page), page); 3449 VM_BUG_ON_PAGE(!PageLocked(page), page);
3422 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 3450 VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
3423 VM_BUG_ON_PAGE(!PageCompound(page), page); 3451 VM_BUG_ON_PAGE(!PageCompound(page), page);
3424 3452
3425 /* 3453 if (PageAnon(head)) {
3426 * The caller does not necessarily hold an mmap_sem that would prevent 3454 /*
3427 * the anon_vma disappearing so we first we take a reference to it 3455 * The caller does not necessarily hold an mmap_sem that would
3428 * and then lock the anon_vma for write. This is similar to 3456 * prevent the anon_vma disappearing so we first we take a
3429 * page_lock_anon_vma_read except the write lock is taken to serialise 3457 * reference to it and then lock the anon_vma for write. This
3430 * against parallel split or collapse operations. 3458 * is similar to page_lock_anon_vma_read except the write lock
3431 */ 3459 * is taken to serialise against parallel split or collapse
3432 anon_vma = page_get_anon_vma(head); 3460 * operations.
3433 if (!anon_vma) { 3461 */
3434 ret = -EBUSY; 3462 anon_vma = page_get_anon_vma(head);
3435 goto out; 3463 if (!anon_vma) {
3464 ret = -EBUSY;
3465 goto out;
3466 }
3467 extra_pins = 0;
3468 mapping = NULL;
3469 anon_vma_lock_write(anon_vma);
3470 } else {
3471 mapping = head->mapping;
3472
3473 /* Truncated ? */
3474 if (!mapping) {
3475 ret = -EBUSY;
3476 goto out;
3477 }
3478
3479 /* Addidional pins from radix tree */
3480 extra_pins = HPAGE_PMD_NR;
3481 anon_vma = NULL;
3482 i_mmap_lock_read(mapping);
3436 } 3483 }
3437 anon_vma_lock_write(anon_vma);
3438 3484
3439 /* 3485 /*
3440 * Racy check if we can split the page, before freeze_page() will 3486 * Racy check if we can split the page, before freeze_page() will
3441 * split PMDs 3487 * split PMDs
3442 */ 3488 */
3443 if (total_mapcount(head) != page_count(head) - 1) { 3489 if (total_mapcount(head) != page_count(head) - extra_pins - 1) {
3444 ret = -EBUSY; 3490 ret = -EBUSY;
3445 goto out_unlock; 3491 goto out_unlock;
3446 } 3492 }
@@ -3453,35 +3499,60 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
3453 if (mlocked) 3499 if (mlocked)
3454 lru_add_drain(); 3500 lru_add_drain();
3455 3501
3502 /* prevent PageLRU to go away from under us, and freeze lru stats */
3503 spin_lock_irqsave(&page_zone(head)->lru_lock, flags);
3504
3505 if (mapping) {
3506 void **pslot;
3507
3508 spin_lock(&mapping->tree_lock);
3509 pslot = radix_tree_lookup_slot(&mapping->page_tree,
3510 page_index(head));
3511 /*
3512 * Check if the head page is present in radix tree.
3513 * We assume all tail are present too, if head is there.
3514 */
3515 if (radix_tree_deref_slot_protected(pslot,
3516 &mapping->tree_lock) != head)
3517 goto fail;
3518 }
3519
3456 /* Prevent deferred_split_scan() touching ->_refcount */ 3520 /* Prevent deferred_split_scan() touching ->_refcount */
3457 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 3521 spin_lock(&pgdata->split_queue_lock);
3458 count = page_count(head); 3522 count = page_count(head);
3459 mapcount = total_mapcount(head); 3523 mapcount = total_mapcount(head);
3460 if (!mapcount && count == 1) { 3524 if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
3461 if (!list_empty(page_deferred_list(head))) { 3525 if (!list_empty(page_deferred_list(head))) {
3462 pgdata->split_queue_len--; 3526 pgdata->split_queue_len--;
3463 list_del(page_deferred_list(head)); 3527 list_del(page_deferred_list(head));
3464 } 3528 }
3465 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3529 spin_unlock(&pgdata->split_queue_lock);
3466 __split_huge_page(page, list); 3530 __split_huge_page(page, list, flags);
3467 ret = 0; 3531 ret = 0;
3468 } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
3469 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3470 pr_alert("total_mapcount: %u, page_count(): %u\n",
3471 mapcount, count);
3472 if (PageTail(page))
3473 dump_page(head, NULL);
3474 dump_page(page, "total_mapcount(head) > 0");
3475 BUG();
3476 } else { 3532 } else {
3477 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 3533 if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
3534 pr_alert("total_mapcount: %u, page_count(): %u\n",
3535 mapcount, count);
3536 if (PageTail(page))
3537 dump_page(head, NULL);
3538 dump_page(page, "total_mapcount(head) > 0");
3539 BUG();
3540 }
3541 spin_unlock(&pgdata->split_queue_lock);
3542fail: if (mapping)
3543 spin_unlock(&mapping->tree_lock);
3544 spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags);
3478 unfreeze_page(head); 3545 unfreeze_page(head);
3479 ret = -EBUSY; 3546 ret = -EBUSY;
3480 } 3547 }
3481 3548
3482out_unlock: 3549out_unlock:
3483 anon_vma_unlock_write(anon_vma); 3550 if (anon_vma) {
3484 put_anon_vma(anon_vma); 3551 anon_vma_unlock_write(anon_vma);
3552 put_anon_vma(anon_vma);
3553 }
3554 if (mapping)
3555 i_mmap_unlock_read(mapping);
3485out: 3556out:
3486 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); 3557 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
3487 return ret; 3558 return ret;
@@ -3604,8 +3675,7 @@ static int split_huge_pages_set(void *data, u64 val)
3604 if (zone != page_zone(page)) 3675 if (zone != page_zone(page))
3605 goto next; 3676 goto next;
3606 3677
3607 if (!PageHead(page) || !PageAnon(page) || 3678 if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
3608 PageHuge(page))
3609 goto next; 3679 goto next;
3610 3680
3611 total++; 3681 total++;