diff options
author | Kirill A. Shutemov <kirill.shutemov@linux.intel.com> | 2016-07-26 18:25:51 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-26 19:19:19 -0400 |
commit | baa355fd331424526e742d41d9b90d5f9d10f716 (patch) | |
tree | 762004078f781ba9fd053186dcb7d9925d3c41bd | |
parent | 37f9f5595c26d3cb644ca2fab83dc4c4db119f9f (diff) |
thp: file pages support for split_huge_page()
Basic scheme is the same as for anon THP.
Main differences:
- File pages are on radix-tree, so we have head->_count offset by
HPAGE_PMD_NR. The count got distributed to small pages during split.
- mapping->tree_lock prevents non-lockless access to pages under split
over radix-tree;
- Lockless access is prevented by setting the head->_count to 0 during
split;
- After split, some pages can be beyond i_size. We drop them from
radix-tree.
- We don't setup migration entries. Just unmap pages. It helps
handling cases when i_size is in the middle of the page: no need
handle unmap pages beyond i_size manually.
Link: http://lkml.kernel.org/r/1466021202-61880-20-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/gup.c | 2 | ||||
-rw-r--r-- | mm/huge_memory.c | 160 |
2 files changed, 117 insertions, 45 deletions
@@ -288,6 +288,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
288 | ret = split_huge_page(page); | 288 | ret = split_huge_page(page); |
289 | unlock_page(page); | 289 | unlock_page(page); |
290 | put_page(page); | 290 | put_page(page); |
291 | if (pmd_none(*pmd)) | ||
292 | return no_page_table(vma, flags); | ||
291 | } | 293 | } |
292 | 294 | ||
293 | return ret ? ERR_PTR(ret) : | 295 | return ret ? ERR_PTR(ret) : |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3a20f11248a1..486077742650 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/hashtable.h> | 30 | #include <linux/hashtable.h> |
31 | #include <linux/userfaultfd_k.h> | 31 | #include <linux/userfaultfd_k.h> |
32 | #include <linux/page_idle.h> | 32 | #include <linux/page_idle.h> |
33 | #include <linux/shmem_fs.h> | ||
33 | 34 | ||
34 | #include <asm/tlb.h> | 35 | #include <asm/tlb.h> |
35 | #include <asm/pgalloc.h> | 36 | #include <asm/pgalloc.h> |
@@ -3187,12 +3188,15 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, | |||
3187 | 3188 | ||
3188 | static void freeze_page(struct page *page) | 3189 | static void freeze_page(struct page *page) |
3189 | { | 3190 | { |
3190 | enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | | 3191 | enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | |
3191 | TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED; | 3192 | TTU_RMAP_LOCKED; |
3192 | int i, ret; | 3193 | int i, ret; |
3193 | 3194 | ||
3194 | VM_BUG_ON_PAGE(!PageHead(page), page); | 3195 | VM_BUG_ON_PAGE(!PageHead(page), page); |
3195 | 3196 | ||
3197 | if (PageAnon(page)) | ||
3198 | ttu_flags |= TTU_MIGRATION; | ||
3199 | |||
3196 | /* We only need TTU_SPLIT_HUGE_PMD once */ | 3200 | /* We only need TTU_SPLIT_HUGE_PMD once */ |
3197 | ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD); | 3201 | ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD); |
3198 | for (i = 1; !ret && i < HPAGE_PMD_NR; i++) { | 3202 | for (i = 1; !ret && i < HPAGE_PMD_NR; i++) { |
@@ -3202,7 +3206,7 @@ static void freeze_page(struct page *page) | |||
3202 | 3206 | ||
3203 | ret = try_to_unmap(page + i, ttu_flags); | 3207 | ret = try_to_unmap(page + i, ttu_flags); |
3204 | } | 3208 | } |
3205 | VM_BUG_ON(ret); | 3209 | VM_BUG_ON_PAGE(ret, page + i - 1); |
3206 | } | 3210 | } |
3207 | 3211 | ||
3208 | static void unfreeze_page(struct page *page) | 3212 | static void unfreeze_page(struct page *page) |
@@ -3224,15 +3228,20 @@ static void __split_huge_page_tail(struct page *head, int tail, | |||
3224 | /* | 3228 | /* |
3225 | * tail_page->_refcount is zero and not changing from under us. But | 3229 | * tail_page->_refcount is zero and not changing from under us. But |
3226 | * get_page_unless_zero() may be running from under us on the | 3230 | * get_page_unless_zero() may be running from under us on the |
3227 | * tail_page. If we used atomic_set() below instead of atomic_inc(), we | 3231 | * tail_page. If we used atomic_set() below instead of atomic_inc() or |
3228 | * would then run atomic_set() concurrently with | 3232 | * atomic_add(), we would then run atomic_set() concurrently with |
3229 | * get_page_unless_zero(), and atomic_set() is implemented in C not | 3233 | * get_page_unless_zero(), and atomic_set() is implemented in C not |
3230 | * using locked ops. spin_unlock on x86 sometime uses locked ops | 3234 | * using locked ops. spin_unlock on x86 sometime uses locked ops |
3231 | * because of PPro errata 66, 92, so unless somebody can guarantee | 3235 | * because of PPro errata 66, 92, so unless somebody can guarantee |
3232 | * atomic_set() here would be safe on all archs (and not only on x86), | 3236 | * atomic_set() here would be safe on all archs (and not only on x86), |
3233 | * it's safer to use atomic_inc(). | 3237 | * it's safer to use atomic_inc()/atomic_add(). |
3234 | */ | 3238 | */ |
3235 | page_ref_inc(page_tail); | 3239 | if (PageAnon(head)) { |
3240 | page_ref_inc(page_tail); | ||
3241 | } else { | ||
3242 | /* Additional pin to radix tree */ | ||
3243 | page_ref_add(page_tail, 2); | ||
3244 | } | ||
3236 | 3245 | ||
3237 | page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | 3246 | page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
3238 | page_tail->flags |= (head->flags & | 3247 | page_tail->flags |= (head->flags & |
@@ -3268,25 +3277,44 @@ static void __split_huge_page_tail(struct page *head, int tail, | |||
3268 | lru_add_page_tail(head, page_tail, lruvec, list); | 3277 | lru_add_page_tail(head, page_tail, lruvec, list); |
3269 | } | 3278 | } |
3270 | 3279 | ||
3271 | static void __split_huge_page(struct page *page, struct list_head *list) | 3280 | static void __split_huge_page(struct page *page, struct list_head *list, |
3281 | unsigned long flags) | ||
3272 | { | 3282 | { |
3273 | struct page *head = compound_head(page); | 3283 | struct page *head = compound_head(page); |
3274 | struct zone *zone = page_zone(head); | 3284 | struct zone *zone = page_zone(head); |
3275 | struct lruvec *lruvec; | 3285 | struct lruvec *lruvec; |
3286 | pgoff_t end = -1; | ||
3276 | int i; | 3287 | int i; |
3277 | 3288 | ||
3278 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | ||
3279 | spin_lock_irq(&zone->lru_lock); | ||
3280 | lruvec = mem_cgroup_page_lruvec(head, zone); | 3289 | lruvec = mem_cgroup_page_lruvec(head, zone); |
3281 | 3290 | ||
3282 | /* complete memcg works before add pages to LRU */ | 3291 | /* complete memcg works before add pages to LRU */ |
3283 | mem_cgroup_split_huge_fixup(head); | 3292 | mem_cgroup_split_huge_fixup(head); |
3284 | 3293 | ||
3285 | for (i = HPAGE_PMD_NR - 1; i >= 1; i--) | 3294 | if (!PageAnon(page)) |
3295 | end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE); | ||
3296 | |||
3297 | for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { | ||
3286 | __split_huge_page_tail(head, i, lruvec, list); | 3298 | __split_huge_page_tail(head, i, lruvec, list); |
3299 | /* Some pages can be beyond i_size: drop them from page cache */ | ||
3300 | if (head[i].index >= end) { | ||
3301 | __ClearPageDirty(head + i); | ||
3302 | __delete_from_page_cache(head + i, NULL); | ||
3303 | put_page(head + i); | ||
3304 | } | ||
3305 | } | ||
3287 | 3306 | ||
3288 | ClearPageCompound(head); | 3307 | ClearPageCompound(head); |
3289 | spin_unlock_irq(&zone->lru_lock); | 3308 | /* See comment in __split_huge_page_tail() */ |
3309 | if (PageAnon(head)) { | ||
3310 | page_ref_inc(head); | ||
3311 | } else { | ||
3312 | /* Additional pin to radix tree */ | ||
3313 | page_ref_add(head, 2); | ||
3314 | spin_unlock(&head->mapping->tree_lock); | ||
3315 | } | ||
3316 | |||
3317 | spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags); | ||
3290 | 3318 | ||
3291 | unfreeze_page(head); | 3319 | unfreeze_page(head); |
3292 | 3320 | ||
@@ -3411,36 +3439,54 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
3411 | { | 3439 | { |
3412 | struct page *head = compound_head(page); | 3440 | struct page *head = compound_head(page); |
3413 | struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); | 3441 | struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); |
3414 | struct anon_vma *anon_vma; | 3442 | struct anon_vma *anon_vma = NULL; |
3415 | int count, mapcount, ret; | 3443 | struct address_space *mapping = NULL; |
3444 | int count, mapcount, extra_pins, ret; | ||
3416 | bool mlocked; | 3445 | bool mlocked; |
3417 | unsigned long flags; | 3446 | unsigned long flags; |
3418 | 3447 | ||
3419 | VM_BUG_ON_PAGE(is_huge_zero_page(page), page); | 3448 | VM_BUG_ON_PAGE(is_huge_zero_page(page), page); |
3420 | VM_BUG_ON_PAGE(!PageAnon(page), page); | ||
3421 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 3449 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
3422 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); | 3450 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); |
3423 | VM_BUG_ON_PAGE(!PageCompound(page), page); | 3451 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
3424 | 3452 | ||
3425 | /* | 3453 | if (PageAnon(head)) { |
3426 | * The caller does not necessarily hold an mmap_sem that would prevent | 3454 | /* |
3427 | * the anon_vma disappearing so we first we take a reference to it | 3455 | * The caller does not necessarily hold an mmap_sem that would |
3428 | * and then lock the anon_vma for write. This is similar to | 3456 | * prevent the anon_vma disappearing so we first we take a |
3429 | * page_lock_anon_vma_read except the write lock is taken to serialise | 3457 | * reference to it and then lock the anon_vma for write. This |
3430 | * against parallel split or collapse operations. | 3458 | * is similar to page_lock_anon_vma_read except the write lock |
3431 | */ | 3459 | * is taken to serialise against parallel split or collapse |
3432 | anon_vma = page_get_anon_vma(head); | 3460 | * operations. |
3433 | if (!anon_vma) { | 3461 | */ |
3434 | ret = -EBUSY; | 3462 | anon_vma = page_get_anon_vma(head); |
3435 | goto out; | 3463 | if (!anon_vma) { |
3464 | ret = -EBUSY; | ||
3465 | goto out; | ||
3466 | } | ||
3467 | extra_pins = 0; | ||
3468 | mapping = NULL; | ||
3469 | anon_vma_lock_write(anon_vma); | ||
3470 | } else { | ||
3471 | mapping = head->mapping; | ||
3472 | |||
3473 | /* Truncated ? */ | ||
3474 | if (!mapping) { | ||
3475 | ret = -EBUSY; | ||
3476 | goto out; | ||
3477 | } | ||
3478 | |||
3479 | /* Addidional pins from radix tree */ | ||
3480 | extra_pins = HPAGE_PMD_NR; | ||
3481 | anon_vma = NULL; | ||
3482 | i_mmap_lock_read(mapping); | ||
3436 | } | 3483 | } |
3437 | anon_vma_lock_write(anon_vma); | ||
3438 | 3484 | ||
3439 | /* | 3485 | /* |
3440 | * Racy check if we can split the page, before freeze_page() will | 3486 | * Racy check if we can split the page, before freeze_page() will |
3441 | * split PMDs | 3487 | * split PMDs |
3442 | */ | 3488 | */ |
3443 | if (total_mapcount(head) != page_count(head) - 1) { | 3489 | if (total_mapcount(head) != page_count(head) - extra_pins - 1) { |
3444 | ret = -EBUSY; | 3490 | ret = -EBUSY; |
3445 | goto out_unlock; | 3491 | goto out_unlock; |
3446 | } | 3492 | } |
@@ -3453,35 +3499,60 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
3453 | if (mlocked) | 3499 | if (mlocked) |
3454 | lru_add_drain(); | 3500 | lru_add_drain(); |
3455 | 3501 | ||
3502 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | ||
3503 | spin_lock_irqsave(&page_zone(head)->lru_lock, flags); | ||
3504 | |||
3505 | if (mapping) { | ||
3506 | void **pslot; | ||
3507 | |||
3508 | spin_lock(&mapping->tree_lock); | ||
3509 | pslot = radix_tree_lookup_slot(&mapping->page_tree, | ||
3510 | page_index(head)); | ||
3511 | /* | ||
3512 | * Check if the head page is present in radix tree. | ||
3513 | * We assume all tail are present too, if head is there. | ||
3514 | */ | ||
3515 | if (radix_tree_deref_slot_protected(pslot, | ||
3516 | &mapping->tree_lock) != head) | ||
3517 | goto fail; | ||
3518 | } | ||
3519 | |||
3456 | /* Prevent deferred_split_scan() touching ->_refcount */ | 3520 | /* Prevent deferred_split_scan() touching ->_refcount */ |
3457 | spin_lock_irqsave(&pgdata->split_queue_lock, flags); | 3521 | spin_lock(&pgdata->split_queue_lock); |
3458 | count = page_count(head); | 3522 | count = page_count(head); |
3459 | mapcount = total_mapcount(head); | 3523 | mapcount = total_mapcount(head); |
3460 | if (!mapcount && count == 1) { | 3524 | if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { |
3461 | if (!list_empty(page_deferred_list(head))) { | 3525 | if (!list_empty(page_deferred_list(head))) { |
3462 | pgdata->split_queue_len--; | 3526 | pgdata->split_queue_len--; |
3463 | list_del(page_deferred_list(head)); | 3527 | list_del(page_deferred_list(head)); |
3464 | } | 3528 | } |
3465 | spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); | 3529 | spin_unlock(&pgdata->split_queue_lock); |
3466 | __split_huge_page(page, list); | 3530 | __split_huge_page(page, list, flags); |
3467 | ret = 0; | 3531 | ret = 0; |
3468 | } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { | ||
3469 | spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); | ||
3470 | pr_alert("total_mapcount: %u, page_count(): %u\n", | ||
3471 | mapcount, count); | ||
3472 | if (PageTail(page)) | ||
3473 | dump_page(head, NULL); | ||
3474 | dump_page(page, "total_mapcount(head) > 0"); | ||
3475 | BUG(); | ||
3476 | } else { | 3532 | } else { |
3477 | spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); | 3533 | if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { |
3534 | pr_alert("total_mapcount: %u, page_count(): %u\n", | ||
3535 | mapcount, count); | ||
3536 | if (PageTail(page)) | ||
3537 | dump_page(head, NULL); | ||
3538 | dump_page(page, "total_mapcount(head) > 0"); | ||
3539 | BUG(); | ||
3540 | } | ||
3541 | spin_unlock(&pgdata->split_queue_lock); | ||
3542 | fail: if (mapping) | ||
3543 | spin_unlock(&mapping->tree_lock); | ||
3544 | spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags); | ||
3478 | unfreeze_page(head); | 3545 | unfreeze_page(head); |
3479 | ret = -EBUSY; | 3546 | ret = -EBUSY; |
3480 | } | 3547 | } |
3481 | 3548 | ||
3482 | out_unlock: | 3549 | out_unlock: |
3483 | anon_vma_unlock_write(anon_vma); | 3550 | if (anon_vma) { |
3484 | put_anon_vma(anon_vma); | 3551 | anon_vma_unlock_write(anon_vma); |
3552 | put_anon_vma(anon_vma); | ||
3553 | } | ||
3554 | if (mapping) | ||
3555 | i_mmap_unlock_read(mapping); | ||
3485 | out: | 3556 | out: |
3486 | count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); | 3557 | count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); |
3487 | return ret; | 3558 | return ret; |
@@ -3604,8 +3675,7 @@ static int split_huge_pages_set(void *data, u64 val) | |||
3604 | if (zone != page_zone(page)) | 3675 | if (zone != page_zone(page)) |
3605 | goto next; | 3676 | goto next; |
3606 | 3677 | ||
3607 | if (!PageHead(page) || !PageAnon(page) || | 3678 | if (!PageHead(page) || PageHuge(page) || !PageLRU(page)) |
3608 | PageHuge(page)) | ||
3609 | goto next; | 3679 | goto next; |
3610 | 3680 | ||
3611 | total++; | 3681 | total++; |