diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
commit | c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch) | |
tree | ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /mm/hugetlb.c | |
parent | ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff) | |
parent | 6a00f206debf8a5c8899055726ad127dbeeed098 (diff) |
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts:
litmus/sched_cedf.c
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 338 |
1 files changed, 201 insertions, 137 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c03273807182..bfcf153bc829 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -146,7 +146,7 @@ static long region_chg(struct list_head *head, long f, long t) | |||
146 | if (rg->from > t) | 146 | if (rg->from > t) |
147 | return chg; | 147 | return chg; |
148 | 148 | ||
149 | /* We overlap with this area, if it extends futher than | 149 | /* We overlap with this area, if it extends further than |
150 | * us then we must extend ourselves. Account for its | 150 | * us then we must extend ourselves. Account for its |
151 | * existing reservation. */ | 151 | * existing reservation. */ |
152 | if (rg->to > t) { | 152 | if (rg->to > t) { |
@@ -394,67 +394,37 @@ static int vma_has_reserves(struct vm_area_struct *vma) | |||
394 | return 0; | 394 | return 0; |
395 | } | 395 | } |
396 | 396 | ||
397 | static void clear_gigantic_page(struct page *page, | 397 | static void copy_gigantic_page(struct page *dst, struct page *src) |
398 | unsigned long addr, unsigned long sz) | ||
399 | { | 398 | { |
400 | int i; | 399 | int i; |
401 | struct page *p = page; | 400 | struct hstate *h = page_hstate(src); |
402 | |||
403 | might_sleep(); | ||
404 | for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) { | ||
405 | cond_resched(); | ||
406 | clear_user_highpage(p, addr + i * PAGE_SIZE); | ||
407 | } | ||
408 | } | ||
409 | static void clear_huge_page(struct page *page, | ||
410 | unsigned long addr, unsigned long sz) | ||
411 | { | ||
412 | int i; | ||
413 | |||
414 | if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { | ||
415 | clear_gigantic_page(page, addr, sz); | ||
416 | return; | ||
417 | } | ||
418 | |||
419 | might_sleep(); | ||
420 | for (i = 0; i < sz/PAGE_SIZE; i++) { | ||
421 | cond_resched(); | ||
422 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); | ||
423 | } | ||
424 | } | ||
425 | |||
426 | static void copy_gigantic_page(struct page *dst, struct page *src, | ||
427 | unsigned long addr, struct vm_area_struct *vma) | ||
428 | { | ||
429 | int i; | ||
430 | struct hstate *h = hstate_vma(vma); | ||
431 | struct page *dst_base = dst; | 401 | struct page *dst_base = dst; |
432 | struct page *src_base = src; | 402 | struct page *src_base = src; |
433 | might_sleep(); | 403 | |
434 | for (i = 0; i < pages_per_huge_page(h); ) { | 404 | for (i = 0; i < pages_per_huge_page(h); ) { |
435 | cond_resched(); | 405 | cond_resched(); |
436 | copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); | 406 | copy_highpage(dst, src); |
437 | 407 | ||
438 | i++; | 408 | i++; |
439 | dst = mem_map_next(dst, dst_base, i); | 409 | dst = mem_map_next(dst, dst_base, i); |
440 | src = mem_map_next(src, src_base, i); | 410 | src = mem_map_next(src, src_base, i); |
441 | } | 411 | } |
442 | } | 412 | } |
443 | static void copy_huge_page(struct page *dst, struct page *src, | 413 | |
444 | unsigned long addr, struct vm_area_struct *vma) | 414 | void copy_huge_page(struct page *dst, struct page *src) |
445 | { | 415 | { |
446 | int i; | 416 | int i; |
447 | struct hstate *h = hstate_vma(vma); | 417 | struct hstate *h = page_hstate(src); |
448 | 418 | ||
449 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { | 419 | if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { |
450 | copy_gigantic_page(dst, src, addr, vma); | 420 | copy_gigantic_page(dst, src); |
451 | return; | 421 | return; |
452 | } | 422 | } |
453 | 423 | ||
454 | might_sleep(); | 424 | might_sleep(); |
455 | for (i = 0; i < pages_per_huge_page(h); i++) { | 425 | for (i = 0; i < pages_per_huge_page(h); i++) { |
456 | cond_resched(); | 426 | cond_resched(); |
457 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); | 427 | copy_highpage(dst + i, src + i); |
458 | } | 428 | } |
459 | } | 429 | } |
460 | 430 | ||
@@ -466,11 +436,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) | |||
466 | h->free_huge_pages_node[nid]++; | 436 | h->free_huge_pages_node[nid]++; |
467 | } | 437 | } |
468 | 438 | ||
439 | static struct page *dequeue_huge_page_node(struct hstate *h, int nid) | ||
440 | { | ||
441 | struct page *page; | ||
442 | |||
443 | if (list_empty(&h->hugepage_freelists[nid])) | ||
444 | return NULL; | ||
445 | page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); | ||
446 | list_del(&page->lru); | ||
447 | set_page_refcounted(page); | ||
448 | h->free_huge_pages--; | ||
449 | h->free_huge_pages_node[nid]--; | ||
450 | return page; | ||
451 | } | ||
452 | |||
469 | static struct page *dequeue_huge_page_vma(struct hstate *h, | 453 | static struct page *dequeue_huge_page_vma(struct hstate *h, |
470 | struct vm_area_struct *vma, | 454 | struct vm_area_struct *vma, |
471 | unsigned long address, int avoid_reserve) | 455 | unsigned long address, int avoid_reserve) |
472 | { | 456 | { |
473 | int nid; | ||
474 | struct page *page = NULL; | 457 | struct page *page = NULL; |
475 | struct mempolicy *mpol; | 458 | struct mempolicy *mpol; |
476 | nodemask_t *nodemask; | 459 | nodemask_t *nodemask; |
@@ -492,23 +475,17 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
492 | 475 | ||
493 | /* If reserves cannot be used, ensure enough pages are in the pool */ | 476 | /* If reserves cannot be used, ensure enough pages are in the pool */ |
494 | if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) | 477 | if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) |
495 | goto err;; | 478 | goto err; |
496 | 479 | ||
497 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 480 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
498 | MAX_NR_ZONES - 1, nodemask) { | 481 | MAX_NR_ZONES - 1, nodemask) { |
499 | nid = zone_to_nid(zone); | 482 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { |
500 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && | 483 | page = dequeue_huge_page_node(h, zone_to_nid(zone)); |
501 | !list_empty(&h->hugepage_freelists[nid])) { | 484 | if (page) { |
502 | page = list_entry(h->hugepage_freelists[nid].next, | 485 | if (!avoid_reserve) |
503 | struct page, lru); | 486 | decrement_hugepage_resv_vma(h, vma); |
504 | list_del(&page->lru); | 487 | break; |
505 | h->free_huge_pages--; | 488 | } |
506 | h->free_huge_pages_node[nid]--; | ||
507 | |||
508 | if (!avoid_reserve) | ||
509 | decrement_hugepage_resv_vma(h, vma); | ||
510 | |||
511 | break; | ||
512 | } | 489 | } |
513 | } | 490 | } |
514 | err: | 491 | err: |
@@ -770,11 +747,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, | |||
770 | return ret; | 747 | return ret; |
771 | } | 748 | } |
772 | 749 | ||
773 | static struct page *alloc_buddy_huge_page(struct hstate *h, | 750 | static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) |
774 | struct vm_area_struct *vma, unsigned long address) | ||
775 | { | 751 | { |
776 | struct page *page; | 752 | struct page *page; |
777 | unsigned int nid; | 753 | unsigned int r_nid; |
778 | 754 | ||
779 | if (h->order >= MAX_ORDER) | 755 | if (h->order >= MAX_ORDER) |
780 | return NULL; | 756 | return NULL; |
@@ -812,9 +788,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
812 | } | 788 | } |
813 | spin_unlock(&hugetlb_lock); | 789 | spin_unlock(&hugetlb_lock); |
814 | 790 | ||
815 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| | 791 | if (nid == NUMA_NO_NODE) |
816 | __GFP_REPEAT|__GFP_NOWARN, | 792 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| |
817 | huge_page_order(h)); | 793 | __GFP_REPEAT|__GFP_NOWARN, |
794 | huge_page_order(h)); | ||
795 | else | ||
796 | page = alloc_pages_exact_node(nid, | ||
797 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| | ||
798 | __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); | ||
818 | 799 | ||
819 | if (page && arch_prepare_hugepage(page)) { | 800 | if (page && arch_prepare_hugepage(page)) { |
820 | __free_pages(page, huge_page_order(h)); | 801 | __free_pages(page, huge_page_order(h)); |
@@ -823,19 +804,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
823 | 804 | ||
824 | spin_lock(&hugetlb_lock); | 805 | spin_lock(&hugetlb_lock); |
825 | if (page) { | 806 | if (page) { |
826 | /* | 807 | r_nid = page_to_nid(page); |
827 | * This page is now managed by the hugetlb allocator and has | ||
828 | * no users -- drop the buddy allocator's reference. | ||
829 | */ | ||
830 | put_page_testzero(page); | ||
831 | VM_BUG_ON(page_count(page)); | ||
832 | nid = page_to_nid(page); | ||
833 | set_compound_page_dtor(page, free_huge_page); | 808 | set_compound_page_dtor(page, free_huge_page); |
834 | /* | 809 | /* |
835 | * We incremented the global counters already | 810 | * We incremented the global counters already |
836 | */ | 811 | */ |
837 | h->nr_huge_pages_node[nid]++; | 812 | h->nr_huge_pages_node[r_nid]++; |
838 | h->surplus_huge_pages_node[nid]++; | 813 | h->surplus_huge_pages_node[r_nid]++; |
839 | __count_vm_event(HTLB_BUDDY_PGALLOC); | 814 | __count_vm_event(HTLB_BUDDY_PGALLOC); |
840 | } else { | 815 | } else { |
841 | h->nr_huge_pages--; | 816 | h->nr_huge_pages--; |
@@ -848,7 +823,26 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
848 | } | 823 | } |
849 | 824 | ||
850 | /* | 825 | /* |
851 | * Increase the hugetlb pool such that it can accomodate a reservation | 826 | * This allocation function is useful in the context where vma is irrelevant. |
827 | * E.g. soft-offlining uses this function because it only cares physical | ||
828 | * address of error page. | ||
829 | */ | ||
830 | struct page *alloc_huge_page_node(struct hstate *h, int nid) | ||
831 | { | ||
832 | struct page *page; | ||
833 | |||
834 | spin_lock(&hugetlb_lock); | ||
835 | page = dequeue_huge_page_node(h, nid); | ||
836 | spin_unlock(&hugetlb_lock); | ||
837 | |||
838 | if (!page) | ||
839 | page = alloc_buddy_huge_page(h, nid); | ||
840 | |||
841 | return page; | ||
842 | } | ||
843 | |||
844 | /* | ||
845 | * Increase the hugetlb pool such that it can accommodate a reservation | ||
852 | * of size 'delta'. | 846 | * of size 'delta'. |
853 | */ | 847 | */ |
854 | static int gather_surplus_pages(struct hstate *h, int delta) | 848 | static int gather_surplus_pages(struct hstate *h, int delta) |
@@ -871,17 +865,14 @@ static int gather_surplus_pages(struct hstate *h, int delta) | |||
871 | retry: | 865 | retry: |
872 | spin_unlock(&hugetlb_lock); | 866 | spin_unlock(&hugetlb_lock); |
873 | for (i = 0; i < needed; i++) { | 867 | for (i = 0; i < needed; i++) { |
874 | page = alloc_buddy_huge_page(h, NULL, 0); | 868 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
875 | if (!page) { | 869 | if (!page) |
876 | /* | 870 | /* |
877 | * We were not able to allocate enough pages to | 871 | * We were not able to allocate enough pages to |
878 | * satisfy the entire reservation so we free what | 872 | * satisfy the entire reservation so we free what |
879 | * we've allocated so far. | 873 | * we've allocated so far. |
880 | */ | 874 | */ |
881 | spin_lock(&hugetlb_lock); | ||
882 | needed = 0; | ||
883 | goto free; | 875 | goto free; |
884 | } | ||
885 | 876 | ||
886 | list_add(&page->lru, &surplus_list); | 877 | list_add(&page->lru, &surplus_list); |
887 | } | 878 | } |
@@ -899,7 +890,7 @@ retry: | |||
899 | 890 | ||
900 | /* | 891 | /* |
901 | * The surplus_list now contains _at_least_ the number of extra pages | 892 | * The surplus_list now contains _at_least_ the number of extra pages |
902 | * needed to accomodate the reservation. Add the appropriate number | 893 | * needed to accommodate the reservation. Add the appropriate number |
903 | * of pages to the hugetlb pool and free the extras back to the buddy | 894 | * of pages to the hugetlb pool and free the extras back to the buddy |
904 | * allocator. Commit the entire reservation here to prevent another | 895 | * allocator. Commit the entire reservation here to prevent another |
905 | * process from stealing the pages as they are added to the pool but | 896 | * process from stealing the pages as they are added to the pool but |
@@ -908,31 +899,31 @@ retry: | |||
908 | needed += allocated; | 899 | needed += allocated; |
909 | h->resv_huge_pages += delta; | 900 | h->resv_huge_pages += delta; |
910 | ret = 0; | 901 | ret = 0; |
911 | free: | 902 | |
903 | spin_unlock(&hugetlb_lock); | ||
912 | /* Free the needed pages to the hugetlb pool */ | 904 | /* Free the needed pages to the hugetlb pool */ |
913 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 905 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
914 | if ((--needed) < 0) | 906 | if ((--needed) < 0) |
915 | break; | 907 | break; |
916 | list_del(&page->lru); | 908 | list_del(&page->lru); |
909 | /* | ||
910 | * This page is now managed by the hugetlb allocator and has | ||
911 | * no users -- drop the buddy allocator's reference. | ||
912 | */ | ||
913 | put_page_testzero(page); | ||
914 | VM_BUG_ON(page_count(page)); | ||
917 | enqueue_huge_page(h, page); | 915 | enqueue_huge_page(h, page); |
918 | } | 916 | } |
919 | 917 | ||
920 | /* Free unnecessary surplus pages to the buddy allocator */ | 918 | /* Free unnecessary surplus pages to the buddy allocator */ |
919 | free: | ||
921 | if (!list_empty(&surplus_list)) { | 920 | if (!list_empty(&surplus_list)) { |
922 | spin_unlock(&hugetlb_lock); | ||
923 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 921 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
924 | list_del(&page->lru); | 922 | list_del(&page->lru); |
925 | /* | 923 | put_page(page); |
926 | * The page has a reference count of zero already, so | ||
927 | * call free_huge_page directly instead of using | ||
928 | * put_page. This must be done with hugetlb_lock | ||
929 | * unlocked which is safe because free_huge_page takes | ||
930 | * hugetlb_lock before deciding how to free the page. | ||
931 | */ | ||
932 | free_huge_page(page); | ||
933 | } | 924 | } |
934 | spin_lock(&hugetlb_lock); | ||
935 | } | 925 | } |
926 | spin_lock(&hugetlb_lock); | ||
936 | 927 | ||
937 | return ret; | 928 | return ret; |
938 | } | 929 | } |
@@ -1042,24 +1033,23 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1042 | */ | 1033 | */ |
1043 | chg = vma_needs_reservation(h, vma, addr); | 1034 | chg = vma_needs_reservation(h, vma, addr); |
1044 | if (chg < 0) | 1035 | if (chg < 0) |
1045 | return ERR_PTR(chg); | 1036 | return ERR_PTR(-VM_FAULT_OOM); |
1046 | if (chg) | 1037 | if (chg) |
1047 | if (hugetlb_get_quota(inode->i_mapping, chg)) | 1038 | if (hugetlb_get_quota(inode->i_mapping, chg)) |
1048 | return ERR_PTR(-ENOSPC); | 1039 | return ERR_PTR(-VM_FAULT_SIGBUS); |
1049 | 1040 | ||
1050 | spin_lock(&hugetlb_lock); | 1041 | spin_lock(&hugetlb_lock); |
1051 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); | 1042 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); |
1052 | spin_unlock(&hugetlb_lock); | 1043 | spin_unlock(&hugetlb_lock); |
1053 | 1044 | ||
1054 | if (!page) { | 1045 | if (!page) { |
1055 | page = alloc_buddy_huge_page(h, vma, addr); | 1046 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
1056 | if (!page) { | 1047 | if (!page) { |
1057 | hugetlb_put_quota(inode->i_mapping, chg); | 1048 | hugetlb_put_quota(inode->i_mapping, chg); |
1058 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1049 | return ERR_PTR(-VM_FAULT_SIGBUS); |
1059 | } | 1050 | } |
1060 | } | 1051 | } |
1061 | 1052 | ||
1062 | set_page_refcounted(page); | ||
1063 | set_page_private(page, (unsigned long) mapping); | 1053 | set_page_private(page, (unsigned long) mapping); |
1064 | 1054 | ||
1065 | vma_commit_reservation(h, vma, addr); | 1055 | vma_commit_reservation(h, vma, addr); |
@@ -1121,6 +1111,14 @@ static void __init gather_bootmem_prealloc(void) | |||
1121 | WARN_ON(page_count(page) != 1); | 1111 | WARN_ON(page_count(page) != 1); |
1122 | prep_compound_huge_page(page, h->order); | 1112 | prep_compound_huge_page(page, h->order); |
1123 | prep_new_huge_page(h, page, page_to_nid(page)); | 1113 | prep_new_huge_page(h, page, page_to_nid(page)); |
1114 | /* | ||
1115 | * If we had gigantic hugepages allocated at boot time, we need | ||
1116 | * to restore the 'stolen' pages to totalram_pages in order to | ||
1117 | * fix confusing memory reports from free(1) and another | ||
1118 | * side-effects, like CommitLimit going negative. | ||
1119 | */ | ||
1120 | if (h->order > (MAX_ORDER - 1)) | ||
1121 | totalram_pages += 1 << h->order; | ||
1124 | } | 1122 | } |
1125 | } | 1123 | } |
1126 | 1124 | ||
@@ -1373,6 +1371,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, | |||
1373 | 1371 | ||
1374 | return sprintf(buf, "%lu\n", nr_huge_pages); | 1372 | return sprintf(buf, "%lu\n", nr_huge_pages); |
1375 | } | 1373 | } |
1374 | |||
1376 | static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | 1375 | static ssize_t nr_hugepages_store_common(bool obey_mempolicy, |
1377 | struct kobject *kobj, struct kobj_attribute *attr, | 1376 | struct kobject *kobj, struct kobj_attribute *attr, |
1378 | const char *buf, size_t len) | 1377 | const char *buf, size_t len) |
@@ -1385,9 +1384,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
1385 | 1384 | ||
1386 | err = strict_strtoul(buf, 10, &count); | 1385 | err = strict_strtoul(buf, 10, &count); |
1387 | if (err) | 1386 | if (err) |
1388 | return 0; | 1387 | goto out; |
1389 | 1388 | ||
1390 | h = kobj_to_hstate(kobj, &nid); | 1389 | h = kobj_to_hstate(kobj, &nid); |
1390 | if (h->order >= MAX_ORDER) { | ||
1391 | err = -EINVAL; | ||
1392 | goto out; | ||
1393 | } | ||
1394 | |||
1391 | if (nid == NUMA_NO_NODE) { | 1395 | if (nid == NUMA_NO_NODE) { |
1392 | /* | 1396 | /* |
1393 | * global hstate attribute | 1397 | * global hstate attribute |
@@ -1413,6 +1417,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
1413 | NODEMASK_FREE(nodes_allowed); | 1417 | NODEMASK_FREE(nodes_allowed); |
1414 | 1418 | ||
1415 | return len; | 1419 | return len; |
1420 | out: | ||
1421 | NODEMASK_FREE(nodes_allowed); | ||
1422 | return err; | ||
1416 | } | 1423 | } |
1417 | 1424 | ||
1418 | static ssize_t nr_hugepages_show(struct kobject *kobj, | 1425 | static ssize_t nr_hugepages_show(struct kobject *kobj, |
@@ -1455,6 +1462,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, | |||
1455 | struct hstate *h = kobj_to_hstate(kobj, NULL); | 1462 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1456 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); | 1463 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); |
1457 | } | 1464 | } |
1465 | |||
1458 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | 1466 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, |
1459 | struct kobj_attribute *attr, const char *buf, size_t count) | 1467 | struct kobj_attribute *attr, const char *buf, size_t count) |
1460 | { | 1468 | { |
@@ -1462,9 +1470,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | |||
1462 | unsigned long input; | 1470 | unsigned long input; |
1463 | struct hstate *h = kobj_to_hstate(kobj, NULL); | 1471 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
1464 | 1472 | ||
1473 | if (h->order >= MAX_ORDER) | ||
1474 | return -EINVAL; | ||
1475 | |||
1465 | err = strict_strtoul(buf, 10, &input); | 1476 | err = strict_strtoul(buf, 10, &input); |
1466 | if (err) | 1477 | if (err) |
1467 | return 0; | 1478 | return err; |
1468 | 1479 | ||
1469 | spin_lock(&hugetlb_lock); | 1480 | spin_lock(&hugetlb_lock); |
1470 | h->nr_overcommit_huge_pages = input; | 1481 | h->nr_overcommit_huge_pages = input; |
@@ -1867,13 +1878,18 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, | |||
1867 | { | 1878 | { |
1868 | struct hstate *h = &default_hstate; | 1879 | struct hstate *h = &default_hstate; |
1869 | unsigned long tmp; | 1880 | unsigned long tmp; |
1881 | int ret; | ||
1882 | |||
1883 | tmp = h->max_huge_pages; | ||
1870 | 1884 | ||
1871 | if (!write) | 1885 | if (write && h->order >= MAX_ORDER) |
1872 | tmp = h->max_huge_pages; | 1886 | return -EINVAL; |
1873 | 1887 | ||
1874 | table->data = &tmp; | 1888 | table->data = &tmp; |
1875 | table->maxlen = sizeof(unsigned long); | 1889 | table->maxlen = sizeof(unsigned long); |
1876 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 1890 | ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); |
1891 | if (ret) | ||
1892 | goto out; | ||
1877 | 1893 | ||
1878 | if (write) { | 1894 | if (write) { |
1879 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, | 1895 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, |
@@ -1888,8 +1904,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, | |||
1888 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | 1904 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) |
1889 | NODEMASK_FREE(nodes_allowed); | 1905 | NODEMASK_FREE(nodes_allowed); |
1890 | } | 1906 | } |
1891 | 1907 | out: | |
1892 | return 0; | 1908 | return ret; |
1893 | } | 1909 | } |
1894 | 1910 | ||
1895 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 1911 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, |
@@ -1927,21 +1943,26 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
1927 | { | 1943 | { |
1928 | struct hstate *h = &default_hstate; | 1944 | struct hstate *h = &default_hstate; |
1929 | unsigned long tmp; | 1945 | unsigned long tmp; |
1946 | int ret; | ||
1930 | 1947 | ||
1931 | if (!write) | 1948 | tmp = h->nr_overcommit_huge_pages; |
1932 | tmp = h->nr_overcommit_huge_pages; | 1949 | |
1950 | if (write && h->order >= MAX_ORDER) | ||
1951 | return -EINVAL; | ||
1933 | 1952 | ||
1934 | table->data = &tmp; | 1953 | table->data = &tmp; |
1935 | table->maxlen = sizeof(unsigned long); | 1954 | table->maxlen = sizeof(unsigned long); |
1936 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 1955 | ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); |
1956 | if (ret) | ||
1957 | goto out; | ||
1937 | 1958 | ||
1938 | if (write) { | 1959 | if (write) { |
1939 | spin_lock(&hugetlb_lock); | 1960 | spin_lock(&hugetlb_lock); |
1940 | h->nr_overcommit_huge_pages = tmp; | 1961 | h->nr_overcommit_huge_pages = tmp; |
1941 | spin_unlock(&hugetlb_lock); | 1962 | spin_unlock(&hugetlb_lock); |
1942 | } | 1963 | } |
1943 | 1964 | out: | |
1944 | return 0; | 1965 | return ret; |
1945 | } | 1966 | } |
1946 | 1967 | ||
1947 | #endif /* CONFIG_SYSCTL */ | 1968 | #endif /* CONFIG_SYSCTL */ |
@@ -2030,7 +2051,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) | |||
2030 | * This new VMA should share its siblings reservation map if present. | 2051 | * This new VMA should share its siblings reservation map if present. |
2031 | * The VMA will only ever have a valid reservation map pointer where | 2052 | * The VMA will only ever have a valid reservation map pointer where |
2032 | * it is being copied for another still existing VMA. As that VMA | 2053 | * it is being copied for another still existing VMA. As that VMA |
2033 | * has a reference to the reservation map it cannot dissappear until | 2054 | * has a reference to the reservation map it cannot disappear until |
2034 | * after this open call completes. It is therefore safe to take a | 2055 | * after this open call completes. It is therefore safe to take a |
2035 | * new reference here without additional locking. | 2056 | * new reference here without additional locking. |
2036 | */ | 2057 | */ |
@@ -2153,6 +2174,19 @@ nomem: | |||
2153 | return -ENOMEM; | 2174 | return -ENOMEM; |
2154 | } | 2175 | } |
2155 | 2176 | ||
2177 | static int is_hugetlb_entry_migration(pte_t pte) | ||
2178 | { | ||
2179 | swp_entry_t swp; | ||
2180 | |||
2181 | if (huge_pte_none(pte) || pte_present(pte)) | ||
2182 | return 0; | ||
2183 | swp = pte_to_swp_entry(pte); | ||
2184 | if (non_swap_entry(swp) && is_migration_entry(swp)) { | ||
2185 | return 1; | ||
2186 | } else | ||
2187 | return 0; | ||
2188 | } | ||
2189 | |||
2156 | static int is_hugetlb_entry_hwpoisoned(pte_t pte) | 2190 | static int is_hugetlb_entry_hwpoisoned(pte_t pte) |
2157 | { | 2191 | { |
2158 | swp_entry_t swp; | 2192 | swp_entry_t swp; |
@@ -2179,7 +2213,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
2179 | unsigned long sz = huge_page_size(h); | 2213 | unsigned long sz = huge_page_size(h); |
2180 | 2214 | ||
2181 | /* | 2215 | /* |
2182 | * A page gathering list, protected by per file i_mmap_lock. The | 2216 | * A page gathering list, protected by per file i_mmap_mutex. The |
2183 | * lock is used to avoid list corruption from multiple unmapping | 2217 | * lock is used to avoid list corruption from multiple unmapping |
2184 | * of the same page since we are using page->lru. | 2218 | * of the same page since we are using page->lru. |
2185 | */ | 2219 | */ |
@@ -2248,9 +2282,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
2248 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | 2282 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, |
2249 | unsigned long end, struct page *ref_page) | 2283 | unsigned long end, struct page *ref_page) |
2250 | { | 2284 | { |
2251 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); | 2285 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); |
2252 | __unmap_hugepage_range(vma, start, end, ref_page); | 2286 | __unmap_hugepage_range(vma, start, end, ref_page); |
2253 | spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); | 2287 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); |
2254 | } | 2288 | } |
2255 | 2289 | ||
2256 | /* | 2290 | /* |
@@ -2282,7 +2316,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2282 | * this mapping should be shared between all the VMAs, | 2316 | * this mapping should be shared between all the VMAs, |
2283 | * __unmap_hugepage_range() is called as the lock is already held | 2317 | * __unmap_hugepage_range() is called as the lock is already held |
2284 | */ | 2318 | */ |
2285 | spin_lock(&mapping->i_mmap_lock); | 2319 | mutex_lock(&mapping->i_mmap_mutex); |
2286 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 2320 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
2287 | /* Do not unmap the current VMA */ | 2321 | /* Do not unmap the current VMA */ |
2288 | if (iter_vma == vma) | 2322 | if (iter_vma == vma) |
@@ -2300,7 +2334,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2300 | address, address + huge_page_size(h), | 2334 | address, address + huge_page_size(h), |
2301 | page); | 2335 | page); |
2302 | } | 2336 | } |
2303 | spin_unlock(&mapping->i_mmap_lock); | 2337 | mutex_unlock(&mapping->i_mmap_mutex); |
2304 | 2338 | ||
2305 | return 1; | 2339 | return 1; |
2306 | } | 2340 | } |
@@ -2380,10 +2414,14 @@ retry_avoidcopy: | |||
2380 | * When the original hugepage is shared one, it does not have | 2414 | * When the original hugepage is shared one, it does not have |
2381 | * anon_vma prepared. | 2415 | * anon_vma prepared. |
2382 | */ | 2416 | */ |
2383 | if (unlikely(anon_vma_prepare(vma))) | 2417 | if (unlikely(anon_vma_prepare(vma))) { |
2418 | /* Caller expects lock to be held */ | ||
2419 | spin_lock(&mm->page_table_lock); | ||
2384 | return VM_FAULT_OOM; | 2420 | return VM_FAULT_OOM; |
2421 | } | ||
2385 | 2422 | ||
2386 | copy_huge_page(new_page, old_page, address, vma); | 2423 | copy_user_huge_page(new_page, old_page, address, vma, |
2424 | pages_per_huge_page(h)); | ||
2387 | __SetPageUptodate(new_page); | 2425 | __SetPageUptodate(new_page); |
2388 | 2426 | ||
2389 | /* | 2427 | /* |
@@ -2460,7 +2498,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2460 | /* | 2498 | /* |
2461 | * Currently, we are forced to kill the process in the event the | 2499 | * Currently, we are forced to kill the process in the event the |
2462 | * original mapper has unmapped pages from the child due to a failed | 2500 | * original mapper has unmapped pages from the child due to a failed |
2463 | * COW. Warn that such a situation has occured as it may not be obvious | 2501 | * COW. Warn that such a situation has occurred as it may not be obvious |
2464 | */ | 2502 | */ |
2465 | if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { | 2503 | if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { |
2466 | printk(KERN_WARNING | 2504 | printk(KERN_WARNING |
@@ -2487,7 +2525,7 @@ retry: | |||
2487 | ret = -PTR_ERR(page); | 2525 | ret = -PTR_ERR(page); |
2488 | goto out; | 2526 | goto out; |
2489 | } | 2527 | } |
2490 | clear_huge_page(page, address, huge_page_size(h)); | 2528 | clear_huge_page(page, address, pages_per_huge_page(h)); |
2491 | __SetPageUptodate(page); | 2529 | __SetPageUptodate(page); |
2492 | 2530 | ||
2493 | if (vma->vm_flags & VM_MAYSHARE) { | 2531 | if (vma->vm_flags & VM_MAYSHARE) { |
@@ -2515,22 +2553,20 @@ retry: | |||
2515 | hugepage_add_new_anon_rmap(page, vma, address); | 2553 | hugepage_add_new_anon_rmap(page, vma, address); |
2516 | } | 2554 | } |
2517 | } else { | 2555 | } else { |
2556 | /* | ||
2557 | * If memory error occurs between mmap() and fault, some process | ||
2558 | * don't have hwpoisoned swap entry for errored virtual address. | ||
2559 | * So we need to block hugepage fault by PG_hwpoison bit check. | ||
2560 | */ | ||
2561 | if (unlikely(PageHWPoison(page))) { | ||
2562 | ret = VM_FAULT_HWPOISON | | ||
2563 | VM_FAULT_SET_HINDEX(h - hstates); | ||
2564 | goto backout_unlocked; | ||
2565 | } | ||
2518 | page_dup_rmap(page); | 2566 | page_dup_rmap(page); |
2519 | } | 2567 | } |
2520 | 2568 | ||
2521 | /* | 2569 | /* |
2522 | * Since memory error handler replaces pte into hwpoison swap entry | ||
2523 | * at the time of error handling, a process which reserved but not have | ||
2524 | * the mapping to the error hugepage does not have hwpoison swap entry. | ||
2525 | * So we need to block accesses from such a process by checking | ||
2526 | * PG_hwpoison bit here. | ||
2527 | */ | ||
2528 | if (unlikely(PageHWPoison(page))) { | ||
2529 | ret = VM_FAULT_HWPOISON; | ||
2530 | goto backout_unlocked; | ||
2531 | } | ||
2532 | |||
2533 | /* | ||
2534 | * If we are going to COW a private mapping later, we examine the | 2570 | * If we are going to COW a private mapping later, we examine the |
2535 | * pending reservations for this page now. This will ensure that | 2571 | * pending reservations for this page now. This will ensure that |
2536 | * any allocations necessary to record that reservation occur outside | 2572 | * any allocations necessary to record that reservation occur outside |
@@ -2587,8 +2623,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2587 | ptep = huge_pte_offset(mm, address); | 2623 | ptep = huge_pte_offset(mm, address); |
2588 | if (ptep) { | 2624 | if (ptep) { |
2589 | entry = huge_ptep_get(ptep); | 2625 | entry = huge_ptep_get(ptep); |
2590 | if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | 2626 | if (unlikely(is_hugetlb_entry_migration(entry))) { |
2591 | return VM_FAULT_HWPOISON; | 2627 | migration_entry_wait(mm, (pmd_t *)ptep, address); |
2628 | return 0; | ||
2629 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | ||
2630 | return VM_FAULT_HWPOISON_LARGE | | ||
2631 | VM_FAULT_SET_HINDEX(h - hstates); | ||
2592 | } | 2632 | } |
2593 | 2633 | ||
2594 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); | 2634 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); |
@@ -2665,7 +2705,8 @@ out_page_table_lock: | |||
2665 | unlock_page(pagecache_page); | 2705 | unlock_page(pagecache_page); |
2666 | put_page(pagecache_page); | 2706 | put_page(pagecache_page); |
2667 | } | 2707 | } |
2668 | unlock_page(page); | 2708 | if (page != pagecache_page) |
2709 | unlock_page(page); | ||
2669 | 2710 | ||
2670 | out_mutex: | 2711 | out_mutex: |
2671 | mutex_unlock(&hugetlb_instantiation_mutex); | 2712 | mutex_unlock(&hugetlb_instantiation_mutex); |
@@ -2777,7 +2818,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
2777 | BUG_ON(address >= end); | 2818 | BUG_ON(address >= end); |
2778 | flush_cache_range(vma, address, end); | 2819 | flush_cache_range(vma, address, end); |
2779 | 2820 | ||
2780 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); | 2821 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); |
2781 | spin_lock(&mm->page_table_lock); | 2822 | spin_lock(&mm->page_table_lock); |
2782 | for (; address < end; address += huge_page_size(h)) { | 2823 | for (; address < end; address += huge_page_size(h)) { |
2783 | ptep = huge_pte_offset(mm, address); | 2824 | ptep = huge_pte_offset(mm, address); |
@@ -2792,7 +2833,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
2792 | } | 2833 | } |
2793 | } | 2834 | } |
2794 | spin_unlock(&mm->page_table_lock); | 2835 | spin_unlock(&mm->page_table_lock); |
2795 | spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); | 2836 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); |
2796 | 2837 | ||
2797 | flush_tlb_range(vma, start, end); | 2838 | flush_tlb_range(vma, start, end); |
2798 | } | 2839 | } |
@@ -2800,7 +2841,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
2800 | int hugetlb_reserve_pages(struct inode *inode, | 2841 | int hugetlb_reserve_pages(struct inode *inode, |
2801 | long from, long to, | 2842 | long from, long to, |
2802 | struct vm_area_struct *vma, | 2843 | struct vm_area_struct *vma, |
2803 | int acctflag) | 2844 | vm_flags_t vm_flags) |
2804 | { | 2845 | { |
2805 | long ret, chg; | 2846 | long ret, chg; |
2806 | struct hstate *h = hstate_inode(inode); | 2847 | struct hstate *h = hstate_inode(inode); |
@@ -2810,7 +2851,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2810 | * attempt will be made for VM_NORESERVE to allocate a page | 2851 | * attempt will be made for VM_NORESERVE to allocate a page |
2811 | * and filesystem quota without using reserves | 2852 | * and filesystem quota without using reserves |
2812 | */ | 2853 | */ |
2813 | if (acctflag & VM_NORESERVE) | 2854 | if (vm_flags & VM_NORESERVE) |
2814 | return 0; | 2855 | return 0; |
2815 | 2856 | ||
2816 | /* | 2857 | /* |
@@ -2878,18 +2919,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
2878 | hugetlb_acct_memory(h, -(chg - freed)); | 2919 | hugetlb_acct_memory(h, -(chg - freed)); |
2879 | } | 2920 | } |
2880 | 2921 | ||
2922 | #ifdef CONFIG_MEMORY_FAILURE | ||
2923 | |||
2924 | /* Should be called in hugetlb_lock */ | ||
2925 | static int is_hugepage_on_freelist(struct page *hpage) | ||
2926 | { | ||
2927 | struct page *page; | ||
2928 | struct page *tmp; | ||
2929 | struct hstate *h = page_hstate(hpage); | ||
2930 | int nid = page_to_nid(hpage); | ||
2931 | |||
2932 | list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) | ||
2933 | if (page == hpage) | ||
2934 | return 1; | ||
2935 | return 0; | ||
2936 | } | ||
2937 | |||
2881 | /* | 2938 | /* |
2882 | * This function is called from memory failure code. | 2939 | * This function is called from memory failure code. |
2883 | * Assume the caller holds page lock of the head page. | 2940 | * Assume the caller holds page lock of the head page. |
2884 | */ | 2941 | */ |
2885 | void __isolate_hwpoisoned_huge_page(struct page *hpage) | 2942 | int dequeue_hwpoisoned_huge_page(struct page *hpage) |
2886 | { | 2943 | { |
2887 | struct hstate *h = page_hstate(hpage); | 2944 | struct hstate *h = page_hstate(hpage); |
2888 | int nid = page_to_nid(hpage); | 2945 | int nid = page_to_nid(hpage); |
2946 | int ret = -EBUSY; | ||
2889 | 2947 | ||
2890 | spin_lock(&hugetlb_lock); | 2948 | spin_lock(&hugetlb_lock); |
2891 | list_del(&hpage->lru); | 2949 | if (is_hugepage_on_freelist(hpage)) { |
2892 | h->free_huge_pages--; | 2950 | list_del(&hpage->lru); |
2893 | h->free_huge_pages_node[nid]--; | 2951 | set_page_refcounted(hpage); |
2952 | h->free_huge_pages--; | ||
2953 | h->free_huge_pages_node[nid]--; | ||
2954 | ret = 0; | ||
2955 | } | ||
2894 | spin_unlock(&hugetlb_lock); | 2956 | spin_unlock(&hugetlb_lock); |
2957 | return ret; | ||
2895 | } | 2958 | } |
2959 | #endif | ||