aboutsummaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c338
1 files changed, 201 insertions, 137 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c03273807182..bfcf153bc829 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -146,7 +146,7 @@ static long region_chg(struct list_head *head, long f, long t)
146 if (rg->from > t) 146 if (rg->from > t)
147 return chg; 147 return chg;
148 148
149 /* We overlap with this area, if it extends futher than 149 /* We overlap with this area, if it extends further than
150 * us then we must extend ourselves. Account for its 150 * us then we must extend ourselves. Account for its
151 * existing reservation. */ 151 * existing reservation. */
152 if (rg->to > t) { 152 if (rg->to > t) {
@@ -394,67 +394,37 @@ static int vma_has_reserves(struct vm_area_struct *vma)
394 return 0; 394 return 0;
395} 395}
396 396
397static void clear_gigantic_page(struct page *page, 397static void copy_gigantic_page(struct page *dst, struct page *src)
398 unsigned long addr, unsigned long sz)
399{ 398{
400 int i; 399 int i;
401 struct page *p = page; 400 struct hstate *h = page_hstate(src);
402
403 might_sleep();
404 for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
405 cond_resched();
406 clear_user_highpage(p, addr + i * PAGE_SIZE);
407 }
408}
409static void clear_huge_page(struct page *page,
410 unsigned long addr, unsigned long sz)
411{
412 int i;
413
414 if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
415 clear_gigantic_page(page, addr, sz);
416 return;
417 }
418
419 might_sleep();
420 for (i = 0; i < sz/PAGE_SIZE; i++) {
421 cond_resched();
422 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
423 }
424}
425
426static void copy_gigantic_page(struct page *dst, struct page *src,
427 unsigned long addr, struct vm_area_struct *vma)
428{
429 int i;
430 struct hstate *h = hstate_vma(vma);
431 struct page *dst_base = dst; 401 struct page *dst_base = dst;
432 struct page *src_base = src; 402 struct page *src_base = src;
433 might_sleep(); 403
434 for (i = 0; i < pages_per_huge_page(h); ) { 404 for (i = 0; i < pages_per_huge_page(h); ) {
435 cond_resched(); 405 cond_resched();
436 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); 406 copy_highpage(dst, src);
437 407
438 i++; 408 i++;
439 dst = mem_map_next(dst, dst_base, i); 409 dst = mem_map_next(dst, dst_base, i);
440 src = mem_map_next(src, src_base, i); 410 src = mem_map_next(src, src_base, i);
441 } 411 }
442} 412}
443static void copy_huge_page(struct page *dst, struct page *src, 413
444 unsigned long addr, struct vm_area_struct *vma) 414void copy_huge_page(struct page *dst, struct page *src)
445{ 415{
446 int i; 416 int i;
447 struct hstate *h = hstate_vma(vma); 417 struct hstate *h = page_hstate(src);
448 418
449 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { 419 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
450 copy_gigantic_page(dst, src, addr, vma); 420 copy_gigantic_page(dst, src);
451 return; 421 return;
452 } 422 }
453 423
454 might_sleep(); 424 might_sleep();
455 for (i = 0; i < pages_per_huge_page(h); i++) { 425 for (i = 0; i < pages_per_huge_page(h); i++) {
456 cond_resched(); 426 cond_resched();
457 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 427 copy_highpage(dst + i, src + i);
458 } 428 }
459} 429}
460 430
@@ -466,11 +436,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
466 h->free_huge_pages_node[nid]++; 436 h->free_huge_pages_node[nid]++;
467} 437}
468 438
439static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
440{
441 struct page *page;
442
443 if (list_empty(&h->hugepage_freelists[nid]))
444 return NULL;
445 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
446 list_del(&page->lru);
447 set_page_refcounted(page);
448 h->free_huge_pages--;
449 h->free_huge_pages_node[nid]--;
450 return page;
451}
452
469static struct page *dequeue_huge_page_vma(struct hstate *h, 453static struct page *dequeue_huge_page_vma(struct hstate *h,
470 struct vm_area_struct *vma, 454 struct vm_area_struct *vma,
471 unsigned long address, int avoid_reserve) 455 unsigned long address, int avoid_reserve)
472{ 456{
473 int nid;
474 struct page *page = NULL; 457 struct page *page = NULL;
475 struct mempolicy *mpol; 458 struct mempolicy *mpol;
476 nodemask_t *nodemask; 459 nodemask_t *nodemask;
@@ -492,23 +475,17 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
492 475
493 /* If reserves cannot be used, ensure enough pages are in the pool */ 476 /* If reserves cannot be used, ensure enough pages are in the pool */
494 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 477 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
495 goto err;; 478 goto err;
496 479
497 for_each_zone_zonelist_nodemask(zone, z, zonelist, 480 for_each_zone_zonelist_nodemask(zone, z, zonelist,
498 MAX_NR_ZONES - 1, nodemask) { 481 MAX_NR_ZONES - 1, nodemask) {
499 nid = zone_to_nid(zone); 482 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
500 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 483 page = dequeue_huge_page_node(h, zone_to_nid(zone));
501 !list_empty(&h->hugepage_freelists[nid])) { 484 if (page) {
502 page = list_entry(h->hugepage_freelists[nid].next, 485 if (!avoid_reserve)
503 struct page, lru); 486 decrement_hugepage_resv_vma(h, vma);
504 list_del(&page->lru); 487 break;
505 h->free_huge_pages--; 488 }
506 h->free_huge_pages_node[nid]--;
507
508 if (!avoid_reserve)
509 decrement_hugepage_resv_vma(h, vma);
510
511 break;
512 } 489 }
513 } 490 }
514err: 491err:
@@ -770,11 +747,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
770 return ret; 747 return ret;
771} 748}
772 749
773static struct page *alloc_buddy_huge_page(struct hstate *h, 750static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
774 struct vm_area_struct *vma, unsigned long address)
775{ 751{
776 struct page *page; 752 struct page *page;
777 unsigned int nid; 753 unsigned int r_nid;
778 754
779 if (h->order >= MAX_ORDER) 755 if (h->order >= MAX_ORDER)
780 return NULL; 756 return NULL;
@@ -812,9 +788,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
812 } 788 }
813 spin_unlock(&hugetlb_lock); 789 spin_unlock(&hugetlb_lock);
814 790
815 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 791 if (nid == NUMA_NO_NODE)
816 __GFP_REPEAT|__GFP_NOWARN, 792 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
817 huge_page_order(h)); 793 __GFP_REPEAT|__GFP_NOWARN,
794 huge_page_order(h));
795 else
796 page = alloc_pages_exact_node(nid,
797 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
798 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
818 799
819 if (page && arch_prepare_hugepage(page)) { 800 if (page && arch_prepare_hugepage(page)) {
820 __free_pages(page, huge_page_order(h)); 801 __free_pages(page, huge_page_order(h));
@@ -823,19 +804,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
823 804
824 spin_lock(&hugetlb_lock); 805 spin_lock(&hugetlb_lock);
825 if (page) { 806 if (page) {
826 /* 807 r_nid = page_to_nid(page);
827 * This page is now managed by the hugetlb allocator and has
828 * no users -- drop the buddy allocator's reference.
829 */
830 put_page_testzero(page);
831 VM_BUG_ON(page_count(page));
832 nid = page_to_nid(page);
833 set_compound_page_dtor(page, free_huge_page); 808 set_compound_page_dtor(page, free_huge_page);
834 /* 809 /*
835 * We incremented the global counters already 810 * We incremented the global counters already
836 */ 811 */
837 h->nr_huge_pages_node[nid]++; 812 h->nr_huge_pages_node[r_nid]++;
838 h->surplus_huge_pages_node[nid]++; 813 h->surplus_huge_pages_node[r_nid]++;
839 __count_vm_event(HTLB_BUDDY_PGALLOC); 814 __count_vm_event(HTLB_BUDDY_PGALLOC);
840 } else { 815 } else {
841 h->nr_huge_pages--; 816 h->nr_huge_pages--;
@@ -848,7 +823,26 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
848} 823}
849 824
850/* 825/*
851 * Increase the hugetlb pool such that it can accomodate a reservation 826 * This allocation function is useful in the context where vma is irrelevant.
827 * E.g. soft-offlining uses this function because it only cares physical
828 * address of error page.
829 */
830struct page *alloc_huge_page_node(struct hstate *h, int nid)
831{
832 struct page *page;
833
834 spin_lock(&hugetlb_lock);
835 page = dequeue_huge_page_node(h, nid);
836 spin_unlock(&hugetlb_lock);
837
838 if (!page)
839 page = alloc_buddy_huge_page(h, nid);
840
841 return page;
842}
843
844/*
845 * Increase the hugetlb pool such that it can accommodate a reservation
852 * of size 'delta'. 846 * of size 'delta'.
853 */ 847 */
854static int gather_surplus_pages(struct hstate *h, int delta) 848static int gather_surplus_pages(struct hstate *h, int delta)
@@ -871,17 +865,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
871retry: 865retry:
872 spin_unlock(&hugetlb_lock); 866 spin_unlock(&hugetlb_lock);
873 for (i = 0; i < needed; i++) { 867 for (i = 0; i < needed; i++) {
874 page = alloc_buddy_huge_page(h, NULL, 0); 868 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
875 if (!page) { 869 if (!page)
876 /* 870 /*
877 * We were not able to allocate enough pages to 871 * We were not able to allocate enough pages to
878 * satisfy the entire reservation so we free what 872 * satisfy the entire reservation so we free what
879 * we've allocated so far. 873 * we've allocated so far.
880 */ 874 */
881 spin_lock(&hugetlb_lock);
882 needed = 0;
883 goto free; 875 goto free;
884 }
885 876
886 list_add(&page->lru, &surplus_list); 877 list_add(&page->lru, &surplus_list);
887 } 878 }
@@ -899,7 +890,7 @@ retry:
899 890
900 /* 891 /*
901 * The surplus_list now contains _at_least_ the number of extra pages 892 * The surplus_list now contains _at_least_ the number of extra pages
902 * needed to accomodate the reservation. Add the appropriate number 893 * needed to accommodate the reservation. Add the appropriate number
903 * of pages to the hugetlb pool and free the extras back to the buddy 894 * of pages to the hugetlb pool and free the extras back to the buddy
904 * allocator. Commit the entire reservation here to prevent another 895 * allocator. Commit the entire reservation here to prevent another
905 * process from stealing the pages as they are added to the pool but 896 * process from stealing the pages as they are added to the pool but
@@ -908,31 +899,31 @@ retry:
908 needed += allocated; 899 needed += allocated;
909 h->resv_huge_pages += delta; 900 h->resv_huge_pages += delta;
910 ret = 0; 901 ret = 0;
911free: 902
903 spin_unlock(&hugetlb_lock);
912 /* Free the needed pages to the hugetlb pool */ 904 /* Free the needed pages to the hugetlb pool */
913 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 905 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
914 if ((--needed) < 0) 906 if ((--needed) < 0)
915 break; 907 break;
916 list_del(&page->lru); 908 list_del(&page->lru);
909 /*
910 * This page is now managed by the hugetlb allocator and has
911 * no users -- drop the buddy allocator's reference.
912 */
913 put_page_testzero(page);
914 VM_BUG_ON(page_count(page));
917 enqueue_huge_page(h, page); 915 enqueue_huge_page(h, page);
918 } 916 }
919 917
920 /* Free unnecessary surplus pages to the buddy allocator */ 918 /* Free unnecessary surplus pages to the buddy allocator */
919free:
921 if (!list_empty(&surplus_list)) { 920 if (!list_empty(&surplus_list)) {
922 spin_unlock(&hugetlb_lock);
923 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 921 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
924 list_del(&page->lru); 922 list_del(&page->lru);
925 /* 923 put_page(page);
926 * The page has a reference count of zero already, so
927 * call free_huge_page directly instead of using
928 * put_page. This must be done with hugetlb_lock
929 * unlocked which is safe because free_huge_page takes
930 * hugetlb_lock before deciding how to free the page.
931 */
932 free_huge_page(page);
933 } 924 }
934 spin_lock(&hugetlb_lock);
935 } 925 }
926 spin_lock(&hugetlb_lock);
936 927
937 return ret; 928 return ret;
938} 929}
@@ -1042,24 +1033,23 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1042 */ 1033 */
1043 chg = vma_needs_reservation(h, vma, addr); 1034 chg = vma_needs_reservation(h, vma, addr);
1044 if (chg < 0) 1035 if (chg < 0)
1045 return ERR_PTR(chg); 1036 return ERR_PTR(-VM_FAULT_OOM);
1046 if (chg) 1037 if (chg)
1047 if (hugetlb_get_quota(inode->i_mapping, chg)) 1038 if (hugetlb_get_quota(inode->i_mapping, chg))
1048 return ERR_PTR(-ENOSPC); 1039 return ERR_PTR(-VM_FAULT_SIGBUS);
1049 1040
1050 spin_lock(&hugetlb_lock); 1041 spin_lock(&hugetlb_lock);
1051 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); 1042 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
1052 spin_unlock(&hugetlb_lock); 1043 spin_unlock(&hugetlb_lock);
1053 1044
1054 if (!page) { 1045 if (!page) {
1055 page = alloc_buddy_huge_page(h, vma, addr); 1046 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1056 if (!page) { 1047 if (!page) {
1057 hugetlb_put_quota(inode->i_mapping, chg); 1048 hugetlb_put_quota(inode->i_mapping, chg);
1058 return ERR_PTR(-VM_FAULT_SIGBUS); 1049 return ERR_PTR(-VM_FAULT_SIGBUS);
1059 } 1050 }
1060 } 1051 }
1061 1052
1062 set_page_refcounted(page);
1063 set_page_private(page, (unsigned long) mapping); 1053 set_page_private(page, (unsigned long) mapping);
1064 1054
1065 vma_commit_reservation(h, vma, addr); 1055 vma_commit_reservation(h, vma, addr);
@@ -1121,6 +1111,14 @@ static void __init gather_bootmem_prealloc(void)
1121 WARN_ON(page_count(page) != 1); 1111 WARN_ON(page_count(page) != 1);
1122 prep_compound_huge_page(page, h->order); 1112 prep_compound_huge_page(page, h->order);
1123 prep_new_huge_page(h, page, page_to_nid(page)); 1113 prep_new_huge_page(h, page, page_to_nid(page));
1114 /*
1115 * If we had gigantic hugepages allocated at boot time, we need
1116 * to restore the 'stolen' pages to totalram_pages in order to
1117 * fix confusing memory reports from free(1) and another
1118 * side-effects, like CommitLimit going negative.
1119 */
1120 if (h->order > (MAX_ORDER - 1))
1121 totalram_pages += 1 << h->order;
1124 } 1122 }
1125} 1123}
1126 1124
@@ -1373,6 +1371,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1373 1371
1374 return sprintf(buf, "%lu\n", nr_huge_pages); 1372 return sprintf(buf, "%lu\n", nr_huge_pages);
1375} 1373}
1374
1376static ssize_t nr_hugepages_store_common(bool obey_mempolicy, 1375static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1377 struct kobject *kobj, struct kobj_attribute *attr, 1376 struct kobject *kobj, struct kobj_attribute *attr,
1378 const char *buf, size_t len) 1377 const char *buf, size_t len)
@@ -1385,9 +1384,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1385 1384
1386 err = strict_strtoul(buf, 10, &count); 1385 err = strict_strtoul(buf, 10, &count);
1387 if (err) 1386 if (err)
1388 return 0; 1387 goto out;
1389 1388
1390 h = kobj_to_hstate(kobj, &nid); 1389 h = kobj_to_hstate(kobj, &nid);
1390 if (h->order >= MAX_ORDER) {
1391 err = -EINVAL;
1392 goto out;
1393 }
1394
1391 if (nid == NUMA_NO_NODE) { 1395 if (nid == NUMA_NO_NODE) {
1392 /* 1396 /*
1393 * global hstate attribute 1397 * global hstate attribute
@@ -1413,6 +1417,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1413 NODEMASK_FREE(nodes_allowed); 1417 NODEMASK_FREE(nodes_allowed);
1414 1418
1415 return len; 1419 return len;
1420out:
1421 NODEMASK_FREE(nodes_allowed);
1422 return err;
1416} 1423}
1417 1424
1418static ssize_t nr_hugepages_show(struct kobject *kobj, 1425static ssize_t nr_hugepages_show(struct kobject *kobj,
@@ -1455,6 +1462,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1455 struct hstate *h = kobj_to_hstate(kobj, NULL); 1462 struct hstate *h = kobj_to_hstate(kobj, NULL);
1456 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 1463 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1457} 1464}
1465
1458static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 1466static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1459 struct kobj_attribute *attr, const char *buf, size_t count) 1467 struct kobj_attribute *attr, const char *buf, size_t count)
1460{ 1468{
@@ -1462,9 +1470,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1462 unsigned long input; 1470 unsigned long input;
1463 struct hstate *h = kobj_to_hstate(kobj, NULL); 1471 struct hstate *h = kobj_to_hstate(kobj, NULL);
1464 1472
1473 if (h->order >= MAX_ORDER)
1474 return -EINVAL;
1475
1465 err = strict_strtoul(buf, 10, &input); 1476 err = strict_strtoul(buf, 10, &input);
1466 if (err) 1477 if (err)
1467 return 0; 1478 return err;
1468 1479
1469 spin_lock(&hugetlb_lock); 1480 spin_lock(&hugetlb_lock);
1470 h->nr_overcommit_huge_pages = input; 1481 h->nr_overcommit_huge_pages = input;
@@ -1867,13 +1878,18 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1867{ 1878{
1868 struct hstate *h = &default_hstate; 1879 struct hstate *h = &default_hstate;
1869 unsigned long tmp; 1880 unsigned long tmp;
1881 int ret;
1882
1883 tmp = h->max_huge_pages;
1870 1884
1871 if (!write) 1885 if (write && h->order >= MAX_ORDER)
1872 tmp = h->max_huge_pages; 1886 return -EINVAL;
1873 1887
1874 table->data = &tmp; 1888 table->data = &tmp;
1875 table->maxlen = sizeof(unsigned long); 1889 table->maxlen = sizeof(unsigned long);
1876 proc_doulongvec_minmax(table, write, buffer, length, ppos); 1890 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
1891 if (ret)
1892 goto out;
1877 1893
1878 if (write) { 1894 if (write) {
1879 NODEMASK_ALLOC(nodemask_t, nodes_allowed, 1895 NODEMASK_ALLOC(nodemask_t, nodes_allowed,
@@ -1888,8 +1904,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1888 if (nodes_allowed != &node_states[N_HIGH_MEMORY]) 1904 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1889 NODEMASK_FREE(nodes_allowed); 1905 NODEMASK_FREE(nodes_allowed);
1890 } 1906 }
1891 1907out:
1892 return 0; 1908 return ret;
1893} 1909}
1894 1910
1895int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1911int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -1927,21 +1943,26 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1927{ 1943{
1928 struct hstate *h = &default_hstate; 1944 struct hstate *h = &default_hstate;
1929 unsigned long tmp; 1945 unsigned long tmp;
1946 int ret;
1930 1947
1931 if (!write) 1948 tmp = h->nr_overcommit_huge_pages;
1932 tmp = h->nr_overcommit_huge_pages; 1949
1950 if (write && h->order >= MAX_ORDER)
1951 return -EINVAL;
1933 1952
1934 table->data = &tmp; 1953 table->data = &tmp;
1935 table->maxlen = sizeof(unsigned long); 1954 table->maxlen = sizeof(unsigned long);
1936 proc_doulongvec_minmax(table, write, buffer, length, ppos); 1955 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
1956 if (ret)
1957 goto out;
1937 1958
1938 if (write) { 1959 if (write) {
1939 spin_lock(&hugetlb_lock); 1960 spin_lock(&hugetlb_lock);
1940 h->nr_overcommit_huge_pages = tmp; 1961 h->nr_overcommit_huge_pages = tmp;
1941 spin_unlock(&hugetlb_lock); 1962 spin_unlock(&hugetlb_lock);
1942 } 1963 }
1943 1964out:
1944 return 0; 1965 return ret;
1945} 1966}
1946 1967
1947#endif /* CONFIG_SYSCTL */ 1968#endif /* CONFIG_SYSCTL */
@@ -2030,7 +2051,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2030 * This new VMA should share its siblings reservation map if present. 2051 * This new VMA should share its siblings reservation map if present.
2031 * The VMA will only ever have a valid reservation map pointer where 2052 * The VMA will only ever have a valid reservation map pointer where
2032 * it is being copied for another still existing VMA. As that VMA 2053 * it is being copied for another still existing VMA. As that VMA
2033 * has a reference to the reservation map it cannot dissappear until 2054 * has a reference to the reservation map it cannot disappear until
2034 * after this open call completes. It is therefore safe to take a 2055 * after this open call completes. It is therefore safe to take a
2035 * new reference here without additional locking. 2056 * new reference here without additional locking.
2036 */ 2057 */
@@ -2153,6 +2174,19 @@ nomem:
2153 return -ENOMEM; 2174 return -ENOMEM;
2154} 2175}
2155 2176
2177static int is_hugetlb_entry_migration(pte_t pte)
2178{
2179 swp_entry_t swp;
2180
2181 if (huge_pte_none(pte) || pte_present(pte))
2182 return 0;
2183 swp = pte_to_swp_entry(pte);
2184 if (non_swap_entry(swp) && is_migration_entry(swp)) {
2185 return 1;
2186 } else
2187 return 0;
2188}
2189
2156static int is_hugetlb_entry_hwpoisoned(pte_t pte) 2190static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2157{ 2191{
2158 swp_entry_t swp; 2192 swp_entry_t swp;
@@ -2179,7 +2213,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2179 unsigned long sz = huge_page_size(h); 2213 unsigned long sz = huge_page_size(h);
2180 2214
2181 /* 2215 /*
2182 * A page gathering list, protected by per file i_mmap_lock. The 2216 * A page gathering list, protected by per file i_mmap_mutex. The
2183 * lock is used to avoid list corruption from multiple unmapping 2217 * lock is used to avoid list corruption from multiple unmapping
2184 * of the same page since we are using page->lru. 2218 * of the same page since we are using page->lru.
2185 */ 2219 */
@@ -2248,9 +2282,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2248void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2282void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2249 unsigned long end, struct page *ref_page) 2283 unsigned long end, struct page *ref_page)
2250{ 2284{
2251 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 2285 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
2252 __unmap_hugepage_range(vma, start, end, ref_page); 2286 __unmap_hugepage_range(vma, start, end, ref_page);
2253 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 2287 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
2254} 2288}
2255 2289
2256/* 2290/*
@@ -2282,7 +2316,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2282 * this mapping should be shared between all the VMAs, 2316 * this mapping should be shared between all the VMAs,
2283 * __unmap_hugepage_range() is called as the lock is already held 2317 * __unmap_hugepage_range() is called as the lock is already held
2284 */ 2318 */
2285 spin_lock(&mapping->i_mmap_lock); 2319 mutex_lock(&mapping->i_mmap_mutex);
2286 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 2320 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
2287 /* Do not unmap the current VMA */ 2321 /* Do not unmap the current VMA */
2288 if (iter_vma == vma) 2322 if (iter_vma == vma)
@@ -2300,7 +2334,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2300 address, address + huge_page_size(h), 2334 address, address + huge_page_size(h),
2301 page); 2335 page);
2302 } 2336 }
2303 spin_unlock(&mapping->i_mmap_lock); 2337 mutex_unlock(&mapping->i_mmap_mutex);
2304 2338
2305 return 1; 2339 return 1;
2306} 2340}
@@ -2380,10 +2414,14 @@ retry_avoidcopy:
2380 * When the original hugepage is shared one, it does not have 2414 * When the original hugepage is shared one, it does not have
2381 * anon_vma prepared. 2415 * anon_vma prepared.
2382 */ 2416 */
2383 if (unlikely(anon_vma_prepare(vma))) 2417 if (unlikely(anon_vma_prepare(vma))) {
2418 /* Caller expects lock to be held */
2419 spin_lock(&mm->page_table_lock);
2384 return VM_FAULT_OOM; 2420 return VM_FAULT_OOM;
2421 }
2385 2422
2386 copy_huge_page(new_page, old_page, address, vma); 2423 copy_user_huge_page(new_page, old_page, address, vma,
2424 pages_per_huge_page(h));
2387 __SetPageUptodate(new_page); 2425 __SetPageUptodate(new_page);
2388 2426
2389 /* 2427 /*
@@ -2460,7 +2498,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2460 /* 2498 /*
2461 * Currently, we are forced to kill the process in the event the 2499 * Currently, we are forced to kill the process in the event the
2462 * original mapper has unmapped pages from the child due to a failed 2500 * original mapper has unmapped pages from the child due to a failed
2463 * COW. Warn that such a situation has occured as it may not be obvious 2501 * COW. Warn that such a situation has occurred as it may not be obvious
2464 */ 2502 */
2465 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 2503 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
2466 printk(KERN_WARNING 2504 printk(KERN_WARNING
@@ -2487,7 +2525,7 @@ retry:
2487 ret = -PTR_ERR(page); 2525 ret = -PTR_ERR(page);
2488 goto out; 2526 goto out;
2489 } 2527 }
2490 clear_huge_page(page, address, huge_page_size(h)); 2528 clear_huge_page(page, address, pages_per_huge_page(h));
2491 __SetPageUptodate(page); 2529 __SetPageUptodate(page);
2492 2530
2493 if (vma->vm_flags & VM_MAYSHARE) { 2531 if (vma->vm_flags & VM_MAYSHARE) {
@@ -2515,22 +2553,20 @@ retry:
2515 hugepage_add_new_anon_rmap(page, vma, address); 2553 hugepage_add_new_anon_rmap(page, vma, address);
2516 } 2554 }
2517 } else { 2555 } else {
2556 /*
2557 * If memory error occurs between mmap() and fault, some process
2558 * don't have hwpoisoned swap entry for errored virtual address.
2559 * So we need to block hugepage fault by PG_hwpoison bit check.
2560 */
2561 if (unlikely(PageHWPoison(page))) {
2562 ret = VM_FAULT_HWPOISON |
2563 VM_FAULT_SET_HINDEX(h - hstates);
2564 goto backout_unlocked;
2565 }
2518 page_dup_rmap(page); 2566 page_dup_rmap(page);
2519 } 2567 }
2520 2568
2521 /* 2569 /*
2522 * Since memory error handler replaces pte into hwpoison swap entry
2523 * at the time of error handling, a process which reserved but not have
2524 * the mapping to the error hugepage does not have hwpoison swap entry.
2525 * So we need to block accesses from such a process by checking
2526 * PG_hwpoison bit here.
2527 */
2528 if (unlikely(PageHWPoison(page))) {
2529 ret = VM_FAULT_HWPOISON;
2530 goto backout_unlocked;
2531 }
2532
2533 /*
2534 * If we are going to COW a private mapping later, we examine the 2570 * If we are going to COW a private mapping later, we examine the
2535 * pending reservations for this page now. This will ensure that 2571 * pending reservations for this page now. This will ensure that
2536 * any allocations necessary to record that reservation occur outside 2572 * any allocations necessary to record that reservation occur outside
@@ -2587,8 +2623,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2587 ptep = huge_pte_offset(mm, address); 2623 ptep = huge_pte_offset(mm, address);
2588 if (ptep) { 2624 if (ptep) {
2589 entry = huge_ptep_get(ptep); 2625 entry = huge_ptep_get(ptep);
2590 if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2626 if (unlikely(is_hugetlb_entry_migration(entry))) {
2591 return VM_FAULT_HWPOISON; 2627 migration_entry_wait(mm, (pmd_t *)ptep, address);
2628 return 0;
2629 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2630 return VM_FAULT_HWPOISON_LARGE |
2631 VM_FAULT_SET_HINDEX(h - hstates);
2592 } 2632 }
2593 2633
2594 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 2634 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2665,7 +2705,8 @@ out_page_table_lock:
2665 unlock_page(pagecache_page); 2705 unlock_page(pagecache_page);
2666 put_page(pagecache_page); 2706 put_page(pagecache_page);
2667 } 2707 }
2668 unlock_page(page); 2708 if (page != pagecache_page)
2709 unlock_page(page);
2669 2710
2670out_mutex: 2711out_mutex:
2671 mutex_unlock(&hugetlb_instantiation_mutex); 2712 mutex_unlock(&hugetlb_instantiation_mutex);
@@ -2777,7 +2818,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2777 BUG_ON(address >= end); 2818 BUG_ON(address >= end);
2778 flush_cache_range(vma, address, end); 2819 flush_cache_range(vma, address, end);
2779 2820
2780 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 2821 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
2781 spin_lock(&mm->page_table_lock); 2822 spin_lock(&mm->page_table_lock);
2782 for (; address < end; address += huge_page_size(h)) { 2823 for (; address < end; address += huge_page_size(h)) {
2783 ptep = huge_pte_offset(mm, address); 2824 ptep = huge_pte_offset(mm, address);
@@ -2792,7 +2833,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2792 } 2833 }
2793 } 2834 }
2794 spin_unlock(&mm->page_table_lock); 2835 spin_unlock(&mm->page_table_lock);
2795 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 2836 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
2796 2837
2797 flush_tlb_range(vma, start, end); 2838 flush_tlb_range(vma, start, end);
2798} 2839}
@@ -2800,7 +2841,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2800int hugetlb_reserve_pages(struct inode *inode, 2841int hugetlb_reserve_pages(struct inode *inode,
2801 long from, long to, 2842 long from, long to,
2802 struct vm_area_struct *vma, 2843 struct vm_area_struct *vma,
2803 int acctflag) 2844 vm_flags_t vm_flags)
2804{ 2845{
2805 long ret, chg; 2846 long ret, chg;
2806 struct hstate *h = hstate_inode(inode); 2847 struct hstate *h = hstate_inode(inode);
@@ -2810,7 +2851,7 @@ int hugetlb_reserve_pages(struct inode *inode,
2810 * attempt will be made for VM_NORESERVE to allocate a page 2851 * attempt will be made for VM_NORESERVE to allocate a page
2811 * and filesystem quota without using reserves 2852 * and filesystem quota without using reserves
2812 */ 2853 */
2813 if (acctflag & VM_NORESERVE) 2854 if (vm_flags & VM_NORESERVE)
2814 return 0; 2855 return 0;
2815 2856
2816 /* 2857 /*
@@ -2878,18 +2919,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2878 hugetlb_acct_memory(h, -(chg - freed)); 2919 hugetlb_acct_memory(h, -(chg - freed));
2879} 2920}
2880 2921
2922#ifdef CONFIG_MEMORY_FAILURE
2923
2924/* Should be called in hugetlb_lock */
2925static int is_hugepage_on_freelist(struct page *hpage)
2926{
2927 struct page *page;
2928 struct page *tmp;
2929 struct hstate *h = page_hstate(hpage);
2930 int nid = page_to_nid(hpage);
2931
2932 list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
2933 if (page == hpage)
2934 return 1;
2935 return 0;
2936}
2937
2881/* 2938/*
2882 * This function is called from memory failure code. 2939 * This function is called from memory failure code.
2883 * Assume the caller holds page lock of the head page. 2940 * Assume the caller holds page lock of the head page.
2884 */ 2941 */
2885void __isolate_hwpoisoned_huge_page(struct page *hpage) 2942int dequeue_hwpoisoned_huge_page(struct page *hpage)
2886{ 2943{
2887 struct hstate *h = page_hstate(hpage); 2944 struct hstate *h = page_hstate(hpage);
2888 int nid = page_to_nid(hpage); 2945 int nid = page_to_nid(hpage);
2946 int ret = -EBUSY;
2889 2947
2890 spin_lock(&hugetlb_lock); 2948 spin_lock(&hugetlb_lock);
2891 list_del(&hpage->lru); 2949 if (is_hugepage_on_freelist(hpage)) {
2892 h->free_huge_pages--; 2950 list_del(&hpage->lru);
2893 h->free_huge_pages_node[nid]--; 2951 set_page_refcounted(hpage);
2952 h->free_huge_pages--;
2953 h->free_huge_pages_node[nid]--;
2954 ret = 0;
2955 }
2894 spin_unlock(&hugetlb_lock); 2956 spin_unlock(&hugetlb_lock);
2957 return ret;
2895} 2958}
2959#endif