aboutsummaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c201
1 files changed, 183 insertions, 18 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0af500db3632..a2d29b84501f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -40,6 +40,9 @@ static int hugetlb_next_nid;
40 */ 40 */
41static DEFINE_SPINLOCK(hugetlb_lock); 41static DEFINE_SPINLOCK(hugetlb_lock);
42 42
43#define HPAGE_RESV_OWNER (1UL << (BITS_PER_LONG - 1))
44#define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2))
45#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
43/* 46/*
44 * These helpers are used to track how many pages are reserved for 47 * These helpers are used to track how many pages are reserved for
45 * faults in a MAP_PRIVATE mapping. Only the process that called mmap() 48 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
@@ -54,17 +57,32 @@ static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma)
54{ 57{
55 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 58 VM_BUG_ON(!is_vm_hugetlb_page(vma));
56 if (!(vma->vm_flags & VM_SHARED)) 59 if (!(vma->vm_flags & VM_SHARED))
57 return (unsigned long)vma->vm_private_data; 60 return (unsigned long)vma->vm_private_data & ~HPAGE_RESV_MASK;
58 return 0; 61 return 0;
59} 62}
60 63
61static void set_vma_resv_huge_pages(struct vm_area_struct *vma, 64static void set_vma_resv_huge_pages(struct vm_area_struct *vma,
62 unsigned long reserve) 65 unsigned long reserve)
63{ 66{
67 unsigned long flags;
64 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 68 VM_BUG_ON(!is_vm_hugetlb_page(vma));
65 VM_BUG_ON(vma->vm_flags & VM_SHARED); 69 VM_BUG_ON(vma->vm_flags & VM_SHARED);
66 70
67 vma->vm_private_data = (void *)reserve; 71 flags = (unsigned long)vma->vm_private_data & HPAGE_RESV_MASK;
72 vma->vm_private_data = (void *)(reserve | flags);
73}
74
75static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
76{
77 unsigned long reserveflags = (unsigned long)vma->vm_private_data;
78 VM_BUG_ON(!is_vm_hugetlb_page(vma));
79 vma->vm_private_data = (void *)(reserveflags | flags);
80}
81
82static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
83{
84 VM_BUG_ON(!is_vm_hugetlb_page(vma));
85 return ((unsigned long)vma->vm_private_data & flag) != 0;
68} 86}
69 87
70/* Decrement the reserved pages in the hugepage pool by one */ 88/* Decrement the reserved pages in the hugepage pool by one */
@@ -78,14 +96,18 @@ static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
78 * Only the process that called mmap() has reserves for 96 * Only the process that called mmap() has reserves for
79 * private mappings. 97 * private mappings.
80 */ 98 */
81 if (vma_resv_huge_pages(vma)) { 99 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
100 unsigned long flags, reserve;
82 resv_huge_pages--; 101 resv_huge_pages--;
102 flags = (unsigned long)vma->vm_private_data &
103 HPAGE_RESV_MASK;
83 reserve = (unsigned long)vma->vm_private_data - 1; 104 reserve = (unsigned long)vma->vm_private_data - 1;
84 vma->vm_private_data = (void *)reserve; 105 vma->vm_private_data = (void *)(reserve | flags);
85 } 106 }
86 } 107 }
87} 108}
88 109
110/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
89void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 111void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
90{ 112{
91 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 113 VM_BUG_ON(!is_vm_hugetlb_page(vma));
@@ -153,7 +175,7 @@ static struct page *dequeue_huge_page(void)
153} 175}
154 176
155static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, 177static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
156 unsigned long address) 178 unsigned long address, int avoid_reserve)
157{ 179{
158 int nid; 180 int nid;
159 struct page *page = NULL; 181 struct page *page = NULL;
@@ -173,6 +195,10 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
173 free_huge_pages - resv_huge_pages == 0) 195 free_huge_pages - resv_huge_pages == 0)
174 return NULL; 196 return NULL;
175 197
198 /* If reserves cannot be used, ensure enough pages are in the pool */
199 if (avoid_reserve && free_huge_pages - resv_huge_pages == 0)
200 return NULL;
201
176 for_each_zone_zonelist_nodemask(zone, z, zonelist, 202 for_each_zone_zonelist_nodemask(zone, z, zonelist,
177 MAX_NR_ZONES - 1, nodemask) { 203 MAX_NR_ZONES - 1, nodemask) {
178 nid = zone_to_nid(zone); 204 nid = zone_to_nid(zone);
@@ -183,7 +209,9 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
183 list_del(&page->lru); 209 list_del(&page->lru);
184 free_huge_pages--; 210 free_huge_pages--;
185 free_huge_pages_node[nid]--; 211 free_huge_pages_node[nid]--;
186 decrement_hugepage_resv_vma(vma); 212
213 if (!avoid_reserve)
214 decrement_hugepage_resv_vma(vma);
187 215
188 break; 216 break;
189 } 217 }
@@ -534,7 +562,7 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
534} 562}
535 563
536static struct page *alloc_huge_page(struct vm_area_struct *vma, 564static struct page *alloc_huge_page(struct vm_area_struct *vma,
537 unsigned long addr) 565 unsigned long addr, int avoid_reserve)
538{ 566{
539 struct page *page; 567 struct page *page;
540 struct address_space *mapping = vma->vm_file->f_mapping; 568 struct address_space *mapping = vma->vm_file->f_mapping;
@@ -546,14 +574,15 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
546 * will not have accounted against quota. Check that the quota can be 574 * will not have accounted against quota. Check that the quota can be
547 * made before satisfying the allocation 575 * made before satisfying the allocation
548 */ 576 */
549 if (!vma_has_private_reserves(vma)) { 577 if (!(vma->vm_flags & VM_SHARED) &&
578 !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
550 chg = 1; 579 chg = 1;
551 if (hugetlb_get_quota(inode->i_mapping, chg)) 580 if (hugetlb_get_quota(inode->i_mapping, chg))
552 return ERR_PTR(-ENOSPC); 581 return ERR_PTR(-ENOSPC);
553 } 582 }
554 583
555 spin_lock(&hugetlb_lock); 584 spin_lock(&hugetlb_lock);
556 page = dequeue_huge_page_vma(vma, addr); 585 page = dequeue_huge_page_vma(vma, addr, avoid_reserve);
557 spin_unlock(&hugetlb_lock); 586 spin_unlock(&hugetlb_lock);
558 587
559 if (!page) { 588 if (!page) {
@@ -909,7 +938,7 @@ nomem:
909} 938}
910 939
911void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 940void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
912 unsigned long end) 941 unsigned long end, struct page *ref_page)
913{ 942{
914 struct mm_struct *mm = vma->vm_mm; 943 struct mm_struct *mm = vma->vm_mm;
915 unsigned long address; 944 unsigned long address;
@@ -937,6 +966,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
937 if (huge_pmd_unshare(mm, &address, ptep)) 966 if (huge_pmd_unshare(mm, &address, ptep))
938 continue; 967 continue;
939 968
969 /*
970 * If a reference page is supplied, it is because a specific
971 * page is being unmapped, not a range. Ensure the page we
972 * are about to unmap is the actual page of interest.
973 */
974 if (ref_page) {
975 pte = huge_ptep_get(ptep);
976 if (huge_pte_none(pte))
977 continue;
978 page = pte_page(pte);
979 if (page != ref_page)
980 continue;
981
982 /*
983 * Mark the VMA as having unmapped its page so that
984 * future faults in this VMA will fail rather than
985 * looking like data was lost
986 */
987 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
988 }
989
940 pte = huge_ptep_get_and_clear(mm, address, ptep); 990 pte = huge_ptep_get_and_clear(mm, address, ptep);
941 if (huge_pte_none(pte)) 991 if (huge_pte_none(pte))
942 continue; 992 continue;
@@ -955,7 +1005,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
955} 1005}
956 1006
957void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 1007void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
958 unsigned long end) 1008 unsigned long end, struct page *ref_page)
959{ 1009{
960 /* 1010 /*
961 * It is undesirable to test vma->vm_file as it should be non-null 1011 * It is undesirable to test vma->vm_file as it should be non-null
@@ -967,19 +1017,68 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
967 */ 1017 */
968 if (vma->vm_file) { 1018 if (vma->vm_file) {
969 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1019 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
970 __unmap_hugepage_range(vma, start, end); 1020 __unmap_hugepage_range(vma, start, end, ref_page);
971 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 1021 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
972 } 1022 }
973} 1023}
974 1024
1025/*
1026 * This is called when the original mapper is failing to COW a MAP_PRIVATE
1027 * mappping it owns the reserve page for. The intention is to unmap the page
1028 * from other VMAs and let the children be SIGKILLed if they are faulting the
1029 * same region.
1030 */
1031int unmap_ref_private(struct mm_struct *mm,
1032 struct vm_area_struct *vma,
1033 struct page *page,
1034 unsigned long address)
1035{
1036 struct vm_area_struct *iter_vma;
1037 struct address_space *mapping;
1038 struct prio_tree_iter iter;
1039 pgoff_t pgoff;
1040
1041 /*
1042 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
1043 * from page cache lookup which is in HPAGE_SIZE units.
1044 */
1045 address = address & huge_page_mask(hstate_vma(vma));
1046 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
1047 + (vma->vm_pgoff >> PAGE_SHIFT);
1048 mapping = (struct address_space *)page_private(page);
1049
1050 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1051 /* Do not unmap the current VMA */
1052 if (iter_vma == vma)
1053 continue;
1054
1055 /*
1056 * Unmap the page from other VMAs without their own reserves.
1057 * They get marked to be SIGKILLed if they fault in these
1058 * areas. This is because a future no-page fault on this VMA
1059 * could insert a zeroed page instead of the data existing
1060 * from the time of fork. This would look like data corruption
1061 */
1062 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
1063 unmap_hugepage_range(iter_vma,
1064 address, address + HPAGE_SIZE,
1065 page);
1066 }
1067
1068 return 1;
1069}
1070
975static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 1071static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
976 unsigned long address, pte_t *ptep, pte_t pte) 1072 unsigned long address, pte_t *ptep, pte_t pte,
1073 struct page *pagecache_page)
977{ 1074{
978 struct page *old_page, *new_page; 1075 struct page *old_page, *new_page;
979 int avoidcopy; 1076 int avoidcopy;
1077 int outside_reserve = 0;
980 1078
981 old_page = pte_page(pte); 1079 old_page = pte_page(pte);
982 1080
1081retry_avoidcopy:
983 /* If no-one else is actually using this page, avoid the copy 1082 /* If no-one else is actually using this page, avoid the copy
984 * and just make the page writable */ 1083 * and just make the page writable */
985 avoidcopy = (page_count(old_page) == 1); 1084 avoidcopy = (page_count(old_page) == 1);
@@ -988,11 +1087,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
988 return 0; 1087 return 0;
989 } 1088 }
990 1089
1090 /*
1091 * If the process that created a MAP_PRIVATE mapping is about to
1092 * perform a COW due to a shared page count, attempt to satisfy
1093 * the allocation without using the existing reserves. The pagecache
1094 * page is used to determine if the reserve at this address was
1095 * consumed or not. If reserves were used, a partial faulted mapping
1096 * at the time of fork() could consume its reserves on COW instead
1097 * of the full address range.
1098 */
1099 if (!(vma->vm_flags & VM_SHARED) &&
1100 is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
1101 old_page != pagecache_page)
1102 outside_reserve = 1;
1103
991 page_cache_get(old_page); 1104 page_cache_get(old_page);
992 new_page = alloc_huge_page(vma, address); 1105 new_page = alloc_huge_page(vma, address, outside_reserve);
993 1106
994 if (IS_ERR(new_page)) { 1107 if (IS_ERR(new_page)) {
995 page_cache_release(old_page); 1108 page_cache_release(old_page);
1109
1110 /*
1111 * If a process owning a MAP_PRIVATE mapping fails to COW,
1112 * it is due to references held by a child and an insufficient
1113 * huge page pool. To guarantee the original mappers
1114 * reliability, unmap the page from child processes. The child
1115 * may get SIGKILLed if it later faults.
1116 */
1117 if (outside_reserve) {
1118 BUG_ON(huge_pte_none(pte));
1119 if (unmap_ref_private(mm, vma, old_page, address)) {
1120 BUG_ON(page_count(old_page) != 1);
1121 BUG_ON(huge_pte_none(pte));
1122 goto retry_avoidcopy;
1123 }
1124 WARN_ON_ONCE(1);
1125 }
1126
996 return -PTR_ERR(new_page); 1127 return -PTR_ERR(new_page);
997 } 1128 }
998 1129
@@ -1015,6 +1146,20 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
1015 return 0; 1146 return 0;
1016} 1147}
1017 1148
1149/* Return the pagecache page at a given address within a VMA */
1150static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma,
1151 unsigned long address)
1152{
1153 struct address_space *mapping;
1154 unsigned long idx;
1155
1156 mapping = vma->vm_file->f_mapping;
1157 idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
1158 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
1159
1160 return find_lock_page(mapping, idx);
1161}
1162
1018static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 1163static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1019 unsigned long address, pte_t *ptep, int write_access) 1164 unsigned long address, pte_t *ptep, int write_access)
1020{ 1165{
@@ -1025,6 +1170,18 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1025 struct address_space *mapping; 1170 struct address_space *mapping;
1026 pte_t new_pte; 1171 pte_t new_pte;
1027 1172
1173 /*
1174 * Currently, we are forced to kill the process in the event the
1175 * original mapper has unmapped pages from the child due to a failed
1176 * COW. Warn that such a situation has occured as it may not be obvious
1177 */
1178 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
1179 printk(KERN_WARNING
1180 "PID %d killed due to inadequate hugepage pool\n",
1181 current->pid);
1182 return ret;
1183 }
1184
1028 mapping = vma->vm_file->f_mapping; 1185 mapping = vma->vm_file->f_mapping;
1029 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 1186 idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
1030 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 1187 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
@@ -1039,7 +1196,7 @@ retry:
1039 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1196 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
1040 if (idx >= size) 1197 if (idx >= size)
1041 goto out; 1198 goto out;
1042 page = alloc_huge_page(vma, address); 1199 page = alloc_huge_page(vma, address, 0);
1043 if (IS_ERR(page)) { 1200 if (IS_ERR(page)) {
1044 ret = -PTR_ERR(page); 1201 ret = -PTR_ERR(page);
1045 goto out; 1202 goto out;
@@ -1081,7 +1238,7 @@ retry:
1081 1238
1082 if (write_access && !(vma->vm_flags & VM_SHARED)) { 1239 if (write_access && !(vma->vm_flags & VM_SHARED)) {
1083 /* Optimization, do the COW without a second fault */ 1240 /* Optimization, do the COW without a second fault */
1084 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 1241 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
1085 } 1242 }
1086 1243
1087 spin_unlock(&mm->page_table_lock); 1244 spin_unlock(&mm->page_table_lock);
@@ -1126,8 +1283,15 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
1126 spin_lock(&mm->page_table_lock); 1283 spin_lock(&mm->page_table_lock);
1127 /* Check for a racing update before calling hugetlb_cow */ 1284 /* Check for a racing update before calling hugetlb_cow */
1128 if (likely(pte_same(entry, huge_ptep_get(ptep)))) 1285 if (likely(pte_same(entry, huge_ptep_get(ptep))))
1129 if (write_access && !pte_write(entry)) 1286 if (write_access && !pte_write(entry)) {
1130 ret = hugetlb_cow(mm, vma, address, ptep, entry); 1287 struct page *page;
1288 page = hugetlbfs_pagecache_page(vma, address);
1289 ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
1290 if (page) {
1291 unlock_page(page);
1292 put_page(page);
1293 }
1294 }
1131 spin_unlock(&mm->page_table_lock); 1295 spin_unlock(&mm->page_table_lock);
1132 mutex_unlock(&hugetlb_instantiation_mutex); 1296 mutex_unlock(&hugetlb_instantiation_mutex);
1133 1297
@@ -1371,6 +1535,7 @@ int hugetlb_reserve_pages(struct inode *inode,
1371 else { 1535 else {
1372 chg = to - from; 1536 chg = to - from;
1373 set_vma_resv_huge_pages(vma, chg); 1537 set_vma_resv_huge_pages(vma, chg);
1538 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
1374 } 1539 }
1375 1540
1376 if (chg < 0) 1541 if (chg < 0)