diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 201 |
1 files changed, 183 insertions, 18 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0af500db3632..a2d29b84501f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -40,6 +40,9 @@ static int hugetlb_next_nid; | |||
40 | */ | 40 | */ |
41 | static DEFINE_SPINLOCK(hugetlb_lock); | 41 | static DEFINE_SPINLOCK(hugetlb_lock); |
42 | 42 | ||
43 | #define HPAGE_RESV_OWNER (1UL << (BITS_PER_LONG - 1)) | ||
44 | #define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2)) | ||
45 | #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) | ||
43 | /* | 46 | /* |
44 | * These helpers are used to track how many pages are reserved for | 47 | * These helpers are used to track how many pages are reserved for |
45 | * faults in a MAP_PRIVATE mapping. Only the process that called mmap() | 48 | * faults in a MAP_PRIVATE mapping. Only the process that called mmap() |
@@ -54,17 +57,32 @@ static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma) | |||
54 | { | 57 | { |
55 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 58 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); |
56 | if (!(vma->vm_flags & VM_SHARED)) | 59 | if (!(vma->vm_flags & VM_SHARED)) |
57 | return (unsigned long)vma->vm_private_data; | 60 | return (unsigned long)vma->vm_private_data & ~HPAGE_RESV_MASK; |
58 | return 0; | 61 | return 0; |
59 | } | 62 | } |
60 | 63 | ||
61 | static void set_vma_resv_huge_pages(struct vm_area_struct *vma, | 64 | static void set_vma_resv_huge_pages(struct vm_area_struct *vma, |
62 | unsigned long reserve) | 65 | unsigned long reserve) |
63 | { | 66 | { |
67 | unsigned long flags; | ||
64 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 68 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); |
65 | VM_BUG_ON(vma->vm_flags & VM_SHARED); | 69 | VM_BUG_ON(vma->vm_flags & VM_SHARED); |
66 | 70 | ||
67 | vma->vm_private_data = (void *)reserve; | 71 | flags = (unsigned long)vma->vm_private_data & HPAGE_RESV_MASK; |
72 | vma->vm_private_data = (void *)(reserve | flags); | ||
73 | } | ||
74 | |||
75 | static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) | ||
76 | { | ||
77 | unsigned long reserveflags = (unsigned long)vma->vm_private_data; | ||
78 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | ||
79 | vma->vm_private_data = (void *)(reserveflags | flags); | ||
80 | } | ||
81 | |||
82 | static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) | ||
83 | { | ||
84 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | ||
85 | return ((unsigned long)vma->vm_private_data & flag) != 0; | ||
68 | } | 86 | } |
69 | 87 | ||
70 | /* Decrement the reserved pages in the hugepage pool by one */ | 88 | /* Decrement the reserved pages in the hugepage pool by one */ |
@@ -78,14 +96,18 @@ static void decrement_hugepage_resv_vma(struct vm_area_struct *vma) | |||
78 | * Only the process that called mmap() has reserves for | 96 | * Only the process that called mmap() has reserves for |
79 | * private mappings. | 97 | * private mappings. |
80 | */ | 98 | */ |
81 | if (vma_resv_huge_pages(vma)) { | 99 | if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { |
100 | unsigned long flags, reserve; | ||
82 | resv_huge_pages--; | 101 | resv_huge_pages--; |
102 | flags = (unsigned long)vma->vm_private_data & | ||
103 | HPAGE_RESV_MASK; | ||
83 | reserve = (unsigned long)vma->vm_private_data - 1; | 104 | reserve = (unsigned long)vma->vm_private_data - 1; |
84 | vma->vm_private_data = (void *)reserve; | 105 | vma->vm_private_data = (void *)(reserve | flags); |
85 | } | 106 | } |
86 | } | 107 | } |
87 | } | 108 | } |
88 | 109 | ||
110 | /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ | ||
89 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma) | 111 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma) |
90 | { | 112 | { |
91 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 113 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); |
@@ -153,7 +175,7 @@ static struct page *dequeue_huge_page(void) | |||
153 | } | 175 | } |
154 | 176 | ||
155 | static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | 177 | static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, |
156 | unsigned long address) | 178 | unsigned long address, int avoid_reserve) |
157 | { | 179 | { |
158 | int nid; | 180 | int nid; |
159 | struct page *page = NULL; | 181 | struct page *page = NULL; |
@@ -173,6 +195,10 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | |||
173 | free_huge_pages - resv_huge_pages == 0) | 195 | free_huge_pages - resv_huge_pages == 0) |
174 | return NULL; | 196 | return NULL; |
175 | 197 | ||
198 | /* If reserves cannot be used, ensure enough pages are in the pool */ | ||
199 | if (avoid_reserve && free_huge_pages - resv_huge_pages == 0) | ||
200 | return NULL; | ||
201 | |||
176 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 202 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
177 | MAX_NR_ZONES - 1, nodemask) { | 203 | MAX_NR_ZONES - 1, nodemask) { |
178 | nid = zone_to_nid(zone); | 204 | nid = zone_to_nid(zone); |
@@ -183,7 +209,9 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | |||
183 | list_del(&page->lru); | 209 | list_del(&page->lru); |
184 | free_huge_pages--; | 210 | free_huge_pages--; |
185 | free_huge_pages_node[nid]--; | 211 | free_huge_pages_node[nid]--; |
186 | decrement_hugepage_resv_vma(vma); | 212 | |
213 | if (!avoid_reserve) | ||
214 | decrement_hugepage_resv_vma(vma); | ||
187 | 215 | ||
188 | break; | 216 | break; |
189 | } | 217 | } |
@@ -534,7 +562,7 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) | |||
534 | } | 562 | } |
535 | 563 | ||
536 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 564 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
537 | unsigned long addr) | 565 | unsigned long addr, int avoid_reserve) |
538 | { | 566 | { |
539 | struct page *page; | 567 | struct page *page; |
540 | struct address_space *mapping = vma->vm_file->f_mapping; | 568 | struct address_space *mapping = vma->vm_file->f_mapping; |
@@ -546,14 +574,15 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
546 | * will not have accounted against quota. Check that the quota can be | 574 | * will not have accounted against quota. Check that the quota can be |
547 | * made before satisfying the allocation | 575 | * made before satisfying the allocation |
548 | */ | 576 | */ |
549 | if (!vma_has_private_reserves(vma)) { | 577 | if (!(vma->vm_flags & VM_SHARED) && |
578 | !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { | ||
550 | chg = 1; | 579 | chg = 1; |
551 | if (hugetlb_get_quota(inode->i_mapping, chg)) | 580 | if (hugetlb_get_quota(inode->i_mapping, chg)) |
552 | return ERR_PTR(-ENOSPC); | 581 | return ERR_PTR(-ENOSPC); |
553 | } | 582 | } |
554 | 583 | ||
555 | spin_lock(&hugetlb_lock); | 584 | spin_lock(&hugetlb_lock); |
556 | page = dequeue_huge_page_vma(vma, addr); | 585 | page = dequeue_huge_page_vma(vma, addr, avoid_reserve); |
557 | spin_unlock(&hugetlb_lock); | 586 | spin_unlock(&hugetlb_lock); |
558 | 587 | ||
559 | if (!page) { | 588 | if (!page) { |
@@ -909,7 +938,7 @@ nomem: | |||
909 | } | 938 | } |
910 | 939 | ||
911 | void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | 940 | void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, |
912 | unsigned long end) | 941 | unsigned long end, struct page *ref_page) |
913 | { | 942 | { |
914 | struct mm_struct *mm = vma->vm_mm; | 943 | struct mm_struct *mm = vma->vm_mm; |
915 | unsigned long address; | 944 | unsigned long address; |
@@ -937,6 +966,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
937 | if (huge_pmd_unshare(mm, &address, ptep)) | 966 | if (huge_pmd_unshare(mm, &address, ptep)) |
938 | continue; | 967 | continue; |
939 | 968 | ||
969 | /* | ||
970 | * If a reference page is supplied, it is because a specific | ||
971 | * page is being unmapped, not a range. Ensure the page we | ||
972 | * are about to unmap is the actual page of interest. | ||
973 | */ | ||
974 | if (ref_page) { | ||
975 | pte = huge_ptep_get(ptep); | ||
976 | if (huge_pte_none(pte)) | ||
977 | continue; | ||
978 | page = pte_page(pte); | ||
979 | if (page != ref_page) | ||
980 | continue; | ||
981 | |||
982 | /* | ||
983 | * Mark the VMA as having unmapped its page so that | ||
984 | * future faults in this VMA will fail rather than | ||
985 | * looking like data was lost | ||
986 | */ | ||
987 | set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); | ||
988 | } | ||
989 | |||
940 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 990 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
941 | if (huge_pte_none(pte)) | 991 | if (huge_pte_none(pte)) |
942 | continue; | 992 | continue; |
@@ -955,7 +1005,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
955 | } | 1005 | } |
956 | 1006 | ||
957 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | 1007 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, |
958 | unsigned long end) | 1008 | unsigned long end, struct page *ref_page) |
959 | { | 1009 | { |
960 | /* | 1010 | /* |
961 | * It is undesirable to test vma->vm_file as it should be non-null | 1011 | * It is undesirable to test vma->vm_file as it should be non-null |
@@ -967,19 +1017,68 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
967 | */ | 1017 | */ |
968 | if (vma->vm_file) { | 1018 | if (vma->vm_file) { |
969 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); | 1019 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); |
970 | __unmap_hugepage_range(vma, start, end); | 1020 | __unmap_hugepage_range(vma, start, end, ref_page); |
971 | spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); | 1021 | spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); |
972 | } | 1022 | } |
973 | } | 1023 | } |
974 | 1024 | ||
1025 | /* | ||
1026 | * This is called when the original mapper is failing to COW a MAP_PRIVATE | ||
1027 | * mappping it owns the reserve page for. The intention is to unmap the page | ||
1028 | * from other VMAs and let the children be SIGKILLed if they are faulting the | ||
1029 | * same region. | ||
1030 | */ | ||
1031 | int unmap_ref_private(struct mm_struct *mm, | ||
1032 | struct vm_area_struct *vma, | ||
1033 | struct page *page, | ||
1034 | unsigned long address) | ||
1035 | { | ||
1036 | struct vm_area_struct *iter_vma; | ||
1037 | struct address_space *mapping; | ||
1038 | struct prio_tree_iter iter; | ||
1039 | pgoff_t pgoff; | ||
1040 | |||
1041 | /* | ||
1042 | * vm_pgoff is in PAGE_SIZE units, hence the different calculation | ||
1043 | * from page cache lookup which is in HPAGE_SIZE units. | ||
1044 | */ | ||
1045 | address = address & huge_page_mask(hstate_vma(vma)); | ||
1046 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) | ||
1047 | + (vma->vm_pgoff >> PAGE_SHIFT); | ||
1048 | mapping = (struct address_space *)page_private(page); | ||
1049 | |||
1050 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | ||
1051 | /* Do not unmap the current VMA */ | ||
1052 | if (iter_vma == vma) | ||
1053 | continue; | ||
1054 | |||
1055 | /* | ||
1056 | * Unmap the page from other VMAs without their own reserves. | ||
1057 | * They get marked to be SIGKILLed if they fault in these | ||
1058 | * areas. This is because a future no-page fault on this VMA | ||
1059 | * could insert a zeroed page instead of the data existing | ||
1060 | * from the time of fork. This would look like data corruption | ||
1061 | */ | ||
1062 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) | ||
1063 | unmap_hugepage_range(iter_vma, | ||
1064 | address, address + HPAGE_SIZE, | ||
1065 | page); | ||
1066 | } | ||
1067 | |||
1068 | return 1; | ||
1069 | } | ||
1070 | |||
975 | static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | 1071 | static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, |
976 | unsigned long address, pte_t *ptep, pte_t pte) | 1072 | unsigned long address, pte_t *ptep, pte_t pte, |
1073 | struct page *pagecache_page) | ||
977 | { | 1074 | { |
978 | struct page *old_page, *new_page; | 1075 | struct page *old_page, *new_page; |
979 | int avoidcopy; | 1076 | int avoidcopy; |
1077 | int outside_reserve = 0; | ||
980 | 1078 | ||
981 | old_page = pte_page(pte); | 1079 | old_page = pte_page(pte); |
982 | 1080 | ||
1081 | retry_avoidcopy: | ||
983 | /* If no-one else is actually using this page, avoid the copy | 1082 | /* If no-one else is actually using this page, avoid the copy |
984 | * and just make the page writable */ | 1083 | * and just make the page writable */ |
985 | avoidcopy = (page_count(old_page) == 1); | 1084 | avoidcopy = (page_count(old_page) == 1); |
@@ -988,11 +1087,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
988 | return 0; | 1087 | return 0; |
989 | } | 1088 | } |
990 | 1089 | ||
1090 | /* | ||
1091 | * If the process that created a MAP_PRIVATE mapping is about to | ||
1092 | * perform a COW due to a shared page count, attempt to satisfy | ||
1093 | * the allocation without using the existing reserves. The pagecache | ||
1094 | * page is used to determine if the reserve at this address was | ||
1095 | * consumed or not. If reserves were used, a partial faulted mapping | ||
1096 | * at the time of fork() could consume its reserves on COW instead | ||
1097 | * of the full address range. | ||
1098 | */ | ||
1099 | if (!(vma->vm_flags & VM_SHARED) && | ||
1100 | is_vma_resv_set(vma, HPAGE_RESV_OWNER) && | ||
1101 | old_page != pagecache_page) | ||
1102 | outside_reserve = 1; | ||
1103 | |||
991 | page_cache_get(old_page); | 1104 | page_cache_get(old_page); |
992 | new_page = alloc_huge_page(vma, address); | 1105 | new_page = alloc_huge_page(vma, address, outside_reserve); |
993 | 1106 | ||
994 | if (IS_ERR(new_page)) { | 1107 | if (IS_ERR(new_page)) { |
995 | page_cache_release(old_page); | 1108 | page_cache_release(old_page); |
1109 | |||
1110 | /* | ||
1111 | * If a process owning a MAP_PRIVATE mapping fails to COW, | ||
1112 | * it is due to references held by a child and an insufficient | ||
1113 | * huge page pool. To guarantee the original mappers | ||
1114 | * reliability, unmap the page from child processes. The child | ||
1115 | * may get SIGKILLed if it later faults. | ||
1116 | */ | ||
1117 | if (outside_reserve) { | ||
1118 | BUG_ON(huge_pte_none(pte)); | ||
1119 | if (unmap_ref_private(mm, vma, old_page, address)) { | ||
1120 | BUG_ON(page_count(old_page) != 1); | ||
1121 | BUG_ON(huge_pte_none(pte)); | ||
1122 | goto retry_avoidcopy; | ||
1123 | } | ||
1124 | WARN_ON_ONCE(1); | ||
1125 | } | ||
1126 | |||
996 | return -PTR_ERR(new_page); | 1127 | return -PTR_ERR(new_page); |
997 | } | 1128 | } |
998 | 1129 | ||
@@ -1015,6 +1146,20 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1015 | return 0; | 1146 | return 0; |
1016 | } | 1147 | } |
1017 | 1148 | ||
1149 | /* Return the pagecache page at a given address within a VMA */ | ||
1150 | static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma, | ||
1151 | unsigned long address) | ||
1152 | { | ||
1153 | struct address_space *mapping; | ||
1154 | unsigned long idx; | ||
1155 | |||
1156 | mapping = vma->vm_file->f_mapping; | ||
1157 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) | ||
1158 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
1159 | |||
1160 | return find_lock_page(mapping, idx); | ||
1161 | } | ||
1162 | |||
1018 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1163 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1019 | unsigned long address, pte_t *ptep, int write_access) | 1164 | unsigned long address, pte_t *ptep, int write_access) |
1020 | { | 1165 | { |
@@ -1025,6 +1170,18 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1025 | struct address_space *mapping; | 1170 | struct address_space *mapping; |
1026 | pte_t new_pte; | 1171 | pte_t new_pte; |
1027 | 1172 | ||
1173 | /* | ||
1174 | * Currently, we are forced to kill the process in the event the | ||
1175 | * original mapper has unmapped pages from the child due to a failed | ||
1176 | * COW. Warn that such a situation has occured as it may not be obvious | ||
1177 | */ | ||
1178 | if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { | ||
1179 | printk(KERN_WARNING | ||
1180 | "PID %d killed due to inadequate hugepage pool\n", | ||
1181 | current->pid); | ||
1182 | return ret; | ||
1183 | } | ||
1184 | |||
1028 | mapping = vma->vm_file->f_mapping; | 1185 | mapping = vma->vm_file->f_mapping; |
1029 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) | 1186 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) |
1030 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | 1187 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); |
@@ -1039,7 +1196,7 @@ retry: | |||
1039 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; | 1196 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; |
1040 | if (idx >= size) | 1197 | if (idx >= size) |
1041 | goto out; | 1198 | goto out; |
1042 | page = alloc_huge_page(vma, address); | 1199 | page = alloc_huge_page(vma, address, 0); |
1043 | if (IS_ERR(page)) { | 1200 | if (IS_ERR(page)) { |
1044 | ret = -PTR_ERR(page); | 1201 | ret = -PTR_ERR(page); |
1045 | goto out; | 1202 | goto out; |
@@ -1081,7 +1238,7 @@ retry: | |||
1081 | 1238 | ||
1082 | if (write_access && !(vma->vm_flags & VM_SHARED)) { | 1239 | if (write_access && !(vma->vm_flags & VM_SHARED)) { |
1083 | /* Optimization, do the COW without a second fault */ | 1240 | /* Optimization, do the COW without a second fault */ |
1084 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte); | 1241 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); |
1085 | } | 1242 | } |
1086 | 1243 | ||
1087 | spin_unlock(&mm->page_table_lock); | 1244 | spin_unlock(&mm->page_table_lock); |
@@ -1126,8 +1283,15 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1126 | spin_lock(&mm->page_table_lock); | 1283 | spin_lock(&mm->page_table_lock); |
1127 | /* Check for a racing update before calling hugetlb_cow */ | 1284 | /* Check for a racing update before calling hugetlb_cow */ |
1128 | if (likely(pte_same(entry, huge_ptep_get(ptep)))) | 1285 | if (likely(pte_same(entry, huge_ptep_get(ptep)))) |
1129 | if (write_access && !pte_write(entry)) | 1286 | if (write_access && !pte_write(entry)) { |
1130 | ret = hugetlb_cow(mm, vma, address, ptep, entry); | 1287 | struct page *page; |
1288 | page = hugetlbfs_pagecache_page(vma, address); | ||
1289 | ret = hugetlb_cow(mm, vma, address, ptep, entry, page); | ||
1290 | if (page) { | ||
1291 | unlock_page(page); | ||
1292 | put_page(page); | ||
1293 | } | ||
1294 | } | ||
1131 | spin_unlock(&mm->page_table_lock); | 1295 | spin_unlock(&mm->page_table_lock); |
1132 | mutex_unlock(&hugetlb_instantiation_mutex); | 1296 | mutex_unlock(&hugetlb_instantiation_mutex); |
1133 | 1297 | ||
@@ -1371,6 +1535,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
1371 | else { | 1535 | else { |
1372 | chg = to - from; | 1536 | chg = to - from; |
1373 | set_vma_resv_huge_pages(vma, chg); | 1537 | set_vma_resv_huge_pages(vma, chg); |
1538 | set_vma_resv_flags(vma, HPAGE_RESV_OWNER); | ||
1374 | } | 1539 | } |
1375 | 1540 | ||
1376 | if (chg < 0) | 1541 | if (chg < 0) |