diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 228 |
1 files changed, 149 insertions, 79 deletions
diff --git a/mm/memory.c b/mm/memory.c index 0d14d1e58a5f..bbab1e37055e 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -371,57 +371,93 @@ static inline int is_cow_mapping(unsigned int flags) | |||
371 | } | 371 | } |
372 | 372 | ||
373 | /* | 373 | /* |
374 | * This function gets the "struct page" associated with a pte. | 374 | * vm_normal_page -- This function gets the "struct page" associated with a pte. |
375 | * | 375 | * |
376 | * NOTE! Some mappings do not have "struct pages". A raw PFN mapping | 376 | * "Special" mappings do not wish to be associated with a "struct page" (either |
377 | * will have each page table entry just pointing to a raw page frame | 377 | * it doesn't exist, or it exists but they don't want to touch it). In this |
378 | * number, and as far as the VM layer is concerned, those do not have | 378 | * case, NULL is returned here. "Normal" mappings do have a struct page. |
379 | * pages associated with them - even if the PFN might point to memory | ||
380 | * that otherwise is perfectly fine and has a "struct page". | ||
381 | * | 379 | * |
382 | * The way we recognize those mappings is through the rules set up | 380 | * There are 2 broad cases. Firstly, an architecture may define a pte_special() |
383 | * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set, | 381 | * pte bit, in which case this function is trivial. Secondly, an architecture |
384 | * and the vm_pgoff will point to the first PFN mapped: thus every | 382 | * may not have a spare pte bit, which requires a more complicated scheme, |
385 | * page that is a raw mapping will always honor the rule | 383 | * described below. |
384 | * | ||
385 | * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a | ||
386 | * special mapping (even if there are underlying and valid "struct pages"). | ||
387 | * COWed pages of a VM_PFNMAP are always normal. | ||
388 | * | ||
389 | * The way we recognize COWed pages within VM_PFNMAP mappings is through the | ||
390 | * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit | ||
391 | * set, and the vm_pgoff will point to the first PFN mapped: thus every special | ||
392 | * mapping will always honor the rule | ||
386 | * | 393 | * |
387 | * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) | 394 | * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) |
388 | * | 395 | * |
389 | * and if that isn't true, the page has been COW'ed (in which case it | 396 | * And for normal mappings this is false. |
390 | * _does_ have a "struct page" associated with it even if it is in a | 397 | * |
391 | * VM_PFNMAP range). | 398 | * This restricts such mappings to be a linear translation from virtual address |
399 | * to pfn. To get around this restriction, we allow arbitrary mappings so long | ||
400 | * as the vma is not a COW mapping; in that case, we know that all ptes are | ||
401 | * special (because none can have been COWed). | ||
402 | * | ||
403 | * | ||
404 | * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. | ||
405 | * | ||
406 | * VM_MIXEDMAP mappings can likewise contain memory with or without "struct | ||
407 | * page" backing, however the difference is that _all_ pages with a struct | ||
408 | * page (that is, those where pfn_valid is true) are refcounted and considered | ||
409 | * normal pages by the VM. The disadvantage is that pages are refcounted | ||
410 | * (which can be slower and simply not an option for some PFNMAP users). The | ||
411 | * advantage is that we don't have to follow the strict linearity rule of | ||
412 | * PFNMAP mappings in order to support COWable mappings. | ||
413 | * | ||
392 | */ | 414 | */ |
393 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) | 415 | #ifdef __HAVE_ARCH_PTE_SPECIAL |
416 | # define HAVE_PTE_SPECIAL 1 | ||
417 | #else | ||
418 | # define HAVE_PTE_SPECIAL 0 | ||
419 | #endif | ||
420 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | ||
421 | pte_t pte) | ||
394 | { | 422 | { |
395 | unsigned long pfn = pte_pfn(pte); | 423 | unsigned long pfn; |
396 | 424 | ||
397 | if (unlikely(vma->vm_flags & VM_PFNMAP)) { | 425 | if (HAVE_PTE_SPECIAL) { |
398 | unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; | 426 | if (likely(!pte_special(pte))) { |
399 | if (pfn == vma->vm_pgoff + off) | 427 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); |
400 | return NULL; | 428 | return pte_page(pte); |
401 | if (!is_cow_mapping(vma->vm_flags)) | 429 | } |
402 | return NULL; | 430 | VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); |
431 | return NULL; | ||
403 | } | 432 | } |
404 | 433 | ||
405 | #ifdef CONFIG_DEBUG_VM | 434 | /* !HAVE_PTE_SPECIAL case follows: */ |
406 | /* | 435 | |
407 | * Add some anal sanity checks for now. Eventually, | 436 | pfn = pte_pfn(pte); |
408 | * we should just do "return pfn_to_page(pfn)", but | 437 | |
409 | * in the meantime we check that we get a valid pfn, | 438 | if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { |
410 | * and that the resulting page looks ok. | 439 | if (vma->vm_flags & VM_MIXEDMAP) { |
411 | */ | 440 | if (!pfn_valid(pfn)) |
412 | if (unlikely(!pfn_valid(pfn))) { | 441 | return NULL; |
413 | print_bad_pte(vma, pte, addr); | 442 | goto out; |
414 | return NULL; | 443 | } else { |
444 | unsigned long off; | ||
445 | off = (addr - vma->vm_start) >> PAGE_SHIFT; | ||
446 | if (pfn == vma->vm_pgoff + off) | ||
447 | return NULL; | ||
448 | if (!is_cow_mapping(vma->vm_flags)) | ||
449 | return NULL; | ||
450 | } | ||
415 | } | 451 | } |
416 | #endif | 452 | |
453 | VM_BUG_ON(!pfn_valid(pfn)); | ||
417 | 454 | ||
418 | /* | 455 | /* |
419 | * NOTE! We still have PageReserved() pages in the page | 456 | * NOTE! We still have PageReserved() pages in the page tables. |
420 | * tables. | ||
421 | * | 457 | * |
422 | * The PAGE_ZERO() pages and various VDSO mappings can | 458 | * eg. VDSO mappings can cause them to exist. |
423 | * cause them to exist. | ||
424 | */ | 459 | */ |
460 | out: | ||
425 | return pfn_to_page(pfn); | 461 | return pfn_to_page(pfn); |
426 | } | 462 | } |
427 | 463 | ||
@@ -1057,8 +1093,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1057 | if (pages) | 1093 | if (pages) |
1058 | foll_flags |= FOLL_GET; | 1094 | foll_flags |= FOLL_GET; |
1059 | if (!write && !(vma->vm_flags & VM_LOCKED) && | 1095 | if (!write && !(vma->vm_flags & VM_LOCKED) && |
1060 | (!vma->vm_ops || (!vma->vm_ops->nopage && | 1096 | (!vma->vm_ops || !vma->vm_ops->fault)) |
1061 | !vma->vm_ops->fault))) | ||
1062 | foll_flags |= FOLL_ANON; | 1097 | foll_flags |= FOLL_ANON; |
1063 | 1098 | ||
1064 | do { | 1099 | do { |
@@ -1141,8 +1176,10 @@ pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, | |||
1141 | * old drivers should use this, and they needed to mark their | 1176 | * old drivers should use this, and they needed to mark their |
1142 | * pages reserved for the old functions anyway. | 1177 | * pages reserved for the old functions anyway. |
1143 | */ | 1178 | */ |
1144 | static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot) | 1179 | static int insert_page(struct vm_area_struct *vma, unsigned long addr, |
1180 | struct page *page, pgprot_t prot) | ||
1145 | { | 1181 | { |
1182 | struct mm_struct *mm = vma->vm_mm; | ||
1146 | int retval; | 1183 | int retval; |
1147 | pte_t *pte; | 1184 | pte_t *pte; |
1148 | spinlock_t *ptl; | 1185 | spinlock_t *ptl; |
@@ -1202,40 +1239,26 @@ out: | |||
1202 | * | 1239 | * |
1203 | * The page does not need to be reserved. | 1240 | * The page does not need to be reserved. |
1204 | */ | 1241 | */ |
1205 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) | 1242 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, |
1243 | struct page *page) | ||
1206 | { | 1244 | { |
1207 | if (addr < vma->vm_start || addr >= vma->vm_end) | 1245 | if (addr < vma->vm_start || addr >= vma->vm_end) |
1208 | return -EFAULT; | 1246 | return -EFAULT; |
1209 | if (!page_count(page)) | 1247 | if (!page_count(page)) |
1210 | return -EINVAL; | 1248 | return -EINVAL; |
1211 | vma->vm_flags |= VM_INSERTPAGE; | 1249 | vma->vm_flags |= VM_INSERTPAGE; |
1212 | return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot); | 1250 | return insert_page(vma, addr, page, vma->vm_page_prot); |
1213 | } | 1251 | } |
1214 | EXPORT_SYMBOL(vm_insert_page); | 1252 | EXPORT_SYMBOL(vm_insert_page); |
1215 | 1253 | ||
1216 | /** | 1254 | static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, |
1217 | * vm_insert_pfn - insert single pfn into user vma | 1255 | unsigned long pfn, pgprot_t prot) |
1218 | * @vma: user vma to map to | ||
1219 | * @addr: target user address of this page | ||
1220 | * @pfn: source kernel pfn | ||
1221 | * | ||
1222 | * Similar to vm_inert_page, this allows drivers to insert individual pages | ||
1223 | * they've allocated into a user vma. Same comments apply. | ||
1224 | * | ||
1225 | * This function should only be called from a vm_ops->fault handler, and | ||
1226 | * in that case the handler should return NULL. | ||
1227 | */ | ||
1228 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | ||
1229 | unsigned long pfn) | ||
1230 | { | 1256 | { |
1231 | struct mm_struct *mm = vma->vm_mm; | 1257 | struct mm_struct *mm = vma->vm_mm; |
1232 | int retval; | 1258 | int retval; |
1233 | pte_t *pte, entry; | 1259 | pte_t *pte, entry; |
1234 | spinlock_t *ptl; | 1260 | spinlock_t *ptl; |
1235 | 1261 | ||
1236 | BUG_ON(!(vma->vm_flags & VM_PFNMAP)); | ||
1237 | BUG_ON(is_cow_mapping(vma->vm_flags)); | ||
1238 | |||
1239 | retval = -ENOMEM; | 1262 | retval = -ENOMEM; |
1240 | pte = get_locked_pte(mm, addr, &ptl); | 1263 | pte = get_locked_pte(mm, addr, &ptl); |
1241 | if (!pte) | 1264 | if (!pte) |
@@ -1245,19 +1268,74 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | |||
1245 | goto out_unlock; | 1268 | goto out_unlock; |
1246 | 1269 | ||
1247 | /* Ok, finally just insert the thing.. */ | 1270 | /* Ok, finally just insert the thing.. */ |
1248 | entry = pfn_pte(pfn, vma->vm_page_prot); | 1271 | entry = pte_mkspecial(pfn_pte(pfn, prot)); |
1249 | set_pte_at(mm, addr, pte, entry); | 1272 | set_pte_at(mm, addr, pte, entry); |
1250 | update_mmu_cache(vma, addr, entry); | 1273 | update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ |
1251 | 1274 | ||
1252 | retval = 0; | 1275 | retval = 0; |
1253 | out_unlock: | 1276 | out_unlock: |
1254 | pte_unmap_unlock(pte, ptl); | 1277 | pte_unmap_unlock(pte, ptl); |
1255 | |||
1256 | out: | 1278 | out: |
1257 | return retval; | 1279 | return retval; |
1258 | } | 1280 | } |
1281 | |||
1282 | /** | ||
1283 | * vm_insert_pfn - insert single pfn into user vma | ||
1284 | * @vma: user vma to map to | ||
1285 | * @addr: target user address of this page | ||
1286 | * @pfn: source kernel pfn | ||
1287 | * | ||
1288 | * Similar to vm_inert_page, this allows drivers to insert individual pages | ||
1289 | * they've allocated into a user vma. Same comments apply. | ||
1290 | * | ||
1291 | * This function should only be called from a vm_ops->fault handler, and | ||
1292 | * in that case the handler should return NULL. | ||
1293 | */ | ||
1294 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | ||
1295 | unsigned long pfn) | ||
1296 | { | ||
1297 | /* | ||
1298 | * Technically, architectures with pte_special can avoid all these | ||
1299 | * restrictions (same for remap_pfn_range). However we would like | ||
1300 | * consistency in testing and feature parity among all, so we should | ||
1301 | * try to keep these invariants in place for everybody. | ||
1302 | */ | ||
1303 | BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); | ||
1304 | BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == | ||
1305 | (VM_PFNMAP|VM_MIXEDMAP)); | ||
1306 | BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); | ||
1307 | BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); | ||
1308 | |||
1309 | if (addr < vma->vm_start || addr >= vma->vm_end) | ||
1310 | return -EFAULT; | ||
1311 | return insert_pfn(vma, addr, pfn, vma->vm_page_prot); | ||
1312 | } | ||
1259 | EXPORT_SYMBOL(vm_insert_pfn); | 1313 | EXPORT_SYMBOL(vm_insert_pfn); |
1260 | 1314 | ||
1315 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | ||
1316 | unsigned long pfn) | ||
1317 | { | ||
1318 | BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); | ||
1319 | |||
1320 | if (addr < vma->vm_start || addr >= vma->vm_end) | ||
1321 | return -EFAULT; | ||
1322 | |||
1323 | /* | ||
1324 | * If we don't have pte special, then we have to use the pfn_valid() | ||
1325 | * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* | ||
1326 | * refcount the page if pfn_valid is true (hence insert_page rather | ||
1327 | * than insert_pfn). | ||
1328 | */ | ||
1329 | if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { | ||
1330 | struct page *page; | ||
1331 | |||
1332 | page = pfn_to_page(pfn); | ||
1333 | return insert_page(vma, addr, page, vma->vm_page_prot); | ||
1334 | } | ||
1335 | return insert_pfn(vma, addr, pfn, vma->vm_page_prot); | ||
1336 | } | ||
1337 | EXPORT_SYMBOL(vm_insert_mixed); | ||
1338 | |||
1261 | /* | 1339 | /* |
1262 | * maps a range of physical memory into the requested pages. the old | 1340 | * maps a range of physical memory into the requested pages. the old |
1263 | * mappings are removed. any references to nonexistent pages results | 1341 | * mappings are removed. any references to nonexistent pages results |
@@ -1276,7 +1354,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
1276 | arch_enter_lazy_mmu_mode(); | 1354 | arch_enter_lazy_mmu_mode(); |
1277 | do { | 1355 | do { |
1278 | BUG_ON(!pte_none(*pte)); | 1356 | BUG_ON(!pte_none(*pte)); |
1279 | set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); | 1357 | set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); |
1280 | pfn++; | 1358 | pfn++; |
1281 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1359 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1282 | arch_leave_lazy_mmu_mode(); | 1360 | arch_leave_lazy_mmu_mode(); |
@@ -2199,20 +2277,9 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2199 | 2277 | ||
2200 | BUG_ON(vma->vm_flags & VM_PFNMAP); | 2278 | BUG_ON(vma->vm_flags & VM_PFNMAP); |
2201 | 2279 | ||
2202 | if (likely(vma->vm_ops->fault)) { | 2280 | ret = vma->vm_ops->fault(vma, &vmf); |
2203 | ret = vma->vm_ops->fault(vma, &vmf); | 2281 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) |
2204 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) | 2282 | return ret; |
2205 | return ret; | ||
2206 | } else { | ||
2207 | /* Legacy ->nopage path */ | ||
2208 | ret = 0; | ||
2209 | vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); | ||
2210 | /* no page was available -- either SIGBUS or OOM */ | ||
2211 | if (unlikely(vmf.page == NOPAGE_SIGBUS)) | ||
2212 | return VM_FAULT_SIGBUS; | ||
2213 | else if (unlikely(vmf.page == NOPAGE_OOM)) | ||
2214 | return VM_FAULT_OOM; | ||
2215 | } | ||
2216 | 2283 | ||
2217 | /* | 2284 | /* |
2218 | * For consistency in subsequent calls, make the faulted page always | 2285 | * For consistency in subsequent calls, make the faulted page always |
@@ -2377,10 +2444,13 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2377 | unsigned long pfn; | 2444 | unsigned long pfn; |
2378 | 2445 | ||
2379 | pte_unmap(page_table); | 2446 | pte_unmap(page_table); |
2380 | BUG_ON(!(vma->vm_flags & VM_PFNMAP)); | 2447 | BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); |
2381 | BUG_ON(is_cow_mapping(vma->vm_flags)); | 2448 | BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); |
2382 | 2449 | ||
2383 | pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); | 2450 | pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); |
2451 | |||
2452 | BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); | ||
2453 | |||
2384 | if (unlikely(pfn == NOPFN_OOM)) | 2454 | if (unlikely(pfn == NOPFN_OOM)) |
2385 | return VM_FAULT_OOM; | 2455 | return VM_FAULT_OOM; |
2386 | else if (unlikely(pfn == NOPFN_SIGBUS)) | 2456 | else if (unlikely(pfn == NOPFN_SIGBUS)) |
@@ -2458,7 +2528,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
2458 | if (!pte_present(entry)) { | 2528 | if (!pte_present(entry)) { |
2459 | if (pte_none(entry)) { | 2529 | if (pte_none(entry)) { |
2460 | if (vma->vm_ops) { | 2530 | if (vma->vm_ops) { |
2461 | if (vma->vm_ops->fault || vma->vm_ops->nopage) | 2531 | if (likely(vma->vm_ops->fault)) |
2462 | return do_linear_fault(mm, vma, address, | 2532 | return do_linear_fault(mm, vma, address, |
2463 | pte, pmd, write_access, entry); | 2533 | pte, pmd, write_access, entry); |
2464 | if (unlikely(vma->vm_ops->nopfn)) | 2534 | if (unlikely(vma->vm_ops->nopfn)) |