aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c228
1 files changed, 149 insertions, 79 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 0d14d1e58a5f..bbab1e37055e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -371,57 +371,93 @@ static inline int is_cow_mapping(unsigned int flags)
371} 371}
372 372
373/* 373/*
374 * This function gets the "struct page" associated with a pte. 374 * vm_normal_page -- This function gets the "struct page" associated with a pte.
375 * 375 *
376 * NOTE! Some mappings do not have "struct pages". A raw PFN mapping 376 * "Special" mappings do not wish to be associated with a "struct page" (either
377 * will have each page table entry just pointing to a raw page frame 377 * it doesn't exist, or it exists but they don't want to touch it). In this
378 * number, and as far as the VM layer is concerned, those do not have 378 * case, NULL is returned here. "Normal" mappings do have a struct page.
379 * pages associated with them - even if the PFN might point to memory
380 * that otherwise is perfectly fine and has a "struct page".
381 * 379 *
382 * The way we recognize those mappings is through the rules set up 380 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
383 * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set, 381 * pte bit, in which case this function is trivial. Secondly, an architecture
384 * and the vm_pgoff will point to the first PFN mapped: thus every 382 * may not have a spare pte bit, which requires a more complicated scheme,
385 * page that is a raw mapping will always honor the rule 383 * described below.
384 *
385 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
386 * special mapping (even if there are underlying and valid "struct pages").
387 * COWed pages of a VM_PFNMAP are always normal.
388 *
389 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
390 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
391 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
392 * mapping will always honor the rule
386 * 393 *
387 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) 394 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
388 * 395 *
389 * and if that isn't true, the page has been COW'ed (in which case it 396 * And for normal mappings this is false.
390 * _does_ have a "struct page" associated with it even if it is in a 397 *
391 * VM_PFNMAP range). 398 * This restricts such mappings to be a linear translation from virtual address
399 * to pfn. To get around this restriction, we allow arbitrary mappings so long
400 * as the vma is not a COW mapping; in that case, we know that all ptes are
401 * special (because none can have been COWed).
402 *
403 *
404 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
405 *
406 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
407 * page" backing, however the difference is that _all_ pages with a struct
408 * page (that is, those where pfn_valid is true) are refcounted and considered
409 * normal pages by the VM. The disadvantage is that pages are refcounted
410 * (which can be slower and simply not an option for some PFNMAP users). The
411 * advantage is that we don't have to follow the strict linearity rule of
412 * PFNMAP mappings in order to support COWable mappings.
413 *
392 */ 414 */
393struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) 415#ifdef __HAVE_ARCH_PTE_SPECIAL
416# define HAVE_PTE_SPECIAL 1
417#else
418# define HAVE_PTE_SPECIAL 0
419#endif
420struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
421 pte_t pte)
394{ 422{
395 unsigned long pfn = pte_pfn(pte); 423 unsigned long pfn;
396 424
397 if (unlikely(vma->vm_flags & VM_PFNMAP)) { 425 if (HAVE_PTE_SPECIAL) {
398 unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; 426 if (likely(!pte_special(pte))) {
399 if (pfn == vma->vm_pgoff + off) 427 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
400 return NULL; 428 return pte_page(pte);
401 if (!is_cow_mapping(vma->vm_flags)) 429 }
402 return NULL; 430 VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
431 return NULL;
403 } 432 }
404 433
405#ifdef CONFIG_DEBUG_VM 434 /* !HAVE_PTE_SPECIAL case follows: */
406 /* 435
407 * Add some anal sanity checks for now. Eventually, 436 pfn = pte_pfn(pte);
408 * we should just do "return pfn_to_page(pfn)", but 437
409 * in the meantime we check that we get a valid pfn, 438 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
410 * and that the resulting page looks ok. 439 if (vma->vm_flags & VM_MIXEDMAP) {
411 */ 440 if (!pfn_valid(pfn))
412 if (unlikely(!pfn_valid(pfn))) { 441 return NULL;
413 print_bad_pte(vma, pte, addr); 442 goto out;
414 return NULL; 443 } else {
444 unsigned long off;
445 off = (addr - vma->vm_start) >> PAGE_SHIFT;
446 if (pfn == vma->vm_pgoff + off)
447 return NULL;
448 if (!is_cow_mapping(vma->vm_flags))
449 return NULL;
450 }
415 } 451 }
416#endif 452
453 VM_BUG_ON(!pfn_valid(pfn));
417 454
418 /* 455 /*
419 * NOTE! We still have PageReserved() pages in the page 456 * NOTE! We still have PageReserved() pages in the page tables.
420 * tables.
421 * 457 *
422 * The PAGE_ZERO() pages and various VDSO mappings can 458 * eg. VDSO mappings can cause them to exist.
423 * cause them to exist.
424 */ 459 */
460out:
425 return pfn_to_page(pfn); 461 return pfn_to_page(pfn);
426} 462}
427 463
@@ -1057,8 +1093,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1057 if (pages) 1093 if (pages)
1058 foll_flags |= FOLL_GET; 1094 foll_flags |= FOLL_GET;
1059 if (!write && !(vma->vm_flags & VM_LOCKED) && 1095 if (!write && !(vma->vm_flags & VM_LOCKED) &&
1060 (!vma->vm_ops || (!vma->vm_ops->nopage && 1096 (!vma->vm_ops || !vma->vm_ops->fault))
1061 !vma->vm_ops->fault)))
1062 foll_flags |= FOLL_ANON; 1097 foll_flags |= FOLL_ANON;
1063 1098
1064 do { 1099 do {
@@ -1141,8 +1176,10 @@ pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
1141 * old drivers should use this, and they needed to mark their 1176 * old drivers should use this, and they needed to mark their
1142 * pages reserved for the old functions anyway. 1177 * pages reserved for the old functions anyway.
1143 */ 1178 */
1144static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot) 1179static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1180 struct page *page, pgprot_t prot)
1145{ 1181{
1182 struct mm_struct *mm = vma->vm_mm;
1146 int retval; 1183 int retval;
1147 pte_t *pte; 1184 pte_t *pte;
1148 spinlock_t *ptl; 1185 spinlock_t *ptl;
@@ -1202,40 +1239,26 @@ out:
1202 * 1239 *
1203 * The page does not need to be reserved. 1240 * The page does not need to be reserved.
1204 */ 1241 */
1205int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) 1242int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1243 struct page *page)
1206{ 1244{
1207 if (addr < vma->vm_start || addr >= vma->vm_end) 1245 if (addr < vma->vm_start || addr >= vma->vm_end)
1208 return -EFAULT; 1246 return -EFAULT;
1209 if (!page_count(page)) 1247 if (!page_count(page))
1210 return -EINVAL; 1248 return -EINVAL;
1211 vma->vm_flags |= VM_INSERTPAGE; 1249 vma->vm_flags |= VM_INSERTPAGE;
1212 return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot); 1250 return insert_page(vma, addr, page, vma->vm_page_prot);
1213} 1251}
1214EXPORT_SYMBOL(vm_insert_page); 1252EXPORT_SYMBOL(vm_insert_page);
1215 1253
1216/** 1254static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1217 * vm_insert_pfn - insert single pfn into user vma 1255 unsigned long pfn, pgprot_t prot)
1218 * @vma: user vma to map to
1219 * @addr: target user address of this page
1220 * @pfn: source kernel pfn
1221 *
1222 * Similar to vm_inert_page, this allows drivers to insert individual pages
1223 * they've allocated into a user vma. Same comments apply.
1224 *
1225 * This function should only be called from a vm_ops->fault handler, and
1226 * in that case the handler should return NULL.
1227 */
1228int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1229 unsigned long pfn)
1230{ 1256{
1231 struct mm_struct *mm = vma->vm_mm; 1257 struct mm_struct *mm = vma->vm_mm;
1232 int retval; 1258 int retval;
1233 pte_t *pte, entry; 1259 pte_t *pte, entry;
1234 spinlock_t *ptl; 1260 spinlock_t *ptl;
1235 1261
1236 BUG_ON(!(vma->vm_flags & VM_PFNMAP));
1237 BUG_ON(is_cow_mapping(vma->vm_flags));
1238
1239 retval = -ENOMEM; 1262 retval = -ENOMEM;
1240 pte = get_locked_pte(mm, addr, &ptl); 1263 pte = get_locked_pte(mm, addr, &ptl);
1241 if (!pte) 1264 if (!pte)
@@ -1245,19 +1268,74 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1245 goto out_unlock; 1268 goto out_unlock;
1246 1269
1247 /* Ok, finally just insert the thing.. */ 1270 /* Ok, finally just insert the thing.. */
1248 entry = pfn_pte(pfn, vma->vm_page_prot); 1271 entry = pte_mkspecial(pfn_pte(pfn, prot));
1249 set_pte_at(mm, addr, pte, entry); 1272 set_pte_at(mm, addr, pte, entry);
1250 update_mmu_cache(vma, addr, entry); 1273 update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */
1251 1274
1252 retval = 0; 1275 retval = 0;
1253out_unlock: 1276out_unlock:
1254 pte_unmap_unlock(pte, ptl); 1277 pte_unmap_unlock(pte, ptl);
1255
1256out: 1278out:
1257 return retval; 1279 return retval;
1258} 1280}
1281
1282/**
1283 * vm_insert_pfn - insert single pfn into user vma
1284 * @vma: user vma to map to
1285 * @addr: target user address of this page
1286 * @pfn: source kernel pfn
1287 *
1288 * Similar to vm_inert_page, this allows drivers to insert individual pages
1289 * they've allocated into a user vma. Same comments apply.
1290 *
1291 * This function should only be called from a vm_ops->fault handler, and
1292 * in that case the handler should return NULL.
1293 */
1294int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1295 unsigned long pfn)
1296{
1297 /*
1298 * Technically, architectures with pte_special can avoid all these
1299 * restrictions (same for remap_pfn_range). However we would like
1300 * consistency in testing and feature parity among all, so we should
1301 * try to keep these invariants in place for everybody.
1302 */
1303 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1304 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1305 (VM_PFNMAP|VM_MIXEDMAP));
1306 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1307 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1308
1309 if (addr < vma->vm_start || addr >= vma->vm_end)
1310 return -EFAULT;
1311 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
1312}
1259EXPORT_SYMBOL(vm_insert_pfn); 1313EXPORT_SYMBOL(vm_insert_pfn);
1260 1314
1315int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1316 unsigned long pfn)
1317{
1318 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1319
1320 if (addr < vma->vm_start || addr >= vma->vm_end)
1321 return -EFAULT;
1322
1323 /*
1324 * If we don't have pte special, then we have to use the pfn_valid()
1325 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
1326 * refcount the page if pfn_valid is true (hence insert_page rather
1327 * than insert_pfn).
1328 */
1329 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1330 struct page *page;
1331
1332 page = pfn_to_page(pfn);
1333 return insert_page(vma, addr, page, vma->vm_page_prot);
1334 }
1335 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
1336}
1337EXPORT_SYMBOL(vm_insert_mixed);
1338
1261/* 1339/*
1262 * maps a range of physical memory into the requested pages. the old 1340 * maps a range of physical memory into the requested pages. the old
1263 * mappings are removed. any references to nonexistent pages results 1341 * mappings are removed. any references to nonexistent pages results
@@ -1276,7 +1354,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1276 arch_enter_lazy_mmu_mode(); 1354 arch_enter_lazy_mmu_mode();
1277 do { 1355 do {
1278 BUG_ON(!pte_none(*pte)); 1356 BUG_ON(!pte_none(*pte));
1279 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); 1357 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1280 pfn++; 1358 pfn++;
1281 } while (pte++, addr += PAGE_SIZE, addr != end); 1359 } while (pte++, addr += PAGE_SIZE, addr != end);
1282 arch_leave_lazy_mmu_mode(); 1360 arch_leave_lazy_mmu_mode();
@@ -2199,20 +2277,9 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2199 2277
2200 BUG_ON(vma->vm_flags & VM_PFNMAP); 2278 BUG_ON(vma->vm_flags & VM_PFNMAP);
2201 2279
2202 if (likely(vma->vm_ops->fault)) { 2280 ret = vma->vm_ops->fault(vma, &vmf);
2203 ret = vma->vm_ops->fault(vma, &vmf); 2281 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2204 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) 2282 return ret;
2205 return ret;
2206 } else {
2207 /* Legacy ->nopage path */
2208 ret = 0;
2209 vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
2210 /* no page was available -- either SIGBUS or OOM */
2211 if (unlikely(vmf.page == NOPAGE_SIGBUS))
2212 return VM_FAULT_SIGBUS;
2213 else if (unlikely(vmf.page == NOPAGE_OOM))
2214 return VM_FAULT_OOM;
2215 }
2216 2283
2217 /* 2284 /*
2218 * For consistency in subsequent calls, make the faulted page always 2285 * For consistency in subsequent calls, make the faulted page always
@@ -2377,10 +2444,13 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2377 unsigned long pfn; 2444 unsigned long pfn;
2378 2445
2379 pte_unmap(page_table); 2446 pte_unmap(page_table);
2380 BUG_ON(!(vma->vm_flags & VM_PFNMAP)); 2447 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2381 BUG_ON(is_cow_mapping(vma->vm_flags)); 2448 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2382 2449
2383 pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); 2450 pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
2451
2452 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2453
2384 if (unlikely(pfn == NOPFN_OOM)) 2454 if (unlikely(pfn == NOPFN_OOM))
2385 return VM_FAULT_OOM; 2455 return VM_FAULT_OOM;
2386 else if (unlikely(pfn == NOPFN_SIGBUS)) 2456 else if (unlikely(pfn == NOPFN_SIGBUS))
@@ -2458,7 +2528,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2458 if (!pte_present(entry)) { 2528 if (!pte_present(entry)) {
2459 if (pte_none(entry)) { 2529 if (pte_none(entry)) {
2460 if (vma->vm_ops) { 2530 if (vma->vm_ops) {
2461 if (vma->vm_ops->fault || vma->vm_ops->nopage) 2531 if (likely(vma->vm_ops->fault))
2462 return do_linear_fault(mm, vma, address, 2532 return do_linear_fault(mm, vma, address,
2463 pte, pmd, write_access, entry); 2533 pte, pmd, write_access, entry);
2464 if (unlikely(vma->vm_ops->nopfn)) 2534 if (unlikely(vma->vm_ops->nopfn))