aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2008-04-28 05:13:00 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-04-28 11:58:23 -0400
commit7e675137a8e1a4d45822746456dd389b65745bf6 (patch)
tree5df01d23ea1b6b212d18f2136ff82913fcbe7718 /mm/memory.c
parentb379d790197cdf8a95fb67507d75a24ac0a1678d (diff)
mm: introduce pte_special pte bit
s390 for one, cannot implement VM_MIXEDMAP with pfn_valid, due to their memory model (which is more dynamic than most). Instead, they had proposed to implement it with an additional path through vm_normal_page(), using a bit in the pte to determine whether or not the page should be refcounted: vm_normal_page() { ... if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { #ifdef s390 if (!mixedmap_refcount_pte(pte)) return NULL; #else if (!pfn_valid(pfn)) return NULL; #endif goto out; } ... } This is fine, however if we are allowed to use a bit in the pte to determine refcountedness, we can use that to _completely_ replace all the vma based schemes. So instead of adding more cases to the already complex vma-based scheme, we can have a clearly seperate and simple pte-based scheme (and get slightly better code generation in the process): vm_normal_page() { #ifdef s390 if (!mixedmap_refcount_pte(pte)) return NULL; return pte_page(pte); #else ... #endif } And finally, we may rather make this concept usable by any architecture rather than making it s390 only, so implement a new type of pte state for this. Unfortunately the old vma based code must stay, because some architectures may not be able to spare pte bits. This makes vm_normal_page a little bit more ugly than we would like, but the 2 cases are clearly seperate. So introduce a pte_special pte state, and use it in mm/memory.c. It is currently a noop for all architectures, so this doesn't actually result in any compiled code changes to mm/memory.o. BTW: I haven't put vm_normal_page() into arch code as-per an earlier suggestion. The reason is that, regardless of where vm_normal_page is actually implemented, the *abstraction* is still exactly the same. Also, while it depends on whether the architecture has pte_special or not, that is the only two possible cases, and it really isn't an arch specific function -- the role of the arch code should be to provide primitive functions and accessors with which to build the core code; pte_special does that. We do not want architectures to know or care about vm_normal_page itself, and we definitely don't want them being able to invent something new there out of sight of mm/ code. If we made vm_normal_page an arch function, then we have to make vm_insert_mixed (next patch) an arch function too. So I don't think moving it to arch code fundamentally improves any abstractions, while it does practically make the code more difficult to follow, for both mm and arch developers, and easier to misuse. [akpm@linux-foundation.org: build fix] Signed-off-by: Nick Piggin <npiggin@suse.de> Acked-by: Carsten Otte <cotte@de.ibm.com> Cc: Jared Hulbert <jaredeh@gmail.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c99
1 files changed, 55 insertions, 44 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 0da414c383e7..c5e88bcd8ec3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -371,33 +371,37 @@ static inline int is_cow_mapping(unsigned int flags)
371} 371}
372 372
373/* 373/*
374 * This function gets the "struct page" associated with a pte or returns 374 * vm_normal_page -- This function gets the "struct page" associated with a pte.
375 * NULL if no "struct page" is associated with the pte.
376 * 375 *
377 * A raw VM_PFNMAP mapping (ie. one that is not COWed) may not have any "struct 376 * "Special" mappings do not wish to be associated with a "struct page" (either
378 * page" backing, and even if they do, they are not refcounted. COWed pages of 377 * it doesn't exist, or it exists but they don't want to touch it). In this
379 * a VM_PFNMAP do always have a struct page, and they are normally refcounted 378 * case, NULL is returned here. "Normal" mappings do have a struct page.
380 * (they are _normal_ pages).
381 * 379 *
382 * So a raw PFNMAP mapping will have each page table entry just pointing 380 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
383 * to a page frame number, and as far as the VM layer is concerned, those do 381 * pte bit, in which case this function is trivial. Secondly, an architecture
384 * not have pages associated with them - even if the PFN might point to memory 382 * may not have a spare pte bit, which requires a more complicated scheme,
385 * that otherwise is perfectly fine and has a "struct page". 383 * described below.
384 *
385 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
386 * special mapping (even if there are underlying and valid "struct pages").
387 * COWed pages of a VM_PFNMAP are always normal.
386 * 388 *
387 * The way we recognize COWed pages within VM_PFNMAP mappings is through the 389 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
388 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit 390 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
389 * set, and the vm_pgoff will point to the first PFN mapped: thus every 391 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
390 * page that is a raw mapping will always honor the rule 392 * mapping will always honor the rule
391 * 393 *
392 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) 394 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
393 * 395 *
394 * A call to vm_normal_page() will return NULL for such a page. 396 * And for normal mappings this is false.
397 *
398 * This restricts such mappings to be a linear translation from virtual address
399 * to pfn. To get around this restriction, we allow arbitrary mappings so long
400 * as the vma is not a COW mapping; in that case, we know that all ptes are
401 * special (because none can have been COWed).
395 * 402 *
396 * If the page doesn't follow the "remap_pfn_range()" rule in a VM_PFNMAP
397 * then the page has been COW'ed. A COW'ed page _does_ have a "struct page"
398 * associated with it even if it is in a VM_PFNMAP range. Calling
399 * vm_normal_page() on such a page will therefore return the "struct page".
400 * 403 *
404 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
401 * 405 *
402 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct 406 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
403 * page" backing, however the difference is that _all_ pages with a struct 407 * page" backing, however the difference is that _all_ pages with a struct
@@ -407,16 +411,29 @@ static inline int is_cow_mapping(unsigned int flags)
407 * advantage is that we don't have to follow the strict linearity rule of 411 * advantage is that we don't have to follow the strict linearity rule of
408 * PFNMAP mappings in order to support COWable mappings. 412 * PFNMAP mappings in order to support COWable mappings.
409 * 413 *
410 * A call to vm_normal_page() with a VM_MIXEDMAP mapping will return the
411 * associated "struct page" or NULL for memory not backed by a "struct page".
412 *
413 *
414 * All other mappings should have a valid struct page, which will be
415 * returned by a call to vm_normal_page().
416 */ 414 */
417struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) 415#ifdef __HAVE_ARCH_PTE_SPECIAL
416# define HAVE_PTE_SPECIAL 1
417#else
418# define HAVE_PTE_SPECIAL 0
419#endif
420struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
421 pte_t pte)
418{ 422{
419 unsigned long pfn = pte_pfn(pte); 423 unsigned long pfn;
424
425 if (HAVE_PTE_SPECIAL) {
426 if (likely(!pte_special(pte))) {
427 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
428 return pte_page(pte);
429 }
430 VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
431 return NULL;
432 }
433
434 /* !HAVE_PTE_SPECIAL case follows: */
435
436 pfn = pte_pfn(pte);
420 437
421 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { 438 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
422 if (vma->vm_flags & VM_MIXEDMAP) { 439 if (vma->vm_flags & VM_MIXEDMAP) {
@@ -424,7 +441,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
424 return NULL; 441 return NULL;
425 goto out; 442 goto out;
426 } else { 443 } else {
427 unsigned long off = (addr-vma->vm_start) >> PAGE_SHIFT; 444 unsigned long off;
445 off = (addr - vma->vm_start) >> PAGE_SHIFT;
428 if (pfn == vma->vm_pgoff + off) 446 if (pfn == vma->vm_pgoff + off)
429 return NULL; 447 return NULL;
430 if (!is_cow_mapping(vma->vm_flags)) 448 if (!is_cow_mapping(vma->vm_flags))
@@ -432,25 +450,12 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
432 } 450 }
433 } 451 }
434 452
435#ifdef CONFIG_DEBUG_VM 453 VM_BUG_ON(!pfn_valid(pfn));
436 /*
437 * Add some anal sanity checks for now. Eventually,
438 * we should just do "return pfn_to_page(pfn)", but
439 * in the meantime we check that we get a valid pfn,
440 * and that the resulting page looks ok.
441 */
442 if (unlikely(!pfn_valid(pfn))) {
443 print_bad_pte(vma, pte, addr);
444 return NULL;
445 }
446#endif
447 454
448 /* 455 /*
449 * NOTE! We still have PageReserved() pages in the page 456 * NOTE! We still have PageReserved() pages in the page tables.
450 * tables.
451 * 457 *
452 * The PAGE_ZERO() pages and various VDSO mappings can 458 * eg. VDSO mappings can cause them to exist.
453 * cause them to exist.
454 */ 459 */
455out: 460out:
456 return pfn_to_page(pfn); 461 return pfn_to_page(pfn);
@@ -1263,6 +1268,12 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1263 pte_t *pte, entry; 1268 pte_t *pte, entry;
1264 spinlock_t *ptl; 1269 spinlock_t *ptl;
1265 1270
1271 /*
1272 * Technically, architectures with pte_special can avoid all these
1273 * restrictions (same for remap_pfn_range). However we would like
1274 * consistency in testing and feature parity among all, so we should
1275 * try to keep these invariants in place for everybody.
1276 */
1266 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); 1277 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1267 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1278 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1268 (VM_PFNMAP|VM_MIXEDMAP)); 1279 (VM_PFNMAP|VM_MIXEDMAP));
@@ -1278,7 +1289,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1278 goto out_unlock; 1289 goto out_unlock;
1279 1290
1280 /* Ok, finally just insert the thing.. */ 1291 /* Ok, finally just insert the thing.. */
1281 entry = pfn_pte(pfn, vma->vm_page_prot); 1292 entry = pte_mkspecial(pfn_pte(pfn, vma->vm_page_prot));
1282 set_pte_at(mm, addr, pte, entry); 1293 set_pte_at(mm, addr, pte, entry);
1283 update_mmu_cache(vma, addr, entry); 1294 update_mmu_cache(vma, addr, entry);
1284 1295
@@ -1309,7 +1320,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1309 arch_enter_lazy_mmu_mode(); 1320 arch_enter_lazy_mmu_mode();
1310 do { 1321 do {
1311 BUG_ON(!pte_none(*pte)); 1322 BUG_ON(!pte_none(*pte));
1312 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); 1323 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1313 pfn++; 1324 pfn++;
1314 } while (pte++, addr += PAGE_SIZE, addr != end); 1325 } while (pte++, addr += PAGE_SIZE, addr != end);
1315 arch_leave_lazy_mmu_mode(); 1326 arch_leave_lazy_mmu_mode();