aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJared Hulbert <jaredeh@gmail.com>2008-04-28 05:12:58 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-04-28 11:58:22 -0400
commitb379d790197cdf8a95fb67507d75a24ac0a1678d (patch)
tree8ea36c9a0766aca3cfd69cd33aa8a5d2ca8dd2d4
parent214e471ff99064726b2d8af3aa0e24a73c775531 (diff)
mm: introduce VM_MIXEDMAP
This series introduces some important infrastructure work. The overall result is that: 1. We now support XIP backed filesystems using memory that have no struct page allocated to them. And patches 6 and 7 actually implement this for s390. This is pretty important in a number of cases. As far as I understand, in the case of virtualisation (eg. s390), each guest may mount a readonly copy of the same filesystem (eg. the distro). Currently, guests need to allocate struct pages for this image. So if you have 100 guests, you already need to allocate more memory for the struct pages than the size of the image. I think. (Carsten?) For other (eg. embedded) systems, you may have a very large non- volatile filesystem. If you have to have struct pages for this, then your RAM consumption will go up proportionally to fs size. Even though it is just a small proportion, the RAM can be much more costly eg in terms of power, so every KB less that Linux uses makes it more attractive to a lot of these guys. 2. VM_MIXEDMAP allows us to support mappings where you actually do want to refcount _some_ pages in the mapping, but not others, and support COW on arbitrary (non-linear) mappings. Jared needs this for his NVRAM filesystem in progress. Future iterations of this filesystem will most likely want to migrate pages between pagecache and XIP backing, which is where the requirement for mixed (some refcounted, some not) comes from. 3. pte_special also has a peripheral usage that I need for my lockless get_user_pages patch. That was shown to speed up "oltp" on db2 by 10% on a 2 socket system, which is kind of significant because they scrounge for months to try to find 0.1% improvement on these workloads. I'm hoping we might finally be faster than AIX on pSeries with this :). My reference to lockless get_user_pages is not meant to justify this patchset (which doesn't include lockless gup), but just to show that pte_special is not some s390 specific thing that should be hidden in arch code or xip code: I definitely want to use it on at least x86 and powerpc as well. This patch: Introduce a new type of mapping, VM_MIXEDMAP. This is unlike VM_PFNMAP in that it can support COW mappings of arbitrary ranges including ranges without struct page *and* ranges with a struct page that we actually want to refcount (PFNMAP can only support COW in those cases where the un-COW-ed translations are mapped linearly in the virtual address, and can only support non refcounted ranges). VM_MIXEDMAP achieves this by refcounting all pfn_valid pages, and not refcounting !pfn_valid pages (which is not an option for VM_PFNMAP, because it needs to avoid refcounting pfn_valid pages eg. for /dev/mem mappings). Signed-off-by: Jared Hulbert <jaredeh@gmail.com> Signed-off-by: Nick Piggin <npiggin@suse.de> Acked-by: Carsten Otte <cotte@de.ibm.com> Cc: Jared Hulbert <jaredeh@gmail.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mm.h1
-rw-r--r--mm/memory.c79
2 files changed, 59 insertions, 21 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 526f810367d9..c657ea0bd6aa 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -107,6 +107,7 @@ extern unsigned int kobjsize(const void *objp);
107#define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */ 107#define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */
108 108
109#define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ 109#define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */
110#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */
110 111
111#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ 112#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
112#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS 113#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
diff --git a/mm/memory.c b/mm/memory.c
index 46958fb97c2d..0da414c383e7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -371,35 +371,65 @@ static inline int is_cow_mapping(unsigned int flags)
371} 371}
372 372
373/* 373/*
374 * This function gets the "struct page" associated with a pte. 374 * This function gets the "struct page" associated with a pte or returns
375 * NULL if no "struct page" is associated with the pte.
375 * 376 *
376 * NOTE! Some mappings do not have "struct pages". A raw PFN mapping 377 * A raw VM_PFNMAP mapping (ie. one that is not COWed) may not have any "struct
377 * will have each page table entry just pointing to a raw page frame 378 * page" backing, and even if they do, they are not refcounted. COWed pages of
378 * number, and as far as the VM layer is concerned, those do not have 379 * a VM_PFNMAP do always have a struct page, and they are normally refcounted
379 * pages associated with them - even if the PFN might point to memory 380 * (they are _normal_ pages).
381 *
382 * So a raw PFNMAP mapping will have each page table entry just pointing
383 * to a page frame number, and as far as the VM layer is concerned, those do
384 * not have pages associated with them - even if the PFN might point to memory
380 * that otherwise is perfectly fine and has a "struct page". 385 * that otherwise is perfectly fine and has a "struct page".
381 * 386 *
382 * The way we recognize those mappings is through the rules set up 387 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
383 * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set, 388 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
384 * and the vm_pgoff will point to the first PFN mapped: thus every 389 * set, and the vm_pgoff will point to the first PFN mapped: thus every
385 * page that is a raw mapping will always honor the rule 390 * page that is a raw mapping will always honor the rule
386 * 391 *
387 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) 392 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
388 * 393 *
389 * and if that isn't true, the page has been COW'ed (in which case it 394 * A call to vm_normal_page() will return NULL for such a page.
390 * _does_ have a "struct page" associated with it even if it is in a 395 *
391 * VM_PFNMAP range). 396 * If the page doesn't follow the "remap_pfn_range()" rule in a VM_PFNMAP
397 * then the page has been COW'ed. A COW'ed page _does_ have a "struct page"
398 * associated with it even if it is in a VM_PFNMAP range. Calling
399 * vm_normal_page() on such a page will therefore return the "struct page".
400 *
401 *
402 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
403 * page" backing, however the difference is that _all_ pages with a struct
404 * page (that is, those where pfn_valid is true) are refcounted and considered
405 * normal pages by the VM. The disadvantage is that pages are refcounted
406 * (which can be slower and simply not an option for some PFNMAP users). The
407 * advantage is that we don't have to follow the strict linearity rule of
408 * PFNMAP mappings in order to support COWable mappings.
409 *
410 * A call to vm_normal_page() with a VM_MIXEDMAP mapping will return the
411 * associated "struct page" or NULL for memory not backed by a "struct page".
412 *
413 *
414 * All other mappings should have a valid struct page, which will be
415 * returned by a call to vm_normal_page().
392 */ 416 */
393struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) 417struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
394{ 418{
395 unsigned long pfn = pte_pfn(pte); 419 unsigned long pfn = pte_pfn(pte);
396 420
397 if (unlikely(vma->vm_flags & VM_PFNMAP)) { 421 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
398 unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; 422 if (vma->vm_flags & VM_MIXEDMAP) {
399 if (pfn == vma->vm_pgoff + off) 423 if (!pfn_valid(pfn))
400 return NULL; 424 return NULL;
401 if (!is_cow_mapping(vma->vm_flags)) 425 goto out;
402 return NULL; 426 } else {
427 unsigned long off = (addr-vma->vm_start) >> PAGE_SHIFT;
428 if (pfn == vma->vm_pgoff + off)
429 return NULL;
430 if (!is_cow_mapping(vma->vm_flags))
431 return NULL;
432 }
403 } 433 }
404 434
405#ifdef CONFIG_DEBUG_VM 435#ifdef CONFIG_DEBUG_VM
@@ -422,6 +452,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
422 * The PAGE_ZERO() pages and various VDSO mappings can 452 * The PAGE_ZERO() pages and various VDSO mappings can
423 * cause them to exist. 453 * cause them to exist.
424 */ 454 */
455out:
425 return pfn_to_page(pfn); 456 return pfn_to_page(pfn);
426} 457}
427 458
@@ -1232,8 +1263,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1232 pte_t *pte, entry; 1263 pte_t *pte, entry;
1233 spinlock_t *ptl; 1264 spinlock_t *ptl;
1234 1265
1235 BUG_ON(!(vma->vm_flags & VM_PFNMAP)); 1266 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1236 BUG_ON(is_cow_mapping(vma->vm_flags)); 1267 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1268 (VM_PFNMAP|VM_MIXEDMAP));
1269 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1270 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1237 1271
1238 retval = -ENOMEM; 1272 retval = -ENOMEM;
1239 pte = get_locked_pte(mm, addr, &ptl); 1273 pte = get_locked_pte(mm, addr, &ptl);
@@ -2365,10 +2399,13 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2365 unsigned long pfn; 2399 unsigned long pfn;
2366 2400
2367 pte_unmap(page_table); 2401 pte_unmap(page_table);
2368 BUG_ON(!(vma->vm_flags & VM_PFNMAP)); 2402 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2369 BUG_ON(is_cow_mapping(vma->vm_flags)); 2403 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2370 2404
2371 pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); 2405 pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
2406
2407 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2408
2372 if (unlikely(pfn == NOPFN_OOM)) 2409 if (unlikely(pfn == NOPFN_OOM))
2373 return VM_FAULT_OOM; 2410 return VM_FAULT_OOM;
2374 else if (unlikely(pfn == NOPFN_SIGBUS)) 2411 else if (unlikely(pfn == NOPFN_SIGBUS))