aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/Locking10
-rw-r--r--include/linux/mm.h8
-rw-r--r--mm/memory.c81
3 files changed, 96 insertions, 3 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index f424e0e5b46b..efca5c1bbb10 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -529,6 +529,7 @@ locking rules:
529open: yes 529open: yes
530close: yes 530close: yes
531fault: yes can return with page locked 531fault: yes can return with page locked
532map_pages: yes
532page_mkwrite: yes can return with page locked 533page_mkwrite: yes can return with page locked
533access: yes 534access: yes
534 535
@@ -540,6 +541,15 @@ the page, then ensure it is not already truncated (the page lock will block
540subsequent truncate), and then return with VM_FAULT_LOCKED, and the page 541subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
541locked. The VM will unlock the page. 542locked. The VM will unlock the page.
542 543
544 ->map_pages() is called when VM asks to map easy accessible pages.
545Filesystem should find and map pages associated with offsets from "pgoff"
546till "max_pgoff". ->map_pages() is called with page table locked and must
547not block. If it's not possible to reach a page without blocking,
548filesystem should skip it. Filesystem should use do_set_pte() to setup
549page table entry. Pointer to entry associated with offset "pgoff" is
550passed in "pte" field in vm_fault structure. Pointers to entries for other
551offsets should be calculated relative to "pte".
552
543 ->page_mkwrite() is called when a previously read-only pte is 553 ->page_mkwrite() is called when a previously read-only pte is
544about to become writeable. The filesystem again must ensure that there are 554about to become writeable. The filesystem again must ensure that there are
545no truncate/invalidate races, and then return with the page locked. If 555no truncate/invalidate races, and then return with the page locked. If
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c270fa68a32b..f710d32291e8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -213,6 +213,10 @@ struct vm_fault {
213 * is set (which is also implied by 213 * is set (which is also implied by
214 * VM_FAULT_ERROR). 214 * VM_FAULT_ERROR).
215 */ 215 */
216 /* for ->map_pages() only */
217 pgoff_t max_pgoff; /* map pages for offset from pgoff till
218 * max_pgoff inclusive */
219 pte_t *pte; /* pte entry associated with ->pgoff */
216}; 220};
217 221
218/* 222/*
@@ -224,6 +228,7 @@ struct vm_operations_struct {
224 void (*open)(struct vm_area_struct * area); 228 void (*open)(struct vm_area_struct * area);
225 void (*close)(struct vm_area_struct * area); 229 void (*close)(struct vm_area_struct * area);
226 int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); 230 int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
231 void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
227 232
228 /* notification that a previously read-only page is about to become 233 /* notification that a previously read-only page is about to become
229 * writable, if an error is returned it will cause a SIGBUS */ 234 * writable, if an error is returned it will cause a SIGBUS */
@@ -584,6 +589,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
584 pte = pte_mkwrite(pte); 589 pte = pte_mkwrite(pte);
585 return pte; 590 return pte;
586} 591}
592
593void do_set_pte(struct vm_area_struct *vma, unsigned long address,
594 struct page *page, pte_t *pte, bool write, bool anon);
587#endif 595#endif
588 596
589/* 597/*
diff --git a/mm/memory.c b/mm/memory.c
index c6ee34d10fcc..4eefb7e31521 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3342,7 +3342,22 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
3342 return ret; 3342 return ret;
3343} 3343}
3344 3344
3345static void do_set_pte(struct vm_area_struct *vma, unsigned long address, 3345/**
3346 * do_set_pte - setup new PTE entry for given page and add reverse page mapping.
3347 *
3348 * @vma: virtual memory area
3349 * @address: user virtual address
3350 * @page: page to map
3351 * @pte: pointer to target page table entry
3352 * @write: true, if new entry is writable
3353 * @anon: true, if it's anonymous page
3354 *
3355 * Caller must hold page table lock relevant for @pte.
3356 *
3357 * Target users are page handler itself and implementations of
3358 * vm_ops->map_pages.
3359 */
3360void do_set_pte(struct vm_area_struct *vma, unsigned long address,
3346 struct page *page, pte_t *pte, bool write, bool anon) 3361 struct page *page, pte_t *pte, bool write, bool anon)
3347{ 3362{
3348 pte_t entry; 3363 pte_t entry;
@@ -3366,6 +3381,52 @@ static void do_set_pte(struct vm_area_struct *vma, unsigned long address,
3366 update_mmu_cache(vma, address, pte); 3381 update_mmu_cache(vma, address, pte);
3367} 3382}
3368 3383
3384#define FAULT_AROUND_ORDER 4
3385#define FAULT_AROUND_PAGES (1UL << FAULT_AROUND_ORDER)
3386#define FAULT_AROUND_MASK ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1)
3387
3388static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
3389 pte_t *pte, pgoff_t pgoff, unsigned int flags)
3390{
3391 unsigned long start_addr;
3392 pgoff_t max_pgoff;
3393 struct vm_fault vmf;
3394 int off;
3395
3396 BUILD_BUG_ON(FAULT_AROUND_PAGES > PTRS_PER_PTE);
3397
3398 start_addr = max(address & FAULT_AROUND_MASK, vma->vm_start);
3399 off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3400 pte -= off;
3401 pgoff -= off;
3402
3403 /*
3404 * max_pgoff is either end of page table or end of vma
3405 * or FAULT_AROUND_PAGES from pgoff, depending what is neast.
3406 */
3407 max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3408 PTRS_PER_PTE - 1;
3409 max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
3410 pgoff + FAULT_AROUND_PAGES - 1);
3411
3412 /* Check if it makes any sense to call ->map_pages */
3413 while (!pte_none(*pte)) {
3414 if (++pgoff > max_pgoff)
3415 return;
3416 start_addr += PAGE_SIZE;
3417 if (start_addr >= vma->vm_end)
3418 return;
3419 pte++;
3420 }
3421
3422 vmf.virtual_address = (void __user *) start_addr;
3423 vmf.pte = pte;
3424 vmf.pgoff = pgoff;
3425 vmf.max_pgoff = max_pgoff;
3426 vmf.flags = flags;
3427 vma->vm_ops->map_pages(vma, &vmf);
3428}
3429
3369static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3430static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3370 unsigned long address, pmd_t *pmd, 3431 unsigned long address, pmd_t *pmd,
3371 pgoff_t pgoff, unsigned int flags, pte_t orig_pte) 3432 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
@@ -3373,7 +3434,20 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3373 struct page *fault_page; 3434 struct page *fault_page;
3374 spinlock_t *ptl; 3435 spinlock_t *ptl;
3375 pte_t *pte; 3436 pte_t *pte;
3376 int ret; 3437 int ret = 0;
3438
3439 /*
3440 * Let's call ->map_pages() first and use ->fault() as fallback
3441 * if page by the offset is not ready to be mapped (cold cache or
3442 * something).
3443 */
3444 if (vma->vm_ops->map_pages) {
3445 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
3446 do_fault_around(vma, address, pte, pgoff, flags);
3447 if (!pte_same(*pte, orig_pte))
3448 goto unlock_out;
3449 pte_unmap_unlock(pte, ptl);
3450 }
3377 3451
3378 ret = __do_fault(vma, address, pgoff, flags, &fault_page); 3452 ret = __do_fault(vma, address, pgoff, flags, &fault_page);
3379 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3453 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -3387,8 +3461,9 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3387 return ret; 3461 return ret;
3388 } 3462 }
3389 do_set_pte(vma, address, fault_page, pte, false, false); 3463 do_set_pte(vma, address, fault_page, pte, false, false);
3390 pte_unmap_unlock(pte, ptl);
3391 unlock_page(fault_page); 3464 unlock_page(fault_page);
3465unlock_out:
3466 pte_unmap_unlock(pte, ptl);
3392 return ret; 3467 return ret;
3393} 3468}
3394 3469