diff options
-rw-r--r-- | Documentation/filesystems/Locking | 10 | ||||
-rw-r--r-- | include/linux/mm.h | 8 | ||||
-rw-r--r-- | mm/memory.c | 81 |
3 files changed, 96 insertions, 3 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index f424e0e5b46b..efca5c1bbb10 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -529,6 +529,7 @@ locking rules: | |||
529 | open: yes | 529 | open: yes |
530 | close: yes | 530 | close: yes |
531 | fault: yes can return with page locked | 531 | fault: yes can return with page locked |
532 | map_pages: yes | ||
532 | page_mkwrite: yes can return with page locked | 533 | page_mkwrite: yes can return with page locked |
533 | access: yes | 534 | access: yes |
534 | 535 | ||
@@ -540,6 +541,15 @@ the page, then ensure it is not already truncated (the page lock will block | |||
540 | subsequent truncate), and then return with VM_FAULT_LOCKED, and the page | 541 | subsequent truncate), and then return with VM_FAULT_LOCKED, and the page |
541 | locked. The VM will unlock the page. | 542 | locked. The VM will unlock the page. |
542 | 543 | ||
544 | ->map_pages() is called when VM asks to map easy accessible pages. | ||
545 | Filesystem should find and map pages associated with offsets from "pgoff" | ||
546 | till "max_pgoff". ->map_pages() is called with page table locked and must | ||
547 | not block. If it's not possible to reach a page without blocking, | ||
548 | filesystem should skip it. Filesystem should use do_set_pte() to setup | ||
549 | page table entry. Pointer to entry associated with offset "pgoff" is | ||
550 | passed in "pte" field in vm_fault structure. Pointers to entries for other | ||
551 | offsets should be calculated relative to "pte". | ||
552 | |||
543 | ->page_mkwrite() is called when a previously read-only pte is | 553 | ->page_mkwrite() is called when a previously read-only pte is |
544 | about to become writeable. The filesystem again must ensure that there are | 554 | about to become writeable. The filesystem again must ensure that there are |
545 | no truncate/invalidate races, and then return with the page locked. If | 555 | no truncate/invalidate races, and then return with the page locked. If |
diff --git a/include/linux/mm.h b/include/linux/mm.h index c270fa68a32b..f710d32291e8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -213,6 +213,10 @@ struct vm_fault { | |||
213 | * is set (which is also implied by | 213 | * is set (which is also implied by |
214 | * VM_FAULT_ERROR). | 214 | * VM_FAULT_ERROR). |
215 | */ | 215 | */ |
216 | /* for ->map_pages() only */ | ||
217 | pgoff_t max_pgoff; /* map pages for offset from pgoff till | ||
218 | * max_pgoff inclusive */ | ||
219 | pte_t *pte; /* pte entry associated with ->pgoff */ | ||
216 | }; | 220 | }; |
217 | 221 | ||
218 | /* | 222 | /* |
@@ -224,6 +228,7 @@ struct vm_operations_struct { | |||
224 | void (*open)(struct vm_area_struct * area); | 228 | void (*open)(struct vm_area_struct * area); |
225 | void (*close)(struct vm_area_struct * area); | 229 | void (*close)(struct vm_area_struct * area); |
226 | int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); | 230 | int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); |
231 | void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf); | ||
227 | 232 | ||
228 | /* notification that a previously read-only page is about to become | 233 | /* notification that a previously read-only page is about to become |
229 | * writable, if an error is returned it will cause a SIGBUS */ | 234 | * writable, if an error is returned it will cause a SIGBUS */ |
@@ -584,6 +589,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
584 | pte = pte_mkwrite(pte); | 589 | pte = pte_mkwrite(pte); |
585 | return pte; | 590 | return pte; |
586 | } | 591 | } |
592 | |||
593 | void do_set_pte(struct vm_area_struct *vma, unsigned long address, | ||
594 | struct page *page, pte_t *pte, bool write, bool anon); | ||
587 | #endif | 595 | #endif |
588 | 596 | ||
589 | /* | 597 | /* |
diff --git a/mm/memory.c b/mm/memory.c index c6ee34d10fcc..4eefb7e31521 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -3342,7 +3342,22 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, | |||
3342 | return ret; | 3342 | return ret; |
3343 | } | 3343 | } |
3344 | 3344 | ||
3345 | static void do_set_pte(struct vm_area_struct *vma, unsigned long address, | 3345 | /** |
3346 | * do_set_pte - setup new PTE entry for given page and add reverse page mapping. | ||
3347 | * | ||
3348 | * @vma: virtual memory area | ||
3349 | * @address: user virtual address | ||
3350 | * @page: page to map | ||
3351 | * @pte: pointer to target page table entry | ||
3352 | * @write: true, if new entry is writable | ||
3353 | * @anon: true, if it's anonymous page | ||
3354 | * | ||
3355 | * Caller must hold page table lock relevant for @pte. | ||
3356 | * | ||
3357 | * Target users are page handler itself and implementations of | ||
3358 | * vm_ops->map_pages. | ||
3359 | */ | ||
3360 | void do_set_pte(struct vm_area_struct *vma, unsigned long address, | ||
3346 | struct page *page, pte_t *pte, bool write, bool anon) | 3361 | struct page *page, pte_t *pte, bool write, bool anon) |
3347 | { | 3362 | { |
3348 | pte_t entry; | 3363 | pte_t entry; |
@@ -3366,6 +3381,52 @@ static void do_set_pte(struct vm_area_struct *vma, unsigned long address, | |||
3366 | update_mmu_cache(vma, address, pte); | 3381 | update_mmu_cache(vma, address, pte); |
3367 | } | 3382 | } |
3368 | 3383 | ||
3384 | #define FAULT_AROUND_ORDER 4 | ||
3385 | #define FAULT_AROUND_PAGES (1UL << FAULT_AROUND_ORDER) | ||
3386 | #define FAULT_AROUND_MASK ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1) | ||
3387 | |||
3388 | static void do_fault_around(struct vm_area_struct *vma, unsigned long address, | ||
3389 | pte_t *pte, pgoff_t pgoff, unsigned int flags) | ||
3390 | { | ||
3391 | unsigned long start_addr; | ||
3392 | pgoff_t max_pgoff; | ||
3393 | struct vm_fault vmf; | ||
3394 | int off; | ||
3395 | |||
3396 | BUILD_BUG_ON(FAULT_AROUND_PAGES > PTRS_PER_PTE); | ||
3397 | |||
3398 | start_addr = max(address & FAULT_AROUND_MASK, vma->vm_start); | ||
3399 | off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); | ||
3400 | pte -= off; | ||
3401 | pgoff -= off; | ||
3402 | |||
3403 | /* | ||
3404 | * max_pgoff is either end of page table or end of vma | ||
3405 | * or FAULT_AROUND_PAGES from pgoff, depending what is neast. | ||
3406 | */ | ||
3407 | max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + | ||
3408 | PTRS_PER_PTE - 1; | ||
3409 | max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, | ||
3410 | pgoff + FAULT_AROUND_PAGES - 1); | ||
3411 | |||
3412 | /* Check if it makes any sense to call ->map_pages */ | ||
3413 | while (!pte_none(*pte)) { | ||
3414 | if (++pgoff > max_pgoff) | ||
3415 | return; | ||
3416 | start_addr += PAGE_SIZE; | ||
3417 | if (start_addr >= vma->vm_end) | ||
3418 | return; | ||
3419 | pte++; | ||
3420 | } | ||
3421 | |||
3422 | vmf.virtual_address = (void __user *) start_addr; | ||
3423 | vmf.pte = pte; | ||
3424 | vmf.pgoff = pgoff; | ||
3425 | vmf.max_pgoff = max_pgoff; | ||
3426 | vmf.flags = flags; | ||
3427 | vma->vm_ops->map_pages(vma, &vmf); | ||
3428 | } | ||
3429 | |||
3369 | static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3430 | static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
3370 | unsigned long address, pmd_t *pmd, | 3431 | unsigned long address, pmd_t *pmd, |
3371 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) | 3432 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) |
@@ -3373,7 +3434,20 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3373 | struct page *fault_page; | 3434 | struct page *fault_page; |
3374 | spinlock_t *ptl; | 3435 | spinlock_t *ptl; |
3375 | pte_t *pte; | 3436 | pte_t *pte; |
3376 | int ret; | 3437 | int ret = 0; |
3438 | |||
3439 | /* | ||
3440 | * Let's call ->map_pages() first and use ->fault() as fallback | ||
3441 | * if page by the offset is not ready to be mapped (cold cache or | ||
3442 | * something). | ||
3443 | */ | ||
3444 | if (vma->vm_ops->map_pages) { | ||
3445 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
3446 | do_fault_around(vma, address, pte, pgoff, flags); | ||
3447 | if (!pte_same(*pte, orig_pte)) | ||
3448 | goto unlock_out; | ||
3449 | pte_unmap_unlock(pte, ptl); | ||
3450 | } | ||
3377 | 3451 | ||
3378 | ret = __do_fault(vma, address, pgoff, flags, &fault_page); | 3452 | ret = __do_fault(vma, address, pgoff, flags, &fault_page); |
3379 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3453 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
@@ -3387,8 +3461,9 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3387 | return ret; | 3461 | return ret; |
3388 | } | 3462 | } |
3389 | do_set_pte(vma, address, fault_page, pte, false, false); | 3463 | do_set_pte(vma, address, fault_page, pte, false, false); |
3390 | pte_unmap_unlock(pte, ptl); | ||
3391 | unlock_page(fault_page); | 3464 | unlock_page(fault_page); |
3465 | unlock_out: | ||
3466 | pte_unmap_unlock(pte, ptl); | ||
3392 | return ret; | 3467 | return ret; |
3393 | } | 3468 | } |
3394 | 3469 | ||