diff options
author | David Gibson <david@gibson.dropbear.id.au> | 2006-01-06 03:10:44 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-01-06 11:33:23 -0500 |
commit | 1e8f889b10d8d2223105719e36ce45688fedbd59 (patch) | |
tree | 86dee89e4363aaf6c7ec7c9751ea37f725c95bb9 /mm | |
parent | 86e5216f8d8aa258ba836caffe2613d79cc9aead (diff) |
[PATCH] Hugetlb: Copy on Write support
Implement copy-on-write support for hugetlb mappings so MAP_PRIVATE can be
supported. This helps us to safely use hugetlb pages in many more
applications. The patch makes the following changes. If needed, I also have
it broken out according to the following paragraphs.
1. Add a pair of functions to set/clear write access on huge ptes. The
writable check in make_huge_pte is moved out to the caller for use by COW
later.
2. Hugetlb copy-on-write requires special case handling in the following
situations:
- copy_hugetlb_page_range() - Copied pages must be write protected so
a COW fault will be triggered (if necessary) if those pages are written
to.
- find_or_alloc_huge_page() - Only MAP_SHARED pages are added to the
page cache. MAP_PRIVATE pages still need to be locked however.
3. Provide hugetlb_cow() and calls from hugetlb_fault() and
hugetlb_no_page() which handles the COW fault by making the actual copy.
4. Remove the check in hugetlbfs_file_map() so that MAP_PRIVATE mmaps
will be allowed. Make MAP_HUGETLB exempt from the depricated VM_RESERVED
mapping check.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Adam Litke <agl@us.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "Seth, Rohit" <rohit.seth@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/hugetlb.c | 127 |
1 files changed, 108 insertions, 19 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cf8225108b2f..da8a211414c9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -261,11 +261,12 @@ struct vm_operations_struct hugetlb_vm_ops = { | |||
261 | .nopage = hugetlb_nopage, | 261 | .nopage = hugetlb_nopage, |
262 | }; | 262 | }; |
263 | 263 | ||
264 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) | 264 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, |
265 | int writable) | ||
265 | { | 266 | { |
266 | pte_t entry; | 267 | pte_t entry; |
267 | 268 | ||
268 | if (vma->vm_flags & VM_WRITE) { | 269 | if (writable) { |
269 | entry = | 270 | entry = |
270 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); | 271 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); |
271 | } else { | 272 | } else { |
@@ -277,12 +278,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) | |||
277 | return entry; | 278 | return entry; |
278 | } | 279 | } |
279 | 280 | ||
281 | static void set_huge_ptep_writable(struct vm_area_struct *vma, | ||
282 | unsigned long address, pte_t *ptep) | ||
283 | { | ||
284 | pte_t entry; | ||
285 | |||
286 | entry = pte_mkwrite(pte_mkdirty(*ptep)); | ||
287 | ptep_set_access_flags(vma, address, ptep, entry, 1); | ||
288 | update_mmu_cache(vma, address, entry); | ||
289 | lazy_mmu_prot_update(entry); | ||
290 | } | ||
291 | |||
292 | |||
280 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | 293 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, |
281 | struct vm_area_struct *vma) | 294 | struct vm_area_struct *vma) |
282 | { | 295 | { |
283 | pte_t *src_pte, *dst_pte, entry; | 296 | pte_t *src_pte, *dst_pte, entry; |
284 | struct page *ptepage; | 297 | struct page *ptepage; |
285 | unsigned long addr; | 298 | unsigned long addr; |
299 | int cow; | ||
300 | |||
301 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | ||
286 | 302 | ||
287 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | 303 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { |
288 | src_pte = huge_pte_offset(src, addr); | 304 | src_pte = huge_pte_offset(src, addr); |
@@ -294,6 +310,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
294 | spin_lock(&dst->page_table_lock); | 310 | spin_lock(&dst->page_table_lock); |
295 | spin_lock(&src->page_table_lock); | 311 | spin_lock(&src->page_table_lock); |
296 | if (!pte_none(*src_pte)) { | 312 | if (!pte_none(*src_pte)) { |
313 | if (cow) | ||
314 | ptep_set_wrprotect(src, addr, src_pte); | ||
297 | entry = *src_pte; | 315 | entry = *src_pte; |
298 | ptepage = pte_page(entry); | 316 | ptepage = pte_page(entry); |
299 | get_page(ptepage); | 317 | get_page(ptepage); |
@@ -346,7 +364,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
346 | } | 364 | } |
347 | 365 | ||
348 | static struct page *find_or_alloc_huge_page(struct address_space *mapping, | 366 | static struct page *find_or_alloc_huge_page(struct address_space *mapping, |
349 | unsigned long idx) | 367 | unsigned long idx, int shared) |
350 | { | 368 | { |
351 | struct page *page; | 369 | struct page *page; |
352 | int err; | 370 | int err; |
@@ -364,26 +382,80 @@ retry: | |||
364 | goto out; | 382 | goto out; |
365 | } | 383 | } |
366 | 384 | ||
367 | err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); | 385 | if (shared) { |
368 | if (err) { | 386 | err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); |
369 | put_page(page); | 387 | if (err) { |
370 | hugetlb_put_quota(mapping); | 388 | put_page(page); |
371 | if (err == -EEXIST) | 389 | hugetlb_put_quota(mapping); |
372 | goto retry; | 390 | if (err == -EEXIST) |
373 | page = NULL; | 391 | goto retry; |
392 | page = NULL; | ||
393 | } | ||
394 | } else { | ||
395 | /* Caller expects a locked page */ | ||
396 | lock_page(page); | ||
374 | } | 397 | } |
375 | out: | 398 | out: |
376 | return page; | 399 | return page; |
377 | } | 400 | } |
378 | 401 | ||
402 | static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | ||
403 | unsigned long address, pte_t *ptep, pte_t pte) | ||
404 | { | ||
405 | struct page *old_page, *new_page; | ||
406 | int i, avoidcopy; | ||
407 | |||
408 | old_page = pte_page(pte); | ||
409 | |||
410 | /* If no-one else is actually using this page, avoid the copy | ||
411 | * and just make the page writable */ | ||
412 | avoidcopy = (page_count(old_page) == 1); | ||
413 | if (avoidcopy) { | ||
414 | set_huge_ptep_writable(vma, address, ptep); | ||
415 | return VM_FAULT_MINOR; | ||
416 | } | ||
417 | |||
418 | page_cache_get(old_page); | ||
419 | new_page = alloc_huge_page(); | ||
420 | |||
421 | if (!new_page) { | ||
422 | page_cache_release(old_page); | ||
423 | |||
424 | /* Logically this is OOM, not a SIGBUS, but an OOM | ||
425 | * could cause the kernel to go killing other | ||
426 | * processes which won't help the hugepage situation | ||
427 | * at all (?) */ | ||
428 | return VM_FAULT_SIGBUS; | ||
429 | } | ||
430 | |||
431 | spin_unlock(&mm->page_table_lock); | ||
432 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) | ||
433 | copy_user_highpage(new_page + i, old_page + i, | ||
434 | address + i*PAGE_SIZE); | ||
435 | spin_lock(&mm->page_table_lock); | ||
436 | |||
437 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); | ||
438 | if (likely(pte_same(*ptep, pte))) { | ||
439 | /* Break COW */ | ||
440 | set_huge_pte_at(mm, address, ptep, | ||
441 | make_huge_pte(vma, new_page, 1)); | ||
442 | /* Make the old page be freed below */ | ||
443 | new_page = old_page; | ||
444 | } | ||
445 | page_cache_release(new_page); | ||
446 | page_cache_release(old_page); | ||
447 | return VM_FAULT_MINOR; | ||
448 | } | ||
449 | |||
379 | int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | 450 | int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
380 | unsigned long address, pte_t *ptep) | 451 | unsigned long address, pte_t *ptep, int write_access) |
381 | { | 452 | { |
382 | int ret = VM_FAULT_SIGBUS; | 453 | int ret = VM_FAULT_SIGBUS; |
383 | unsigned long idx; | 454 | unsigned long idx; |
384 | unsigned long size; | 455 | unsigned long size; |
385 | struct page *page; | 456 | struct page *page; |
386 | struct address_space *mapping; | 457 | struct address_space *mapping; |
458 | pte_t new_pte; | ||
387 | 459 | ||
388 | mapping = vma->vm_file->f_mapping; | 460 | mapping = vma->vm_file->f_mapping; |
389 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) | 461 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) |
@@ -393,10 +465,13 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
393 | * Use page lock to guard against racing truncation | 465 | * Use page lock to guard against racing truncation |
394 | * before we get page_table_lock. | 466 | * before we get page_table_lock. |
395 | */ | 467 | */ |
396 | page = find_or_alloc_huge_page(mapping, idx); | 468 | page = find_or_alloc_huge_page(mapping, idx, |
469 | vma->vm_flags & VM_SHARED); | ||
397 | if (!page) | 470 | if (!page) |
398 | goto out; | 471 | goto out; |
399 | 472 | ||
473 | BUG_ON(!PageLocked(page)); | ||
474 | |||
400 | spin_lock(&mm->page_table_lock); | 475 | spin_lock(&mm->page_table_lock); |
401 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; | 476 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; |
402 | if (idx >= size) | 477 | if (idx >= size) |
@@ -407,7 +482,15 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
407 | goto backout; | 482 | goto backout; |
408 | 483 | ||
409 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); | 484 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); |
410 | set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page)); | 485 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) |
486 | && (vma->vm_flags & VM_SHARED))); | ||
487 | set_huge_pte_at(mm, address, ptep, new_pte); | ||
488 | |||
489 | if (write_access && !(vma->vm_flags & VM_SHARED)) { | ||
490 | /* Optimization, do the COW without a second fault */ | ||
491 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte); | ||
492 | } | ||
493 | |||
411 | spin_unlock(&mm->page_table_lock); | 494 | spin_unlock(&mm->page_table_lock); |
412 | unlock_page(page); | 495 | unlock_page(page); |
413 | out: | 496 | out: |
@@ -426,6 +509,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
426 | { | 509 | { |
427 | pte_t *ptep; | 510 | pte_t *ptep; |
428 | pte_t entry; | 511 | pte_t entry; |
512 | int ret; | ||
429 | 513 | ||
430 | ptep = huge_pte_alloc(mm, address); | 514 | ptep = huge_pte_alloc(mm, address); |
431 | if (!ptep) | 515 | if (!ptep) |
@@ -433,13 +517,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
433 | 517 | ||
434 | entry = *ptep; | 518 | entry = *ptep; |
435 | if (pte_none(entry)) | 519 | if (pte_none(entry)) |
436 | return hugetlb_no_page(mm, vma, address, ptep); | 520 | return hugetlb_no_page(mm, vma, address, ptep, write_access); |
437 | 521 | ||
438 | /* | 522 | ret = VM_FAULT_MINOR; |
439 | * We could get here if another thread instantiated the pte | 523 | |
440 | * before the test above. | 524 | spin_lock(&mm->page_table_lock); |
441 | */ | 525 | /* Check for a racing update before calling hugetlb_cow */ |
442 | return VM_FAULT_MINOR; | 526 | if (likely(pte_same(entry, *ptep))) |
527 | if (write_access && !pte_write(entry)) | ||
528 | ret = hugetlb_cow(mm, vma, address, ptep, entry); | ||
529 | spin_unlock(&mm->page_table_lock); | ||
530 | |||
531 | return ret; | ||
443 | } | 532 | } |
444 | 533 | ||
445 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 534 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |