aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Gibson <david@gibson.dropbear.id.au>2006-01-06 03:10:44 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-01-06 11:33:23 -0500
commit1e8f889b10d8d2223105719e36ce45688fedbd59 (patch)
tree86dee89e4363aaf6c7ec7c9751ea37f725c95bb9
parent86e5216f8d8aa258ba836caffe2613d79cc9aead (diff)
[PATCH] Hugetlb: Copy on Write support
Implement copy-on-write support for hugetlb mappings so MAP_PRIVATE can be supported. This helps us to safely use hugetlb pages in many more applications. The patch makes the following changes. If needed, I also have it broken out according to the following paragraphs. 1. Add a pair of functions to set/clear write access on huge ptes. The writable check in make_huge_pte is moved out to the caller for use by COW later. 2. Hugetlb copy-on-write requires special case handling in the following situations: - copy_hugetlb_page_range() - Copied pages must be write protected so a COW fault will be triggered (if necessary) if those pages are written to. - find_or_alloc_huge_page() - Only MAP_SHARED pages are added to the page cache. MAP_PRIVATE pages still need to be locked however. 3. Provide hugetlb_cow() and calls from hugetlb_fault() and hugetlb_no_page() which handles the COW fault by making the actual copy. 4. Remove the check in hugetlbfs_file_map() so that MAP_PRIVATE mmaps will be allowed. Make MAP_HUGETLB exempt from the depricated VM_RESERVED mapping check. Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Adam Litke <agl@us.ibm.com> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: "Seth, Rohit" <rohit.seth@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--fs/hugetlbfs/inode.c3
-rw-r--r--mm/hugetlb.c127
2 files changed, 108 insertions, 22 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8c1cef3bb677..8c41315a6e42 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -100,9 +100,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
100 loff_t len, vma_len; 100 loff_t len, vma_len;
101 int ret; 101 int ret;
102 102
103 if ((vma->vm_flags & (VM_MAYSHARE | VM_WRITE)) == VM_WRITE)
104 return -EINVAL;
105
106 if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1)) 103 if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1))
107 return -EINVAL; 104 return -EINVAL;
108 105
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cf8225108b2f..da8a211414c9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -261,11 +261,12 @@ struct vm_operations_struct hugetlb_vm_ops = {
261 .nopage = hugetlb_nopage, 261 .nopage = hugetlb_nopage,
262}; 262};
263 263
264static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) 264static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
265 int writable)
265{ 266{
266 pte_t entry; 267 pte_t entry;
267 268
268 if (vma->vm_flags & VM_WRITE) { 269 if (writable) {
269 entry = 270 entry =
270 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 271 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
271 } else { 272 } else {
@@ -277,12 +278,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
277 return entry; 278 return entry;
278} 279}
279 280
281static void set_huge_ptep_writable(struct vm_area_struct *vma,
282 unsigned long address, pte_t *ptep)
283{
284 pte_t entry;
285
286 entry = pte_mkwrite(pte_mkdirty(*ptep));
287 ptep_set_access_flags(vma, address, ptep, entry, 1);
288 update_mmu_cache(vma, address, entry);
289 lazy_mmu_prot_update(entry);
290}
291
292
280int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 293int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
281 struct vm_area_struct *vma) 294 struct vm_area_struct *vma)
282{ 295{
283 pte_t *src_pte, *dst_pte, entry; 296 pte_t *src_pte, *dst_pte, entry;
284 struct page *ptepage; 297 struct page *ptepage;
285 unsigned long addr; 298 unsigned long addr;
299 int cow;
300
301 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
286 302
287 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 303 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
288 src_pte = huge_pte_offset(src, addr); 304 src_pte = huge_pte_offset(src, addr);
@@ -294,6 +310,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
294 spin_lock(&dst->page_table_lock); 310 spin_lock(&dst->page_table_lock);
295 spin_lock(&src->page_table_lock); 311 spin_lock(&src->page_table_lock);
296 if (!pte_none(*src_pte)) { 312 if (!pte_none(*src_pte)) {
313 if (cow)
314 ptep_set_wrprotect(src, addr, src_pte);
297 entry = *src_pte; 315 entry = *src_pte;
298 ptepage = pte_page(entry); 316 ptepage = pte_page(entry);
299 get_page(ptepage); 317 get_page(ptepage);
@@ -346,7 +364,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
346} 364}
347 365
348static struct page *find_or_alloc_huge_page(struct address_space *mapping, 366static struct page *find_or_alloc_huge_page(struct address_space *mapping,
349 unsigned long idx) 367 unsigned long idx, int shared)
350{ 368{
351 struct page *page; 369 struct page *page;
352 int err; 370 int err;
@@ -364,26 +382,80 @@ retry:
364 goto out; 382 goto out;
365 } 383 }
366 384
367 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 385 if (shared) {
368 if (err) { 386 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
369 put_page(page); 387 if (err) {
370 hugetlb_put_quota(mapping); 388 put_page(page);
371 if (err == -EEXIST) 389 hugetlb_put_quota(mapping);
372 goto retry; 390 if (err == -EEXIST)
373 page = NULL; 391 goto retry;
392 page = NULL;
393 }
394 } else {
395 /* Caller expects a locked page */
396 lock_page(page);
374 } 397 }
375out: 398out:
376 return page; 399 return page;
377} 400}
378 401
402static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
403 unsigned long address, pte_t *ptep, pte_t pte)
404{
405 struct page *old_page, *new_page;
406 int i, avoidcopy;
407
408 old_page = pte_page(pte);
409
410 /* If no-one else is actually using this page, avoid the copy
411 * and just make the page writable */
412 avoidcopy = (page_count(old_page) == 1);
413 if (avoidcopy) {
414 set_huge_ptep_writable(vma, address, ptep);
415 return VM_FAULT_MINOR;
416 }
417
418 page_cache_get(old_page);
419 new_page = alloc_huge_page();
420
421 if (!new_page) {
422 page_cache_release(old_page);
423
424 /* Logically this is OOM, not a SIGBUS, but an OOM
425 * could cause the kernel to go killing other
426 * processes which won't help the hugepage situation
427 * at all (?) */
428 return VM_FAULT_SIGBUS;
429 }
430
431 spin_unlock(&mm->page_table_lock);
432 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
433 copy_user_highpage(new_page + i, old_page + i,
434 address + i*PAGE_SIZE);
435 spin_lock(&mm->page_table_lock);
436
437 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
438 if (likely(pte_same(*ptep, pte))) {
439 /* Break COW */
440 set_huge_pte_at(mm, address, ptep,
441 make_huge_pte(vma, new_page, 1));
442 /* Make the old page be freed below */
443 new_page = old_page;
444 }
445 page_cache_release(new_page);
446 page_cache_release(old_page);
447 return VM_FAULT_MINOR;
448}
449
379int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 450int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
380 unsigned long address, pte_t *ptep) 451 unsigned long address, pte_t *ptep, int write_access)
381{ 452{
382 int ret = VM_FAULT_SIGBUS; 453 int ret = VM_FAULT_SIGBUS;
383 unsigned long idx; 454 unsigned long idx;
384 unsigned long size; 455 unsigned long size;
385 struct page *page; 456 struct page *page;
386 struct address_space *mapping; 457 struct address_space *mapping;
458 pte_t new_pte;
387 459
388 mapping = vma->vm_file->f_mapping; 460 mapping = vma->vm_file->f_mapping;
389 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 461 idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -393,10 +465,13 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
393 * Use page lock to guard against racing truncation 465 * Use page lock to guard against racing truncation
394 * before we get page_table_lock. 466 * before we get page_table_lock.
395 */ 467 */
396 page = find_or_alloc_huge_page(mapping, idx); 468 page = find_or_alloc_huge_page(mapping, idx,
469 vma->vm_flags & VM_SHARED);
397 if (!page) 470 if (!page)
398 goto out; 471 goto out;
399 472
473 BUG_ON(!PageLocked(page));
474
400 spin_lock(&mm->page_table_lock); 475 spin_lock(&mm->page_table_lock);
401 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 476 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
402 if (idx >= size) 477 if (idx >= size)
@@ -407,7 +482,15 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
407 goto backout; 482 goto backout;
408 483
409 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); 484 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
410 set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page)); 485 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
486 && (vma->vm_flags & VM_SHARED)));
487 set_huge_pte_at(mm, address, ptep, new_pte);
488
489 if (write_access && !(vma->vm_flags & VM_SHARED)) {
490 /* Optimization, do the COW without a second fault */
491 ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
492 }
493
411 spin_unlock(&mm->page_table_lock); 494 spin_unlock(&mm->page_table_lock);
412 unlock_page(page); 495 unlock_page(page);
413out: 496out:
@@ -426,6 +509,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
426{ 509{
427 pte_t *ptep; 510 pte_t *ptep;
428 pte_t entry; 511 pte_t entry;
512 int ret;
429 513
430 ptep = huge_pte_alloc(mm, address); 514 ptep = huge_pte_alloc(mm, address);
431 if (!ptep) 515 if (!ptep)
@@ -433,13 +517,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
433 517
434 entry = *ptep; 518 entry = *ptep;
435 if (pte_none(entry)) 519 if (pte_none(entry))
436 return hugetlb_no_page(mm, vma, address, ptep); 520 return hugetlb_no_page(mm, vma, address, ptep, write_access);
437 521
438 /* 522 ret = VM_FAULT_MINOR;
439 * We could get here if another thread instantiated the pte 523
440 * before the test above. 524 spin_lock(&mm->page_table_lock);
441 */ 525 /* Check for a racing update before calling hugetlb_cow */
442 return VM_FAULT_MINOR; 526 if (likely(pte_same(entry, *ptep)))
527 if (write_access && !pte_write(entry))
528 ret = hugetlb_cow(mm, vma, address, ptep, entry);
529 spin_unlock(&mm->page_table_lock);
530
531 return ret;
443} 532}
444 533
445int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 534int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,