diff options
| -rw-r--r-- | fs/hugetlbfs/inode.c | 3 | ||||
| -rw-r--r-- | mm/hugetlb.c | 127 |
2 files changed, 108 insertions, 22 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8c1cef3bb677..8c41315a6e42 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
| @@ -100,9 +100,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 100 | loff_t len, vma_len; | 100 | loff_t len, vma_len; |
| 101 | int ret; | 101 | int ret; |
| 102 | 102 | ||
| 103 | if ((vma->vm_flags & (VM_MAYSHARE | VM_WRITE)) == VM_WRITE) | ||
| 104 | return -EINVAL; | ||
| 105 | |||
| 106 | if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1)) | 103 | if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1)) |
| 107 | return -EINVAL; | 104 | return -EINVAL; |
| 108 | 105 | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cf8225108b2f..da8a211414c9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -261,11 +261,12 @@ struct vm_operations_struct hugetlb_vm_ops = { | |||
| 261 | .nopage = hugetlb_nopage, | 261 | .nopage = hugetlb_nopage, |
| 262 | }; | 262 | }; |
| 263 | 263 | ||
| 264 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) | 264 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, |
| 265 | int writable) | ||
| 265 | { | 266 | { |
| 266 | pte_t entry; | 267 | pte_t entry; |
| 267 | 268 | ||
| 268 | if (vma->vm_flags & VM_WRITE) { | 269 | if (writable) { |
| 269 | entry = | 270 | entry = |
| 270 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); | 271 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); |
| 271 | } else { | 272 | } else { |
| @@ -277,12 +278,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) | |||
| 277 | return entry; | 278 | return entry; |
| 278 | } | 279 | } |
| 279 | 280 | ||
| 281 | static void set_huge_ptep_writable(struct vm_area_struct *vma, | ||
| 282 | unsigned long address, pte_t *ptep) | ||
| 283 | { | ||
| 284 | pte_t entry; | ||
| 285 | |||
| 286 | entry = pte_mkwrite(pte_mkdirty(*ptep)); | ||
| 287 | ptep_set_access_flags(vma, address, ptep, entry, 1); | ||
| 288 | update_mmu_cache(vma, address, entry); | ||
| 289 | lazy_mmu_prot_update(entry); | ||
| 290 | } | ||
| 291 | |||
| 292 | |||
| 280 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | 293 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, |
| 281 | struct vm_area_struct *vma) | 294 | struct vm_area_struct *vma) |
| 282 | { | 295 | { |
| 283 | pte_t *src_pte, *dst_pte, entry; | 296 | pte_t *src_pte, *dst_pte, entry; |
| 284 | struct page *ptepage; | 297 | struct page *ptepage; |
| 285 | unsigned long addr; | 298 | unsigned long addr; |
| 299 | int cow; | ||
| 300 | |||
| 301 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | ||
| 286 | 302 | ||
| 287 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | 303 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { |
| 288 | src_pte = huge_pte_offset(src, addr); | 304 | src_pte = huge_pte_offset(src, addr); |
| @@ -294,6 +310,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
| 294 | spin_lock(&dst->page_table_lock); | 310 | spin_lock(&dst->page_table_lock); |
| 295 | spin_lock(&src->page_table_lock); | 311 | spin_lock(&src->page_table_lock); |
| 296 | if (!pte_none(*src_pte)) { | 312 | if (!pte_none(*src_pte)) { |
| 313 | if (cow) | ||
| 314 | ptep_set_wrprotect(src, addr, src_pte); | ||
| 297 | entry = *src_pte; | 315 | entry = *src_pte; |
| 298 | ptepage = pte_page(entry); | 316 | ptepage = pte_page(entry); |
| 299 | get_page(ptepage); | 317 | get_page(ptepage); |
| @@ -346,7 +364,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
| 346 | } | 364 | } |
| 347 | 365 | ||
| 348 | static struct page *find_or_alloc_huge_page(struct address_space *mapping, | 366 | static struct page *find_or_alloc_huge_page(struct address_space *mapping, |
| 349 | unsigned long idx) | 367 | unsigned long idx, int shared) |
| 350 | { | 368 | { |
| 351 | struct page *page; | 369 | struct page *page; |
| 352 | int err; | 370 | int err; |
| @@ -364,26 +382,80 @@ retry: | |||
| 364 | goto out; | 382 | goto out; |
| 365 | } | 383 | } |
| 366 | 384 | ||
| 367 | err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); | 385 | if (shared) { |
| 368 | if (err) { | 386 | err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); |
| 369 | put_page(page); | 387 | if (err) { |
| 370 | hugetlb_put_quota(mapping); | 388 | put_page(page); |
| 371 | if (err == -EEXIST) | 389 | hugetlb_put_quota(mapping); |
| 372 | goto retry; | 390 | if (err == -EEXIST) |
| 373 | page = NULL; | 391 | goto retry; |
| 392 | page = NULL; | ||
| 393 | } | ||
| 394 | } else { | ||
| 395 | /* Caller expects a locked page */ | ||
| 396 | lock_page(page); | ||
| 374 | } | 397 | } |
| 375 | out: | 398 | out: |
| 376 | return page; | 399 | return page; |
| 377 | } | 400 | } |
| 378 | 401 | ||
| 402 | static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 403 | unsigned long address, pte_t *ptep, pte_t pte) | ||
| 404 | { | ||
| 405 | struct page *old_page, *new_page; | ||
| 406 | int i, avoidcopy; | ||
| 407 | |||
| 408 | old_page = pte_page(pte); | ||
| 409 | |||
| 410 | /* If no-one else is actually using this page, avoid the copy | ||
| 411 | * and just make the page writable */ | ||
| 412 | avoidcopy = (page_count(old_page) == 1); | ||
| 413 | if (avoidcopy) { | ||
| 414 | set_huge_ptep_writable(vma, address, ptep); | ||
| 415 | return VM_FAULT_MINOR; | ||
| 416 | } | ||
| 417 | |||
| 418 | page_cache_get(old_page); | ||
| 419 | new_page = alloc_huge_page(); | ||
| 420 | |||
| 421 | if (!new_page) { | ||
| 422 | page_cache_release(old_page); | ||
| 423 | |||
| 424 | /* Logically this is OOM, not a SIGBUS, but an OOM | ||
| 425 | * could cause the kernel to go killing other | ||
| 426 | * processes which won't help the hugepage situation | ||
| 427 | * at all (?) */ | ||
| 428 | return VM_FAULT_SIGBUS; | ||
| 429 | } | ||
| 430 | |||
| 431 | spin_unlock(&mm->page_table_lock); | ||
| 432 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) | ||
| 433 | copy_user_highpage(new_page + i, old_page + i, | ||
| 434 | address + i*PAGE_SIZE); | ||
| 435 | spin_lock(&mm->page_table_lock); | ||
| 436 | |||
| 437 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); | ||
| 438 | if (likely(pte_same(*ptep, pte))) { | ||
| 439 | /* Break COW */ | ||
| 440 | set_huge_pte_at(mm, address, ptep, | ||
| 441 | make_huge_pte(vma, new_page, 1)); | ||
| 442 | /* Make the old page be freed below */ | ||
| 443 | new_page = old_page; | ||
| 444 | } | ||
| 445 | page_cache_release(new_page); | ||
| 446 | page_cache_release(old_page); | ||
| 447 | return VM_FAULT_MINOR; | ||
| 448 | } | ||
| 449 | |||
| 379 | int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | 450 | int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 380 | unsigned long address, pte_t *ptep) | 451 | unsigned long address, pte_t *ptep, int write_access) |
| 381 | { | 452 | { |
| 382 | int ret = VM_FAULT_SIGBUS; | 453 | int ret = VM_FAULT_SIGBUS; |
| 383 | unsigned long idx; | 454 | unsigned long idx; |
| 384 | unsigned long size; | 455 | unsigned long size; |
| 385 | struct page *page; | 456 | struct page *page; |
| 386 | struct address_space *mapping; | 457 | struct address_space *mapping; |
| 458 | pte_t new_pte; | ||
| 387 | 459 | ||
| 388 | mapping = vma->vm_file->f_mapping; | 460 | mapping = vma->vm_file->f_mapping; |
| 389 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) | 461 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) |
| @@ -393,10 +465,13 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 393 | * Use page lock to guard against racing truncation | 465 | * Use page lock to guard against racing truncation |
| 394 | * before we get page_table_lock. | 466 | * before we get page_table_lock. |
| 395 | */ | 467 | */ |
| 396 | page = find_or_alloc_huge_page(mapping, idx); | 468 | page = find_or_alloc_huge_page(mapping, idx, |
| 469 | vma->vm_flags & VM_SHARED); | ||
| 397 | if (!page) | 470 | if (!page) |
| 398 | goto out; | 471 | goto out; |
| 399 | 472 | ||
| 473 | BUG_ON(!PageLocked(page)); | ||
| 474 | |||
| 400 | spin_lock(&mm->page_table_lock); | 475 | spin_lock(&mm->page_table_lock); |
| 401 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; | 476 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; |
| 402 | if (idx >= size) | 477 | if (idx >= size) |
| @@ -407,7 +482,15 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 407 | goto backout; | 482 | goto backout; |
| 408 | 483 | ||
| 409 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); | 484 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); |
| 410 | set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, page)); | 485 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) |
| 486 | && (vma->vm_flags & VM_SHARED))); | ||
| 487 | set_huge_pte_at(mm, address, ptep, new_pte); | ||
| 488 | |||
| 489 | if (write_access && !(vma->vm_flags & VM_SHARED)) { | ||
| 490 | /* Optimization, do the COW without a second fault */ | ||
| 491 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte); | ||
| 492 | } | ||
| 493 | |||
| 411 | spin_unlock(&mm->page_table_lock); | 494 | spin_unlock(&mm->page_table_lock); |
| 412 | unlock_page(page); | 495 | unlock_page(page); |
| 413 | out: | 496 | out: |
| @@ -426,6 +509,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 426 | { | 509 | { |
| 427 | pte_t *ptep; | 510 | pte_t *ptep; |
| 428 | pte_t entry; | 511 | pte_t entry; |
| 512 | int ret; | ||
| 429 | 513 | ||
| 430 | ptep = huge_pte_alloc(mm, address); | 514 | ptep = huge_pte_alloc(mm, address); |
| 431 | if (!ptep) | 515 | if (!ptep) |
| @@ -433,13 +517,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 433 | 517 | ||
| 434 | entry = *ptep; | 518 | entry = *ptep; |
| 435 | if (pte_none(entry)) | 519 | if (pte_none(entry)) |
| 436 | return hugetlb_no_page(mm, vma, address, ptep); | 520 | return hugetlb_no_page(mm, vma, address, ptep, write_access); |
| 437 | 521 | ||
| 438 | /* | 522 | ret = VM_FAULT_MINOR; |
| 439 | * We could get here if another thread instantiated the pte | 523 | |
| 440 | * before the test above. | 524 | spin_lock(&mm->page_table_lock); |
| 441 | */ | 525 | /* Check for a racing update before calling hugetlb_cow */ |
| 442 | return VM_FAULT_MINOR; | 526 | if (likely(pte_same(entry, *ptep))) |
| 527 | if (write_access && !pte_write(entry)) | ||
| 528 | ret = hugetlb_cow(mm, vma, address, ptep, entry); | ||
| 529 | spin_unlock(&mm->page_table_lock); | ||
| 530 | |||
| 531 | return ret; | ||
| 443 | } | 532 | } |
| 444 | 533 | ||
| 445 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 534 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
