diff options
Diffstat (limited to 'mm/hugetlb.c')
| -rw-r--r-- | mm/hugetlb.c | 207 | 
1 files changed, 109 insertions, 98 deletions
| diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 61d380678030..c9b43360fd33 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -277,19 +277,23 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
| 277 | unsigned long addr; | 277 | unsigned long addr; | 
| 278 | 278 | ||
| 279 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | 279 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | 
| 280 | src_pte = huge_pte_offset(src, addr); | ||
| 281 | if (!src_pte) | ||
| 282 | continue; | ||
| 280 | dst_pte = huge_pte_alloc(dst, addr); | 283 | dst_pte = huge_pte_alloc(dst, addr); | 
| 281 | if (!dst_pte) | 284 | if (!dst_pte) | 
| 282 | goto nomem; | 285 | goto nomem; | 
| 286 | spin_lock(&dst->page_table_lock); | ||
| 283 | spin_lock(&src->page_table_lock); | 287 | spin_lock(&src->page_table_lock); | 
| 284 | src_pte = huge_pte_offset(src, addr); | 288 | if (!pte_none(*src_pte)) { | 
| 285 | if (src_pte && !pte_none(*src_pte)) { | ||
| 286 | entry = *src_pte; | 289 | entry = *src_pte; | 
| 287 | ptepage = pte_page(entry); | 290 | ptepage = pte_page(entry); | 
| 288 | get_page(ptepage); | 291 | get_page(ptepage); | 
| 289 | add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); | 292 | add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE); | 
| 290 | set_huge_pte_at(dst, addr, dst_pte, entry); | 293 | set_huge_pte_at(dst, addr, dst_pte, entry); | 
| 291 | } | 294 | } | 
| 292 | spin_unlock(&src->page_table_lock); | 295 | spin_unlock(&src->page_table_lock); | 
| 296 | spin_unlock(&dst->page_table_lock); | ||
| 293 | } | 297 | } | 
| 294 | return 0; | 298 | return 0; | 
| 295 | 299 | ||
| @@ -310,12 +314,14 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
| 310 | BUG_ON(start & ~HPAGE_MASK); | 314 | BUG_ON(start & ~HPAGE_MASK); | 
| 311 | BUG_ON(end & ~HPAGE_MASK); | 315 | BUG_ON(end & ~HPAGE_MASK); | 
| 312 | 316 | ||
| 317 | spin_lock(&mm->page_table_lock); | ||
| 318 | |||
| 319 | /* Update high watermark before we lower rss */ | ||
| 320 | update_hiwater_rss(mm); | ||
| 321 | |||
| 313 | for (address = start; address < end; address += HPAGE_SIZE) { | 322 | for (address = start; address < end; address += HPAGE_SIZE) { | 
| 314 | ptep = huge_pte_offset(mm, address); | 323 | ptep = huge_pte_offset(mm, address); | 
| 315 | if (! ptep) | 324 | if (!ptep) | 
| 316 | /* This can happen on truncate, or if an | ||
| 317 | * mmap() is aborted due to an error before | ||
| 318 | * the prefault */ | ||
| 319 | continue; | 325 | continue; | 
| 320 | 326 | ||
| 321 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 327 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 
| @@ -324,96 +330,99 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
| 324 | 330 | ||
| 325 | page = pte_page(pte); | 331 | page = pte_page(pte); | 
| 326 | put_page(page); | 332 | put_page(page); | 
| 327 | add_mm_counter(mm, rss, - (HPAGE_SIZE / PAGE_SIZE)); | 333 | add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE)); | 
| 328 | } | 334 | } | 
| 329 | flush_tlb_range(vma, start, end); | ||
| 330 | } | ||
| 331 | |||
| 332 | void zap_hugepage_range(struct vm_area_struct *vma, | ||
| 333 | unsigned long start, unsigned long length) | ||
| 334 | { | ||
| 335 | struct mm_struct *mm = vma->vm_mm; | ||
| 336 | 335 | ||
| 337 | spin_lock(&mm->page_table_lock); | ||
| 338 | unmap_hugepage_range(vma, start, start + length); | ||
| 339 | spin_unlock(&mm->page_table_lock); | 336 | spin_unlock(&mm->page_table_lock); | 
| 337 | flush_tlb_range(vma, start, end); | ||
| 340 | } | 338 | } | 
| 341 | 339 | ||
| 342 | int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) | 340 | static struct page *find_lock_huge_page(struct address_space *mapping, | 
| 341 | unsigned long idx) | ||
| 343 | { | 342 | { | 
| 344 | struct mm_struct *mm = current->mm; | 343 | struct page *page; | 
| 345 | unsigned long addr; | 344 | int err; | 
| 346 | int ret = 0; | 345 | struct inode *inode = mapping->host; | 
| 347 | 346 | unsigned long size; | |
| 348 | WARN_ON(!is_vm_hugetlb_page(vma)); | 347 | |
| 349 | BUG_ON(vma->vm_start & ~HPAGE_MASK); | 348 | retry: | 
| 350 | BUG_ON(vma->vm_end & ~HPAGE_MASK); | 349 | page = find_lock_page(mapping, idx); | 
| 351 | 350 | if (page) | |
| 352 | hugetlb_prefault_arch_hook(mm); | 351 | goto out; | 
| 353 | 352 | ||
| 354 | spin_lock(&mm->page_table_lock); | 353 | /* Check to make sure the mapping hasn't been truncated */ | 
| 355 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | 354 | size = i_size_read(inode) >> HPAGE_SHIFT; | 
| 356 | unsigned long idx; | 355 | if (idx >= size) | 
| 357 | pte_t *pte = huge_pte_alloc(mm, addr); | 356 | goto out; | 
| 358 | struct page *page; | 357 | |
| 359 | 358 | if (hugetlb_get_quota(mapping)) | |
| 360 | if (!pte) { | 359 | goto out; | 
| 361 | ret = -ENOMEM; | 360 | page = alloc_huge_page(); | 
| 362 | goto out; | 361 | if (!page) { | 
| 363 | } | 362 | hugetlb_put_quota(mapping); | 
| 363 | goto out; | ||
| 364 | } | ||
| 364 | 365 | ||
| 365 | idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) | 366 | err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); | 
| 366 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | 367 | if (err) { | 
| 367 | page = find_get_page(mapping, idx); | 368 | put_page(page); | 
| 368 | if (!page) { | 369 | hugetlb_put_quota(mapping); | 
| 369 | /* charge the fs quota first */ | 370 | if (err == -EEXIST) | 
| 370 | if (hugetlb_get_quota(mapping)) { | 371 | goto retry; | 
| 371 | ret = -ENOMEM; | 372 | page = NULL; | 
| 372 | goto out; | ||
| 373 | } | ||
| 374 | page = alloc_huge_page(); | ||
| 375 | if (!page) { | ||
| 376 | hugetlb_put_quota(mapping); | ||
| 377 | ret = -ENOMEM; | ||
| 378 | goto out; | ||
| 379 | } | ||
| 380 | ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); | ||
| 381 | if (! ret) { | ||
| 382 | unlock_page(page); | ||
| 383 | } else { | ||
| 384 | hugetlb_put_quota(mapping); | ||
| 385 | free_huge_page(page); | ||
| 386 | goto out; | ||
| 387 | } | ||
| 388 | } | ||
| 389 | add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); | ||
| 390 | set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page)); | ||
| 391 | } | 373 | } | 
| 392 | out: | 374 | out: | 
| 393 | spin_unlock(&mm->page_table_lock); | 375 | return page; | 
| 394 | return ret; | ||
| 395 | } | 376 | } | 
| 396 | 377 | ||
| 397 | /* | ||
| 398 | * On ia64 at least, it is possible to receive a hugetlb fault from a | ||
| 399 | * stale zero entry left in the TLB from earlier hardware prefetching. | ||
| 400 | * Low-level arch code should already have flushed the stale entry as | ||
| 401 | * part of its fault handling, but we do need to accept this minor fault | ||
| 402 | * and return successfully. Whereas the "normal" case is that this is | ||
| 403 | * an access to a hugetlb page which has been truncated off since mmap. | ||
| 404 | */ | ||
| 405 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 378 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 
| 406 | unsigned long address, int write_access) | 379 | unsigned long address, int write_access) | 
| 407 | { | 380 | { | 
| 408 | int ret = VM_FAULT_SIGBUS; | 381 | int ret = VM_FAULT_SIGBUS; | 
| 382 | unsigned long idx; | ||
| 383 | unsigned long size; | ||
| 409 | pte_t *pte; | 384 | pte_t *pte; | 
| 385 | struct page *page; | ||
| 386 | struct address_space *mapping; | ||
| 387 | |||
| 388 | pte = huge_pte_alloc(mm, address); | ||
| 389 | if (!pte) | ||
| 390 | goto out; | ||
| 391 | |||
| 392 | mapping = vma->vm_file->f_mapping; | ||
| 393 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) | ||
| 394 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
| 395 | |||
| 396 | /* | ||
| 397 | * Use page lock to guard against racing truncation | ||
| 398 | * before we get page_table_lock. | ||
| 399 | */ | ||
| 400 | page = find_lock_huge_page(mapping, idx); | ||
| 401 | if (!page) | ||
| 402 | goto out; | ||
| 410 | 403 | ||
| 411 | spin_lock(&mm->page_table_lock); | 404 | spin_lock(&mm->page_table_lock); | 
| 412 | pte = huge_pte_offset(mm, address); | 405 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; | 
| 413 | if (pte && !pte_none(*pte)) | 406 | if (idx >= size) | 
| 414 | ret = VM_FAULT_MINOR; | 407 | goto backout; | 
| 408 | |||
| 409 | ret = VM_FAULT_MINOR; | ||
| 410 | if (!pte_none(*pte)) | ||
| 411 | goto backout; | ||
| 412 | |||
| 413 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); | ||
| 414 | set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); | ||
| 415 | spin_unlock(&mm->page_table_lock); | 415 | spin_unlock(&mm->page_table_lock); | 
| 416 | unlock_page(page); | ||
| 417 | out: | ||
| 416 | return ret; | 418 | return ret; | 
| 419 | |||
| 420 | backout: | ||
| 421 | spin_unlock(&mm->page_table_lock); | ||
| 422 | hugetlb_put_quota(mapping); | ||
| 423 | unlock_page(page); | ||
| 424 | put_page(page); | ||
| 425 | goto out; | ||
| 417 | } | 426 | } | 
| 418 | 427 | ||
| 419 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 428 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 
| @@ -423,34 +432,36 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 423 | unsigned long vpfn, vaddr = *position; | 432 | unsigned long vpfn, vaddr = *position; | 
| 424 | int remainder = *length; | 433 | int remainder = *length; | 
| 425 | 434 | ||
| 426 | BUG_ON(!is_vm_hugetlb_page(vma)); | ||
| 427 | |||
| 428 | vpfn = vaddr/PAGE_SIZE; | 435 | vpfn = vaddr/PAGE_SIZE; | 
| 429 | spin_lock(&mm->page_table_lock); | 436 | spin_lock(&mm->page_table_lock); | 
| 430 | while (vaddr < vma->vm_end && remainder) { | 437 | while (vaddr < vma->vm_end && remainder) { | 
| 438 | pte_t *pte; | ||
| 439 | struct page *page; | ||
| 431 | 440 | ||
| 432 | if (pages) { | 441 | /* | 
| 433 | pte_t *pte; | 442 | * Some archs (sparc64, sh*) have multiple pte_ts to | 
| 434 | struct page *page; | 443 | * each hugepage. We have to make * sure we get the | 
| 435 | 444 | * first, for the page indexing below to work. | |
| 436 | /* Some archs (sparc64, sh*) have multiple | 445 | */ | 
| 437 | * pte_ts to each hugepage. We have to make | 446 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); | 
| 438 | * sure we get the first, for the page | ||
| 439 | * indexing below to work. */ | ||
| 440 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); | ||
| 441 | |||
| 442 | /* the hugetlb file might have been truncated */ | ||
| 443 | if (!pte || pte_none(*pte)) { | ||
| 444 | remainder = 0; | ||
| 445 | if (!i) | ||
| 446 | i = -EFAULT; | ||
| 447 | break; | ||
| 448 | } | ||
| 449 | 447 | ||
| 450 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | 448 | if (!pte || pte_none(*pte)) { | 
| 449 | int ret; | ||
| 451 | 450 | ||
| 452 | WARN_ON(!PageCompound(page)); | 451 | spin_unlock(&mm->page_table_lock); | 
| 452 | ret = hugetlb_fault(mm, vma, vaddr, 0); | ||
| 453 | spin_lock(&mm->page_table_lock); | ||
| 454 | if (ret == VM_FAULT_MINOR) | ||
| 455 | continue; | ||
| 456 | |||
| 457 | remainder = 0; | ||
| 458 | if (!i) | ||
| 459 | i = -EFAULT; | ||
| 460 | break; | ||
| 461 | } | ||
| 453 | 462 | ||
| 463 | if (pages) { | ||
| 464 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | ||
| 454 | get_page(page); | 465 | get_page(page); | 
| 455 | pages[i] = page; | 466 | pages[i] = page; | 
| 456 | } | 467 | } | 
