diff options
-rw-r--r-- | fs/hugetlbfs/inode.c | 7 | ||||
-rw-r--r-- | mm/hugetlb.c | 180 |
2 files changed, 97 insertions, 90 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8f94feb24c0a..2627efe767cf 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -48,7 +48,6 @@ int sysctl_hugetlb_shm_group; | |||
48 | static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) | 48 | static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) |
49 | { | 49 | { |
50 | struct inode *inode = file->f_dentry->d_inode; | 50 | struct inode *inode = file->f_dentry->d_inode; |
51 | struct address_space *mapping = inode->i_mapping; | ||
52 | loff_t len, vma_len; | 51 | loff_t len, vma_len; |
53 | int ret; | 52 | int ret; |
54 | 53 | ||
@@ -79,10 +78,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) | |||
79 | if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) | 78 | if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) |
80 | goto out; | 79 | goto out; |
81 | 80 | ||
82 | ret = hugetlb_prefault(mapping, vma); | 81 | ret = 0; |
83 | if (ret) | 82 | hugetlb_prefault_arch_hook(vma->vm_mm); |
84 | goto out; | ||
85 | |||
86 | if (inode->i_size < len) | 83 | if (inode->i_size < len) |
87 | inode->i_size = len; | 84 | inode->i_size = len; |
88 | out: | 85 | out: |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f29b7dc02c39..c9b43360fd33 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -321,10 +321,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
321 | 321 | ||
322 | for (address = start; address < end; address += HPAGE_SIZE) { | 322 | for (address = start; address < end; address += HPAGE_SIZE) { |
323 | ptep = huge_pte_offset(mm, address); | 323 | ptep = huge_pte_offset(mm, address); |
324 | if (! ptep) | 324 | if (!ptep) |
325 | /* This can happen on truncate, or if an | ||
326 | * mmap() is aborted due to an error before | ||
327 | * the prefault */ | ||
328 | continue; | 325 | continue; |
329 | 326 | ||
330 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 327 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
@@ -340,81 +337,92 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
340 | flush_tlb_range(vma, start, end); | 337 | flush_tlb_range(vma, start, end); |
341 | } | 338 | } |
342 | 339 | ||
343 | int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) | 340 | static struct page *find_lock_huge_page(struct address_space *mapping, |
341 | unsigned long idx) | ||
344 | { | 342 | { |
345 | struct mm_struct *mm = current->mm; | 343 | struct page *page; |
346 | unsigned long addr; | 344 | int err; |
347 | int ret = 0; | 345 | struct inode *inode = mapping->host; |
348 | 346 | unsigned long size; | |
349 | WARN_ON(!is_vm_hugetlb_page(vma)); | 347 | |
350 | BUG_ON(vma->vm_start & ~HPAGE_MASK); | 348 | retry: |
351 | BUG_ON(vma->vm_end & ~HPAGE_MASK); | 349 | page = find_lock_page(mapping, idx); |
352 | 350 | if (page) | |
353 | hugetlb_prefault_arch_hook(mm); | 351 | goto out; |
354 | 352 | ||
355 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | 353 | /* Check to make sure the mapping hasn't been truncated */ |
356 | unsigned long idx; | 354 | size = i_size_read(inode) >> HPAGE_SHIFT; |
357 | pte_t *pte = huge_pte_alloc(mm, addr); | 355 | if (idx >= size) |
358 | struct page *page; | 356 | goto out; |
359 | 357 | ||
360 | if (!pte) { | 358 | if (hugetlb_get_quota(mapping)) |
361 | ret = -ENOMEM; | 359 | goto out; |
362 | goto out; | 360 | page = alloc_huge_page(); |
363 | } | 361 | if (!page) { |
362 | hugetlb_put_quota(mapping); | ||
363 | goto out; | ||
364 | } | ||
364 | 365 | ||
365 | idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) | 366 | err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); |
366 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | 367 | if (err) { |
367 | page = find_get_page(mapping, idx); | 368 | put_page(page); |
368 | if (!page) { | 369 | hugetlb_put_quota(mapping); |
369 | /* charge the fs quota first */ | 370 | if (err == -EEXIST) |
370 | if (hugetlb_get_quota(mapping)) { | 371 | goto retry; |
371 | ret = -ENOMEM; | 372 | page = NULL; |
372 | goto out; | ||
373 | } | ||
374 | page = alloc_huge_page(); | ||
375 | if (!page) { | ||
376 | hugetlb_put_quota(mapping); | ||
377 | ret = -ENOMEM; | ||
378 | goto out; | ||
379 | } | ||
380 | ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); | ||
381 | if (! ret) { | ||
382 | unlock_page(page); | ||
383 | } else { | ||
384 | hugetlb_put_quota(mapping); | ||
385 | free_huge_page(page); | ||
386 | goto out; | ||
387 | } | ||
388 | } | ||
389 | spin_lock(&mm->page_table_lock); | ||
390 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); | ||
391 | set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page)); | ||
392 | spin_unlock(&mm->page_table_lock); | ||
393 | } | 373 | } |
394 | out: | 374 | out: |
395 | return ret; | 375 | return page; |
396 | } | 376 | } |
397 | 377 | ||
398 | /* | ||
399 | * On ia64 at least, it is possible to receive a hugetlb fault from a | ||
400 | * stale zero entry left in the TLB from earlier hardware prefetching. | ||
401 | * Low-level arch code should already have flushed the stale entry as | ||
402 | * part of its fault handling, but we do need to accept this minor fault | ||
403 | * and return successfully. Whereas the "normal" case is that this is | ||
404 | * an access to a hugetlb page which has been truncated off since mmap. | ||
405 | */ | ||
406 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 378 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
407 | unsigned long address, int write_access) | 379 | unsigned long address, int write_access) |
408 | { | 380 | { |
409 | int ret = VM_FAULT_SIGBUS; | 381 | int ret = VM_FAULT_SIGBUS; |
382 | unsigned long idx; | ||
383 | unsigned long size; | ||
410 | pte_t *pte; | 384 | pte_t *pte; |
385 | struct page *page; | ||
386 | struct address_space *mapping; | ||
387 | |||
388 | pte = huge_pte_alloc(mm, address); | ||
389 | if (!pte) | ||
390 | goto out; | ||
391 | |||
392 | mapping = vma->vm_file->f_mapping; | ||
393 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) | ||
394 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
395 | |||
396 | /* | ||
397 | * Use page lock to guard against racing truncation | ||
398 | * before we get page_table_lock. | ||
399 | */ | ||
400 | page = find_lock_huge_page(mapping, idx); | ||
401 | if (!page) | ||
402 | goto out; | ||
411 | 403 | ||
412 | spin_lock(&mm->page_table_lock); | 404 | spin_lock(&mm->page_table_lock); |
413 | pte = huge_pte_offset(mm, address); | 405 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; |
414 | if (pte && !pte_none(*pte)) | 406 | if (idx >= size) |
415 | ret = VM_FAULT_MINOR; | 407 | goto backout; |
408 | |||
409 | ret = VM_FAULT_MINOR; | ||
410 | if (!pte_none(*pte)) | ||
411 | goto backout; | ||
412 | |||
413 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); | ||
414 | set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); | ||
416 | spin_unlock(&mm->page_table_lock); | 415 | spin_unlock(&mm->page_table_lock); |
416 | unlock_page(page); | ||
417 | out: | ||
417 | return ret; | 418 | return ret; |
419 | |||
420 | backout: | ||
421 | spin_unlock(&mm->page_table_lock); | ||
422 | hugetlb_put_quota(mapping); | ||
423 | unlock_page(page); | ||
424 | put_page(page); | ||
425 | goto out; | ||
418 | } | 426 | } |
419 | 427 | ||
420 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 428 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
@@ -424,34 +432,36 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
424 | unsigned long vpfn, vaddr = *position; | 432 | unsigned long vpfn, vaddr = *position; |
425 | int remainder = *length; | 433 | int remainder = *length; |
426 | 434 | ||
427 | BUG_ON(!is_vm_hugetlb_page(vma)); | ||
428 | |||
429 | vpfn = vaddr/PAGE_SIZE; | 435 | vpfn = vaddr/PAGE_SIZE; |
430 | spin_lock(&mm->page_table_lock); | 436 | spin_lock(&mm->page_table_lock); |
431 | while (vaddr < vma->vm_end && remainder) { | 437 | while (vaddr < vma->vm_end && remainder) { |
438 | pte_t *pte; | ||
439 | struct page *page; | ||
432 | 440 | ||
433 | if (pages) { | 441 | /* |
434 | pte_t *pte; | 442 | * Some archs (sparc64, sh*) have multiple pte_ts to |
435 | struct page *page; | 443 | * each hugepage. We have to make * sure we get the |
436 | 444 | * first, for the page indexing below to work. | |
437 | /* Some archs (sparc64, sh*) have multiple | 445 | */ |
438 | * pte_ts to each hugepage. We have to make | 446 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); |
439 | * sure we get the first, for the page | ||
440 | * indexing below to work. */ | ||
441 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); | ||
442 | |||
443 | /* the hugetlb file might have been truncated */ | ||
444 | if (!pte || pte_none(*pte)) { | ||
445 | remainder = 0; | ||
446 | if (!i) | ||
447 | i = -EFAULT; | ||
448 | break; | ||
449 | } | ||
450 | 447 | ||
451 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | 448 | if (!pte || pte_none(*pte)) { |
449 | int ret; | ||
452 | 450 | ||
453 | WARN_ON(!PageCompound(page)); | 451 | spin_unlock(&mm->page_table_lock); |
452 | ret = hugetlb_fault(mm, vma, vaddr, 0); | ||
453 | spin_lock(&mm->page_table_lock); | ||
454 | if (ret == VM_FAULT_MINOR) | ||
455 | continue; | ||
454 | 456 | ||
457 | remainder = 0; | ||
458 | if (!i) | ||
459 | i = -EFAULT; | ||
460 | break; | ||
461 | } | ||
462 | |||
463 | if (pages) { | ||
464 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | ||
455 | get_page(page); | 465 | get_page(page); |
456 | pages[i] = page; | 466 | pages[i] = page; |
457 | } | 467 | } |