aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorAdam Litke <agl@us.ibm.com>2005-10-29 21:16:46 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-10-30 00:40:43 -0400
commit4c887265977213985091476be40ab11dfdcb4caf (patch)
tree82ee135f8678094664d7311617287802d54d52cf /mm
parent551110a94aa15890d1709b179c4be1e66ff6db53 (diff)
[PATCH] hugetlb: demand fault handler
Below is a patch to implement demand faulting for huge pages. The main motivation for changing from prefaulting to demand faulting is so that huge page memory areas can be allocated according to NUMA policy. Thanks to consolidated hugetlb code, switching the behavior requires changing only one fault handler. The bulk of the patch just moves the logic from hugelb_prefault() to hugetlb_pte_fault() and find_get_huge_page(). Signed-off-by: Adam Litke <agl@us.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c180
1 files changed, 95 insertions, 85 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f29b7dc02c39..c9b43360fd33 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -321,10 +321,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
321 321
322 for (address = start; address < end; address += HPAGE_SIZE) { 322 for (address = start; address < end; address += HPAGE_SIZE) {
323 ptep = huge_pte_offset(mm, address); 323 ptep = huge_pte_offset(mm, address);
324 if (! ptep) 324 if (!ptep)
325 /* This can happen on truncate, or if an
326 * mmap() is aborted due to an error before
327 * the prefault */
328 continue; 325 continue;
329 326
330 pte = huge_ptep_get_and_clear(mm, address, ptep); 327 pte = huge_ptep_get_and_clear(mm, address, ptep);
@@ -340,81 +337,92 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
340 flush_tlb_range(vma, start, end); 337 flush_tlb_range(vma, start, end);
341} 338}
342 339
343int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) 340static struct page *find_lock_huge_page(struct address_space *mapping,
341 unsigned long idx)
344{ 342{
345 struct mm_struct *mm = current->mm; 343 struct page *page;
346 unsigned long addr; 344 int err;
347 int ret = 0; 345 struct inode *inode = mapping->host;
348 346 unsigned long size;
349 WARN_ON(!is_vm_hugetlb_page(vma)); 347
350 BUG_ON(vma->vm_start & ~HPAGE_MASK); 348retry:
351 BUG_ON(vma->vm_end & ~HPAGE_MASK); 349 page = find_lock_page(mapping, idx);
352 350 if (page)
353 hugetlb_prefault_arch_hook(mm); 351 goto out;
354 352
355 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 353 /* Check to make sure the mapping hasn't been truncated */
356 unsigned long idx; 354 size = i_size_read(inode) >> HPAGE_SHIFT;
357 pte_t *pte = huge_pte_alloc(mm, addr); 355 if (idx >= size)
358 struct page *page; 356 goto out;
359 357
360 if (!pte) { 358 if (hugetlb_get_quota(mapping))
361 ret = -ENOMEM; 359 goto out;
362 goto out; 360 page = alloc_huge_page();
363 } 361 if (!page) {
362 hugetlb_put_quota(mapping);
363 goto out;
364 }
364 365
365 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) 366 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
366 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 367 if (err) {
367 page = find_get_page(mapping, idx); 368 put_page(page);
368 if (!page) { 369 hugetlb_put_quota(mapping);
369 /* charge the fs quota first */ 370 if (err == -EEXIST)
370 if (hugetlb_get_quota(mapping)) { 371 goto retry;
371 ret = -ENOMEM; 372 page = NULL;
372 goto out;
373 }
374 page = alloc_huge_page();
375 if (!page) {
376 hugetlb_put_quota(mapping);
377 ret = -ENOMEM;
378 goto out;
379 }
380 ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
381 if (! ret) {
382 unlock_page(page);
383 } else {
384 hugetlb_put_quota(mapping);
385 free_huge_page(page);
386 goto out;
387 }
388 }
389 spin_lock(&mm->page_table_lock);
390 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
391 set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
392 spin_unlock(&mm->page_table_lock);
393 } 373 }
394out: 374out:
395 return ret; 375 return page;
396} 376}
397 377
398/*
399 * On ia64 at least, it is possible to receive a hugetlb fault from a
400 * stale zero entry left in the TLB from earlier hardware prefetching.
401 * Low-level arch code should already have flushed the stale entry as
402 * part of its fault handling, but we do need to accept this minor fault
403 * and return successfully. Whereas the "normal" case is that this is
404 * an access to a hugetlb page which has been truncated off since mmap.
405 */
406int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 378int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
407 unsigned long address, int write_access) 379 unsigned long address, int write_access)
408{ 380{
409 int ret = VM_FAULT_SIGBUS; 381 int ret = VM_FAULT_SIGBUS;
382 unsigned long idx;
383 unsigned long size;
410 pte_t *pte; 384 pte_t *pte;
385 struct page *page;
386 struct address_space *mapping;
387
388 pte = huge_pte_alloc(mm, address);
389 if (!pte)
390 goto out;
391
392 mapping = vma->vm_file->f_mapping;
393 idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
394 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
395
396 /*
397 * Use page lock to guard against racing truncation
398 * before we get page_table_lock.
399 */
400 page = find_lock_huge_page(mapping, idx);
401 if (!page)
402 goto out;
411 403
412 spin_lock(&mm->page_table_lock); 404 spin_lock(&mm->page_table_lock);
413 pte = huge_pte_offset(mm, address); 405 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
414 if (pte && !pte_none(*pte)) 406 if (idx >= size)
415 ret = VM_FAULT_MINOR; 407 goto backout;
408
409 ret = VM_FAULT_MINOR;
410 if (!pte_none(*pte))
411 goto backout;
412
413 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
414 set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
416 spin_unlock(&mm->page_table_lock); 415 spin_unlock(&mm->page_table_lock);
416 unlock_page(page);
417out:
417 return ret; 418 return ret;
419
420backout:
421 spin_unlock(&mm->page_table_lock);
422 hugetlb_put_quota(mapping);
423 unlock_page(page);
424 put_page(page);
425 goto out;
418} 426}
419 427
420int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 428int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -424,34 +432,36 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
424 unsigned long vpfn, vaddr = *position; 432 unsigned long vpfn, vaddr = *position;
425 int remainder = *length; 433 int remainder = *length;
426 434
427 BUG_ON(!is_vm_hugetlb_page(vma));
428
429 vpfn = vaddr/PAGE_SIZE; 435 vpfn = vaddr/PAGE_SIZE;
430 spin_lock(&mm->page_table_lock); 436 spin_lock(&mm->page_table_lock);
431 while (vaddr < vma->vm_end && remainder) { 437 while (vaddr < vma->vm_end && remainder) {
438 pte_t *pte;
439 struct page *page;
432 440
433 if (pages) { 441 /*
434 pte_t *pte; 442 * Some archs (sparc64, sh*) have multiple pte_ts to
435 struct page *page; 443 * each hugepage. We have to make * sure we get the
436 444 * first, for the page indexing below to work.
437 /* Some archs (sparc64, sh*) have multiple 445 */
438 * pte_ts to each hugepage. We have to make 446 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
439 * sure we get the first, for the page
440 * indexing below to work. */
441 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
442
443 /* the hugetlb file might have been truncated */
444 if (!pte || pte_none(*pte)) {
445 remainder = 0;
446 if (!i)
447 i = -EFAULT;
448 break;
449 }
450 447
451 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; 448 if (!pte || pte_none(*pte)) {
449 int ret;
452 450
453 WARN_ON(!PageCompound(page)); 451 spin_unlock(&mm->page_table_lock);
452 ret = hugetlb_fault(mm, vma, vaddr, 0);
453 spin_lock(&mm->page_table_lock);
454 if (ret == VM_FAULT_MINOR)
455 continue;
454 456
457 remainder = 0;
458 if (!i)
459 i = -EFAULT;
460 break;
461 }
462
463 if (pages) {
464 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
455 get_page(page); 465 get_page(page);
456 pages[i] = page; 466 pages[i] = page;
457 } 467 }