aboutsummaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c207
1 files changed, 109 insertions, 98 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 61d380678030..c9b43360fd33 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -277,19 +277,23 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
277 unsigned long addr; 277 unsigned long addr;
278 278
279 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 279 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
280 src_pte = huge_pte_offset(src, addr);
281 if (!src_pte)
282 continue;
280 dst_pte = huge_pte_alloc(dst, addr); 283 dst_pte = huge_pte_alloc(dst, addr);
281 if (!dst_pte) 284 if (!dst_pte)
282 goto nomem; 285 goto nomem;
286 spin_lock(&dst->page_table_lock);
283 spin_lock(&src->page_table_lock); 287 spin_lock(&src->page_table_lock);
284 src_pte = huge_pte_offset(src, addr); 288 if (!pte_none(*src_pte)) {
285 if (src_pte && !pte_none(*src_pte)) {
286 entry = *src_pte; 289 entry = *src_pte;
287 ptepage = pte_page(entry); 290 ptepage = pte_page(entry);
288 get_page(ptepage); 291 get_page(ptepage);
289 add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); 292 add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
290 set_huge_pte_at(dst, addr, dst_pte, entry); 293 set_huge_pte_at(dst, addr, dst_pte, entry);
291 } 294 }
292 spin_unlock(&src->page_table_lock); 295 spin_unlock(&src->page_table_lock);
296 spin_unlock(&dst->page_table_lock);
293 } 297 }
294 return 0; 298 return 0;
295 299
@@ -310,12 +314,14 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
310 BUG_ON(start & ~HPAGE_MASK); 314 BUG_ON(start & ~HPAGE_MASK);
311 BUG_ON(end & ~HPAGE_MASK); 315 BUG_ON(end & ~HPAGE_MASK);
312 316
317 spin_lock(&mm->page_table_lock);
318
319 /* Update high watermark before we lower rss */
320 update_hiwater_rss(mm);
321
313 for (address = start; address < end; address += HPAGE_SIZE) { 322 for (address = start; address < end; address += HPAGE_SIZE) {
314 ptep = huge_pte_offset(mm, address); 323 ptep = huge_pte_offset(mm, address);
315 if (! ptep) 324 if (!ptep)
316 /* This can happen on truncate, or if an
317 * mmap() is aborted due to an error before
318 * the prefault */
319 continue; 325 continue;
320 326
321 pte = huge_ptep_get_and_clear(mm, address, ptep); 327 pte = huge_ptep_get_and_clear(mm, address, ptep);
@@ -324,96 +330,99 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
324 330
325 page = pte_page(pte); 331 page = pte_page(pte);
326 put_page(page); 332 put_page(page);
327 add_mm_counter(mm, rss, - (HPAGE_SIZE / PAGE_SIZE)); 333 add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
328 } 334 }
329 flush_tlb_range(vma, start, end);
330}
331
332void zap_hugepage_range(struct vm_area_struct *vma,
333 unsigned long start, unsigned long length)
334{
335 struct mm_struct *mm = vma->vm_mm;
336 335
337 spin_lock(&mm->page_table_lock);
338 unmap_hugepage_range(vma, start, start + length);
339 spin_unlock(&mm->page_table_lock); 336 spin_unlock(&mm->page_table_lock);
337 flush_tlb_range(vma, start, end);
340} 338}
341 339
342int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) 340static struct page *find_lock_huge_page(struct address_space *mapping,
341 unsigned long idx)
343{ 342{
344 struct mm_struct *mm = current->mm; 343 struct page *page;
345 unsigned long addr; 344 int err;
346 int ret = 0; 345 struct inode *inode = mapping->host;
347 346 unsigned long size;
348 WARN_ON(!is_vm_hugetlb_page(vma)); 347
349 BUG_ON(vma->vm_start & ~HPAGE_MASK); 348retry:
350 BUG_ON(vma->vm_end & ~HPAGE_MASK); 349 page = find_lock_page(mapping, idx);
351 350 if (page)
352 hugetlb_prefault_arch_hook(mm); 351 goto out;
353 352
354 spin_lock(&mm->page_table_lock); 353 /* Check to make sure the mapping hasn't been truncated */
355 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 354 size = i_size_read(inode) >> HPAGE_SHIFT;
356 unsigned long idx; 355 if (idx >= size)
357 pte_t *pte = huge_pte_alloc(mm, addr); 356 goto out;
358 struct page *page; 357
359 358 if (hugetlb_get_quota(mapping))
360 if (!pte) { 359 goto out;
361 ret = -ENOMEM; 360 page = alloc_huge_page();
362 goto out; 361 if (!page) {
363 } 362 hugetlb_put_quota(mapping);
363 goto out;
364 }
364 365
365 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) 366 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
366 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 367 if (err) {
367 page = find_get_page(mapping, idx); 368 put_page(page);
368 if (!page) { 369 hugetlb_put_quota(mapping);
369 /* charge the fs quota first */ 370 if (err == -EEXIST)
370 if (hugetlb_get_quota(mapping)) { 371 goto retry;
371 ret = -ENOMEM; 372 page = NULL;
372 goto out;
373 }
374 page = alloc_huge_page();
375 if (!page) {
376 hugetlb_put_quota(mapping);
377 ret = -ENOMEM;
378 goto out;
379 }
380 ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
381 if (! ret) {
382 unlock_page(page);
383 } else {
384 hugetlb_put_quota(mapping);
385 free_huge_page(page);
386 goto out;
387 }
388 }
389 add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
390 set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
391 } 373 }
392out: 374out:
393 spin_unlock(&mm->page_table_lock); 375 return page;
394 return ret;
395} 376}
396 377
397/*
398 * On ia64 at least, it is possible to receive a hugetlb fault from a
399 * stale zero entry left in the TLB from earlier hardware prefetching.
400 * Low-level arch code should already have flushed the stale entry as
401 * part of its fault handling, but we do need to accept this minor fault
402 * and return successfully. Whereas the "normal" case is that this is
403 * an access to a hugetlb page which has been truncated off since mmap.
404 */
405int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 378int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
406 unsigned long address, int write_access) 379 unsigned long address, int write_access)
407{ 380{
408 int ret = VM_FAULT_SIGBUS; 381 int ret = VM_FAULT_SIGBUS;
382 unsigned long idx;
383 unsigned long size;
409 pte_t *pte; 384 pte_t *pte;
385 struct page *page;
386 struct address_space *mapping;
387
388 pte = huge_pte_alloc(mm, address);
389 if (!pte)
390 goto out;
391
392 mapping = vma->vm_file->f_mapping;
393 idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
394 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
395
396 /*
397 * Use page lock to guard against racing truncation
398 * before we get page_table_lock.
399 */
400 page = find_lock_huge_page(mapping, idx);
401 if (!page)
402 goto out;
410 403
411 spin_lock(&mm->page_table_lock); 404 spin_lock(&mm->page_table_lock);
412 pte = huge_pte_offset(mm, address); 405 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
413 if (pte && !pte_none(*pte)) 406 if (idx >= size)
414 ret = VM_FAULT_MINOR; 407 goto backout;
408
409 ret = VM_FAULT_MINOR;
410 if (!pte_none(*pte))
411 goto backout;
412
413 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
414 set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
415 spin_unlock(&mm->page_table_lock); 415 spin_unlock(&mm->page_table_lock);
416 unlock_page(page);
417out:
416 return ret; 418 return ret;
419
420backout:
421 spin_unlock(&mm->page_table_lock);
422 hugetlb_put_quota(mapping);
423 unlock_page(page);
424 put_page(page);
425 goto out;
417} 426}
418 427
419int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 428int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -423,34 +432,36 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
423 unsigned long vpfn, vaddr = *position; 432 unsigned long vpfn, vaddr = *position;
424 int remainder = *length; 433 int remainder = *length;
425 434
426 BUG_ON(!is_vm_hugetlb_page(vma));
427
428 vpfn = vaddr/PAGE_SIZE; 435 vpfn = vaddr/PAGE_SIZE;
429 spin_lock(&mm->page_table_lock); 436 spin_lock(&mm->page_table_lock);
430 while (vaddr < vma->vm_end && remainder) { 437 while (vaddr < vma->vm_end && remainder) {
438 pte_t *pte;
439 struct page *page;
431 440
432 if (pages) { 441 /*
433 pte_t *pte; 442 * Some archs (sparc64, sh*) have multiple pte_ts to
434 struct page *page; 443 * each hugepage. We have to make * sure we get the
435 444 * first, for the page indexing below to work.
436 /* Some archs (sparc64, sh*) have multiple 445 */
437 * pte_ts to each hugepage. We have to make 446 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
438 * sure we get the first, for the page
439 * indexing below to work. */
440 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
441
442 /* the hugetlb file might have been truncated */
443 if (!pte || pte_none(*pte)) {
444 remainder = 0;
445 if (!i)
446 i = -EFAULT;
447 break;
448 }
449 447
450 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; 448 if (!pte || pte_none(*pte)) {
449 int ret;
451 450
452 WARN_ON(!PageCompound(page)); 451 spin_unlock(&mm->page_table_lock);
452 ret = hugetlb_fault(mm, vma, vaddr, 0);
453 spin_lock(&mm->page_table_lock);
454 if (ret == VM_FAULT_MINOR)
455 continue;
456
457 remainder = 0;
458 if (!i)
459 i = -EFAULT;
460 break;
461 }
453 462
463 if (pages) {
464 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
454 get_page(page); 465 get_page(page);
455 pages[i] = page; 466 pages[i] = page;
456 } 467 }