diff options
author | Paul Mackerras <paulus@samba.org> | 2008-06-18 01:29:12 -0400 |
---|---|---|
committer | Paul Mackerras <paulus@samba.org> | 2008-06-30 21:27:57 -0400 |
commit | 3a8247cc2c856930f34eafce33f6a039227ee175 (patch) | |
tree | aa8599cdf09893f1150a2bc137878d8b8a661780 /arch/powerpc/mm | |
parent | e952e6c4d6635b36c212c056a9427bd93460178c (diff) |
powerpc: Only demote individual slices rather than whole process
At present, if we have a kernel with a 64kB page size, and some
process maps something that has to be mapped with 4kB pages (such as a
cache-inhibited mapping on POWER5+, or the eHCA infiniband queue-pair
pages), we change the process to use 4kB pages everywhere. This hurts
the performance of HPC programs that access eHCA from userspace.
With this patch, the kernel will only demote the slice(s) containing
the eHCA or cache-inhibited mappings, leaving the remaining slices
able to use 64kB hardware pages.
This also changes the slice_get_unmapped_area code so that it is
willing to place a 64k-page mapping into (or across) a 4k-page slice
if there is no better alternative, i.e. if the program specified
MAP_FIXED or if there is not sufficient space available in slices that
are either empty or already have 64k-page mappings in them.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r-- | arch/powerpc/mm/hash_utils_64.c | 36 | ||||
-rw-r--r-- | arch/powerpc/mm/slice.c | 177 |
2 files changed, 160 insertions, 53 deletions
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index bf5b6d7ed30f..8d3b58ebd38e 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c | |||
@@ -695,6 +695,28 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) | |||
695 | return pp; | 695 | return pp; |
696 | } | 696 | } |
697 | 697 | ||
698 | #ifdef CONFIG_PPC_MM_SLICES | ||
699 | unsigned int get_paca_psize(unsigned long addr) | ||
700 | { | ||
701 | unsigned long index, slices; | ||
702 | |||
703 | if (addr < SLICE_LOW_TOP) { | ||
704 | slices = get_paca()->context.low_slices_psize; | ||
705 | index = GET_LOW_SLICE_INDEX(addr); | ||
706 | } else { | ||
707 | slices = get_paca()->context.high_slices_psize; | ||
708 | index = GET_HIGH_SLICE_INDEX(addr); | ||
709 | } | ||
710 | return (slices >> (index * 4)) & 0xF; | ||
711 | } | ||
712 | |||
713 | #else | ||
714 | unsigned int get_paca_psize(unsigned long addr) | ||
715 | { | ||
716 | return get_paca()->context.user_psize; | ||
717 | } | ||
718 | #endif | ||
719 | |||
698 | /* | 720 | /* |
699 | * Demote a segment to using 4k pages. | 721 | * Demote a segment to using 4k pages. |
700 | * For now this makes the whole process use 4k pages. | 722 | * For now this makes the whole process use 4k pages. |
@@ -702,13 +724,13 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) | |||
702 | #ifdef CONFIG_PPC_64K_PAGES | 724 | #ifdef CONFIG_PPC_64K_PAGES |
703 | void demote_segment_4k(struct mm_struct *mm, unsigned long addr) | 725 | void demote_segment_4k(struct mm_struct *mm, unsigned long addr) |
704 | { | 726 | { |
705 | if (mm->context.user_psize == MMU_PAGE_4K) | 727 | if (get_slice_psize(mm, addr) == MMU_PAGE_4K) |
706 | return; | 728 | return; |
707 | slice_set_user_psize(mm, MMU_PAGE_4K); | 729 | slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); |
708 | #ifdef CONFIG_SPU_BASE | 730 | #ifdef CONFIG_SPU_BASE |
709 | spu_flush_all_slbs(mm); | 731 | spu_flush_all_slbs(mm); |
710 | #endif | 732 | #endif |
711 | if (get_paca()->context.user_psize != MMU_PAGE_4K) { | 733 | if (get_paca_psize(addr) != MMU_PAGE_4K) { |
712 | get_paca()->context = mm->context; | 734 | get_paca()->context = mm->context; |
713 | slb_flush_and_rebolt(); | 735 | slb_flush_and_rebolt(); |
714 | } | 736 | } |
@@ -792,11 +814,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) | |||
792 | DBG_LOW(" user region with no mm !\n"); | 814 | DBG_LOW(" user region with no mm !\n"); |
793 | return 1; | 815 | return 1; |
794 | } | 816 | } |
795 | #ifdef CONFIG_PPC_MM_SLICES | ||
796 | psize = get_slice_psize(mm, ea); | 817 | psize = get_slice_psize(mm, ea); |
797 | #else | ||
798 | psize = mm->context.user_psize; | ||
799 | #endif | ||
800 | ssize = user_segment_size(ea); | 818 | ssize = user_segment_size(ea); |
801 | vsid = get_vsid(mm->context.id, ea, ssize); | 819 | vsid = get_vsid(mm->context.id, ea, ssize); |
802 | break; | 820 | break; |
@@ -868,7 +886,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) | |||
868 | /* Do actual hashing */ | 886 | /* Do actual hashing */ |
869 | #ifdef CONFIG_PPC_64K_PAGES | 887 | #ifdef CONFIG_PPC_64K_PAGES |
870 | /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */ | 888 | /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */ |
871 | if (pte_val(*ptep) & _PAGE_4K_PFN) { | 889 | if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) { |
872 | demote_segment_4k(mm, ea); | 890 | demote_segment_4k(mm, ea); |
873 | psize = MMU_PAGE_4K; | 891 | psize = MMU_PAGE_4K; |
874 | } | 892 | } |
@@ -897,7 +915,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) | |||
897 | } | 915 | } |
898 | } | 916 | } |
899 | if (user_region) { | 917 | if (user_region) { |
900 | if (psize != get_paca()->context.user_psize) { | 918 | if (psize != get_paca_psize(ea)) { |
901 | get_paca()->context = mm->context; | 919 | get_paca()->context = mm->context; |
902 | slb_flush_and_rebolt(); | 920 | slb_flush_and_rebolt(); |
903 | } | 921 | } |
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index ad928edafb0a..583be67ad938 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c | |||
@@ -215,10 +215,7 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz | |||
215 | mm->context.high_slices_psize); | 215 | mm->context.high_slices_psize); |
216 | 216 | ||
217 | spin_unlock_irqrestore(&slice_convert_lock, flags); | 217 | spin_unlock_irqrestore(&slice_convert_lock, flags); |
218 | mb(); | ||
219 | 218 | ||
220 | /* XXX this is sub-optimal but will do for now */ | ||
221 | on_each_cpu(slice_flush_segments, mm, 0, 1); | ||
222 | #ifdef CONFIG_SPU_BASE | 219 | #ifdef CONFIG_SPU_BASE |
223 | spu_flush_all_slbs(mm); | 220 | spu_flush_all_slbs(mm); |
224 | #endif | 221 | #endif |
@@ -384,17 +381,34 @@ static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, | |||
384 | return slice_find_area_bottomup(mm, len, mask, psize, use_cache); | 381 | return slice_find_area_bottomup(mm, len, mask, psize, use_cache); |
385 | } | 382 | } |
386 | 383 | ||
384 | #define or_mask(dst, src) do { \ | ||
385 | (dst).low_slices |= (src).low_slices; \ | ||
386 | (dst).high_slices |= (src).high_slices; \ | ||
387 | } while (0) | ||
388 | |||
389 | #define andnot_mask(dst, src) do { \ | ||
390 | (dst).low_slices &= ~(src).low_slices; \ | ||
391 | (dst).high_slices &= ~(src).high_slices; \ | ||
392 | } while (0) | ||
393 | |||
394 | #ifdef CONFIG_PPC_64K_PAGES | ||
395 | #define MMU_PAGE_BASE MMU_PAGE_64K | ||
396 | #else | ||
397 | #define MMU_PAGE_BASE MMU_PAGE_4K | ||
398 | #endif | ||
399 | |||
387 | unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, | 400 | unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, |
388 | unsigned long flags, unsigned int psize, | 401 | unsigned long flags, unsigned int psize, |
389 | int topdown, int use_cache) | 402 | int topdown, int use_cache) |
390 | { | 403 | { |
391 | struct slice_mask mask; | 404 | struct slice_mask mask = {0, 0}; |
392 | struct slice_mask good_mask; | 405 | struct slice_mask good_mask; |
393 | struct slice_mask potential_mask = {0,0} /* silence stupid warning */; | 406 | struct slice_mask potential_mask = {0,0} /* silence stupid warning */; |
394 | int pmask_set = 0; | 407 | struct slice_mask compat_mask = {0, 0}; |
395 | int fixed = (flags & MAP_FIXED); | 408 | int fixed = (flags & MAP_FIXED); |
396 | int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); | 409 | int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); |
397 | struct mm_struct *mm = current->mm; | 410 | struct mm_struct *mm = current->mm; |
411 | unsigned long newaddr; | ||
398 | 412 | ||
399 | /* Sanity checks */ | 413 | /* Sanity checks */ |
400 | BUG_ON(mm->task_size == 0); | 414 | BUG_ON(mm->task_size == 0); |
@@ -416,21 +430,48 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, | |||
416 | if (!fixed && addr) { | 430 | if (!fixed && addr) { |
417 | addr = _ALIGN_UP(addr, 1ul << pshift); | 431 | addr = _ALIGN_UP(addr, 1ul << pshift); |
418 | slice_dbg(" aligned addr=%lx\n", addr); | 432 | slice_dbg(" aligned addr=%lx\n", addr); |
433 | /* Ignore hint if it's too large or overlaps a VMA */ | ||
434 | if (addr > mm->task_size - len || | ||
435 | !slice_area_is_free(mm, addr, len)) | ||
436 | addr = 0; | ||
419 | } | 437 | } |
420 | 438 | ||
421 | /* First makeup a "good" mask of slices that have the right size | 439 | /* First make up a "good" mask of slices that have the right size |
422 | * already | 440 | * already |
423 | */ | 441 | */ |
424 | good_mask = slice_mask_for_size(mm, psize); | 442 | good_mask = slice_mask_for_size(mm, psize); |
425 | slice_print_mask(" good_mask", good_mask); | 443 | slice_print_mask(" good_mask", good_mask); |
426 | 444 | ||
427 | /* First check hint if it's valid or if we have MAP_FIXED */ | 445 | /* |
428 | if ((addr != 0 || fixed) && (mm->task_size - len) >= addr) { | 446 | * Here "good" means slices that are already the right page size, |
447 | * "compat" means slices that have a compatible page size (i.e. | ||
448 | * 4k in a 64k pagesize kernel), and "free" means slices without | ||
449 | * any VMAs. | ||
450 | * | ||
451 | * If MAP_FIXED: | ||
452 | * check if fits in good | compat => OK | ||
453 | * check if fits in good | compat | free => convert free | ||
454 | * else bad | ||
455 | * If have hint: | ||
456 | * check if hint fits in good => OK | ||
457 | * check if hint fits in good | free => convert free | ||
458 | * Otherwise: | ||
459 | * search in good, found => OK | ||
460 | * search in good | free, found => convert free | ||
461 | * search in good | compat | free, found => convert free. | ||
462 | */ | ||
429 | 463 | ||
430 | /* Don't bother with hint if it overlaps a VMA */ | 464 | #ifdef CONFIG_PPC_64K_PAGES |
431 | if (!fixed && !slice_area_is_free(mm, addr, len)) | 465 | /* If we support combo pages, we can allow 64k pages in 4k slices */ |
432 | goto search; | 466 | if (psize == MMU_PAGE_64K) { |
467 | compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K); | ||
468 | if (fixed) | ||
469 | or_mask(good_mask, compat_mask); | ||
470 | } | ||
471 | #endif | ||
433 | 472 | ||
473 | /* First check hint if it's valid or if we have MAP_FIXED */ | ||
474 | if (addr != 0 || fixed) { | ||
434 | /* Build a mask for the requested range */ | 475 | /* Build a mask for the requested range */ |
435 | mask = slice_range_to_mask(addr, len); | 476 | mask = slice_range_to_mask(addr, len); |
436 | slice_print_mask(" mask", mask); | 477 | slice_print_mask(" mask", mask); |
@@ -442,54 +483,66 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, | |||
442 | slice_dbg(" fits good !\n"); | 483 | slice_dbg(" fits good !\n"); |
443 | return addr; | 484 | return addr; |
444 | } | 485 | } |
445 | 486 | } else { | |
446 | /* We don't fit in the good mask, check what other slices are | 487 | /* Now let's see if we can find something in the existing |
447 | * empty and thus can be converted | 488 | * slices for that size |
448 | */ | 489 | */ |
449 | potential_mask = slice_mask_for_free(mm); | 490 | newaddr = slice_find_area(mm, len, good_mask, psize, topdown, |
450 | potential_mask.low_slices |= good_mask.low_slices; | 491 | use_cache); |
451 | potential_mask.high_slices |= good_mask.high_slices; | 492 | if (newaddr != -ENOMEM) { |
452 | pmask_set = 1; | 493 | /* Found within the good mask, we don't have to setup, |
453 | slice_print_mask(" potential", potential_mask); | 494 | * we thus return directly |
454 | if (slice_check_fit(mask, potential_mask)) { | 495 | */ |
455 | slice_dbg(" fits potential !\n"); | 496 | slice_dbg(" found area at 0x%lx\n", newaddr); |
456 | goto convert; | 497 | return newaddr; |
457 | } | 498 | } |
458 | } | 499 | } |
459 | 500 | ||
460 | /* If we have MAP_FIXED and failed the above step, then error out */ | 501 | /* We don't fit in the good mask, check what other slices are |
502 | * empty and thus can be converted | ||
503 | */ | ||
504 | potential_mask = slice_mask_for_free(mm); | ||
505 | or_mask(potential_mask, good_mask); | ||
506 | slice_print_mask(" potential", potential_mask); | ||
507 | |||
508 | if ((addr != 0 || fixed) && slice_check_fit(mask, potential_mask)) { | ||
509 | slice_dbg(" fits potential !\n"); | ||
510 | goto convert; | ||
511 | } | ||
512 | |||
513 | /* If we have MAP_FIXED and failed the above steps, then error out */ | ||
461 | if (fixed) | 514 | if (fixed) |
462 | return -EBUSY; | 515 | return -EBUSY; |
463 | 516 | ||
464 | search: | ||
465 | slice_dbg(" search...\n"); | 517 | slice_dbg(" search...\n"); |
466 | 518 | ||
467 | /* Now let's see if we can find something in the existing slices | 519 | /* If we had a hint that didn't work out, see if we can fit |
468 | * for that size | 520 | * anywhere in the good area. |
469 | */ | 521 | */ |
470 | addr = slice_find_area(mm, len, good_mask, psize, topdown, use_cache); | 522 | if (addr) { |
471 | if (addr != -ENOMEM) { | 523 | addr = slice_find_area(mm, len, good_mask, psize, topdown, |
472 | /* Found within the good mask, we don't have to setup, | 524 | use_cache); |
473 | * we thus return directly | 525 | if (addr != -ENOMEM) { |
474 | */ | 526 | slice_dbg(" found area at 0x%lx\n", addr); |
475 | slice_dbg(" found area at 0x%lx\n", addr); | 527 | return addr; |
476 | return addr; | 528 | } |
477 | } | ||
478 | |||
479 | /* Won't fit, check what can be converted */ | ||
480 | if (!pmask_set) { | ||
481 | potential_mask = slice_mask_for_free(mm); | ||
482 | potential_mask.low_slices |= good_mask.low_slices; | ||
483 | potential_mask.high_slices |= good_mask.high_slices; | ||
484 | pmask_set = 1; | ||
485 | slice_print_mask(" potential", potential_mask); | ||
486 | } | 529 | } |
487 | 530 | ||
488 | /* Now let's see if we can find something in the existing slices | 531 | /* Now let's see if we can find something in the existing slices |
489 | * for that size | 532 | * for that size plus free slices |
490 | */ | 533 | */ |
491 | addr = slice_find_area(mm, len, potential_mask, psize, topdown, | 534 | addr = slice_find_area(mm, len, potential_mask, psize, topdown, |
492 | use_cache); | 535 | use_cache); |
536 | |||
537 | #ifdef CONFIG_PPC_64K_PAGES | ||
538 | if (addr == -ENOMEM && psize == MMU_PAGE_64K) { | ||
539 | /* retry the search with 4k-page slices included */ | ||
540 | or_mask(potential_mask, compat_mask); | ||
541 | addr = slice_find_area(mm, len, potential_mask, psize, | ||
542 | topdown, use_cache); | ||
543 | } | ||
544 | #endif | ||
545 | |||
493 | if (addr == -ENOMEM) | 546 | if (addr == -ENOMEM) |
494 | return -ENOMEM; | 547 | return -ENOMEM; |
495 | 548 | ||
@@ -498,7 +551,13 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, | |||
498 | slice_print_mask(" mask", mask); | 551 | slice_print_mask(" mask", mask); |
499 | 552 | ||
500 | convert: | 553 | convert: |
501 | slice_convert(mm, mask, psize); | 554 | andnot_mask(mask, good_mask); |
555 | andnot_mask(mask, compat_mask); | ||
556 | if (mask.low_slices || mask.high_slices) { | ||
557 | slice_convert(mm, mask, psize); | ||
558 | if (psize > MMU_PAGE_BASE) | ||
559 | on_each_cpu(slice_flush_segments, mm, 0, 1); | ||
560 | } | ||
502 | return addr; | 561 | return addr; |
503 | 562 | ||
504 | } | 563 | } |
@@ -598,6 +657,36 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) | |||
598 | spin_unlock_irqrestore(&slice_convert_lock, flags); | 657 | spin_unlock_irqrestore(&slice_convert_lock, flags); |
599 | } | 658 | } |
600 | 659 | ||
660 | void slice_set_psize(struct mm_struct *mm, unsigned long address, | ||
661 | unsigned int psize) | ||
662 | { | ||
663 | unsigned long i, flags; | ||
664 | u64 *p; | ||
665 | |||
666 | spin_lock_irqsave(&slice_convert_lock, flags); | ||
667 | if (address < SLICE_LOW_TOP) { | ||
668 | i = GET_LOW_SLICE_INDEX(address); | ||
669 | p = &mm->context.low_slices_psize; | ||
670 | } else { | ||
671 | i = GET_HIGH_SLICE_INDEX(address); | ||
672 | p = &mm->context.high_slices_psize; | ||
673 | } | ||
674 | *p = (*p & ~(0xful << (i * 4))) | ((unsigned long) psize << (i * 4)); | ||
675 | spin_unlock_irqrestore(&slice_convert_lock, flags); | ||
676 | |||
677 | #ifdef CONFIG_SPU_BASE | ||
678 | spu_flush_all_slbs(mm); | ||
679 | #endif | ||
680 | } | ||
681 | |||
682 | void slice_set_range_psize(struct mm_struct *mm, unsigned long start, | ||
683 | unsigned long len, unsigned int psize) | ||
684 | { | ||
685 | struct slice_mask mask = slice_range_to_mask(start, len); | ||
686 | |||
687 | slice_convert(mm, mask, psize); | ||
688 | } | ||
689 | |||
601 | /* | 690 | /* |
602 | * is_hugepage_only_range() is used by generic code to verify wether | 691 | * is_hugepage_only_range() is used by generic code to verify wether |
603 | * a normal mmap mapping (non hugetlbfs) is valid on a given area. | 692 | * a normal mmap mapping (non hugetlbfs) is valid on a given area. |