diff options
Diffstat (limited to 'Documentation/memory-barriers.txt')
-rw-r--r-- | Documentation/memory-barriers.txt | 128 |
1 files changed, 66 insertions, 62 deletions
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index a4de88fb55f0..22a969cdd476 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt | |||
@@ -574,30 +574,14 @@ However, stores are not speculated. This means that ordering -is- provided | |||
574 | in the following example: | 574 | in the following example: |
575 | 575 | ||
576 | q = ACCESS_ONCE(a); | 576 | q = ACCESS_ONCE(a); |
577 | if (ACCESS_ONCE(q)) { | ||
578 | ACCESS_ONCE(b) = p; | ||
579 | } | ||
580 | |||
581 | Please note that ACCESS_ONCE() is not optional! Without the ACCESS_ONCE(), | ||
582 | the compiler is within its rights to transform this example: | ||
583 | |||
584 | q = a; | ||
585 | if (q) { | 577 | if (q) { |
586 | b = p; /* BUG: Compiler can reorder!!! */ | 578 | ACCESS_ONCE(b) = p; |
587 | do_something(); | ||
588 | } else { | ||
589 | b = p; /* BUG: Compiler can reorder!!! */ | ||
590 | do_something_else(); | ||
591 | } | 579 | } |
592 | 580 | ||
593 | into this, which of course defeats the ordering: | 581 | Please note that ACCESS_ONCE() is not optional! Without the |
594 | 582 | ACCESS_ONCE(), might combine the load from 'a' with other loads from | |
595 | b = p; | 583 | 'a', and the store to 'b' with other stores to 'b', with possible highly |
596 | q = a; | 584 | counterintuitive effects on ordering. |
597 | if (q) | ||
598 | do_something(); | ||
599 | else | ||
600 | do_something_else(); | ||
601 | 585 | ||
602 | Worse yet, if the compiler is able to prove (say) that the value of | 586 | Worse yet, if the compiler is able to prove (say) that the value of |
603 | variable 'a' is always non-zero, it would be well within its rights | 587 | variable 'a' is always non-zero, it would be well within its rights |
@@ -605,11 +589,12 @@ to optimize the original example by eliminating the "if" statement | |||
605 | as follows: | 589 | as follows: |
606 | 590 | ||
607 | q = a; | 591 | q = a; |
608 | b = p; /* BUG: Compiler can reorder!!! */ | 592 | b = p; /* BUG: Compiler and CPU can both reorder!!! */ |
609 | do_something(); | 593 | |
594 | So don't leave out the ACCESS_ONCE(). | ||
610 | 595 | ||
611 | The solution is again ACCESS_ONCE() and barrier(), which preserves the | 596 | It is tempting to try to enforce ordering on identical stores on both |
612 | ordering between the load from variable 'a' and the store to variable 'b': | 597 | branches of the "if" statement as follows: |
613 | 598 | ||
614 | q = ACCESS_ONCE(a); | 599 | q = ACCESS_ONCE(a); |
615 | if (q) { | 600 | if (q) { |
@@ -622,18 +607,11 @@ ordering between the load from variable 'a' and the store to variable 'b': | |||
622 | do_something_else(); | 607 | do_something_else(); |
623 | } | 608 | } |
624 | 609 | ||
625 | The initial ACCESS_ONCE() is required to prevent the compiler from | 610 | Unfortunately, current compilers will transform this as follows at high |
626 | proving the value of 'a', and the pair of barrier() invocations are | 611 | optimization levels: |
627 | required to prevent the compiler from pulling the two identical stores | ||
628 | to 'b' out from the legs of the "if" statement. | ||
629 | |||
630 | It is important to note that control dependencies absolutely require a | ||
631 | a conditional. For example, the following "optimized" version of | ||
632 | the above example breaks ordering, which is why the barrier() invocations | ||
633 | are absolutely required if you have identical stores in both legs of | ||
634 | the "if" statement: | ||
635 | 612 | ||
636 | q = ACCESS_ONCE(a); | 613 | q = ACCESS_ONCE(a); |
614 | barrier(); | ||
637 | ACCESS_ONCE(b) = p; /* BUG: No ordering vs. load from a!!! */ | 615 | ACCESS_ONCE(b) = p; /* BUG: No ordering vs. load from a!!! */ |
638 | if (q) { | 616 | if (q) { |
639 | /* ACCESS_ONCE(b) = p; -- moved up, BUG!!! */ | 617 | /* ACCESS_ONCE(b) = p; -- moved up, BUG!!! */ |
@@ -643,21 +621,36 @@ the "if" statement: | |||
643 | do_something_else(); | 621 | do_something_else(); |
644 | } | 622 | } |
645 | 623 | ||
646 | It is of course legal for the prior load to be part of the conditional, | 624 | Now there is no conditional between the load from 'a' and the store to |
647 | for example, as follows: | 625 | 'b', which means that the CPU is within its rights to reorder them: |
626 | The conditional is absolutely required, and must be present in the | ||
627 | assembly code even after all compiler optimizations have been applied. | ||
628 | Therefore, if you need ordering in this example, you need explicit | ||
629 | memory barriers, for example, smp_store_release(): | ||
648 | 630 | ||
649 | if (ACCESS_ONCE(a) > 0) { | 631 | q = ACCESS_ONCE(a); |
650 | barrier(); | 632 | if (q) { |
651 | ACCESS_ONCE(b) = q / 2; | 633 | smp_store_release(&b, p); |
652 | do_something(); | 634 | do_something(); |
653 | } else { | 635 | } else { |
654 | barrier(); | 636 | smp_store_release(&b, p); |
655 | ACCESS_ONCE(b) = q / 3; | 637 | do_something_else(); |
638 | } | ||
639 | |||
640 | In contrast, without explicit memory barriers, two-legged-if control | ||
641 | ordering is guaranteed only when the stores differ, for example: | ||
642 | |||
643 | q = ACCESS_ONCE(a); | ||
644 | if (q) { | ||
645 | ACCESS_ONCE(b) = p; | ||
646 | do_something(); | ||
647 | } else { | ||
648 | ACCESS_ONCE(b) = r; | ||
656 | do_something_else(); | 649 | do_something_else(); |
657 | } | 650 | } |
658 | 651 | ||
659 | This will again ensure that the load from variable 'a' is ordered before the | 652 | The initial ACCESS_ONCE() is still required to prevent the compiler from |
660 | stores to variable 'b'. | 653 | proving the value of 'a'. |
661 | 654 | ||
662 | In addition, you need to be careful what you do with the local variable 'q', | 655 | In addition, you need to be careful what you do with the local variable 'q', |
663 | otherwise the compiler might be able to guess the value and again remove | 656 | otherwise the compiler might be able to guess the value and again remove |
@@ -665,12 +658,10 @@ the needed conditional. For example: | |||
665 | 658 | ||
666 | q = ACCESS_ONCE(a); | 659 | q = ACCESS_ONCE(a); |
667 | if (q % MAX) { | 660 | if (q % MAX) { |
668 | barrier(); | ||
669 | ACCESS_ONCE(b) = p; | 661 | ACCESS_ONCE(b) = p; |
670 | do_something(); | 662 | do_something(); |
671 | } else { | 663 | } else { |
672 | barrier(); | 664 | ACCESS_ONCE(b) = r; |
673 | ACCESS_ONCE(b) = p; | ||
674 | do_something_else(); | 665 | do_something_else(); |
675 | } | 666 | } |
676 | 667 | ||
@@ -682,9 +673,12 @@ transform the above code into the following: | |||
682 | ACCESS_ONCE(b) = p; | 673 | ACCESS_ONCE(b) = p; |
683 | do_something_else(); | 674 | do_something_else(); |
684 | 675 | ||
685 | This transformation loses the ordering between the load from variable 'a' | 676 | Given this transformation, the CPU is not required to respect the ordering |
686 | and the store to variable 'b'. If you are relying on this ordering, you | 677 | between the load from variable 'a' and the store to variable 'b'. It is |
687 | should do something like the following: | 678 | tempting to add a barrier(), but this does not help. The conditional |
679 | is gone, and the barrier won't bring it back. Therefore, if you are | ||
680 | relying on this ordering, you should make sure that MAX is greater than | ||
681 | one, perhaps as follows: | ||
688 | 682 | ||
689 | q = ACCESS_ONCE(a); | 683 | q = ACCESS_ONCE(a); |
690 | BUILD_BUG_ON(MAX <= 1); /* Order load from a with store to b. */ | 684 | BUILD_BUG_ON(MAX <= 1); /* Order load from a with store to b. */ |
@@ -692,35 +686,45 @@ should do something like the following: | |||
692 | ACCESS_ONCE(b) = p; | 686 | ACCESS_ONCE(b) = p; |
693 | do_something(); | 687 | do_something(); |
694 | } else { | 688 | } else { |
695 | ACCESS_ONCE(b) = p; | 689 | ACCESS_ONCE(b) = r; |
696 | do_something_else(); | 690 | do_something_else(); |
697 | } | 691 | } |
698 | 692 | ||
693 | Please note once again that the stores to 'b' differ. If they were | ||
694 | identical, as noted earlier, the compiler could pull this store outside | ||
695 | of the 'if' statement. | ||
696 | |||
699 | Finally, control dependencies do -not- provide transitivity. This is | 697 | Finally, control dependencies do -not- provide transitivity. This is |
700 | demonstrated by two related examples: | 698 | demonstrated by two related examples, with the initial values of |
699 | x and y both being zero: | ||
701 | 700 | ||
702 | CPU 0 CPU 1 | 701 | CPU 0 CPU 1 |
703 | ===================== ===================== | 702 | ===================== ===================== |
704 | r1 = ACCESS_ONCE(x); r2 = ACCESS_ONCE(y); | 703 | r1 = ACCESS_ONCE(x); r2 = ACCESS_ONCE(y); |
705 | if (r1 >= 0) if (r2 >= 0) | 704 | if (r1 > 0) if (r2 > 0) |
706 | ACCESS_ONCE(y) = 1; ACCESS_ONCE(x) = 1; | 705 | ACCESS_ONCE(y) = 1; ACCESS_ONCE(x) = 1; |
707 | 706 | ||
708 | assert(!(r1 == 1 && r2 == 1)); | 707 | assert(!(r1 == 1 && r2 == 1)); |
709 | 708 | ||
710 | The above two-CPU example will never trigger the assert(). However, | 709 | The above two-CPU example will never trigger the assert(). However, |
711 | if control dependencies guaranteed transitivity (which they do not), | 710 | if control dependencies guaranteed transitivity (which they do not), |
712 | then adding the following two CPUs would guarantee a related assertion: | 711 | then adding the following CPU would guarantee a related assertion: |
713 | 712 | ||
714 | CPU 2 CPU 3 | 713 | CPU 2 |
715 | ===================== ===================== | 714 | ===================== |
716 | ACCESS_ONCE(x) = 2; ACCESS_ONCE(y) = 2; | 715 | ACCESS_ONCE(x) = 2; |
716 | |||
717 | assert(!(r1 == 2 && r2 == 1 && x == 2)); /* FAILS!!! */ | ||
717 | 718 | ||
718 | assert(!(r1 == 2 && r2 == 2 && x == 1 && y == 1)); /* FAILS!!! */ | 719 | But because control dependencies do -not- provide transitivity, the above |
720 | assertion can fail after the combined three-CPU example completes. If you | ||
721 | need the three-CPU example to provide ordering, you will need smp_mb() | ||
722 | between the loads and stores in the CPU 0 and CPU 1 code fragments, | ||
723 | that is, just before or just after the "if" statements. | ||
719 | 724 | ||
720 | But because control dependencies do -not- provide transitivity, the | 725 | These two examples are the LB and WWC litmus tests from this paper: |
721 | above assertion can fail after the combined four-CPU example completes. | 726 | http://www.cl.cam.ac.uk/users/pes20/ppc-supplemental/test6.pdf and this |
722 | If you need the four-CPU example to provide ordering, you will need | 727 | site: https://www.cl.cam.ac.uk/~pes20/ppcmem/index.html. |
723 | smp_mb() between the loads and stores in the CPU 0 and CPU 1 code fragments. | ||
724 | 728 | ||
725 | In summary: | 729 | In summary: |
726 | 730 | ||