diff options
author | Chen Jie <chenj@lemote.com> | 2015-03-26 13:07:24 -0400 |
---|---|---|
committer | Ralf Baechle <ralf@linux-mips.org> | 2015-04-01 11:22:11 -0400 |
commit | 615eb603f4e1da4f151c7fac4aa175753a9913ec (patch) | |
tree | 5f879da5f72fd016a77522a3bfea67a0486ac42c | |
parent | d548ca6b0784a99f0fcae397f115823ccd0361a5 (diff) |
MIPS: csum_partial: Improve instruction parallelism.
Computing sum introduces true data dependency. This patch removes some
true data depdendencies, hence increases instruction level parallelism.
This patch brings up to 50% csum performance gain on Loongson 3a.
One example about how this patch works is in CSUM_BIGCHUNK1:
// ** original ** vs ** patch applied **
ADDC(sum, t0) ADDC(t0, t1)
ADDC(sum, t1) ADDC(t2, t3)
ADDC(sum, t2) ADDC(sum, t0)
ADDC(sum, t3) ADDC(sum, t2)
In the original implementation, each ADDC(sum, ...) depends on the sum
value updated by previous ADDC(as source operand).
With this patch applied, the first two ADDC operations are independent,
hence can be executed simultaneously if possible.
Another example is in the "copy and sum calculating chunk":
// ** original ** vs ** patch applied **
STORE(t0, UNIT(0) ... STORE(t0, UNIT(0) ...
ADDC(sum, t0) ADDC(t0, t1)
STORE(t1, UNIT(1) ... STORE(t1, UNIT(1) ...
ADDC(sum, t1) ADDC(sum, t0)
STORE(t2, UNIT(2) ... STORE(t2, UNIT(2) ...
ADDC(sum, t2) ADDC(t2, t3)
STORE(t3, UNIT(3) ... STORE(t3, UNIT(3) ...
ADDC(sum, t3) ADDC(sum, t2)
With this patch applied, ADDC and the **next next** ADDC are independent.
Signed-off-by: chenj <chenj@lemote.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/9608/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
-rw-r--r-- | arch/mips/lib/csum_partial.S | 38 |
1 files changed, 19 insertions, 19 deletions
diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S index 4c721e247ac9..ed88647b57e2 100644 --- a/arch/mips/lib/csum_partial.S +++ b/arch/mips/lib/csum_partial.S | |||
@@ -76,10 +76,10 @@ | |||
76 | LOAD _t1, (offset + UNIT(1))(src); \ | 76 | LOAD _t1, (offset + UNIT(1))(src); \ |
77 | LOAD _t2, (offset + UNIT(2))(src); \ | 77 | LOAD _t2, (offset + UNIT(2))(src); \ |
78 | LOAD _t3, (offset + UNIT(3))(src); \ | 78 | LOAD _t3, (offset + UNIT(3))(src); \ |
79 | ADDC(_t0, _t1); \ | ||
80 | ADDC(_t2, _t3); \ | ||
79 | ADDC(sum, _t0); \ | 81 | ADDC(sum, _t0); \ |
80 | ADDC(sum, _t1); \ | 82 | ADDC(sum, _t2) |
81 | ADDC(sum, _t2); \ | ||
82 | ADDC(sum, _t3) | ||
83 | 83 | ||
84 | #ifdef USE_DOUBLE | 84 | #ifdef USE_DOUBLE |
85 | #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \ | 85 | #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \ |
@@ -504,21 +504,21 @@ LEAF(csum_partial) | |||
504 | SUB len, len, 8*NBYTES | 504 | SUB len, len, 8*NBYTES |
505 | ADD src, src, 8*NBYTES | 505 | ADD src, src, 8*NBYTES |
506 | STORE(t0, UNIT(0)(dst), .Ls_exc\@) | 506 | STORE(t0, UNIT(0)(dst), .Ls_exc\@) |
507 | ADDC(sum, t0) | 507 | ADDC(t0, t1) |
508 | STORE(t1, UNIT(1)(dst), .Ls_exc\@) | 508 | STORE(t1, UNIT(1)(dst), .Ls_exc\@) |
509 | ADDC(sum, t1) | 509 | ADDC(sum, t0) |
510 | STORE(t2, UNIT(2)(dst), .Ls_exc\@) | 510 | STORE(t2, UNIT(2)(dst), .Ls_exc\@) |
511 | ADDC(sum, t2) | 511 | ADDC(t2, t3) |
512 | STORE(t3, UNIT(3)(dst), .Ls_exc\@) | 512 | STORE(t3, UNIT(3)(dst), .Ls_exc\@) |
513 | ADDC(sum, t3) | 513 | ADDC(sum, t2) |
514 | STORE(t4, UNIT(4)(dst), .Ls_exc\@) | 514 | STORE(t4, UNIT(4)(dst), .Ls_exc\@) |
515 | ADDC(sum, t4) | 515 | ADDC(t4, t5) |
516 | STORE(t5, UNIT(5)(dst), .Ls_exc\@) | 516 | STORE(t5, UNIT(5)(dst), .Ls_exc\@) |
517 | ADDC(sum, t5) | 517 | ADDC(sum, t4) |
518 | STORE(t6, UNIT(6)(dst), .Ls_exc\@) | 518 | STORE(t6, UNIT(6)(dst), .Ls_exc\@) |
519 | ADDC(sum, t6) | 519 | ADDC(t6, t7) |
520 | STORE(t7, UNIT(7)(dst), .Ls_exc\@) | 520 | STORE(t7, UNIT(7)(dst), .Ls_exc\@) |
521 | ADDC(sum, t7) | 521 | ADDC(sum, t6) |
522 | .set reorder /* DADDI_WAR */ | 522 | .set reorder /* DADDI_WAR */ |
523 | ADD dst, dst, 8*NBYTES | 523 | ADD dst, dst, 8*NBYTES |
524 | bgez len, 1b | 524 | bgez len, 1b |
@@ -544,13 +544,13 @@ LEAF(csum_partial) | |||
544 | SUB len, len, 4*NBYTES | 544 | SUB len, len, 4*NBYTES |
545 | ADD src, src, 4*NBYTES | 545 | ADD src, src, 4*NBYTES |
546 | STORE(t0, UNIT(0)(dst), .Ls_exc\@) | 546 | STORE(t0, UNIT(0)(dst), .Ls_exc\@) |
547 | ADDC(sum, t0) | 547 | ADDC(t0, t1) |
548 | STORE(t1, UNIT(1)(dst), .Ls_exc\@) | 548 | STORE(t1, UNIT(1)(dst), .Ls_exc\@) |
549 | ADDC(sum, t1) | 549 | ADDC(sum, t0) |
550 | STORE(t2, UNIT(2)(dst), .Ls_exc\@) | 550 | STORE(t2, UNIT(2)(dst), .Ls_exc\@) |
551 | ADDC(sum, t2) | 551 | ADDC(t2, t3) |
552 | STORE(t3, UNIT(3)(dst), .Ls_exc\@) | 552 | STORE(t3, UNIT(3)(dst), .Ls_exc\@) |
553 | ADDC(sum, t3) | 553 | ADDC(sum, t2) |
554 | .set reorder /* DADDI_WAR */ | 554 | .set reorder /* DADDI_WAR */ |
555 | ADD dst, dst, 4*NBYTES | 555 | ADD dst, dst, 4*NBYTES |
556 | beqz len, .Ldone\@ | 556 | beqz len, .Ldone\@ |
@@ -649,13 +649,13 @@ LEAF(csum_partial) | |||
649 | nop # improves slotting | 649 | nop # improves slotting |
650 | #endif | 650 | #endif |
651 | STORE(t0, UNIT(0)(dst), .Ls_exc\@) | 651 | STORE(t0, UNIT(0)(dst), .Ls_exc\@) |
652 | ADDC(sum, t0) | 652 | ADDC(t0, t1) |
653 | STORE(t1, UNIT(1)(dst), .Ls_exc\@) | 653 | STORE(t1, UNIT(1)(dst), .Ls_exc\@) |
654 | ADDC(sum, t1) | 654 | ADDC(sum, t0) |
655 | STORE(t2, UNIT(2)(dst), .Ls_exc\@) | 655 | STORE(t2, UNIT(2)(dst), .Ls_exc\@) |
656 | ADDC(sum, t2) | 656 | ADDC(t2, t3) |
657 | STORE(t3, UNIT(3)(dst), .Ls_exc\@) | 657 | STORE(t3, UNIT(3)(dst), .Ls_exc\@) |
658 | ADDC(sum, t3) | 658 | ADDC(sum, t2) |
659 | .set reorder /* DADDI_WAR */ | 659 | .set reorder /* DADDI_WAR */ |
660 | ADD dst, dst, 4*NBYTES | 660 | ADD dst, dst, 4*NBYTES |
661 | bne len, rem, 1b | 661 | bne len, rem, 1b |