aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChen Jie <chenj@lemote.com>2015-03-26 13:07:24 -0400
committerRalf Baechle <ralf@linux-mips.org>2015-04-01 11:22:11 -0400
commit615eb603f4e1da4f151c7fac4aa175753a9913ec (patch)
tree5f879da5f72fd016a77522a3bfea67a0486ac42c
parentd548ca6b0784a99f0fcae397f115823ccd0361a5 (diff)
MIPS: csum_partial: Improve instruction parallelism.
Computing sum introduces true data dependency. This patch removes some true data depdendencies, hence increases instruction level parallelism. This patch brings up to 50% csum performance gain on Loongson 3a. One example about how this patch works is in CSUM_BIGCHUNK1: // ** original ** vs ** patch applied ** ADDC(sum, t0) ADDC(t0, t1) ADDC(sum, t1) ADDC(t2, t3) ADDC(sum, t2) ADDC(sum, t0) ADDC(sum, t3) ADDC(sum, t2) In the original implementation, each ADDC(sum, ...) depends on the sum value updated by previous ADDC(as source operand). With this patch applied, the first two ADDC operations are independent, hence can be executed simultaneously if possible. Another example is in the "copy and sum calculating chunk": // ** original ** vs ** patch applied ** STORE(t0, UNIT(0) ... STORE(t0, UNIT(0) ... ADDC(sum, t0) ADDC(t0, t1) STORE(t1, UNIT(1) ... STORE(t1, UNIT(1) ... ADDC(sum, t1) ADDC(sum, t0) STORE(t2, UNIT(2) ... STORE(t2, UNIT(2) ... ADDC(sum, t2) ADDC(t2, t3) STORE(t3, UNIT(3) ... STORE(t3, UNIT(3) ... ADDC(sum, t3) ADDC(sum, t2) With this patch applied, ADDC and the **next next** ADDC are independent. Signed-off-by: chenj <chenj@lemote.com> Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/9608/ Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
-rw-r--r--arch/mips/lib/csum_partial.S38
1 files changed, 19 insertions, 19 deletions
diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
index 4c721e247ac9..ed88647b57e2 100644
--- a/arch/mips/lib/csum_partial.S
+++ b/arch/mips/lib/csum_partial.S
@@ -76,10 +76,10 @@
76 LOAD _t1, (offset + UNIT(1))(src); \ 76 LOAD _t1, (offset + UNIT(1))(src); \
77 LOAD _t2, (offset + UNIT(2))(src); \ 77 LOAD _t2, (offset + UNIT(2))(src); \
78 LOAD _t3, (offset + UNIT(3))(src); \ 78 LOAD _t3, (offset + UNIT(3))(src); \
79 ADDC(_t0, _t1); \
80 ADDC(_t2, _t3); \
79 ADDC(sum, _t0); \ 81 ADDC(sum, _t0); \
80 ADDC(sum, _t1); \ 82 ADDC(sum, _t2)
81 ADDC(sum, _t2); \
82 ADDC(sum, _t3)
83 83
84#ifdef USE_DOUBLE 84#ifdef USE_DOUBLE
85#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \ 85#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \
@@ -504,21 +504,21 @@ LEAF(csum_partial)
504 SUB len, len, 8*NBYTES 504 SUB len, len, 8*NBYTES
505 ADD src, src, 8*NBYTES 505 ADD src, src, 8*NBYTES
506 STORE(t0, UNIT(0)(dst), .Ls_exc\@) 506 STORE(t0, UNIT(0)(dst), .Ls_exc\@)
507 ADDC(sum, t0) 507 ADDC(t0, t1)
508 STORE(t1, UNIT(1)(dst), .Ls_exc\@) 508 STORE(t1, UNIT(1)(dst), .Ls_exc\@)
509 ADDC(sum, t1) 509 ADDC(sum, t0)
510 STORE(t2, UNIT(2)(dst), .Ls_exc\@) 510 STORE(t2, UNIT(2)(dst), .Ls_exc\@)
511 ADDC(sum, t2) 511 ADDC(t2, t3)
512 STORE(t3, UNIT(3)(dst), .Ls_exc\@) 512 STORE(t3, UNIT(3)(dst), .Ls_exc\@)
513 ADDC(sum, t3) 513 ADDC(sum, t2)
514 STORE(t4, UNIT(4)(dst), .Ls_exc\@) 514 STORE(t4, UNIT(4)(dst), .Ls_exc\@)
515 ADDC(sum, t4) 515 ADDC(t4, t5)
516 STORE(t5, UNIT(5)(dst), .Ls_exc\@) 516 STORE(t5, UNIT(5)(dst), .Ls_exc\@)
517 ADDC(sum, t5) 517 ADDC(sum, t4)
518 STORE(t6, UNIT(6)(dst), .Ls_exc\@) 518 STORE(t6, UNIT(6)(dst), .Ls_exc\@)
519 ADDC(sum, t6) 519 ADDC(t6, t7)
520 STORE(t7, UNIT(7)(dst), .Ls_exc\@) 520 STORE(t7, UNIT(7)(dst), .Ls_exc\@)
521 ADDC(sum, t7) 521 ADDC(sum, t6)
522 .set reorder /* DADDI_WAR */ 522 .set reorder /* DADDI_WAR */
523 ADD dst, dst, 8*NBYTES 523 ADD dst, dst, 8*NBYTES
524 bgez len, 1b 524 bgez len, 1b
@@ -544,13 +544,13 @@ LEAF(csum_partial)
544 SUB len, len, 4*NBYTES 544 SUB len, len, 4*NBYTES
545 ADD src, src, 4*NBYTES 545 ADD src, src, 4*NBYTES
546 STORE(t0, UNIT(0)(dst), .Ls_exc\@) 546 STORE(t0, UNIT(0)(dst), .Ls_exc\@)
547 ADDC(sum, t0) 547 ADDC(t0, t1)
548 STORE(t1, UNIT(1)(dst), .Ls_exc\@) 548 STORE(t1, UNIT(1)(dst), .Ls_exc\@)
549 ADDC(sum, t1) 549 ADDC(sum, t0)
550 STORE(t2, UNIT(2)(dst), .Ls_exc\@) 550 STORE(t2, UNIT(2)(dst), .Ls_exc\@)
551 ADDC(sum, t2) 551 ADDC(t2, t3)
552 STORE(t3, UNIT(3)(dst), .Ls_exc\@) 552 STORE(t3, UNIT(3)(dst), .Ls_exc\@)
553 ADDC(sum, t3) 553 ADDC(sum, t2)
554 .set reorder /* DADDI_WAR */ 554 .set reorder /* DADDI_WAR */
555 ADD dst, dst, 4*NBYTES 555 ADD dst, dst, 4*NBYTES
556 beqz len, .Ldone\@ 556 beqz len, .Ldone\@
@@ -649,13 +649,13 @@ LEAF(csum_partial)
649 nop # improves slotting 649 nop # improves slotting
650#endif 650#endif
651 STORE(t0, UNIT(0)(dst), .Ls_exc\@) 651 STORE(t0, UNIT(0)(dst), .Ls_exc\@)
652 ADDC(sum, t0) 652 ADDC(t0, t1)
653 STORE(t1, UNIT(1)(dst), .Ls_exc\@) 653 STORE(t1, UNIT(1)(dst), .Ls_exc\@)
654 ADDC(sum, t1) 654 ADDC(sum, t0)
655 STORE(t2, UNIT(2)(dst), .Ls_exc\@) 655 STORE(t2, UNIT(2)(dst), .Ls_exc\@)
656 ADDC(sum, t2) 656 ADDC(t2, t3)
657 STORE(t3, UNIT(3)(dst), .Ls_exc\@) 657 STORE(t3, UNIT(3)(dst), .Ls_exc\@)
658 ADDC(sum, t3) 658 ADDC(sum, t2)
659 .set reorder /* DADDI_WAR */ 659 .set reorder /* DADDI_WAR */
660 ADD dst, dst, 4*NBYTES 660 ADD dst, dst, 4*NBYTES
661 bne len, rem, 1b 661 bne len, rem, 1b