aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/media/platform/vivi.c
diff options
context:
space:
mode:
authorKirill Smelkov <kirr@mns.spb.ru>2012-11-02 08:10:30 -0400
committerMauro Carvalho Chehab <mchehab@redhat.com>2012-12-21 15:25:21 -0500
commite3a8b4d22b47b3ee17baccdc5b351465f890671c (patch)
treefabb3426239a407b21b2c945be39cc5c08ab8143 /drivers/media/platform/vivi.c
parent011b2aad587a770e758711f465312ac843440d2a (diff)
[media] vivi: Optimize gen_text()
I've noticed that vivi takes a lot of CPU to produce its frames. For example for 8 devices and 8 simple programs running, where each captures YUY2 640x480 and displays it to X via SDL, profile timing is as follows: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # Samples: 82K of event 'cycles' # Event count (approx.): 31551930117 # # Overhead Command Shared Object Symbol # ........ ............... .................... # 49.48% vivi-* [vivi] [k] gen_twopix 10.79% vivi-* [kernel.kallsyms] [k] memcpy 10.02% rawv libc-2.13.so [.] __memcpy_ssse3 8.35% vivi-* [vivi] [k] gen_text.constprop.6 5.06% Xorg [unknown] [.] 0xa73015f8 2.32% rawv [vivi] [k] gen_twopix 1.22% rawv [vivi] [k] precalculate_line 1.20% vivi-* [vivi] [k] vivi_fillbuff (rawv is display program, vivi-* is a combination of vivi-000 through vivi-007) so a lot of time is spent in gen_twopix() which as the follwing call-graph profile shows ... 49.48% vivi-* [vivi] [k] gen_twopix | --- gen_twopix | |--96.30%-- gen_text.constprop.6 | vivi_fillbuff | vivi_thread | kthread | ret_from_kernel_thread | --3.70%-- vivi_fillbuff vivi_thread kthread ret_from_kernel_thread ... is called mostly from gen_text(). If we'll look at gen_text(), in the inner loop, we'll see if (chr & (1 << (7 - i))) gen_twopix(dev, pos + j * dev->pixelsize, WHITE, (x+y) & 1); else gen_twopix(dev, pos + j * dev->pixelsize, TEXT_BLACK, (x+y) & 1); which calls gen_twopix() for every character pixel, and that is very expensive, because gen_twopix() branches several times. Now, let's note, that we operate on only two colors - WHITE and TEXT_BLACK, and that pixel for that colors could be precomputed and gen_twopix() moved out of the inner loop. Also note, that for black and white colors even/odd does not make a difference for all supported pixel formats, so we could stop doing that `odd` gen_twopix() parameter game. So the first thing we are doing here is 1) moving gen_twopix() calls out of gen_text() into vivi_fillbuff(), to pregenerate black and white colors, just before printing starts. what we have next is that gen_text's font rendering loop, even with gen_twopix() calls moved out, was inefficient and branchy, so let's 2) rewrite gen_text() loop so it uses less variables + unroll char horizontal-rendering loop + instantiate 3 code paths for pixelsizes 2,3 and 4 so that in all inner loops we don't have to branch or make indirections (*). Done all above reworks, for gen_text() we get nice, non-branchy streamlined code (showing loop for pixelsize=2): ? cmp $0x2,%eax ? ? jne 26 ? mov -0x18(%ebp),%eax ? mov -0x20(%ebp),%edi ? imul -0x20(%ebp),%eax ? movzwl 0x3ffc(%ebx),%esi 0,08 ? movzwl 0x4000(%ebx),%ecx 0,04 ? add %edi,%edi ? mov 0x0,%ebx 0,51 ? mov %edi,-0x1c(%ebp) ? mov %ebx,-0x14(%ebp) ? movl $0x0,-0x10(%ebp) ? lea 0x20(%edx,%eax,2),%eax ? mov %eax,-0x18(%ebp) ? xchg %ax,%ax 0,04 ? a0: mov 0x8(%ebp),%ebx ? mov -0x18(%ebp),%eax 0,04 ? movzbl (%ebx),%edx 0,16 ? test %dl,%dl 0,04 ? ? je 128 0,08 ? lea 0x0(%esi),%esi 1,61 ? b0:???shl $0x4,%edx 1,02 ? ? mov -0x14(%ebp),%edi 2,04 ? ? add -0x10(%ebp),%edx 2,24 ? ? lea 0x1(%ebx),%ebx 0,27 ? ? movzbl (%edi,%edx,1),%edx 9,92 ? ? mov %esi,%edi 0,39 ? ? test %dl,%dl 2,04 ? ? cmovns %ecx,%edi 4,63 ? ? test $0x40,%dl 0,55 ? ? mov %di,(%eax) 3,76 ? ? mov %esi,%edi 0,71 ? ? cmove %ecx,%edi 3,41 ? ? test $0x20,%dl 0,75 ? ? mov %di,0x2(%eax) 2,43 ? ? mov %esi,%edi 0,59 ? ? cmove %ecx,%edi 4,59 ? ? test $0x10,%dl 0,67 ? ? mov %di,0x4(%eax) 2,55 ? ? mov %esi,%edi 0,78 ? ? cmove %ecx,%edi 4,31 ? ? test $0x8,%dl 0,67 ? ? mov %di,0x6(%eax) 5,76 ? ? mov %esi,%edi 1,80 ? ? cmove %ecx,%edi 4,20 ? ? test $0x4,%dl 0,86 ? ? mov %di,0x8(%eax) 2,98 ? ? mov %esi,%edi 1,37 ? ? cmove %ecx,%edi 4,67 ? ? test $0x2,%dl 0,20 ? ? mov %di,0xa(%eax) 2,78 ? ? mov %esi,%edi 0,75 ? ? cmove %ecx,%edi 3,92 ? ? and $0x1,%edx 0,75 ? ? mov %esi,%edx 2,59 ? ? mov %di,0xc(%eax) 0,59 ? ? cmove %ecx,%edx 3,10 ? ? mov %dx,0xe(%eax) 2,39 ? ? add $0x10,%eax 0,51 ? ? movzbl (%ebx),%edx 2,86 ? ? test %dl,%dl 2,31 ? ???jne b0 0,04 ?128: addl $0x1,-0x10(%ebp) 4,00 ? mov -0x1c(%ebp),%eax 0,04 ? add %eax,-0x18(%ebp) 0,08 ? cmpl $0x10,-0x10(%ebp) ? ? jne a0 which almost goes away from the profile: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # Samples: 49K of event 'cycles' # Event count (approx.): 16799780016 # # Overhead Command Shared Object Symbol # ........ ............... .................... # 27.51% rawv libc-2.13.so [.] __memcpy_ssse3 23.77% vivi-* [kernel.kallsyms] [k] memcpy 9.96% Xorg [unknown] [.] 0xa76f5e12 4.94% vivi-* [vivi] [k] gen_text.constprop.6 4.44% rawv [vivi] [k] gen_twopix 3.17% vivi-* [vivi] [k] vivi_fillbuff 2.45% rawv [vivi] [k] precalculate_line 1.20% swapper [kernel.kallsyms] [k] read_hpet i.e. gen_twopix() overhead dropped from 49% to 4% and gen_text() loops from ~8% to ~4%, and overal cycles count dropped from 31551930117 to 16799780016 which is ~1.9x whole workload speedup. (*) for RGB24 rendering I've introduced x24, which could be thought as synthetic u24 for simplifying the code. That's done because for memcpy used for conditional assignment, gcc generates suboptimal code with more indirections. Fortunately, in C struct assignment is builtin and that's all we need from pixeltype for font rendering. Signed-off-by: Kirill Smelkov <kirr@mns.spb.ru> Acked-by: Hans Verkuil <hans.verkuil@cisco.com> Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Diffstat (limited to 'drivers/media/platform/vivi.c')
-rw-r--r--drivers/media/platform/vivi.c61
1 files changed, 43 insertions, 18 deletions
diff --git a/drivers/media/platform/vivi.c b/drivers/media/platform/vivi.c
index c2f424f32450..3801a8d00946 100644
--- a/drivers/media/platform/vivi.c
+++ b/drivers/media/platform/vivi.c
@@ -240,6 +240,7 @@ struct vivi_dev {
240 u8 line[MAX_WIDTH * 8]; 240 u8 line[MAX_WIDTH * 8];
241 unsigned int pixelsize; 241 unsigned int pixelsize;
242 u8 alpha_component; 242 u8 alpha_component;
243 u32 textfg, textbg;
243}; 244};
244 245
245/* ------------------------------------------------------------------ 246/* ------------------------------------------------------------------
@@ -520,33 +521,54 @@ static void precalculate_line(struct vivi_dev *dev)
520 } 521 }
521} 522}
522 523
524/* need this to do rgb24 rendering */
525typedef struct { u16 __; u8 _; } __attribute__((packed)) x24;
526
523static void gen_text(struct vivi_dev *dev, char *basep, 527static void gen_text(struct vivi_dev *dev, char *basep,
524 int y, int x, char *text) 528 int y, int x, char *text)
525{ 529{
526 int line; 530 int line;
531 unsigned int width = dev->width;
527 532
528 /* Checks if it is possible to show string */ 533 /* Checks if it is possible to show string */
529 if (y + 16 >= dev->height || x + strlen(text) * 8 >= dev->width) 534 if (y + 16 >= dev->height || x + strlen(text) * 8 >= width)
530 return; 535 return;
531 536
532 /* Print stream time */ 537 /* Print stream time */
533 for (line = y; line < y + 16; line++) { 538#define PRINTSTR(PIXTYPE) do { \
534 int j = 0; 539 PIXTYPE fg; \
535 char *pos = basep + line * dev->width * dev->pixelsize + x * dev->pixelsize; 540 PIXTYPE bg; \
536 char *s; 541 memcpy(&fg, &dev->textfg, sizeof(PIXTYPE)); \
537 542 memcpy(&bg, &dev->textbg, sizeof(PIXTYPE)); \
538 for (s = text; *s; s++) { 543 \
539 u8 chr = font8x16[*s * 16 + line - y]; 544 for (line = 0; line < 16; line++) { \
540 int i; 545 PIXTYPE *pos = (PIXTYPE *)( basep + ((y + line) * width + x) * sizeof(PIXTYPE) ); \
541 546 u8 *s; \
542 for (i = 0; i < 7; i++, j++) { 547 \
543 /* Draw white font on black background */ 548 for (s = text; *s; s++) { \
544 if (chr & (1 << (7 - i))) 549 u8 chr = font8x16[*s * 16 + line]; \
545 gen_twopix(dev, pos + j * dev->pixelsize, WHITE, (x+y) & 1); 550 \
546 else 551 pos[0] = (chr & (0x01 << 7) ? fg : bg); \
547 gen_twopix(dev, pos + j * dev->pixelsize, TEXT_BLACK, (x+y) & 1); 552 pos[1] = (chr & (0x01 << 6) ? fg : bg); \
548 } 553 pos[2] = (chr & (0x01 << 5) ? fg : bg); \
549 } 554 pos[3] = (chr & (0x01 << 4) ? fg : bg); \
555 pos[4] = (chr & (0x01 << 3) ? fg : bg); \
556 pos[5] = (chr & (0x01 << 2) ? fg : bg); \
557 pos[6] = (chr & (0x01 << 1) ? fg : bg); \
558 pos[7] = (chr & (0x01 << 0) ? fg : bg); \
559 \
560 pos += 8; \
561 } \
562 } \
563} while (0)
564
565 switch (dev->pixelsize) {
566 case 2:
567 PRINTSTR(u16); break;
568 case 4:
569 PRINTSTR(u32); break;
570 case 3:
571 PRINTSTR(x24); break;
550 } 572 }
551} 573}
552 574
@@ -570,6 +592,9 @@ static void vivi_fillbuff(struct vivi_dev *dev, struct vivi_buffer *buf)
570 592
571 /* Updates stream time */ 593 /* Updates stream time */
572 594
595 gen_twopix(dev, (u8 *)&dev->textbg, TEXT_BLACK, /*odd=*/ 0);
596 gen_twopix(dev, (u8 *)&dev->textfg, WHITE, /*odd=*/ 0);
597
573 dev->ms += jiffies_to_msecs(jiffies - dev->jiffies); 598 dev->ms += jiffies_to_msecs(jiffies - dev->jiffies);
574 dev->jiffies = jiffies; 599 dev->jiffies = jiffies;
575 ms = dev->ms; 600 ms = dev->ms;