summaryrefslogtreecommitdiffstats
path: root/arch/sparc/net
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2017-04-24 22:42:34 -0400
committerDavid S. Miller <davem@davemloft.net>2017-04-24 23:32:15 -0400
commit14933dc8d9964e46f1d5bd2a4dfe3d3be8e420e0 (patch)
treee29c213f1f2bcd31a9d5f95d4b02936ba6e05899 /arch/sparc/net
parente3a724edeec3836ed44675a6587a6db7b6b68dbe (diff)
sparc64: Improve 64-bit constant loading in eBPF JIT.
Doing a full 64-bit decomposition is really stupid especially for simple values like 0 and -1. But if we are going to optimize this, go all the way and try for all 2 and 3 instruction sequences not requiring a temporary register as well. First we do the easy cases where it's a zero or sign extended 32-bit number (sethi+or, sethi+xor, respectively). Then we try to find a range of set bits we can load simply then shift up into place, in various ways. Then we try negating the constant and see if we can do a simple sequence using that with a xor at the end. (f.e. the range of set bits can't be loaded simply, but for the negated value it can) The final optimized strategy involves 4 instructions sequences not needing a temporary register. Otherwise we sadly fully decompose using a temp.. Example, from ALU64_XOR_K: 0x0000ffffffff0000 ^ 0x0 = 0x0000ffffffff0000: 0000000000000000 <foo>: 0: 9d e3 bf 50 save %sp, -176, %sp 4: 01 00 00 00 nop 8: 90 10 00 18 mov %i0, %o0 c: 13 3f ff ff sethi %hi(0xfffffc00), %o1 10: 92 12 63 ff or %o1, 0x3ff, %o1 ! ffffffff <foo+0xffffffff> 14: 93 2a 70 10 sllx %o1, 0x10, %o1 18: 15 3f ff ff sethi %hi(0xfffffc00), %o2 1c: 94 12 a3 ff or %o2, 0x3ff, %o2 ! ffffffff <foo+0xffffffff> 20: 95 2a b0 10 sllx %o2, 0x10, %o2 24: 92 1a 60 00 xor %o1, 0, %o1 28: 12 e2 40 8a cxbe %o1, %o2, 38 <foo+0x38> 2c: 9a 10 20 02 mov 2, %o5 30: 10 60 00 03 b,pn %xcc, 3c <foo+0x3c> 34: 01 00 00 00 nop 38: 9a 10 20 01 mov 1, %o5 ! 1 <foo+0x1> 3c: 81 c7 e0 08 ret 40: 91 eb 40 00 restore %o5, %g0, %o0 Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'arch/sparc/net')
-rw-r--r--arch/sparc/net/bpf_jit_comp_64.c249
1 files changed, 245 insertions, 4 deletions
diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 2b2f3c3335ce..ec7d10da94f0 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -28,6 +28,11 @@ static inline bool is_simm5(unsigned int value)
28 return value + 0x10 < 0x20; 28 return value + 0x10 < 0x20;
29} 29}
30 30
31static inline bool is_sethi(unsigned int value)
32{
33 return (value & ~0x3fffff) == 0;
34}
35
31static void bpf_flush_icache(void *start_, void *end_) 36static void bpf_flush_icache(void *start_, void *end_)
32{ 37{
33 /* Cheetah's I-cache is fully coherent. */ 38 /* Cheetah's I-cache is fully coherent. */
@@ -367,16 +372,252 @@ static void emit_loadimm_sext(s32 K, unsigned int dest, struct jit_ctx *ctx)
367 } 372 }
368} 373}
369 374
375static void analyze_64bit_constant(u32 high_bits, u32 low_bits,
376 int *hbsp, int *lbsp, int *abbasp)
377{
378 int lowest_bit_set, highest_bit_set, all_bits_between_are_set;
379 int i;
380
381 lowest_bit_set = highest_bit_set = -1;
382 i = 0;
383 do {
384 if ((lowest_bit_set == -1) && ((low_bits >> i) & 1))
385 lowest_bit_set = i;
386 if ((highest_bit_set == -1) && ((high_bits >> (32 - i - 1)) & 1))
387 highest_bit_set = (64 - i - 1);
388 } while (++i < 32 && (highest_bit_set == -1 ||
389 lowest_bit_set == -1));
390 if (i == 32) {
391 i = 0;
392 do {
393 if (lowest_bit_set == -1 && ((high_bits >> i) & 1))
394 lowest_bit_set = i + 32;
395 if (highest_bit_set == -1 &&
396 ((low_bits >> (32 - i - 1)) & 1))
397 highest_bit_set = 32 - i - 1;
398 } while (++i < 32 && (highest_bit_set == -1 ||
399 lowest_bit_set == -1));
400 }
401
402 all_bits_between_are_set = 1;
403 for (i = lowest_bit_set; i <= highest_bit_set; i++) {
404 if (i < 32) {
405 if ((low_bits & (1 << i)) != 0)
406 continue;
407 } else {
408 if ((high_bits & (1 << (i - 32))) != 0)
409 continue;
410 }
411 all_bits_between_are_set = 0;
412 break;
413 }
414 *hbsp = highest_bit_set;
415 *lbsp = lowest_bit_set;
416 *abbasp = all_bits_between_are_set;
417}
418
419static unsigned long create_simple_focus_bits(unsigned long high_bits,
420 unsigned long low_bits,
421 int lowest_bit_set, int shift)
422{
423 long hi, lo;
424
425 if (lowest_bit_set < 32) {
426 lo = (low_bits >> lowest_bit_set) << shift;
427 hi = ((high_bits << (32 - lowest_bit_set)) << shift);
428 } else {
429 lo = 0;
430 hi = ((high_bits >> (lowest_bit_set - 32)) << shift);
431 }
432 return hi | lo;
433}
434
435static bool const64_is_2insns(unsigned long high_bits,
436 unsigned long low_bits)
437{
438 int highest_bit_set, lowest_bit_set, all_bits_between_are_set;
439
440 if (high_bits == 0 || high_bits == 0xffffffff)
441 return true;
442
443 analyze_64bit_constant(high_bits, low_bits,
444 &highest_bit_set, &lowest_bit_set,
445 &all_bits_between_are_set);
446
447 if ((highest_bit_set == 63 || lowest_bit_set == 0) &&
448 all_bits_between_are_set != 0)
449 return true;
450
451 if (highest_bit_set - lowest_bit_set < 21)
452 return true;
453
454 return false;
455}
456
457static void sparc_emit_set_const64_quick2(unsigned long high_bits,
458 unsigned long low_imm,
459 unsigned int dest,
460 int shift_count, struct jit_ctx *ctx)
461{
462 emit_loadimm32(high_bits, dest, ctx);
463
464 /* Now shift it up into place. */
465 emit_alu_K(SLLX, dest, shift_count, ctx);
466
467 /* If there is a low immediate part piece, finish up by
468 * putting that in as well.
469 */
470 if (low_imm != 0)
471 emit(OR | IMMED | RS1(dest) | S13(low_imm) | RD(dest), ctx);
472}
473
370static void emit_loadimm64(u64 K, unsigned int dest, struct jit_ctx *ctx) 474static void emit_loadimm64(u64 K, unsigned int dest, struct jit_ctx *ctx)
371{ 475{
476 int all_bits_between_are_set, lowest_bit_set, highest_bit_set;
372 unsigned int tmp = bpf2sparc[TMP_REG_1]; 477 unsigned int tmp = bpf2sparc[TMP_REG_1];
373 u32 high_part = (K >> 32); 478 u32 low_bits = (K & 0xffffffff);
374 u32 low_part = (K & 0xffffffff); 479 u32 high_bits = (K >> 32);
480
481 /* These two tests also take care of all of the one
482 * instruction cases.
483 */
484 if (high_bits == 0xffffffff && (low_bits & 0x80000000))
485 return emit_loadimm_sext(K, dest, ctx);
486 if (high_bits == 0x00000000)
487 return emit_loadimm32(K, dest, ctx);
488
489 analyze_64bit_constant(high_bits, low_bits, &highest_bit_set,
490 &lowest_bit_set, &all_bits_between_are_set);
491
492 /* 1) mov -1, %reg
493 * sllx %reg, shift, %reg
494 * 2) mov -1, %reg
495 * srlx %reg, shift, %reg
496 * 3) mov some_small_const, %reg
497 * sllx %reg, shift, %reg
498 */
499 if (((highest_bit_set == 63 || lowest_bit_set == 0) &&
500 all_bits_between_are_set != 0) ||
501 ((highest_bit_set - lowest_bit_set) < 12)) {
502 int shift = lowest_bit_set;
503 long the_const = -1;
504
505 if ((highest_bit_set != 63 && lowest_bit_set != 0) ||
506 all_bits_between_are_set == 0) {
507 the_const =
508 create_simple_focus_bits(high_bits, low_bits,
509 lowest_bit_set, 0);
510 } else if (lowest_bit_set == 0)
511 shift = -(63 - highest_bit_set);
512
513 emit(OR | IMMED | RS1(G0) | S13(the_const) | RD(dest), ctx);
514 if (shift > 0)
515 emit_alu_K(SLLX, dest, shift, ctx);
516 else if (shift < 0)
517 emit_alu_K(SRLX, dest, -shift, ctx);
518
519 return;
520 }
521
522 /* Now a range of 22 or less bits set somewhere.
523 * 1) sethi %hi(focus_bits), %reg
524 * sllx %reg, shift, %reg
525 * 2) sethi %hi(focus_bits), %reg
526 * srlx %reg, shift, %reg
527 */
528 if ((highest_bit_set - lowest_bit_set) < 21) {
529 unsigned long focus_bits =
530 create_simple_focus_bits(high_bits, low_bits,
531 lowest_bit_set, 10);
532
533 emit(SETHI(focus_bits, dest), ctx);
534
535 /* If lowest_bit_set == 10 then a sethi alone could
536 * have done it.
537 */
538 if (lowest_bit_set < 10)
539 emit_alu_K(SRLX, dest, 10 - lowest_bit_set, ctx);
540 else if (lowest_bit_set > 10)
541 emit_alu_K(SLLX, dest, lowest_bit_set - 10, ctx);
542 return;
543 }
544
545 /* Ok, now 3 instruction sequences. */
546 if (low_bits == 0) {
547 emit_loadimm32(high_bits, dest, ctx);
548 emit_alu_K(SLLX, dest, 32, ctx);
549 return;
550 }
551
552 /* We may be able to do something quick
553 * when the constant is negated, so try that.
554 */
555 if (const64_is_2insns((~high_bits) & 0xffffffff,
556 (~low_bits) & 0xfffffc00)) {
557 /* NOTE: The trailing bits get XOR'd so we need the
558 * non-negated bits, not the negated ones.
559 */
560 unsigned long trailing_bits = low_bits & 0x3ff;
561
562 if ((((~high_bits) & 0xffffffff) == 0 &&
563 ((~low_bits) & 0x80000000) == 0) ||
564 (((~high_bits) & 0xffffffff) == 0xffffffff &&
565 ((~low_bits) & 0x80000000) != 0)) {
566 unsigned long fast_int = (~low_bits & 0xffffffff);
567
568 if ((is_sethi(fast_int) &&
569 (~high_bits & 0xffffffff) == 0)) {
570 emit(SETHI(fast_int, dest), ctx);
571 } else if (is_simm13(fast_int)) {
572 emit(OR | IMMED | RS1(G0) | S13(fast_int) | RD(dest), ctx);
573 } else {
574 emit_loadimm64(fast_int, dest, ctx);
575 }
576 } else {
577 u64 n = ((~low_bits) & 0xfffffc00) |
578 (((unsigned long)((~high_bits) & 0xffffffff))<<32);
579 emit_loadimm64(n, dest, ctx);
580 }
581
582 low_bits = -0x400 | trailing_bits;
583
584 emit(XOR | IMMED | RS1(dest) | S13(low_bits) | RD(dest), ctx);
585 return;
586 }
587
588 /* 1) sethi %hi(xxx), %reg
589 * or %reg, %lo(xxx), %reg
590 * sllx %reg, yyy, %reg
591 */
592 if ((highest_bit_set - lowest_bit_set) < 32) {
593 unsigned long focus_bits =
594 create_simple_focus_bits(high_bits, low_bits,
595 lowest_bit_set, 0);
596
597 /* So what we know is that the set bits straddle the
598 * middle of the 64-bit word.
599 */
600 sparc_emit_set_const64_quick2(focus_bits, 0, dest,
601 lowest_bit_set, ctx);
602 return;
603 }
604
605 /* 1) sethi %hi(high_bits), %reg
606 * or %reg, %lo(high_bits), %reg
607 * sllx %reg, 32, %reg
608 * or %reg, low_bits, %reg
609 */
610 if (is_simm13(low_bits) && ((int)low_bits > 0)) {
611 sparc_emit_set_const64_quick2(high_bits, low_bits,
612 dest, 32, ctx);
613 return;
614 }
375 615
616 /* Oh well, we tried... Do a full 64-bit decomposition. */
376 ctx->tmp_1_used = true; 617 ctx->tmp_1_used = true;
377 618
378 emit_set_const(high_part, tmp, ctx); 619 emit_loadimm32(high_bits, tmp, ctx);
379 emit_set_const(low_part, dest, ctx); 620 emit_loadimm32(low_bits, dest, ctx);
380 emit_alu_K(SLLX, tmp, 32, ctx); 621 emit_alu_K(SLLX, tmp, 32, ctx);
381 emit(OR | RS1(dest) | RS2(tmp) | RD(dest), ctx); 622 emit(OR | RS1(dest) | RS2(tmp) | RD(dest), ctx);
382} 623}