diff options
author | David S. Miller <davem@davemloft.net> | 2017-04-24 22:42:34 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2017-04-24 23:32:15 -0400 |
commit | 14933dc8d9964e46f1d5bd2a4dfe3d3be8e420e0 (patch) | |
tree | e29c213f1f2bcd31a9d5f95d4b02936ba6e05899 /arch/sparc/net | |
parent | e3a724edeec3836ed44675a6587a6db7b6b68dbe (diff) |
sparc64: Improve 64-bit constant loading in eBPF JIT.
Doing a full 64-bit decomposition is really stupid especially for
simple values like 0 and -1.
But if we are going to optimize this, go all the way and try for all 2
and 3 instruction sequences not requiring a temporary register as
well.
First we do the easy cases where it's a zero or sign extended 32-bit
number (sethi+or, sethi+xor, respectively).
Then we try to find a range of set bits we can load simply then shift
up into place, in various ways.
Then we try negating the constant and see if we can do a simple
sequence using that with a xor at the end. (f.e. the range of set
bits can't be loaded simply, but for the negated value it can)
The final optimized strategy involves 4 instructions sequences not
needing a temporary register.
Otherwise we sadly fully decompose using a temp..
Example, from ALU64_XOR_K: 0x0000ffffffff0000 ^ 0x0 = 0x0000ffffffff0000:
0000000000000000 <foo>:
0: 9d e3 bf 50 save %sp, -176, %sp
4: 01 00 00 00 nop
8: 90 10 00 18 mov %i0, %o0
c: 13 3f ff ff sethi %hi(0xfffffc00), %o1
10: 92 12 63 ff or %o1, 0x3ff, %o1 ! ffffffff <foo+0xffffffff>
14: 93 2a 70 10 sllx %o1, 0x10, %o1
18: 15 3f ff ff sethi %hi(0xfffffc00), %o2
1c: 94 12 a3 ff or %o2, 0x3ff, %o2 ! ffffffff <foo+0xffffffff>
20: 95 2a b0 10 sllx %o2, 0x10, %o2
24: 92 1a 60 00 xor %o1, 0, %o1
28: 12 e2 40 8a cxbe %o1, %o2, 38 <foo+0x38>
2c: 9a 10 20 02 mov 2, %o5
30: 10 60 00 03 b,pn %xcc, 3c <foo+0x3c>
34: 01 00 00 00 nop
38: 9a 10 20 01 mov 1, %o5 ! 1 <foo+0x1>
3c: 81 c7 e0 08 ret
40: 91 eb 40 00 restore %o5, %g0, %o0
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'arch/sparc/net')
-rw-r--r-- | arch/sparc/net/bpf_jit_comp_64.c | 249 |
1 files changed, 245 insertions, 4 deletions
diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 2b2f3c3335ce..ec7d10da94f0 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c | |||
@@ -28,6 +28,11 @@ static inline bool is_simm5(unsigned int value) | |||
28 | return value + 0x10 < 0x20; | 28 | return value + 0x10 < 0x20; |
29 | } | 29 | } |
30 | 30 | ||
31 | static inline bool is_sethi(unsigned int value) | ||
32 | { | ||
33 | return (value & ~0x3fffff) == 0; | ||
34 | } | ||
35 | |||
31 | static void bpf_flush_icache(void *start_, void *end_) | 36 | static void bpf_flush_icache(void *start_, void *end_) |
32 | { | 37 | { |
33 | /* Cheetah's I-cache is fully coherent. */ | 38 | /* Cheetah's I-cache is fully coherent. */ |
@@ -367,16 +372,252 @@ static void emit_loadimm_sext(s32 K, unsigned int dest, struct jit_ctx *ctx) | |||
367 | } | 372 | } |
368 | } | 373 | } |
369 | 374 | ||
375 | static void analyze_64bit_constant(u32 high_bits, u32 low_bits, | ||
376 | int *hbsp, int *lbsp, int *abbasp) | ||
377 | { | ||
378 | int lowest_bit_set, highest_bit_set, all_bits_between_are_set; | ||
379 | int i; | ||
380 | |||
381 | lowest_bit_set = highest_bit_set = -1; | ||
382 | i = 0; | ||
383 | do { | ||
384 | if ((lowest_bit_set == -1) && ((low_bits >> i) & 1)) | ||
385 | lowest_bit_set = i; | ||
386 | if ((highest_bit_set == -1) && ((high_bits >> (32 - i - 1)) & 1)) | ||
387 | highest_bit_set = (64 - i - 1); | ||
388 | } while (++i < 32 && (highest_bit_set == -1 || | ||
389 | lowest_bit_set == -1)); | ||
390 | if (i == 32) { | ||
391 | i = 0; | ||
392 | do { | ||
393 | if (lowest_bit_set == -1 && ((high_bits >> i) & 1)) | ||
394 | lowest_bit_set = i + 32; | ||
395 | if (highest_bit_set == -1 && | ||
396 | ((low_bits >> (32 - i - 1)) & 1)) | ||
397 | highest_bit_set = 32 - i - 1; | ||
398 | } while (++i < 32 && (highest_bit_set == -1 || | ||
399 | lowest_bit_set == -1)); | ||
400 | } | ||
401 | |||
402 | all_bits_between_are_set = 1; | ||
403 | for (i = lowest_bit_set; i <= highest_bit_set; i++) { | ||
404 | if (i < 32) { | ||
405 | if ((low_bits & (1 << i)) != 0) | ||
406 | continue; | ||
407 | } else { | ||
408 | if ((high_bits & (1 << (i - 32))) != 0) | ||
409 | continue; | ||
410 | } | ||
411 | all_bits_between_are_set = 0; | ||
412 | break; | ||
413 | } | ||
414 | *hbsp = highest_bit_set; | ||
415 | *lbsp = lowest_bit_set; | ||
416 | *abbasp = all_bits_between_are_set; | ||
417 | } | ||
418 | |||
419 | static unsigned long create_simple_focus_bits(unsigned long high_bits, | ||
420 | unsigned long low_bits, | ||
421 | int lowest_bit_set, int shift) | ||
422 | { | ||
423 | long hi, lo; | ||
424 | |||
425 | if (lowest_bit_set < 32) { | ||
426 | lo = (low_bits >> lowest_bit_set) << shift; | ||
427 | hi = ((high_bits << (32 - lowest_bit_set)) << shift); | ||
428 | } else { | ||
429 | lo = 0; | ||
430 | hi = ((high_bits >> (lowest_bit_set - 32)) << shift); | ||
431 | } | ||
432 | return hi | lo; | ||
433 | } | ||
434 | |||
435 | static bool const64_is_2insns(unsigned long high_bits, | ||
436 | unsigned long low_bits) | ||
437 | { | ||
438 | int highest_bit_set, lowest_bit_set, all_bits_between_are_set; | ||
439 | |||
440 | if (high_bits == 0 || high_bits == 0xffffffff) | ||
441 | return true; | ||
442 | |||
443 | analyze_64bit_constant(high_bits, low_bits, | ||
444 | &highest_bit_set, &lowest_bit_set, | ||
445 | &all_bits_between_are_set); | ||
446 | |||
447 | if ((highest_bit_set == 63 || lowest_bit_set == 0) && | ||
448 | all_bits_between_are_set != 0) | ||
449 | return true; | ||
450 | |||
451 | if (highest_bit_set - lowest_bit_set < 21) | ||
452 | return true; | ||
453 | |||
454 | return false; | ||
455 | } | ||
456 | |||
457 | static void sparc_emit_set_const64_quick2(unsigned long high_bits, | ||
458 | unsigned long low_imm, | ||
459 | unsigned int dest, | ||
460 | int shift_count, struct jit_ctx *ctx) | ||
461 | { | ||
462 | emit_loadimm32(high_bits, dest, ctx); | ||
463 | |||
464 | /* Now shift it up into place. */ | ||
465 | emit_alu_K(SLLX, dest, shift_count, ctx); | ||
466 | |||
467 | /* If there is a low immediate part piece, finish up by | ||
468 | * putting that in as well. | ||
469 | */ | ||
470 | if (low_imm != 0) | ||
471 | emit(OR | IMMED | RS1(dest) | S13(low_imm) | RD(dest), ctx); | ||
472 | } | ||
473 | |||
370 | static void emit_loadimm64(u64 K, unsigned int dest, struct jit_ctx *ctx) | 474 | static void emit_loadimm64(u64 K, unsigned int dest, struct jit_ctx *ctx) |
371 | { | 475 | { |
476 | int all_bits_between_are_set, lowest_bit_set, highest_bit_set; | ||
372 | unsigned int tmp = bpf2sparc[TMP_REG_1]; | 477 | unsigned int tmp = bpf2sparc[TMP_REG_1]; |
373 | u32 high_part = (K >> 32); | 478 | u32 low_bits = (K & 0xffffffff); |
374 | u32 low_part = (K & 0xffffffff); | 479 | u32 high_bits = (K >> 32); |
480 | |||
481 | /* These two tests also take care of all of the one | ||
482 | * instruction cases. | ||
483 | */ | ||
484 | if (high_bits == 0xffffffff && (low_bits & 0x80000000)) | ||
485 | return emit_loadimm_sext(K, dest, ctx); | ||
486 | if (high_bits == 0x00000000) | ||
487 | return emit_loadimm32(K, dest, ctx); | ||
488 | |||
489 | analyze_64bit_constant(high_bits, low_bits, &highest_bit_set, | ||
490 | &lowest_bit_set, &all_bits_between_are_set); | ||
491 | |||
492 | /* 1) mov -1, %reg | ||
493 | * sllx %reg, shift, %reg | ||
494 | * 2) mov -1, %reg | ||
495 | * srlx %reg, shift, %reg | ||
496 | * 3) mov some_small_const, %reg | ||
497 | * sllx %reg, shift, %reg | ||
498 | */ | ||
499 | if (((highest_bit_set == 63 || lowest_bit_set == 0) && | ||
500 | all_bits_between_are_set != 0) || | ||
501 | ((highest_bit_set - lowest_bit_set) < 12)) { | ||
502 | int shift = lowest_bit_set; | ||
503 | long the_const = -1; | ||
504 | |||
505 | if ((highest_bit_set != 63 && lowest_bit_set != 0) || | ||
506 | all_bits_between_are_set == 0) { | ||
507 | the_const = | ||
508 | create_simple_focus_bits(high_bits, low_bits, | ||
509 | lowest_bit_set, 0); | ||
510 | } else if (lowest_bit_set == 0) | ||
511 | shift = -(63 - highest_bit_set); | ||
512 | |||
513 | emit(OR | IMMED | RS1(G0) | S13(the_const) | RD(dest), ctx); | ||
514 | if (shift > 0) | ||
515 | emit_alu_K(SLLX, dest, shift, ctx); | ||
516 | else if (shift < 0) | ||
517 | emit_alu_K(SRLX, dest, -shift, ctx); | ||
518 | |||
519 | return; | ||
520 | } | ||
521 | |||
522 | /* Now a range of 22 or less bits set somewhere. | ||
523 | * 1) sethi %hi(focus_bits), %reg | ||
524 | * sllx %reg, shift, %reg | ||
525 | * 2) sethi %hi(focus_bits), %reg | ||
526 | * srlx %reg, shift, %reg | ||
527 | */ | ||
528 | if ((highest_bit_set - lowest_bit_set) < 21) { | ||
529 | unsigned long focus_bits = | ||
530 | create_simple_focus_bits(high_bits, low_bits, | ||
531 | lowest_bit_set, 10); | ||
532 | |||
533 | emit(SETHI(focus_bits, dest), ctx); | ||
534 | |||
535 | /* If lowest_bit_set == 10 then a sethi alone could | ||
536 | * have done it. | ||
537 | */ | ||
538 | if (lowest_bit_set < 10) | ||
539 | emit_alu_K(SRLX, dest, 10 - lowest_bit_set, ctx); | ||
540 | else if (lowest_bit_set > 10) | ||
541 | emit_alu_K(SLLX, dest, lowest_bit_set - 10, ctx); | ||
542 | return; | ||
543 | } | ||
544 | |||
545 | /* Ok, now 3 instruction sequences. */ | ||
546 | if (low_bits == 0) { | ||
547 | emit_loadimm32(high_bits, dest, ctx); | ||
548 | emit_alu_K(SLLX, dest, 32, ctx); | ||
549 | return; | ||
550 | } | ||
551 | |||
552 | /* We may be able to do something quick | ||
553 | * when the constant is negated, so try that. | ||
554 | */ | ||
555 | if (const64_is_2insns((~high_bits) & 0xffffffff, | ||
556 | (~low_bits) & 0xfffffc00)) { | ||
557 | /* NOTE: The trailing bits get XOR'd so we need the | ||
558 | * non-negated bits, not the negated ones. | ||
559 | */ | ||
560 | unsigned long trailing_bits = low_bits & 0x3ff; | ||
561 | |||
562 | if ((((~high_bits) & 0xffffffff) == 0 && | ||
563 | ((~low_bits) & 0x80000000) == 0) || | ||
564 | (((~high_bits) & 0xffffffff) == 0xffffffff && | ||
565 | ((~low_bits) & 0x80000000) != 0)) { | ||
566 | unsigned long fast_int = (~low_bits & 0xffffffff); | ||
567 | |||
568 | if ((is_sethi(fast_int) && | ||
569 | (~high_bits & 0xffffffff) == 0)) { | ||
570 | emit(SETHI(fast_int, dest), ctx); | ||
571 | } else if (is_simm13(fast_int)) { | ||
572 | emit(OR | IMMED | RS1(G0) | S13(fast_int) | RD(dest), ctx); | ||
573 | } else { | ||
574 | emit_loadimm64(fast_int, dest, ctx); | ||
575 | } | ||
576 | } else { | ||
577 | u64 n = ((~low_bits) & 0xfffffc00) | | ||
578 | (((unsigned long)((~high_bits) & 0xffffffff))<<32); | ||
579 | emit_loadimm64(n, dest, ctx); | ||
580 | } | ||
581 | |||
582 | low_bits = -0x400 | trailing_bits; | ||
583 | |||
584 | emit(XOR | IMMED | RS1(dest) | S13(low_bits) | RD(dest), ctx); | ||
585 | return; | ||
586 | } | ||
587 | |||
588 | /* 1) sethi %hi(xxx), %reg | ||
589 | * or %reg, %lo(xxx), %reg | ||
590 | * sllx %reg, yyy, %reg | ||
591 | */ | ||
592 | if ((highest_bit_set - lowest_bit_set) < 32) { | ||
593 | unsigned long focus_bits = | ||
594 | create_simple_focus_bits(high_bits, low_bits, | ||
595 | lowest_bit_set, 0); | ||
596 | |||
597 | /* So what we know is that the set bits straddle the | ||
598 | * middle of the 64-bit word. | ||
599 | */ | ||
600 | sparc_emit_set_const64_quick2(focus_bits, 0, dest, | ||
601 | lowest_bit_set, ctx); | ||
602 | return; | ||
603 | } | ||
604 | |||
605 | /* 1) sethi %hi(high_bits), %reg | ||
606 | * or %reg, %lo(high_bits), %reg | ||
607 | * sllx %reg, 32, %reg | ||
608 | * or %reg, low_bits, %reg | ||
609 | */ | ||
610 | if (is_simm13(low_bits) && ((int)low_bits > 0)) { | ||
611 | sparc_emit_set_const64_quick2(high_bits, low_bits, | ||
612 | dest, 32, ctx); | ||
613 | return; | ||
614 | } | ||
375 | 615 | ||
616 | /* Oh well, we tried... Do a full 64-bit decomposition. */ | ||
376 | ctx->tmp_1_used = true; | 617 | ctx->tmp_1_used = true; |
377 | 618 | ||
378 | emit_set_const(high_part, tmp, ctx); | 619 | emit_loadimm32(high_bits, tmp, ctx); |
379 | emit_set_const(low_part, dest, ctx); | 620 | emit_loadimm32(low_bits, dest, ctx); |
380 | emit_alu_K(SLLX, tmp, 32, ctx); | 621 | emit_alu_K(SLLX, tmp, 32, ctx); |
381 | emit(OR | RS1(dest) | RS2(tmp) | RD(dest), ctx); | 622 | emit(OR | RS1(dest) | RS2(tmp) | RD(dest), ctx); |
382 | } | 623 | } |