Skip to content

Commit

Permalink
8322770: Implement C2 VectorizedHashCode on AArch64
Browse files Browse the repository at this point in the history
Reviewed-by: aph, adinn
  • Loading branch information
mikabl-arm committed Sep 30, 2024
1 parent 52ba728 commit 475b894
Show file tree
Hide file tree
Showing 11 changed files with 1,355 additions and 580 deletions.
78 changes: 78 additions & 0 deletions src/hotspot/cpu/aarch64/aarch64.ad
Original file line number Diff line number Diff line change
Expand Up @@ -4931,6 +4931,60 @@ operand vRegD_V7()
interface(REG_INTER);
%}

operand vRegD_V12()
%{
constraint(ALLOC_IN_RC(v12_reg));
match(RegD);
op_cost(0);
format %{ %}
interface(REG_INTER);
%}

operand vRegD_V13()
%{
constraint(ALLOC_IN_RC(v13_reg));
match(RegD);
op_cost(0);
format %{ %}
interface(REG_INTER);
%}

operand vRegD_V14()
%{
constraint(ALLOC_IN_RC(v14_reg));
match(RegD);
op_cost(0);
format %{ %}
interface(REG_INTER);
%}

operand vRegD_V15()
%{
constraint(ALLOC_IN_RC(v15_reg));
match(RegD);
op_cost(0);
format %{ %}
interface(REG_INTER);
%}

operand vRegD_V16()
%{
constraint(ALLOC_IN_RC(v16_reg));
match(RegD);
op_cost(0);
format %{ %}
interface(REG_INTER);
%}

operand vRegD_V17()
%{
constraint(ALLOC_IN_RC(v17_reg));
match(RegD);
op_cost(0);
format %{ %}
interface(REG_INTER);
%}

operand pReg()
%{
constraint(ALLOC_IN_RC(pr_reg));
Expand Down Expand Up @@ -16551,6 +16605,30 @@ instruct array_equalsC(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result,
ins_pipe(pipe_class_memory);
%}

instruct arrays_hashcode(iRegP_R1 ary, iRegI_R2 cnt, iRegI_R0 result, immI basic_type,
vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, vRegD_V3 vtmp3,
vRegD_V4 vtmp4, vRegD_V5 vtmp5, vRegD_V6 vtmp6, vRegD_V7 vtmp7,
vRegD_V12 vtmp8, vRegD_V13 vtmp9, vRegD_V14 vtmp10,
vRegD_V15 vtmp11, vRegD_V16 vtmp12, vRegD_V17 vtmp13,
rFlagsReg cr)
%{
match(Set result (VectorizedHashCode (Binary ary cnt) (Binary result basic_type)));
effect(TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP vtmp5, TEMP vtmp6,
TEMP vtmp7, TEMP vtmp8, TEMP vtmp9, TEMP vtmp10, TEMP vtmp11, TEMP vtmp12, TEMP vtmp13,
USE_KILL ary, USE_KILL cnt, USE basic_type, KILL cr);

format %{ "Array HashCode array[] $ary,$cnt,$result,$basic_type -> $result // KILL all" %}
ins_encode %{
address tpc = __ arrays_hashcode($ary$$Register, $cnt$$Register, $result$$Register,
(BasicType)$basic_type$$constant);
if (tpc == nullptr) {
ciEnv::current()->record_failure("CodeCache is full");
return;
}
%}
ins_pipe(pipe_class_memory);
%}

instruct count_positives(iRegP_R1 ary1, iRegI_R2 len, iRegI_R0 result, rFlagsReg cr)
%{
match(Set result (CountPositives ary1 len));
Expand Down
68 changes: 67 additions & 1 deletion src/hotspot/cpu/aarch64/assembler_aarch64.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
Expand Down Expand Up @@ -287,6 +287,11 @@ class Instruction_aarch64 {
f(r->raw_encoding(), lsb + 4, lsb);
}

//<0-15>reg: As `rf(FloatRegister)`, but only the lower 16 FloatRegisters are allowed.
void lrf(FloatRegister r, int lsb) {
f(r->raw_encoding(), lsb + 3, lsb);
}

void prf(PRegister r, int lsb) {
f(r->raw_encoding(), lsb + 3, lsb);
}
Expand Down Expand Up @@ -765,6 +770,7 @@ class Assembler : public AbstractAssembler {
#define f current_insn.f
#define sf current_insn.sf
#define rf current_insn.rf
#define lrf current_insn.lrf
#define srf current_insn.srf
#define zrf current_insn.zrf
#define prf current_insn.prf
Expand Down Expand Up @@ -1590,6 +1596,16 @@ class Assembler : public AbstractAssembler {

#undef INSN

// Load/store a register, but with a BasicType parameter. Loaded signed integer values are
// extended to 64 bits.
void load(Register Rt, const Address &adr, BasicType bt) {
int op = (is_signed_subword_type(bt) || bt == T_INT) ? 0b10 : 0b01;
ld_st2(Rt, adr, exact_log2(type2aelembytes(bt)), op);
}
void store(Register Rt, const Address &adr, BasicType bt) {
ld_st2(Rt, adr, exact_log2(type2aelembytes(bt)), 0b00);
}

/* SIMD extensions
*
* We just use FloatRegister in the following. They are exactly the same
Expand Down Expand Up @@ -2587,6 +2603,7 @@ template<typename R, typename... Rx>
INSN(addpv, 0, 0b101111, true); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
INSN(smullv, 0, 0b110000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
INSN(umullv, 1, 0b110000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
INSN(smlalv, 0, 0b100000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
INSN(umlalv, 1, 0b100000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
INSN(maxv, 0, 0b011001, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
INSN(minv, 0, 0b011011, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
Expand Down Expand Up @@ -2860,6 +2877,28 @@ template<typename R, typename... Rx>
// FMULX - Vector - Scalar
INSN(fmulxvs, 1, 0b1001);

#undef INSN

#define INSN(NAME, op1, op2) \
void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm, int index) { \
starti; \
assert(T == T4H || T == T8H || T == T2S || T == T4S, "invalid arrangement"); \
assert(index >= 0 && \
((T == T2S && index <= 1) || (T != T2S && index <= 3) || (T == T8H && index <= 7)), \
"invalid index"); \
assert((T != T4H && T != T8H) || Vm->encoding() < 16, "invalid source SIMD&FP register"); \
f(0, 31), f((int)T & 1, 30), f(op1, 29), f(0b01111, 28, 24); \
if (T == T4H || T == T8H) { \
f(0b01, 23, 22), f(index & 0b11, 21, 20), lrf(Vm, 16), f(index >> 2 & 1, 11); \
} else { \
f(0b10, 23, 22), f(index & 1, 21), rf(Vm, 16), f(index >> 1, 11); \
} \
f(op2, 15, 12), f(0, 10), rf(Vn, 5), rf(Vd, 0); \
}

// MUL - Vector - Scalar
INSN(mulvs, 0, 0b1000);

#undef INSN

// Floating-point Reciprocal Estimate
Expand Down Expand Up @@ -3023,6 +3062,33 @@ template<typename R, typename... Rx>
umov(Xd, Vn, T, index);
}

protected:
void _xaddwv(bool is_unsigned, FloatRegister Vd, FloatRegister Vn, SIMD_Arrangement Ta,
FloatRegister Vm, SIMD_Arrangement Tb) {
starti;
assert((Tb >> 1) + 1 == (Ta >> 1), "Incompatible arrangement");
f(0, 31), f((int)Tb & 1, 30), f(is_unsigned ? 1 : 0, 29), f(0b01110, 28, 24);
f((int)(Ta >> 1) - 1, 23, 22), f(1, 21), rf(Vm, 16), f(0b000100, 15, 10), rf(Vn, 5), rf(Vd, 0);
}

public:
#define INSN(NAME, assertion, is_unsigned) \
void NAME(FloatRegister Vd, FloatRegister Vn, SIMD_Arrangement Ta, FloatRegister Vm, \
SIMD_Arrangement Tb) { \
assert((assertion), "invalid arrangement"); \
_xaddwv(is_unsigned, Vd, Vn, Ta, Vm, Tb); \
}

public:

INSN(uaddwv, Tb == T8B || Tb == T4H || Tb == T2S, /*is_unsigned*/true)
INSN(uaddwv2, Tb == T16B || Tb == T8H || Tb == T4S, /*is_unsigned*/true)
INSN(saddwv, Tb == T8B || Tb == T4H || Tb == T2S, /*is_unsigned*/false)
INSN(saddwv2, Tb == T16B || Tb == T8H || Tb == T4S, /*is_unsigned*/false)

#undef INSN


private:
void _pmull(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
starti;
Expand Down
91 changes: 91 additions & 0 deletions src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "opto/subnode.hpp"
#include "runtime/stubRoutines.hpp"
#include "utilities/globalDefinitions.hpp"
#include "utilities/powerOfTwo.hpp"

#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
Expand All @@ -46,6 +47,96 @@

typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);

// jdk.internal.util.ArraysSupport.vectorizedHashCode
address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
BasicType eltype) {
assert_different_registers(ary, cnt, result, rscratch1, rscratch2);

Register tmp1 = rscratch1, tmp2 = rscratch2;

Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;

// Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
// use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
// use 4H for chars and shorts instead, but using 8H gives better performance.
const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
: eltype == T_CHAR || eltype == T_SHORT ? 8
: eltype == T_INT ? 4
: 0;
guarantee(vf, "unsupported eltype");

// Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
const size_t unroll_factor = 4;

switch (eltype) {
case T_BOOLEAN:
BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
break;
case T_CHAR:
BLOCK_COMMENT("arrays_hashcode(char) {");
break;
case T_BYTE:
BLOCK_COMMENT("arrays_hashcode(byte) {");
break;
case T_SHORT:
BLOCK_COMMENT("arrays_hashcode(short) {");
break;
case T_INT:
BLOCK_COMMENT("arrays_hashcode(int) {");
break;
default:
ShouldNotReachHere();
}

// large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
// implemented by the stub executes just once. Call the stub only if at least two iterations will
// be executed.
const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
cmpw(cnt, large_threshold);
br(Assembler::HS, LARGE);

bind(TAIL);

// The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
// uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
// Iteration eats up the remainder, uf elements at a time.
assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
andr(tmp2, cnt, unroll_factor - 1);
adr(tmp1, BR_BASE);
sub(tmp1, tmp1, tmp2, ext::sxtw, 3);
movw(tmp2, 0x1f);
br(tmp1);

bind(LOOP);
for (size_t i = 0; i < unroll_factor; ++i) {
load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
maddw(result, result, tmp2, tmp1);
}
bind(BR_BASE);
subsw(cnt, cnt, unroll_factor);
br(Assembler::HS, LOOP);

b(DONE);

bind(LARGE);

RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
address tpc = trampoline_call(stub);
if (tpc == nullptr) {
DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
postcond(pc() == badAddress);
return nullptr;
}

bind(DONE);

BLOCK_COMMENT("} // arrays_hashcode");

postcond(pc() != badAddress);
return pc();
}

void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
Register tmp2Reg, Register tmp3Reg) {
Register oop = objectReg;
Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@
enum shift_kind kind = Assembler::LSL, unsigned shift = 0);

public:
// jdk.internal.util.ArraysSupport.vectorizedHashCode
address arrays_hashcode(Register ary, Register cnt, Register result, BasicType eltype);

// Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
void fast_lock(Register object, Register box, Register tmp, Register tmp2, Register tmp3);
void fast_unlock(Register object, Register box, Register tmp, Register tmp2);
Expand Down
Loading

1 comment on commit 475b894

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.