diff --git a/XenonAnalyse/main.cpp b/XenonAnalyse/main.cpp index d08371ef..108ba725 100644 --- a/XenonAnalyse/main.cpp +++ b/XenonAnalyse/main.cpp @@ -105,7 +105,7 @@ void ScanTable(const uint32_t* code, size_t base, SwitchTable& table) { ppc_insn insn; uint32_t cr{ (uint32_t)-1 }; - for (int i = 0; i < 32; i++) + for (int i = 0; i < 64; i++) { ppc::Disassemble(&code[-i], base - (4 * i), insn); if (insn.opcode == nullptr) @@ -113,7 +113,11 @@ void ScanTable(const uint32_t* code, size_t base, SwitchTable& table) continue; } - if (cr == -1 && (insn.opcode->id == PPC_INST_BGT || insn.opcode->id == PPC_INST_BGTLR || insn.opcode->id == PPC_INST_BLE || insn.opcode->id == PPC_INST_BLELR)) + // Handle conditional branches + if (cr == -1 && (insn.opcode->id == PPC_INST_BGT || + insn.opcode->id == PPC_INST_BGTLR || + insn.opcode->id == PPC_INST_BLE || + insn.opcode->id == PPC_INST_BLELR)) { cr = insn.operands[0]; if (insn.opcode->operands[1] != 0) @@ -121,16 +125,27 @@ void ScanTable(const uint32_t* code, size_t base, SwitchTable& table) table.defaultLabel = insn.operands[1]; } } - else if (cr != -1) + // Handle CMPLWI even if branch not found yet + else if (insn.opcode->id == PPC_INST_CMPLWI) { - if (insn.opcode->id == PPC_INST_CMPLWI && insn.operands[0] == cr) + // Only process if we haven't found labels yet + if (table.labels.empty()) { table.r = insn.operands[1]; table.labels.resize(insn.operands[2] + 1); table.base = base; - break; } } + // Handle CMPLWI after branch detection + else if (cr != -1 && + insn.opcode->id == PPC_INST_CMPLWI && + insn.operands[0] == cr) + { + table.r = insn.operands[1]; + table.labels.resize(insn.operands[2] + 1); + table.base = base; + break; + } } } diff --git a/XenonRecomp/recompiler.cpp b/XenonRecomp/recompiler.cpp index f8608179..bc1923e9 100644 --- a/XenonRecomp/recompiler.cpp +++ b/XenonRecomp/recompiler.cpp @@ -531,6 +531,13 @@ bool Recompiler::Recompile( println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); break; + case PPC_INST_ADDC: + println("\t{}.ca = ({}.u32 + {}.u32 < {}.u32);", xer(), r(insn.operands[1]), r(insn.operands[2]), r(insn.operands[1])); + println("\t{}.u64 = {}.u64 + {}.u64;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); + break; + case PPC_INST_ADDE: println("\t{}.u8 = ({}.u32 + {}.u32 < {}.u32) | ({}.u32 + {}.u32 + {}.ca < {}.ca);", temp(), r(insn.operands[1]), r(insn.operands[2]), r(insn.operands[1]), r(insn.operands[1]), r(insn.operands[2]), xer(), xer()); println("\t{}.u64 = {}.u64 + {}.u64 + {}.ca;", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2]), xer()); @@ -560,6 +567,16 @@ bool Recompiler::Recompile( println("{};", static_cast(insn.operands[2] << 16)); break; + case PPC_INST_ADDME: + println("\t{}.u64 = {}.u64 + {}.ca - 1;", temp(), r(insn.operands[1]), xer()); + println("\t{}.ca = ({}.u64 > {}.u64) || ({}.u64 == {}.u64 && {}.ca);", xer(), + r(insn.operands[1]), temp(), r(insn.operands[1]), temp(), xer()); + println("\t{}.u64 = {}.u64;", r(insn.operands[0]), temp()); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s32, 0, {});", + cr(0), r(insn.operands[0]), xer()); + break; + case PPC_INST_ADDZE: println("\t{}.s64 = {}.s64 + {}.ca;", temp(), r(insn.operands[1]), xer()); println("\t{}.ca = {}.u32 < {}.u32;", xer(), temp(), r(insn.operands[1])); @@ -668,6 +685,12 @@ bool Recompiler::Recompile( println("\tif ({}.u32 != 0 && !{}.eq) goto loc_{:X};", ctr(), cr(insn.operands[0] / 4), insn.operands[1]); break; + case PPC_INST_BDNZT: + // NOTE(crack): Same note as BDNZF but true instead of false + println("\t--{}.u64;", ctr()); + println("\tif ({}.u32 != 0 && {}.eq) goto loc_{:X};", ctr(), cr(insn.operands[0] / 4), insn.operands[1]); + break; + case PPC_INST_BEQ: printConditionalBranch(false, "eq"); break; @@ -808,6 +831,7 @@ bool Recompiler::Recompile( // no op break; + case PPC_INST_DCBST: case PPC_INST_DCBTST: // no op break; @@ -852,6 +876,13 @@ bool Recompiler::Recompile( // no op break; + case PPC_INST_EQV: + // rA = ~(rS XOR rB) + println("\t{}.u64 = ~({}.u64 ^ {}.u64);", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); + break; + case PPC_INST_EXTSB: println("\t{}.s64 = {}.s8;", r(insn.operands[0]), r(insn.operands[1])); if (strchr(insn.opcode->name, '.')) @@ -995,6 +1026,12 @@ bool Recompiler::Recompile( println("\t{}.f64 = {}.f64 >= 0.0 ? {}.f64 : {}.f64;", f(insn.operands[0]), f(insn.operands[1]), f(insn.operands[2]), f(insn.operands[3])); break; + case PPC_INST_FRSQRTE: + // TODO(crack): I sure hope the c++ optimizer can optimize this. Fixme with some simd magic later + printSetFlushMode(false); + println("\t{}.f64 = double(1.0f / sqrtf(float({}.f64)));", f(insn.operands[0]), f(insn.operands[1])); + break; + case PPC_INST_FSQRT: printSetFlushMode(false); println("\t{}.f64 = sqrt({}.f64);", f(insn.operands[0]), f(insn.operands[1])); @@ -1133,6 +1170,8 @@ bool Recompiler::Recompile( println("\t{}.s64 = {};", r(insn.operands[0]), int32_t(insn.operands[1] << 16)); break; + case PPC_INST_LVEBX: + case PPC_INST_LVEHX: case PPC_INST_LVEWX: case PPC_INST_LVEWX128: case PPC_INST_LVX: @@ -1325,6 +1364,22 @@ bool Recompiler::Recompile( println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); break; + case PPC_INST_MULHD: + println("\t{}.s64 = __mulh({}.s64, {}.s64);", + r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s32, 0, {});", + cr(0), r(insn.operands[0]), xer()); + break; + + case PPC_INST_MULHDU: + println("\t{}.u64 = __mulhu({}.u64, {}.u64);", + r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s32, 0, {});", + cr(0), r(insn.operands[0]), xer()); + break; + case PPC_INST_NAND: println("\t{}.u64 = ~({}.u64 & {}.u64);", r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[2])); break; @@ -1395,6 +1450,14 @@ bool Recompiler::Recompile( println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); break; + case PPC_INST_RLWNM: + println("\t{}.u64 = __builtin_rotateleft64({}.u32 | ({}.u64 << 32), {}.u8 & 0x1F) & 0x{:X};", + r(insn.operands[0]), r(insn.operands[1]), r(insn.operands[1]), + r(insn.operands[2]), ComputeMask(insn.operands[3] + 32, insn.operands[4] + 32)); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); + break; + case PPC_INST_ROTLDI: println("\t{}.u64 = __builtin_rotateleft64({}.u64, {});", r(insn.operands[0]), r(insn.operands[1]), insn.operands[2]); break; @@ -1710,28 +1773,235 @@ bool Recompiler::Recompile( println("\t{}.s64 = {} - {}.s64;", r(insn.operands[0]), int32_t(insn.operands[2]), r(insn.operands[1])); break; + case PPC_INST_SUBFME: + println("\t{}.u64 = ~{}.u64 + {}.ca - 1;", temp(), r(insn.operands[1]), xer()); + println("\t{}.ca = ({}.u64 < ~{}.u64) || ({}.u64 == ~{}.u64 && {}.ca);", xer(), + temp(), r(insn.operands[1]), temp(), r(insn.operands[1]), xer()); + println("\t{}.u64 = {}.u64;", r(insn.operands[0]), temp()); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s32, 0, {});", + cr(0), r(insn.operands[0]), xer()); + break; + + case PPC_INST_SUBFZE: + println("\t{}.u64 = ~{}.u64 + {}.ca;", temp(), r(insn.operands[1]), xer()); + println("\t{}.ca = {}.u64 < {}.ca;", xer(), temp(), xer()); + println("\t{}.u64 = {}.u64;", r(insn.operands[0]), temp()); + if (strchr(insn.opcode->name, '.')) + println("\t{}.compare({}.s32, 0, {});", cr(0), r(insn.operands[0]), xer()); + break; + case PPC_INST_SYNC: // no op break; + case PPC_INST_TDEQ: + println("\tif ({}.u64 == {}.u64) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); + break; + + case PPC_INST_TDEQI: + println("\tif ({}.u64 == {}) __builtin_debugtrap();", r(insn.operands[0]), insn.operands[1]); + break; + + case PPC_INST_TDGE: + println("\tif ({}.s64 >= {}.s64) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); + break; + + case PPC_INST_TDGEI: + println("\tif ({}.s64 >= {}) __builtin_debugtrap();", r(insn.operands[0]), int32_t(insn.operands[1])); + break; + + case PPC_INST_TDGT: + println("\tif ({}.s64 > {}.s64) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); + break; + + case PPC_INST_TDGTI: + println("\tif ({}.s64 > {}) __builtin_debugtrap();", r(insn.operands[0]), int32_t(insn.operands[1])); + break; + + case PPC_INST_TDLE: + println("\tif ({}.s64 <= {}.s64) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); + break; + + case PPC_INST_TDLEI: + println("\tif ({}.s64 <= {}) __builtin_debugtrap();", r(insn.operands[0]), int32_t(insn.operands[1])); + break; + + case PPC_INST_TDLGE: + println("\tif ({}.u64 >= {}.u64) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); + break; + case PPC_INST_TDLGEI: - // no op + println("\tif ({}.u64 >= {}) __builtin_debugtrap();", r(insn.operands[0]), insn.operands[1]); + break; + + case PPC_INST_TDLGT: + println("\tif ({}.u64 > {}.u64) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); + break; + + case PPC_INST_TDLGTI: + println("\tif ({}.u64 > {}) __builtin_debugtrap();", r(insn.operands[0]), insn.operands[1]); + break; + + case PPC_INST_TDLLE: + println("\tif ({}.u64 <= {}.u64) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); break; case PPC_INST_TDLLEI: - // no op + println("\tif ({}.u64 <= {}) __builtin_debugtrap();", r(insn.operands[0]), insn.operands[1]); + break; + + case PPC_INST_TDLLT: + println("\tif ({}.u64 < {}.u64) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); + break; + + case PPC_INST_TDLLTI: + println("\tif ({}.u64 < {}) __builtin_debugtrap();", r(insn.operands[0]), insn.operands[1]); + break; + + case PPC_INST_TDLT: + println("\tif ({}.s64 < {}.s64) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); + break; + + case PPC_INST_TDLTI: + println("\tif ({}.s64 < {}) __builtin_debugtrap();", r(insn.operands[0]), int32_t(insn.operands[1])); + break; + + case PPC_INST_TDNE: + println("\tif ({}.u64 != {}.u64) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); break; + case PPC_INST_TDNEI: + println("\tif ({}.u64 != {}) __builtin_debugtrap();", r(insn.operands[0]), insn.operands[1]); + break; case PPC_INST_TWI: - // no op + { + // TO field specifies trap conditions: + // Bit 0 (16): Less than (signed) + // Bit 1 (8): Greater than (signed) + // Bit 2 (4): Equal + // Bit 3 (2): Less than (unsigned) + // Bit 4 (1): Greater than (unsigned) + + bool first = true; + print("\tif ("); + + if (insn.operands[0] & 16) { + print("{}.s32 < {}", r(insn.operands[1]), int32_t(insn.operands[2])); + first = false; + } + + if (insn.operands[0] & 8) { + if (!first) print(" || "); + print("{}.s32 > {}", r(insn.operands[1]), int32_t(insn.operands[2])); + first = false; + } + + if (insn.operands[0] & 4) { + if (!first) print(" || "); + print("{}.u32 == {}", r(insn.operands[1]), insn.operands[2]); + first = false; + } + + if (insn.operands[0] & 2) { + if (!first) print(" || "); + print("{}.u32 < {}", r(insn.operands[1]), insn.operands[2]); + first = false; + } + + if (insn.operands[0] & 1) { + if (!first) print(" || "); + print("{}.u32 > {}", r(insn.operands[1]), insn.operands[2]); + first = false; + } + + if (first) { + // TO = 0 means never trap + println("false) __builtin_debugtrap();"); + } else { + println(") __builtin_debugtrap();"); + } + } + break; + + case PPC_INST_TWEQ: + println("\tif ({}.u32 == {}.u32) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); + break; + + case PPC_INST_TWEQI: + println("\tif ({}.u32 == {}) __builtin_debugtrap();", r(insn.operands[0]), insn.operands[1]); + break; + + case PPC_INST_TWGE: + println("\tif ({}.s32 >= {}.s32) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); + break; + + case PPC_INST_TWGEI: + println("\tif ({}.s32 >= {}) __builtin_debugtrap();", r(insn.operands[0]), int32_t(insn.operands[1])); + break; + + case PPC_INST_TWGT: + println("\tif ({}.s32 > {}.s32) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); + break; + + case PPC_INST_TWGTI: + println("\tif ({}.s32 > {}) __builtin_debugtrap();", r(insn.operands[0]), int32_t(insn.operands[1])); + break; + + case PPC_INST_TWLE: + println("\tif ({}.s32 <= {}.s32) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); + break; + + case PPC_INST_TWLEI: + println("\tif ({}.s32 <= {}) __builtin_debugtrap();", r(insn.operands[0]), int32_t(insn.operands[1])); + break; + + case PPC_INST_TWLGE: + println("\tif ({}.u32 >= {}.u32) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); break; case PPC_INST_TWLGEI: - // no op + println("\tif ({}.u32 >= {}) __builtin_debugtrap();", r(insn.operands[0]), insn.operands[1]); + break; + + case PPC_INST_TWLGT: + println("\tif ({}.u32 > {}.u32) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); + break; + + case PPC_INST_TWLGTI: + println("\tif ({}.u32 > {}) __builtin_debugtrap();", r(insn.operands[0]), insn.operands[1]); + break; + + case PPC_INST_TWLLE: + println("\tif ({}.u32 <= {}.u32) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); break; case PPC_INST_TWLLEI: - // no op + println("\tif ({}.u32 <= {}) __builtin_debugtrap();", r(insn.operands[0]), insn.operands[1]); + break; + + case PPC_INST_TWLLT: + println("\tif ({}.u32 < {}.u32) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); + break; + + case PPC_INST_TWLLTI: + println("\tif ({}.u32 < {}) __builtin_debugtrap();", r(insn.operands[0]), insn.operands[1]); + break; + + case PPC_INST_TWLT: + println("\tif ({}.s32 < {}.s32) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); + break; + + case PPC_INST_TWLTI: + println("\tif ({}.s32 < {}) __builtin_debugtrap();", r(insn.operands[0]), int32_t(insn.operands[1])); + break; + + case PPC_INST_TWNE: + println("\tif ({}.u32 != {}.u32) __builtin_debugtrap();", r(insn.operands[0]), r(insn.operands[1])); + break; + + case PPC_INST_TWNEI: + println("\tif ({}.u32 != {}) __builtin_debugtrap();", r(insn.operands[0]), insn.operands[1]); break; case PPC_INST_VADDFP: @@ -1744,6 +2014,16 @@ bool Recompiler::Recompile( println("\t_mm_store_si128((__m128i*){}.s16, _mm_adds_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VADDSWS: + // TODO(crack): vectorize - SSE doesn't have _mm_adds_epi32 + for (size_t i = 0; i < 4; i++) + { + println("\t{}.s64 = int64_t({}.s32[{}]) + int64_t({}.s32[{}]);", temp(), v(insn.operands[1]), i, v(insn.operands[2]), i); + println("\t{}.s32[{}] = {}.s64 > INT_MAX ? INT_MAX : {}.s64 < INT_MIN ? INT_MIN : {}.s64;", + v(insn.operands[0]), i, temp(), temp(), temp()); + } + break; + case PPC_INST_VADDUBM: println("\t_mm_store_si128((__m128i*){}.u8, _mm_add_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; @@ -1769,6 +2049,7 @@ bool Recompiler::Recompile( println("\t_mm_store_si128((__m128i*){}.u8, _mm_and_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VANDC: case PPC_INST_VANDC128: println("\t_mm_store_si128((__m128i*){}.u8, _mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; @@ -1785,6 +2066,11 @@ bool Recompiler::Recompile( println("\t_mm_store_si128((__m128i*){}.u8, _mm_avg_epu8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VAVGUH: + println("\t_mm_store_si128((__m128i*){}.u16, _mm_avg_epu16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", + v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + case PPC_INST_VCTSXS: case PPC_INST_VCFPSXWS128: printSetFlushMode(true); @@ -1795,6 +2081,16 @@ bool Recompiler::Recompile( println("_mm_load_ps({}.f32)));", v(insn.operands[1])); break; + case PPC_INST_VCTUXS: + case PPC_INST_VCFPUXWS128: + printSetFlushMode(true); + print("\t_mm_store_si128((__m128i*){}.u32, _mm_vctuxs(", v(insn.operands[0])); + if (insn.operands[2] != 0) + println("_mm_mul_ps(_mm_load_ps({}.f32), _mm_set1_ps({}))));", v(insn.operands[1]), 1u << insn.operands[2]); + else + println("_mm_load_ps({}.f32)));", v(insn.operands[1])); + break; + case PPC_INST_VCFSX: case PPC_INST_VCSXWFP128: { @@ -1831,7 +2127,11 @@ bool Recompiler::Recompile( case PPC_INST_VCMPBFP: case PPC_INST_VCMPBFP128: - println("\t__builtin_debugtrap();"); + printSetFlushMode(true); + println("\t_mm_store_ps({}.f32, _mm_vcmpbfp(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", + v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + if (strchr(insn.opcode->name, '.')) + println("\t{}.setFromMask(_mm_load_ps({}.f32), 0xF);", cr(6), v(insn.operands[0])); break; case PPC_INST_VCMPEQFP: @@ -1908,16 +2208,36 @@ bool Recompiler::Recompile( println("\t_mm_store_ps({}.f32, _mm_max_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VMAXSH: + println("\t_mm_store_si128((__m128i*){}.s16, _mm_max_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", + v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + case PPC_INST_VMAXSW: println("\t_mm_store_si128((__m128i*){}.u32, _mm_max_epi32(_mm_load_si128((__m128i*){}.u32), _mm_load_si128((__m128i*){}.u32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VMAXUH: + println("\t_mm_store_si128((__m128i*){}.u16, _mm_max_epu16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", + v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + case PPC_INST_VMINFP: case PPC_INST_VMINFP128: printSetFlushMode(true); println("\t_mm_store_ps({}.f32, _mm_min_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VMINSH: + println("\t_mm_store_si128((__m128i*){}.s16, _mm_min_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", + v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + + case PPC_INST_VMINUH: + println("\t_mm_store_si128((__m128i*){}.u16, _mm_min_epu16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", + v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + case PPC_INST_VMRGHB: println("\t_mm_store_si128((__m128i*){}.u8, _mm_unpackhi_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; @@ -1966,6 +2286,12 @@ bool Recompiler::Recompile( println("\t_mm_store_ps({}.f32, _mm_xor_ps(_mm_sub_ps(_mm_mul_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)), _mm_load_ps({}.f32)), _mm_castsi128_ps(_mm_set1_epi32(int(0x80000000)))));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), v(insn.operands[3])); break; + case PPC_INST_VNOR: + case PPC_INST_VNOR128: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_xor_si128(_mm_or_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)), _mm_set1_epi32(-1)));", + v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + case PPC_INST_VOR: case PPC_INST_VOR128: print("\t_mm_store_si128((__m128i*){}.u8, ", v(insn.operands[0])); @@ -2042,11 +2368,80 @@ bool Recompiler::Recompile( } break; + case PPC_INST_VPKSHSS: + case PPC_INST_VPKSHSS128: + println("\t_mm_store_si128((__m128i*){}.s8, _mm_packs_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", + v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + break; + case PPC_INST_VPKSHUS: case PPC_INST_VPKSHUS128: println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); break; + case PPC_INST_VPKSWSS: + case PPC_INST_VPKSWSS128: + println("\t_mm_store_si128((__m128i*){}.s16, _mm_packs_epi32(_mm_load_si128((__m128i*){}.s32), _mm_load_si128((__m128i*){}.s32)));", + v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + break; + + case PPC_INST_VPKSWUS: + case PPC_INST_VPKSWUS128: + println("\t_mm_store_si128((__m128i*){}.s32, _mm_load_si128((__m128i*){}.s32));", vTemp(), v(insn.operands[2])); + for (int i = 0; i < 4; i++) { + println("\t{}.u16[{}] = {}.s32[{}] < 0 ? 0 : ({}.s32[{}] > 0xFFFF ? 0xFFFF : {}.s32[{}]);", + v(insn.operands[0]), i, vTemp(), i, vTemp(), i, vTemp(), i); + } + println("\t_mm_store_si128((__m128i*){}.s32, _mm_load_si128((__m128i*){}.s32));", vTemp(), v(insn.operands[1])); + for (int i = 0; i < 4; i++) { + println("\t{}.u16[{}] = {}.s32[{}] < 0 ? 0 : ({}.s32[{}] > 0xFFFF ? 0xFFFF : {}.s32[{}]);", + v(insn.operands[0]), i + 4, vTemp(), i, vTemp(), i, vTemp(), i); + } + break; + + case PPC_INST_VPKUHUM: + // Pack without saturation - use shuffle to select lower bytes + println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi16(" + "_mm_and_si128(_mm_load_si128((__m128i*){}.u16), _mm_set1_epi16(0xFF)), " + "_mm_and_si128(_mm_load_si128((__m128i*){}.u16), _mm_set1_epi16(0xFF))));", + v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + break; + + case PPC_INST_VPKUHUS: + case PPC_INST_VPKUHUS128: + // Pack unsigned halfwords to unsigned bytes with saturation + println("\t_mm_store_si128((__m128i*){}.u8, _mm_packus_epi16(_mm_load_si128((__m128i*){}.u16), _mm_load_si128((__m128i*){}.u16)));", + v(insn.operands[0]), v(insn.operands[2]), v(insn.operands[1])); + break; + + case PPC_INST_VPKUWUM: + case PPC_INST_VPKUWUM128: + println("\t_mm_store_si128((__m128i*){}.u32, _mm_load_si128((__m128i*){}.u32));", vTemp(), v(insn.operands[2])); + for (int i = 0; i < 4; i++) { + println("\t{}.u16[{}] = {}.u16[{}];", + v(insn.operands[0]), i, vTemp(), i*2); + } + println("\t_mm_store_si128((__m128i*){}.u32, _mm_load_si128((__m128i*){}.u32));", vTemp(), v(insn.operands[1])); + for (int i = 0; i < 4; i++) { + println("\t{}.u16[{}] = {}.u16[{}];", + v(insn.operands[0]), i + 4, vTemp(), i*2); + } + break; + + case PPC_INST_VPKUWUS: + case PPC_INST_VPKUWUS128: + println("\t_mm_store_si128((__m128i*){}.u32, _mm_load_si128((__m128i*){}.u32));", vTemp(), v(insn.operands[2])); + for (int i = 0; i < 4; i++) { + println("\t{}.u16[{}] = {}.u32[{}] > 0xFFFF ? 0xFFFF : {}.u32[{}];", + v(insn.operands[0]), i, vTemp(), i, vTemp(), i); + } + println("\t_mm_store_si128((__m128i*){}.u32, _mm_load_si128((__m128i*){}.u32));", vTemp(), v(insn.operands[1])); + for (int i = 0; i < 4; i++) { + println("\t{}.u16[{}] = {}.u32[{}] > 0xFFFF ? 0xFFFF : {}.u32[{}];", + v(insn.operands[0]), i + 4, vTemp(), i, vTemp(), i); + } + break; + case PPC_INST_VREFP: case PPC_INST_VREFP128: // TODO: see if we can use rcp safely @@ -2088,15 +2483,51 @@ bool Recompiler::Recompile( break; case PPC_INST_VSEL: + case PPC_INST_VSEL128: println("\t_mm_store_si128((__m128i*){}.u8, _mm_or_si128(_mm_andnot_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)), _mm_and_si128(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8))));", v(insn.operands[0]), v(insn.operands[3]), v(insn.operands[1]), v(insn.operands[3]), v(insn.operands[2])); break; + case PPC_INST_VSL: + println("\t_mm_store_si128((__m128i*){}.u8, _mm_vsl(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8)));", + v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + case PPC_INST_VSLB: // TODO: vectorize for (size_t i = 0; i < 16; i++) println("\t{}.u8[{}] = {}.u8[{}] << ({}.u8[{}] & 0x7);", v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i); break; + case PPC_INST_VSLH: + // Vector shift left halfword + for (size_t i = 0; i < 8; i++) + println("\t{}.u16[{}] = {}.u16[{}] << ({}.u16[{}] & 0xF);", + v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i); + break; + + case PPC_INST_VSRAH: + // Vector shift right algebraic halfword + for (size_t i = 0; i < 8; i++) + println("\t{}.s16[{}] = {}.s16[{}] >> ({}.u16[{}] & 0xF);", + v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i); + break; + + case PPC_INST_VSRH: + // Vector shift right halfword + for (size_t i = 0; i < 8; i++) + println("\t{}.u16[{}] = {}.u16[{}] >> ({}.u16[{}] & 0xF);", + v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i); + break; + + case PPC_INST_VRLH: + // Vector rotate left halfword + for (size_t i = 0; i < 8; i++) + println("\t{}.u16[{}] = ({}.u16[{}] << ({}.u16[{}] & 0xF)) | " + "({}.u16[{}] >> (16 - ({}.u16[{}] & 0xF)));", + v(insn.operands[0]), i, v(insn.operands[1]), i, v(insn.operands[2]), i, + v(insn.operands[1]), i, v(insn.operands[2]), i); + break; + case PPC_INST_VSLDOI: case PPC_INST_VSLDOI128: println("\t_mm_store_si128((__m128i*){}.u8, _mm_alignr_epi8(_mm_load_si128((__m128i*){}.u8), _mm_load_si128((__m128i*){}.u8), {}));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2]), 16 - insn.operands[3]); @@ -2130,6 +2561,11 @@ bool Recompiler::Recompile( println("\t_mm_store_si128((__m128i*){}.u8, _mm_set1_epi8(char(0x{:X})));", v(insn.operands[0]), insn.operands[1]); break; + case PPC_INST_VSPLTISH: + println("\t_mm_store_si128((__m128i*){}.s16, _mm_set1_epi16(short({})));", + v(insn.operands[0]), int16_t(insn.operands[1])); + break; + case PPC_INST_VSPLTISW: case PPC_INST_VSPLTISW128: println("\t_mm_store_si128((__m128i*){}.u32, _mm_set1_epi32(int(0x{:X})));", v(insn.operands[0]), insn.operands[1]); @@ -2169,6 +2605,11 @@ bool Recompiler::Recompile( println("\t_mm_store_ps({}.f32, _mm_sub_ps(_mm_load_ps({}.f32), _mm_load_ps({}.f32)));", v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); break; + case PPC_INST_VSUBSHS: + println("\t_mm_store_si128((__m128i*){}.s16, _mm_subs_epi16(_mm_load_si128((__m128i*){}.s16), _mm_load_si128((__m128i*){}.s16)));", + v(insn.operands[0]), v(insn.operands[1]), v(insn.operands[2])); + break; + case PPC_INST_VSUBSWS: // TODO: vectorize for (size_t i = 0; i < 4; i++) diff --git a/XenonUtils/ppc_context.h b/XenonUtils/ppc_context.h index c1091d17..7b30689a 100644 --- a/XenonUtils/ppc_context.h +++ b/XenonUtils/ppc_context.h @@ -651,4 +651,77 @@ inline __m128i _mm_vsr(__m128i a, __m128i b) return _mm_castps_si128(_mm_insert_ps(_mm_castsi128_ps(_mm_srl_epi64(a, b)), _mm_castsi128_ps(_mm_srl_epi64(_mm_srli_si128(a, 4), b)), 0x10)); } +inline __m128 _mm_vcmpbfp(__m128 a, __m128 b) +{ + __m128 xmm0 = _mm_and_ps(_mm_cmpgt_ps(a, b), _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); + __m128 xmm1 = _mm_and_ps(_mm_cmplt_ps(a, _mm_sub_ps(_mm_setzero_ps(), b)), _mm_castsi128_ps(_mm_set1_epi32(0x40000000))); + return _mm_or_ps(xmm0, xmm1); +} + +inline uint64_t __mulhu(uint64_t a, uint64_t b) { + // Get high/low 32-bit parts + uint32_t a_lo = (uint32_t)a; + uint32_t a_hi = (uint32_t)(a >> 32); + uint32_t b_lo = (uint32_t)b; + uint32_t b_hi = (uint32_t)(b >> 32); + + // Compute partial products + uint64_t lo_lo = (uint64_t)a_lo * b_lo; + uint64_t hi_lo = (uint64_t)a_hi * b_lo; + uint64_t lo_hi = (uint64_t)a_lo * b_hi; + uint64_t hi_hi = (uint64_t)a_hi * b_hi; + + // Compute high 64 bits of result + uint64_t cross = (lo_lo >> 32) + (uint32_t)hi_lo + (uint32_t)lo_hi; + return hi_hi + (hi_lo >> 32) + (lo_hi >> 32) + (cross >> 32); +} + +inline __m128i _mm_vctuxs(__m128 src1) +{ + // Clamp negative to 0 + __m128 clamped = _mm_max_ps(src1, _mm_setzero_ps()); + + // For values in [2^31, 2^32), subtract 2^31, convert, add 2^31 back + __m128i big_result = _mm_add_epi32( + _mm_cvttps_epi32( + _mm_sub_ps(clamped, _mm_castsi128_ps(_mm_set1_epi32(0x4F000000))) + ), + _mm_set1_epi32(0x80000000) + ); + + // Select based on range + __m128i result = _mm_blendv_epi8( + _mm_cvttps_epi32(clamped), + big_result, + _mm_castps_si128( + _mm_cmpge_ps(clamped, _mm_castsi128_ps(_mm_set1_epi32(0x4F000000))) + ) + ); + + // Saturate overflow and NaN to UINT_MAX + __m128 saturate_mask = _mm_or_ps( + _mm_cmpge_ps( + clamped, _mm_castsi128_ps(_mm_set1_epi32(0x4F800000)) + ), + _mm_cmpunord_ps(src1, src1) + ); + return _mm_blendv_epi8(result, _mm_set1_epi32(-1), _mm_castps_si128(saturate_mask)); +} + +inline __m128i _mm_vsl(__m128i a, __m128i b) +{ + // Extract shift count from last byte of b (accounting for endianness) + uint32_t shift = _mm_extract_epi8(b, 15) & 0x7; + + if (shift == 0) return a; + + // Shift left by bits + __m128i shifted = _mm_or_si128( + _mm_slli_epi64(a, shift), + _mm_srli_epi64(_mm_slli_si128(a, 8), 64 - shift) + ); + + return shifted; +} + #endif