early-access version 1995

2021-08-16 13:42:12 +02:00
parent 1c11ae4a45
commit 66b92b0ba8
95 changed files with 18941 additions and 675 deletions
@@ -7,7 +7,9 @@ include(DownloadExternals)
 # xbyak
 if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
    add_library(xbyak INTERFACE)
-    target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak)
+    file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/xbyak/include)
+    file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/xbyak/xbyak DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/xbyak/include)
+    target_include_directories(xbyak SYSTEM INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/xbyak/include)
    target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES)
 endif()

@@ -19,6 +21,7 @@ target_include_directories(catch-single-include INTERFACE catch/single_include)
 if (ARCHITECTURE_x86_64)
    set(DYNARMIC_TESTS OFF)
    set(DYNARMIC_NO_BUNDLED_FMT ON)
+    set(DYNARMIC_IGNORE_ASSERTS ON CACHE BOOL "" FORCE)
    add_subdirectory(dynarmic)
 endif()

@@ -3,4 +3,4 @@ build/
 build-*/
 docs/Doxygen/
 # Generated files
-src/backend/x64/mig/
+src/dynarmic/backend/x64/mig/
@@ -12,6 +12,7 @@ endif()
 option(DYNARMIC_ENABLE_CPU_FEATURE_DETECTION "Turning this off causes dynarmic to assume the host CPU doesn't support anything later than SSE3" ON)
 option(DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT "Enables support for systems that require W^X" OFF)
 option(DYNARMIC_FATAL_ERRORS "Errors are fatal" OFF)
+option(DYNARMIC_IGNORE_ASSERTS "Ignore asserts" OFF)
 option(DYNARMIC_TESTS "Build tests" ${MASTER_PROJECT})
 option(DYNARMIC_TESTS_USE_UNICORN "Enable fuzzing tests against unicorn" OFF)
 option(DYNARMIC_USE_LLVM "Support disassembly of jitted x86_64 code using LLVM" OFF)
@@ -27,7 +28,7 @@ if (NOT CMAKE_BUILD_TYPE)
 endif()

 # Set hard requirements for C++
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)

@@ -5,14 +5,14 @@ support for other versions of the ARM architecture, having a interpreter mode, a
 for other architectures.

 Users of this library interact with it primarily through the interface provided in
-[`include/dynarmic`](../include/dynarmic). Users specify how dynarmic's CPU core interacts with
+[`src/dynarmic/interface`](../src/dynarmic/interface). Users specify how dynarmic's CPU core interacts with
 the rest of their system providing an implementation of the relevant `UserCallbacks` interface.
 Users setup the CPU state using member functions of `Jit`, then call `Jit::Execute` to start CPU
 execution. The callbacks defined on `UserCallbacks` may be called from dynamically generated code,
 so users of the library should not depend on the stack being in a walkable state for unwinding.

-* A32: [`Jit`](../include/dynarmic/A32/a32.h), [`UserCallbacks`](../include/dynarmic/A32/config.h)
-* A64: [`Jit`](../include/dynarmic/A64/a64.h), [`UserCallbacks`](../include/dynarmic/A64/config.h)
+* A32: [`Jit`](../src/dynarmic/interface/A32/a32.h), [`UserCallbacks`](../src/dynarmic/interface/A32/config.h)
+* A64: [`Jit`](../src/dynarmic/interface/A64/a64.h), [`UserCallbacks`](../src/dynarmic/interface/A64/config.h)

 Dynarmic reads instructions from memory by calling `UserCallbacks::MemoryReadCode`. These
 instructions then pass through several stages:
@@ -26,19 +26,19 @@ instructions then pass through several stages:
 Using the A32 frontend with the x64 backend as an example:

 * Decoding is done by [double dispatch](https://en.wikipedia.org/wiki/Visitor_pattern) in
-  [`src/frontend/A32/decoder/{arm.h,thumb16.h,thumb32.h}`](../src/frontend/A32/decoder/).
-* Translation is done by the visitors in `src/frontend/A32/translate/translate_{arm,thumb}.cpp`.
-  The function [`Translate`](../src/frontend/A32/translate/translate.h) takes a starting memory location,
+  [`src/frontend/A32/decoder/{arm.h,thumb16.h,thumb32.h}`](../src/dynarmic/frontend/A32/decoder/).
+* Translation is done by the visitors in [`src/dynarmic/frontend/A32/translate/translate_{arm,thumb}.cpp`](../src/dynarmic/frontend/A32/translate/).
+  The function [`Translate`](../src/dynarmic/frontend/A32/translate/translate.h) takes a starting memory location,
  some CPU state, and memory reader callback and returns a basic block of IR.
-* The IR can be found under [`src/frontend/ir/`](../src/frontend/ir/).
-* Optimizations can be found under [`src/ir_opt/`](../src/ir_opt/).
-* Emission is done by `EmitX64` which can be found in `src/backend_x64/emit_x64.{h,cpp}`.
-* Execution is performed by calling `BlockOfCode::RunCode` in `src/backend_x64/block_of_code.{h,cpp}`.
+* The IR can be found under [`src/frontend/ir/`](../src/dynarmic/ir/).
+* Optimizations can be found under [`src/ir_opt/`](../src/dynarmic/ir/opt/).
+* Emission is done by `EmitX64` which can be found in [`src/dynarmic/backend/x64/emit_x64.{h,cpp}`](../src/dynarmic/backend/x64/).
+* Execution is performed by calling `BlockOfCode::RunCode` in [`src/dynarmic/backend/x64/block_of_code.{h,cpp}`](../src/dynarmic/backend/x64/).

 ## Decoder

 The decoder is a double dispatch decoder. Each instruction is represented by a line in the relevant
-instruction table. Here is an example line from [`arm.h`](../src/frontend/A32/decoder/arm.h):
+instruction table. Here is an example line from [`arm.h`](../src/dynarmic/frontend/A32/decoder/arm.h):

    INST(&V::arm_ADC_imm,     "ADC (imm)",           "cccc0010101Snnnnddddrrrrvvvvvvvv")

@@ -61,7 +61,7 @@ error results.
 ## Translator

 The translator is a visitor that uses the decoder to decode instructions. The translator generates IR code with the
-help of the [`IREmitter` class](../src/frontend/ir/ir_emitter.h). An example of a translation function follows:
+help of the [`IREmitter` class](../src/dynarmic/ir/ir_emitter.h). An example of a translation function follows:

    bool ArmTranslatorVisitor::arm_ADC_imm(Cond cond, bool S, Reg n, Reg d, int rotate, Imm8 imm8) {
        u32 imm32 = ArmExpandImm(rotate, imm8);
@@ -107,7 +107,7 @@ function analyser in the medium-term future.
 Dynarmic's intermediate representation is typed. Each microinstruction may take zero or more arguments and may
 return zero or more arguments. A subset of the microinstructions available is documented below.

-A complete list of microinstructions can be found in [src/frontend/ir/opcodes.inc](../src/frontend/ir/opcodes.inc).
+A complete list of microinstructions can be found in [src/dynarmic/ir/opcodes.inc](../src/dynarmic/ir/opcodes.inc).

 The below lists some commonly used microinstructions.

@@ -5,7 +5,7 @@
 # catch

 add_library(catch INTERFACE)
-target_include_directories(catch INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/catch>)
+target_include_directories(catch INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/catch/include>)

 # fmt

@@ -36,7 +36,9 @@ endif()
 if (NOT TARGET xbyak)
    if (ARCHITECTURE STREQUAL "x86" OR ARCHITECTURE STREQUAL "x86_64")
        add_library(xbyak INTERFACE)
-        target_include_directories(xbyak SYSTEM INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/xbyak/xbyak)
+        file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/xbyak/include)
+        file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/xbyak/xbyak DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/xbyak/include)
+        target_include_directories(xbyak SYSTEM INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/xbyak/include)
        target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES)
    endif()
 endif()
@@ -103,6 +103,7 @@ Type traits not in the standard library.
 * `mp::parameter_list`: Get a typelist of the parameter types
 * `mp::get_parameter`: Get the type of a parameter by index
 * `mp::equivalent_function_type`: Get an equivalent function type (for MFPs this does not include the class)
+* `mp::equivalent_function_type_with_class`: Get an equivalent function type with explicit `this` argument (MFPs only)
 * `mp::return_type`: Return type of the function
 * `mp::class_type`: Only valid for member function pointer types. Gets the class the member function is associated with.

@@ -36,11 +36,15 @@ struct function_info<R(*)(As...)> : function_info<R(As...)> {};
 template<class C, class R, class... As>
 struct function_info<R(C::*)(As...)> : function_info<R(As...)> {
    using class_type = C;
+
+    using equivalent_function_type_with_class = R(C*, As...);
 };

 template<class C, class R, class... As>
 struct function_info<R(C::*)(As...) const> : function_info<R(As...)> {
    using class_type = C;
+
+    using equivalent_function_type_with_class = R(C*, As...);
 };

 template<class F>
@@ -55,6 +59,9 @@ using get_parameter = typename function_info<F>::template parameter<I>::type;
 template<class F>
 using equivalent_function_type = typename function_info<F>::equivalent_function_type;

+template<class F>
+using equivalent_function_type_with_class = typename function_info<F>::equivalent_function_type_with_class;
+
 template<class F>
 using return_type = typename function_info<F>::return_type;

@@ -381,6 +381,9 @@ endif()
 if (DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT)
    target_compile_definitions(dynarmic PRIVATE DYNARMIC_ENABLE_NO_EXECUTE_SUPPORT=1)
 endif()
+if (DYNARMIC_IGNORE_ASSERTS)
+    target_compile_definitions(dynarmic PRIVATE DYNARMIC_IGNORE_ASSERTS=1)
+endif()
 if (CMAKE_SYSTEM_NAME STREQUAL "Windows")
    target_compile_definitions(dynarmic PRIVATE FMT_USE_WINDOWS_H=0)
 endif()
@@ -629,18 +629,10 @@ static void EmitSetFlag(BlockOfCode& code, A32EmitContext& ctx, IR::Inst* inst,
    }
 }

-void A32EmitX64::EmitA32GetNFlag(A32EmitContext& ctx, IR::Inst* inst) {
-    EmitGetFlag(code, ctx, inst, NZCV::x64_n_flag_bit);
-}
-
 void A32EmitX64::EmitA32SetNFlag(A32EmitContext& ctx, IR::Inst* inst) {
    EmitSetFlag(code, ctx, inst, NZCV::x64_n_flag_bit);
 }

-void A32EmitX64::EmitA32GetZFlag(A32EmitContext& ctx, IR::Inst* inst) {
-    EmitGetFlag(code, ctx, inst, NZCV::x64_z_flag_bit);
-}
-
 void A32EmitX64::EmitA32SetZFlag(A32EmitContext& ctx, IR::Inst* inst) {
    EmitSetFlag(code, ctx, inst, NZCV::x64_z_flag_bit);
 }
@@ -653,10 +645,6 @@ void A32EmitX64::EmitA32SetCFlag(A32EmitContext& ctx, IR::Inst* inst) {
    EmitSetFlag(code, ctx, inst, NZCV::x64_c_flag_bit);
 }

-void A32EmitX64::EmitA32GetVFlag(A32EmitContext& ctx, IR::Inst* inst) {
-    EmitGetFlag(code, ctx, inst, NZCV::x64_v_flag_bit);
-}
-
 void A32EmitX64::EmitA32SetVFlag(A32EmitContext& ctx, IR::Inst* inst) {
    EmitSetFlag(code, ctx, inst, NZCV::x64_v_flag_bit);
 }
@@ -18,8 +18,8 @@
 #include "dynarmic/common/assert.h"
 #include "dynarmic/common/cast_util.h"
 #include "dynarmic/common/common_types.h"
-#include "dynarmic/common/llvm_disassemble.h"
 #include "dynarmic/common/scope_exit.h"
+#include "dynarmic/common/x64_disassemble.h"
 #include "dynarmic/frontend/A32/translate/translate.h"
 #include "dynarmic/interface/A32/a32.h"
 #include "dynarmic/interface/A32/context.h"
@@ -91,13 +91,6 @@ struct Jit::Impl {
        jit_state.exclusive_state = 0;
    }

-    std::string Disassemble(const IR::LocationDescriptor& descriptor) {
-        auto block = GetBasicBlock(descriptor);
-        std::string result = fmt::format("address: {}\nsize: {} bytes\n", block.entrypoint, block.size);
-        result += Common::DisassembleX64(block.entrypoint, reinterpret_cast<const char*>(block.entrypoint) + block.size);
-        return result;
-    }
-
    void PerformCacheInvalidation() {
        if (invalidate_entire_cache) {
            jit_state.ResetRSB();
@@ -324,8 +317,9 @@ void Jit::LoadContext(const Context& ctx) {
    impl->jit_state.TransferJitState(ctx.impl->jit_state, reset_rsb);
 }

-std::string Jit::Disassemble() const {
-    return Common::DisassembleX64(impl->block_of_code.GetCodeBegin(), impl->block_of_code.getCurr());
+void Jit::DumpDisassembly() const {
+    const size_t size = (const char*)impl->block_of_code.getCurr() - (const char*)impl->block_of_code.GetCodeBegin();
+    Common::DumpDisassembledX64(impl->block_of_code.GetCodeBegin(), size);
 }

 }  // namespace Dynarmic::A32
@@ -7,8 +7,6 @@

 #include <array>

-#include <xbyak.h>
-
 #include "dynarmic/common/common_types.h"

 namespace Dynarmic::Backend::X64 {
@@ -14,8 +14,8 @@
 #include "dynarmic/backend/x64/devirtualize.h"
 #include "dynarmic/backend/x64/jitstate_info.h"
 #include "dynarmic/common/assert.h"
-#include "dynarmic/common/llvm_disassemble.h"
 #include "dynarmic/common/scope_exit.h"
+#include "dynarmic/common/x64_disassemble.h"
 #include "dynarmic/frontend/A64/translate/translate.h"
 #include "dynarmic/interface/A64/a64.h"
 #include "dynarmic/ir/basic_block.h"
@@ -199,8 +199,9 @@ public:
        return is_executing;
    }

-    std::string Disassemble() const {
-        return Common::DisassembleX64(block_of_code.GetCodeBegin(), block_of_code.getCurr());
+    void DumpDisassembly() const {
+        const size_t size = (const char*)block_of_code.getCurr() - (const char*)block_of_code.GetCodeBegin();
+        Common::DumpDisassembledX64(block_of_code.GetCodeBegin(), size);
    }

 private:
@@ -397,8 +398,8 @@ bool Jit::IsExecuting() const {
    return impl->IsExecuting();
 }

-std::string Jit::Disassemble() const {
-    return impl->Disassemble();
+void Jit::DumpDisassembly() const {
+    return impl->DumpDisassembly();
 }

 }  // namespace Dynarmic::A64
@@ -7,8 +7,6 @@

 #include <array>

-#include <xbyak.h>
-
 #include "dynarmic/backend/x64/nzcv_util.h"
 #include "dynarmic/common/common_types.h"
 #include "dynarmic/frontend/A64/location_descriptor.h"
@@ -8,7 +8,7 @@
 #include <algorithm>
 #include <vector>

-#include <xbyak.h>
+#include <xbyak/xbyak.h>

 #include "dynarmic/backend/x64/block_of_code.h"
 #include "dynarmic/common/common_types.h"
@@ -15,7 +15,7 @@
 #include <array>
 #include <cstring>

-#include <xbyak.h>
+#include <xbyak/xbyak.h>

 #include "dynarmic/backend/x64/a32_jitstate.h"
 #include "dynarmic/backend/x64/abi.h"
@@ -258,8 +258,6 @@ void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) {
    SwitchMxcsrOnEntry();
    jmp(ABI_PARAM2);

-    align();
-
    // Dispatcher loop

    Xbyak::Label return_to_caller, return_to_caller_mxcsr_already_exited;
@@ -10,8 +10,8 @@
 #include <memory>
 #include <type_traits>

-#include <xbyak.h>
-#include <xbyak_util.h>
+#include <xbyak/xbyak.h>
+#include <xbyak/xbyak_util.h>

 #include "dynarmic/backend/x64/abi.h"
 #include "dynarmic/backend/x64/callback.h"
@@ -8,7 +8,7 @@
 #include <functional>
 #include <vector>

-#include <xbyak.h>
+#include <xbyak/xbyak.h>

 #include "dynarmic/common/common_types.h"

@@ -8,7 +8,7 @@
 #include <map>
 #include <tuple>

-#include <xbyak.h>
+#include <xbyak/xbyak.h>

 #include "dynarmic/common/common_types.h"

@@ -13,7 +13,7 @@

 #include <tsl/robin_map.h>
 #include <tsl/robin_set.h>
-#include <xbyak_util.h>
+#include <xbyak/xbyak_util.h>

 #include "dynarmic/backend/x64/exception_handler.h"
 #include "dynarmic/backend/x64/reg_alloc.h"
@@ -1283,6 +1283,72 @@ void EmitX64::EmitAnd64(EmitContext& ctx, IR::Inst* inst) {
    ctx.reg_alloc.DefineValue(inst, result);
 }

+void EmitX64::EmitAndNot32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    if (!args[0].IsImmediate() && !args[1].IsImmediate() && code.HasHostFeature(HostFeature::BMI1)) {
+        Xbyak::Reg32 op_a = ctx.reg_alloc.UseGpr(args[0]).cvt32();
+        Xbyak::Reg32 op_b = ctx.reg_alloc.UseGpr(args[1]).cvt32();
+        Xbyak::Reg32 result = ctx.reg_alloc.ScratchGpr().cvt32();
+        code.andn(result, op_b, op_a);
+        ctx.reg_alloc.DefineValue(inst, result);
+        return;
+    }
+
+    Xbyak::Reg32 result;
+    if (args[1].IsImmediate()) {
+        result = ctx.reg_alloc.ScratchGpr().cvt32();
+        code.mov(result, u32(~args[1].GetImmediateU32()));
+    } else {
+        result = ctx.reg_alloc.UseScratchGpr(args[1]).cvt32();
+        code.not_(result);
+    }
+
+    if (args[0].IsImmediate()) {
+        const u32 op_arg = args[0].GetImmediateU32();
+        code.and_(result, op_arg);
+    } else {
+        OpArg op_arg = ctx.reg_alloc.UseOpArg(args[0]);
+        op_arg.setBit(32);
+        code.and_(result, *op_arg);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
+void EmitX64::EmitAndNot64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    if (!args[0].IsImmediate() && !args[1].IsImmediate() && code.HasHostFeature(HostFeature::BMI1)) {
+        Xbyak::Reg64 op_a = ctx.reg_alloc.UseGpr(args[0]);
+        Xbyak::Reg64 op_b = ctx.reg_alloc.UseGpr(args[1]);
+        Xbyak::Reg64 result = ctx.reg_alloc.ScratchGpr();
+        code.andn(result, op_b, op_a);
+        ctx.reg_alloc.DefineValue(inst, result);
+        return;
+    }
+
+    Xbyak::Reg64 result;
+    if (args[1].IsImmediate()) {
+        result = ctx.reg_alloc.ScratchGpr();
+        code.mov(result, ~args[1].GetImmediateU64());
+    } else {
+        result = ctx.reg_alloc.UseScratchGpr(args[1]);
+        code.not_(result);
+    }
+
+    if (args[0].FitsInImmediateS32()) {
+        const u32 op_arg = u32(args[0].GetImmediateS32());
+        code.and_(result, op_arg);
+    } else {
+        OpArg op_arg = ctx.reg_alloc.UseOpArg(args[0]);
+        op_arg.setBit(64);
+        code.and_(result, *op_arg);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, result);
+}
+
 void EmitX64::EmitEor32(EmitContext& ctx, IR::Inst* inst) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);

@@ -766,12 +766,16 @@ static void EmitFPRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
            const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();

-            if constexpr (fsize == 32) {
-                code.rcpss(result, operand);
+            if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+                FCODE(vrcp14s)(result, operand, operand);
            } else {
-                code.cvtsd2ss(result, operand);
-                code.rcpss(result, result);
-                code.cvtss2sd(result, result);
+                if constexpr (fsize == 32) {
+                    code.rcpss(result, operand);
+                } else {
+                    code.cvtsd2ss(result, operand);
+                    code.rcpss(result, result);
+                    code.cvtss2sd(result, result);
+                }
            }

            ctx.reg_alloc.DefineValue(inst, result);
@@ -984,20 +988,22 @@ static void EmitFPRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
            const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();

-            if constexpr (fsize == 32) {
-                code.rsqrtss(result, operand);
+            if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+                FCODE(vrsqrt14s)(result, operand, operand);
            } else {
-                code.cvtsd2ss(result, operand);
-                code.rsqrtss(result, result);
-                code.cvtss2sd(result, result);
+                if constexpr (fsize == 32) {
+                    code.rsqrtss(result, operand);
+                } else {
+                    code.cvtsd2ss(result, operand);
+                    code.rsqrtss(result, result);
+                    code.cvtss2sd(result, result);
+                }
            }

            ctx.reg_alloc.DefineValue(inst, result);
            return;
        }

-        // TODO: VRSQRT14SS implementation (AVX512F)
-
        auto args = ctx.reg_alloc.GetArgumentInfo(inst);

        const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
@@ -165,7 +165,7 @@ void EmitX64::EmitVectorGetElement8(EmitContext& ctx, IR::Inst* inst) {
    if (code.HasHostFeature(HostFeature::SSE41)) {
        code.pextrb(dest, source, index);
    } else {
-        code.pextrw(dest, source, index / 2);
+        code.pextrw(dest, source, u8(index / 2));
        if (index % 2 == 1) {
            code.shr(dest, 8);
        } else {
@@ -439,6 +439,17 @@ void EmitX64::EmitVectorAnd(EmitContext& ctx, IR::Inst* inst) {
    EmitVectorOperation(code, ctx, inst, &Xbyak::CodeGenerator::pand);
 }

+void EmitX64::EmitVectorAndNot(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
+    const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+    code.pandn(xmm_b, xmm_a);
+
+    ctx.reg_alloc.DefineValue(inst, xmm_b);
+}
+
 static void ArithmeticShiftRightByte(EmitContext& ctx, BlockOfCode& code, const Xbyak::Xmm& result, u8 shift_amount) {
    if (code.HasHostFeature(HostFeature::GFNI)) {
        const u64 shift_matrix = shift_amount < 8
@@ -741,6 +752,148 @@ void EmitX64::EmitVectorBroadcast64(EmitContext& ctx, IR::Inst* inst) {
    ctx.reg_alloc.DefineValue(inst, a);
 }

+void EmitX64::EmitVectorBroadcastElementLower8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+    ASSERT(args[1].IsImmediate());
+    const u8 index = args[1].GetImmediateU8();
+    ASSERT(index < 16);
+
+    if (index > 0) {
+        code.psrldq(a, index);
+    }
+
+    if (code.HasHostFeature(HostFeature::AVX2)) {
+        code.vpbroadcastb(a, a);
+        code.vmovq(a, a);
+    } else if (code.HasHostFeature(HostFeature::SSSE3)) {
+        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+        code.pxor(tmp, tmp);
+        code.pshufb(a, tmp);
+        code.movq(a, a);
+    } else {
+        code.punpcklbw(a, a);
+        code.pshuflw(a, a, 0);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcastElementLower16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+    ASSERT(args[1].IsImmediate());
+    const u8 index = args[1].GetImmediateU8();
+    ASSERT(index < 8);
+
+    if (index > 0) {
+        code.psrldq(a, u8(index * 2));
+    }
+
+    code.pshuflw(a, a, 0);
+
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcastElementLower32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+    ASSERT(args[1].IsImmediate());
+    const u8 index = args[1].GetImmediateU8();
+    ASSERT(index < 4);
+
+    if (index > 0) {
+        code.psrldq(a, u8(index * 4));
+    }
+
+    code.pshuflw(a, a, 0b01'00'01'00);
+
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcastElement8(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+    ASSERT(args[1].IsImmediate());
+    const u8 index = args[1].GetImmediateU8();
+    ASSERT(index < 16);
+
+    if (index > 0) {
+        code.psrldq(a, index);
+    }
+
+    if (code.HasHostFeature(HostFeature::AVX2)) {
+        code.vpbroadcastb(a, a);
+    } else if (code.HasHostFeature(HostFeature::SSSE3)) {
+        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+
+        code.pxor(tmp, tmp);
+        code.pshufb(a, tmp);
+    } else {
+        code.punpcklbw(a, a);
+        code.pshuflw(a, a, 0);
+        code.punpcklqdq(a, a);
+    }
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcastElement16(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+    ASSERT(args[1].IsImmediate());
+    const u8 index = args[1].GetImmediateU8();
+    ASSERT(index < 8);
+
+    if (index == 0 && code.HasHostFeature(HostFeature::AVX2)) {
+        code.vpbroadcastw(a, a);
+
+        ctx.reg_alloc.DefineValue(inst, a);
+        return;
+    }
+
+    if (index < 4) {
+        code.pshuflw(a, a, Common::Replicate<u8>(index, 2));
+        code.punpcklqdq(a, a);
+    } else {
+        code.pshufhw(a, a, Common::Replicate<u8>(u8(index - 4), 2));
+        code.punpckhqdq(a, a);
+    }
+
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcastElement32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+    ASSERT(args[1].IsImmediate());
+    const u8 index = args[1].GetImmediateU8();
+    ASSERT(index < 4);
+
+    code.pshufd(a, a, Common::Replicate<u8>(index, 2));
+
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
+void EmitX64::EmitVectorBroadcastElement64(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+    const Xbyak::Xmm a = ctx.reg_alloc.UseScratchXmm(args[0]);
+    ASSERT(args[1].IsImmediate());
+    const u8 index = args[1].GetImmediateU8();
+    ASSERT(index < 2);
+
+    if (code.HasHostFeature(HostFeature::AVX)) {
+        code.vpermilpd(a, a, Common::Replicate<u8>(index, 1));
+    } else {
+        if (index == 0) {
+            code.punpcklqdq(a, a);
+        } else {
+            code.punpckhqdq(a, a);
+        }
+    }
+    ctx.reg_alloc.DefineValue(inst, a);
+}
+
 template<typename T>
 static void EmitVectorCountLeadingZeros(VectorArray<T>& result, const VectorArray<T>& data) {
    for (size_t i = 0; i < result.size(); i++) {
@@ -1288,12 +1288,16 @@ static void EmitRecipEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
            const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();

-            if constexpr (fsize == 32) {
-                code.rcpps(result, operand);
+            if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+                FCODE(vrcp14p)(result, operand);
            } else {
-                code.cvtpd2ps(result, operand);
-                code.rcpps(result, result);
-                code.cvtps2pd(result, result);
+                if constexpr (fsize == 32) {
+                    code.rcpps(result, operand);
+                } else {
+                    code.cvtpd2ps(result, operand);
+                    code.rcpps(result, result);
+                    code.cvtps2pd(result, result);
+                }
            }

            ctx.reg_alloc.DefineValue(inst, result);
@@ -1502,12 +1506,16 @@ static void EmitRSqrtEstimate(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
            const Xbyak::Xmm operand = ctx.reg_alloc.UseXmm(args[0]);
            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();

-            if constexpr (fsize == 32) {
-                code.rsqrtps(result, operand);
+            if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+                FCODE(vrsqrt14p)(result, operand);
            } else {
-                code.cvtpd2ps(result, operand);
-                code.rsqrtps(result, result);
-                code.cvtps2pd(result, result);
+                if constexpr (fsize == 32) {
+                    code.rsqrtps(result, operand);
+                } else {
+                    code.cvtpd2ps(result, operand);
+                    code.rsqrtps(result, result);
+                    code.cvtps2pd(result, result);
+                }
            }

            ctx.reg_alloc.DefineValue(inst, result);
@@ -1707,8 +1715,6 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
    const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(2).GetU8());
    [[maybe_unused]] const bool fpcr_controlled = inst->GetArg(3).GetU1();

-    // TODO: AVX512 implementation
-
    if constexpr (fsize != 16) {
        if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero) {
            auto args = ctx.reg_alloc.GetArgumentInfo(inst);
@@ -1737,17 +1743,21 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
                    if constexpr (fsize == 32) {
                        code.cvttps2dq(src, src);
                    } else {
-                        const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr();
-                        const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr();
+                        if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+                            code.vcvttpd2qq(src, src);
+                        } else {
+                            const Xbyak::Reg64 hi = ctx.reg_alloc.ScratchGpr();
+                            const Xbyak::Reg64 lo = ctx.reg_alloc.ScratchGpr();

-                        code.cvttsd2si(lo, src);
-                        code.punpckhqdq(src, src);
-                        code.cvttsd2si(hi, src);
-                        code.movq(src, lo);
-                        code.pinsrq(src, hi, 1);
+                            code.cvttsd2si(lo, src);
+                            code.punpckhqdq(src, src);
+                            code.cvttsd2si(hi, src);
+                            code.movq(src, lo);
+                            code.pinsrq(src, hi, 1);

-                        ctx.reg_alloc.Release(hi);
-                        ctx.reg_alloc.Release(lo);
+                            ctx.reg_alloc.Release(hi);
+                            ctx.reg_alloc.Release(lo);
+                        }
                    }
                };

@@ -1765,29 +1775,43 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
                [[maybe_unused]] constexpr u64 float_upper_limit_unsigned = fsize == 32 ? 0x4f800000 : 0x43f0000000000000;

                if constexpr (unsigned_) {
-                    // Zero is minimum
-                    code.xorps(xmm0, xmm0);
-                    FCODE(cmplep)(xmm0, src);
-                    FCODE(andp)(src, xmm0);
+                    if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+                        // Mask positive values
+                        code.xorps(xmm0, xmm0);
+                        FCODE(vcmpp)(k1, src, xmm0, Cmp::GreaterEqual_OQ);

-                    // Will we exceed unsigned range?
-                    const Xbyak::Xmm exceed_unsigned = ctx.reg_alloc.ScratchXmm();
-                    code.movaps(exceed_unsigned, GetVectorOf<fsize, float_upper_limit_unsigned>(code));
-                    FCODE(cmplep)(exceed_unsigned, src);
+                        // Convert positive values to unsigned integers, write 0 anywhere else
+                        // vcvttp*2u*q already saturates out-of-range values to (0xFFFF...)
+                        if constexpr (fsize == 32) {
+                            code.vcvttps2udq(src | k1 | T_z, src);
+                        } else {
+                            code.vcvttpd2uqq(src | k1 | T_z, src);
+                        }
+                    } else {
+                        // Zero is minimum
+                        code.xorps(xmm0, xmm0);
+                        FCODE(cmplep)(xmm0, src);
+                        FCODE(andp)(src, xmm0);

-                    // Will be exceed signed range?
-                    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
-                    code.movaps(tmp, GetVectorOf<fsize, float_upper_limit_signed>(code));
-                    code.movaps(xmm0, tmp);
-                    FCODE(cmplep)(xmm0, src);
-                    FCODE(andp)(tmp, xmm0);
-                    FCODE(subp)(src, tmp);
-                    perform_conversion(src);
-                    ICODE(psll)(xmm0, static_cast<u8>(fsize - 1));
-                    FCODE(orp)(src, xmm0);
+                        // Will we exceed unsigned range?
+                        const Xbyak::Xmm exceed_unsigned = ctx.reg_alloc.ScratchXmm();
+                        code.movaps(exceed_unsigned, GetVectorOf<fsize, float_upper_limit_unsigned>(code));
+                        FCODE(cmplep)(exceed_unsigned, src);

-                    // Saturate to max
-                    FCODE(orp)(src, exceed_unsigned);
+                        // Will be exceed signed range?
+                        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+                        code.movaps(tmp, GetVectorOf<fsize, float_upper_limit_signed>(code));
+                        code.movaps(xmm0, tmp);
+                        FCODE(cmplep)(xmm0, src);
+                        FCODE(andp)(tmp, xmm0);
+                        FCODE(subp)(src, tmp);
+                        perform_conversion(src);
+                        ICODE(psll)(xmm0, static_cast<u8>(fsize - 1));
+                        FCODE(orp)(src, xmm0);
+
+                        // Saturate to max
+                        FCODE(orp)(src, exceed_unsigned);
+                    }
                } else {
                    constexpr u64 integer_max = static_cast<FPT>(std::numeric_limits<std::conditional_t<unsigned_, FPT, std::make_signed_t<FPT>>>::max());

@@ -5,7 +5,7 @@

 #include "dynarmic/backend/x64/hostloc.h"

-#include <xbyak.h>
+#include <xbyak/xbyak.h>

 #include "dynarmic/backend/x64/abi.h"
 #include "dynarmic/backend/x64/stack_layout.h"
@@ -4,7 +4,7 @@
 */
 #pragma once

-#include <xbyak.h>
+#include <xbyak/xbyak.h>

 #include "dynarmic/common/assert.h"
 #include "dynarmic/common/common_types.h"
@@ -5,7 +5,7 @@

 #pragma once

-#include <xbyak.h>
+#include <xbyak/xbyak.h>

 #include "dynarmic/common/assert.h"

@@ -10,7 +10,7 @@
 #include <utility>

 #include <fmt/ostream.h>
-#include <xbyak.h>
+#include <xbyak/xbyak.h>

 #include "dynarmic/backend/x64/abi.h"
 #include "dynarmic/backend/x64/stack_layout.h"
@@ -11,7 +11,7 @@
 #include <utility>
 #include <vector>

-#include <xbyak.h>
+#include <xbyak/xbyak.h>

 #include "dynarmic/backend/x64/block_of_code.h"
 #include "dynarmic/backend/x64/hostloc.h"
@@ -203,7 +203,7 @@ constexpr T Replicate(T value, size_t element_size) {
    ASSERT_MSG(BitSize<T>() % element_size == 0, "bitsize of T not divisible by element_size");
    if (element_size == BitSize<T>())
        return value;
-    return Replicate(value | (value << element_size), element_size * 2);
+    return Replicate<T>(T(value | value << element_size), element_size * 2);
 }

 template<typename T>
@@ -13,6 +13,7 @@
 #endif

 #include "dynarmic/common/assert.h"
+#include "dynarmic/common/cast_util.h"
 #include "dynarmic/common/common_types.h"
 #include "dynarmic/common/llvm_disassemble.h"

@@ -48,7 +49,7 @@ std::string DisassembleX64(const void* begin, const void* end) {
    LLVMDisasmDispose(llvm_ctx);
 #else
    result += fmt::format("(recompile with DYNARMIC_USE_LLVM=ON to disassemble the generated x86_64 code)\n");
-    result += fmt::format("start: {:016x}, end: {:016x}\n", begin, end);
+    result += fmt::format("start: {:016x}, end: {:016x}\n", BitCast<u64>(begin), BitCast<u64>(end));
 #endif

    return result;
@@ -1364,12 +1364,12 @@ public:

    std::string vfp_VMOV_from_i16(Cond cond, Imm<1> i1, size_t Vd, Reg t, bool D, Imm<1> i2) {
        const size_t index = concatenate(i1, i2).ZeroExtend();
-        return fmt::format("vmov{}.{}16 {}[{}], {}", CondToString(cond), FPRegStr(true, Vd, D), index, t);
+        return fmt::format("vmov{}.16 {}[{}], {}", CondToString(cond), FPRegStr(true, Vd, D), index, t);
    }

    std::string vfp_VMOV_from_i8(Cond cond, Imm<1> i1, size_t Vd, Reg t, bool D, Imm<2> i2) {
        const size_t index = concatenate(i1, i2).ZeroExtend();
-        return fmt::format("vmov{}.{}8 {}[{}], {}", CondToString(cond), FPRegStr(true, Vd, D), index, t);
+        return fmt::format("vmov{}.8 {}[{}], {}", CondToString(cond), FPRegStr(true, Vd, D), index, t);
    }

    std::string vfp_VMOV_to_i32(Cond cond, Imm<1> i, size_t Vn, Reg t, bool N) {
@@ -32,6 +32,7 @@ bool IsConditionPassed(TranslatorVisitor& v, IR::Cond cond) {

    if (cond == IR::Cond::NV) {
        // NV conditional is obsolete
+        v.cond_state = ConditionalState::Break;
        v.RaiseException(Exception::UnpredictableInstruction);
        return false;
    }
@@ -80,8 +80,7 @@ bool TranslatorVisitor::asimd_VDUP_scalar(bool D, Imm<4> imm4, size_t Vd, bool Q
    const auto m = ToVector(false, Vm, M);

    const auto reg_m = ir.GetVector(m);
-    const auto scalar = ir.VectorGetElement(esize, reg_m, index);
-    const auto result = ir.VectorBroadcast(esize, scalar);
+    const auto result = ir.VectorBroadcastElement(esize, reg_m, index);

    ir.SetVector(d, result);
    return true;
@@ -318,7 +318,7 @@ bool TranslatorVisitor::asimd_VAND_reg(bool D, size_t Vn, size_t Vd, bool N, boo

 bool TranslatorVisitor::asimd_VBIC_reg(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
    return BitwiseInstruction<false>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_n, const auto& reg_m) {
-        return ir.VectorAnd(reg_n, ir.VectorNot(reg_m));
+        return ir.VectorAndNot(reg_n, reg_m);
    });
 }

@@ -342,19 +342,19 @@ bool TranslatorVisitor::asimd_VEOR_reg(bool D, size_t Vn, size_t Vd, bool N, boo

 bool TranslatorVisitor::asimd_VBSL(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
    return BitwiseInstruction<true>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_d, const auto& reg_n, const auto& reg_m) {
-        return ir.VectorOr(ir.VectorAnd(reg_n, reg_d), ir.VectorAnd(reg_m, ir.VectorNot(reg_d)));
+        return ir.VectorOr(ir.VectorAnd(reg_n, reg_d), ir.VectorAndNot(reg_m, reg_d));
    });
 }

 bool TranslatorVisitor::asimd_VBIT(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
    return BitwiseInstruction<true>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_d, const auto& reg_n, const auto& reg_m) {
-        return ir.VectorOr(ir.VectorAnd(reg_n, reg_m), ir.VectorAnd(reg_d, ir.VectorNot(reg_m)));
+        return ir.VectorOr(ir.VectorAnd(reg_n, reg_m), ir.VectorAndNot(reg_d, reg_m));
    });
 }

 bool TranslatorVisitor::asimd_VBIF(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
    return BitwiseInstruction<true>(*this, D, Vn, Vd, N, Q, M, Vm, [this](const auto& reg_d, const auto& reg_n, const auto& reg_m) {
-        return ir.VectorOr(ir.VectorAnd(reg_d, reg_m), ir.VectorAnd(reg_n, ir.VectorNot(reg_m)));
+        return ir.VectorOr(ir.VectorAnd(reg_d, reg_m), ir.VectorAndNot(reg_n, reg_m));
    });
 }

@@ -46,9 +46,8 @@ bool ScalarMultiply(TranslatorVisitor& v, bool Q, bool D, size_t sz, size_t Vn,
    const auto n = ToVector(Q, Vn, N);
    const auto [m, index] = GetScalarLocation(esize, M, Vm);

-    const auto scalar = v.ir.VectorGetElement(esize, v.ir.GetVector(m), index);
    const auto reg_n = v.ir.GetVector(n);
-    const auto reg_m = v.ir.VectorBroadcast(esize, scalar);
+    const auto reg_m = v.ir.VectorBroadcastElement(esize, v.ir.GetVector(m), index);
    const auto addend = F ? v.ir.FPVectorMul(esize, reg_n, reg_m, false)
                          : v.ir.VectorMultiply(esize, reg_n, reg_m);
    const auto result = [&] {
@@ -125,9 +124,8 @@ bool ScalarMultiplyReturnHigh(TranslatorVisitor& v, bool Q, bool D, size_t sz, s
    const auto n = ToVector(Q, Vn, N);
    const auto [m, index] = GetScalarLocation(esize, M, Vm);

-    const auto scalar = v.ir.VectorGetElement(esize, v.ir.GetVector(m), index);
    const auto reg_n = v.ir.GetVector(n);
-    const auto reg_m = v.ir.VectorBroadcast(esize, scalar);
+    const auto reg_m = v.ir.VectorBroadcastElement(esize, v.ir.GetVector(m), index);
    const auto result = [&] {
        const auto tmp = v.ir.VectorSignedSaturatedDoublingMultiply(esize, reg_n, reg_m);

@@ -177,9 +175,8 @@ bool TranslatorVisitor::asimd_VQDMULL_scalar(bool D, size_t sz, size_t Vn, size_
    const auto n = ToVector(false, Vn, N);
    const auto [m, index] = GetScalarLocation(esize, M, Vm);

-    const auto scalar = ir.VectorGetElement(esize, ir.GetVector(m), index);
    const auto reg_n = ir.GetVector(n);
-    const auto reg_m = ir.VectorBroadcast(esize, scalar);
+    const auto reg_m = ir.VectorBroadcastElement(esize, ir.GetVector(m), index);
    const auto result = ir.VectorSignedSaturatedDoublingMultiplyLong(esize, reg_n, reg_m);

    ir.SetVector(d, result);
@@ -177,7 +177,7 @@ bool TranslatorVisitor::asimd_VSRI(bool D, size_t imm6, size_t Vd, bool L, bool

    const auto shifted = ir.VectorLogicalShiftRight(esize, reg_m, static_cast<u8>(shift_amount));
    const auto mask_vec = ir.VectorBroadcast(esize, I(esize, mask));
-    const auto result = ir.VectorOr(ir.VectorAnd(reg_d, ir.VectorNot(mask_vec)), shifted);
+    const auto result = ir.VectorOr(ir.VectorAndNot(reg_d, mask_vec), shifted);

    ir.SetVector(d, result);
    return true;
@@ -203,7 +203,7 @@ bool TranslatorVisitor::asimd_VSLI(bool D, size_t imm6, size_t Vd, bool L, bool

    const auto shifted = ir.VectorLogicalShiftLeft(esize, reg_m, static_cast<u8>(shift_amount));
    const auto mask_vec = ir.VectorBroadcast(esize, I(esize, mask));
-    const auto result = ir.VectorOr(ir.VectorAnd(reg_d, ir.VectorNot(mask_vec)), shifted);
+    const auto result = ir.VectorOr(ir.VectorAndNot(reg_d, mask_vec), shifted);

    ir.SetVector(d, result);
    return true;
@@ -250,7 +250,7 @@ bool TranslatorVisitor::arm_BIC_imm(Cond cond, bool S, Reg n, Reg d, int rotate,
    }

    const auto imm_carry = ArmExpandImm_C(rotate, imm8, ir.GetCFlag());
-    const auto result = ir.And(ir.GetRegister(n), ir.Not(ir.Imm32(imm_carry.imm32)));
+    const auto result = ir.AndNot(ir.GetRegister(n), ir.Imm32(imm_carry.imm32));
    if (d == Reg::PC) {
        if (S) {
            // This is UNPREDICTABLE when in user-mode.
@@ -280,7 +280,7 @@ bool TranslatorVisitor::arm_BIC_reg(Cond cond, bool S, Reg n, Reg d, Imm<5> imm5

    const auto carry_in = ir.GetCFlag();
    const auto shifted = EmitImmShift(ir.GetRegister(m), shift, imm5, carry_in);
-    const auto result = ir.And(ir.GetRegister(n), ir.Not(shifted.result));
+    const auto result = ir.AndNot(ir.GetRegister(n), shifted.result);
    if (d == Reg::PC) {
        if (S) {
            // This is UNPREDICTABLE when in user-mode.
@@ -315,7 +315,7 @@ bool TranslatorVisitor::arm_BIC_rsr(Cond cond, bool S, Reg n, Reg d, Reg s, Shif
    const auto shift_n = ir.LeastSignificantByte(ir.GetRegister(s));
    const auto carry_in = ir.GetCFlag();
    const auto shifted = EmitRegShift(ir.GetRegister(m), shift, shift_n, carry_in);
-    const auto result = ir.And(ir.GetRegister(n), ir.Not(shifted.result));
+    const auto result = ir.AndNot(ir.GetRegister(n), shifted.result);

    ir.SetRegister(d, result);
    if (S) {
@@ -356,7 +356,7 @@ bool TranslatorVisitor::thumb16_MUL_reg(Reg n, Reg d_m) {
 bool TranslatorVisitor::thumb16_BIC_reg(Reg m, Reg d_n) {
    const Reg d = d_n;
    const Reg n = d_n;
-    const auto result = ir.And(ir.GetRegister(n), ir.Not(ir.GetRegister(m)));
+    const auto result = ir.AndNot(ir.GetRegister(n), ir.GetRegister(m));

    ir.SetRegister(d, result);
    if (!ir.current_location.IT().IsInITBlock()) {
@@ -45,7 +45,7 @@ bool TranslatorVisitor::thumb32_BIC_imm(Imm<1> i, bool S, Reg n, Imm<3> imm3, Re
    }

    const auto imm_carry = ThumbExpandImm_C(i, imm3, imm8, ir.GetCFlag());
-    const auto result = ir.And(ir.GetRegister(n), ir.Not(ir.Imm32(imm_carry.imm32)));
+    const auto result = ir.AndNot(ir.GetRegister(n), ir.Imm32(imm_carry.imm32));

    ir.SetRegister(d, result);
    if (S) {
@@ -45,7 +45,7 @@ bool TranslatorVisitor::thumb32_BIC_reg(bool S, Reg n, Imm<3> imm3, Reg d, Imm<2
    }

    const auto shifted = EmitImmShift(ir.GetRegister(m), type, imm3, imm2, ir.GetCFlag());
-    const auto result = ir.And(ir.GetRegister(n), ir.Not(shifted.result));
+    const auto result = ir.AndNot(ir.GetRegister(n), shifted.result);
    ir.SetRegister(d, result);
    if (S) {
        ir.SetNFlag(ir.MostSignificantBit(result));
@@ -128,8 +128,8 @@ bool TranslatorVisitor::BIC_shift(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Re
    const u8 shift_amount = imm6.ZeroExtend<u8>();

    const auto operand1 = X(datasize, Rn);
-    const auto operand2 = ir.Not(ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount)));
-    const auto result = ir.And(operand1, operand2);
+    const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount));
+    const auto result = ir.AndNot(operand1, operand2);

    X(datasize, Rd, result);
    return true;
@@ -225,8 +225,8 @@ bool TranslatorVisitor::BICS(bool sf, Imm<2> shift, Reg Rm, Imm<6> imm6, Reg Rn,
    const u8 shift_amount = imm6.ZeroExtend<u8>();

    const auto operand1 = X(datasize, Rn);
-    const auto operand2 = ir.Not(ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount)));
-    const auto result = ir.And(operand1, operand2);
+    const auto operand2 = ShiftReg(datasize, Rm, shift, ir.Imm8(shift_amount));
+    const auto result = ir.AndNot(operand1, operand2);

    ir.SetNZCV(ir.NZCVFrom(result));
    X(datasize, Rd, result);
@@ -41,8 +41,7 @@ bool TranslatorVisitor::DUP_elt_2(bool Q, Imm<5> imm5, Vec Vn, Vec Vd) {
    const size_t datasize = Q ? 128 : 64;

    const IR::U128 operand = V(idxdsize, Vn);
-    const IR::UAny element = ir.VectorGetElement(esize, operand, index);
-    const IR::U128 result = Q ? ir.VectorBroadcast(esize, element) : ir.VectorBroadcastLower(esize, element);
+    const IR::U128 result = Q ? ir.VectorBroadcastElement(esize, operand, index) : ir.VectorBroadcastElementLower(esize, operand, index);
    V(datasize, Vd, result);
    return true;
 }
@@ -23,7 +23,7 @@ bool TranslatorVisitor::BCAX(Vec Vm, Vec Va, Vec Vn, Vec Vd) {
    const IR::U128 m = ir.GetQ(Vm);
    const IR::U128 n = ir.GetQ(Vn);

-    const IR::U128 result = ir.VectorEor(n, ir.VectorAnd(m, ir.VectorNot(a)));
+    const IR::U128 result = ir.VectorEor(n, ir.VectorAndNot(m, a));

    ir.SetQ(Vd, result);
    return true;
@@ -65,7 +65,7 @@ bool SM3TT2(TranslatorVisitor& v, Vec Vm, Imm<2> imm2, Vec Vn, Vec Vd, SM3TTVari
            return v.ir.Eor(after_low_d, v.ir.Eor(top_d, before_top_d));
        }
        const IR::U32 tmp1 = v.ir.And(top_d, before_top_d);
-        const IR::U32 tmp2 = v.ir.And(v.ir.Not(top_d), after_low_d);
+        const IR::U32 tmp2 = v.ir.AndNot(after_low_d, top_d);
        return v.ir.Or(tmp1, tmp2);
    }();
    const IR::U32 final_tt2 = v.ir.Add(tt2, v.ir.Add(low_d, v.ir.Add(top_n, wj)));
@@ -156,7 +156,7 @@ bool ShiftAndInsert(TranslatorVisitor& v, Imm<4> immh, Imm<3> immb, Vec Vn, Vec
        return v.ir.LogicalShiftLeft(operand1, v.ir.Imm8(shift_amount));
    }();

-    const IR::U64 result = v.ir.Or(v.ir.And(operand2, v.ir.Not(v.ir.Imm64(mask))), shifted);
+    const IR::U64 result = v.ir.Or(v.ir.AndNot(operand2, v.ir.Imm64(mask)), shifted);
    v.V_scalar(esize, Vd, result);
    return true;
 }
@@ -143,8 +143,8 @@ bool TranslatorVisitor::SQRDMULH_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> V
    const auto [index, Vm] = Combine(size, H, L, M, Vmlo);

    const IR::U128 operand1 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vn), 0));
-    const IR::UAny operand2 = ir.VectorGetElement(esize, V(128, Vm), index);
-    const IR::U128 broadcast = ir.VectorBroadcast(esize, operand2);
+    const IR::U128 operand2 = V(128, Vm);
+    const IR::U128 broadcast = ir.VectorBroadcastElement(esize, operand2, index);
    const IR::UpperAndLower multiply = ir.VectorSignedSaturatedDoublingMultiply(esize, operand1, broadcast);
    const IR::U128 result = ir.VectorAdd(esize, multiply.upper, ir.VectorLogicalShiftRight(esize, multiply.lower, static_cast<u8>(esize - 1)));

@@ -161,8 +161,8 @@ bool TranslatorVisitor::SQDMULL_elt_1(Imm<2> size, Imm<1> L, Imm<1> M, Imm<4> Vm
    const auto [index, Vm] = Combine(size, H, L, M, Vmlo);

    const IR::U128 operand1 = ir.ZeroExtendToQuad(ir.VectorGetElement(esize, V(128, Vn), 0));
-    const IR::UAny operand2 = ir.VectorGetElement(esize, V(128, Vm), index);
-    const IR::U128 broadcast = ir.VectorBroadcast(esize, operand2);
+    const IR::U128 operand2 = V(128, Vm);
+    const IR::U128 broadcast = ir.VectorBroadcastElement(esize, operand2, index);
    const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyLong(esize, operand1, broadcast);

    V(128, Vd, result);
@@ -50,7 +50,7 @@ IR::U128 SHA512Hash(IREmitter& ir, Vec Vm, Vec Vn, Vec Vd, SHA512HashPart part)
        const IR::U64 tmp1 = ir.And(a, b);

        if (part == SHA512HashPart::Part1) {
-            const IR::U64 tmp2 = ir.And(ir.Not(a), c);
+            const IR::U64 tmp2 = ir.AndNot(c, a);
            return ir.Eor(tmp1, tmp2);
        }

@@ -350,7 +350,7 @@ bool TranslatorVisitor::SRI_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd)

    const IR::U128 shifted = ir.VectorLogicalShiftRight(esize, operand1, shift_amount);
    const IR::U128 mask_vec = ir.VectorBroadcast(esize, I(esize, mask));
-    const IR::U128 result = ir.VectorOr(ir.VectorAnd(operand2, ir.VectorNot(mask_vec)), shifted);
+    const IR::U128 result = ir.VectorOr(ir.VectorAndNot(operand2, mask_vec), shifted);

    V(datasize, Vd, result);
    return true;
@@ -376,7 +376,7 @@ bool TranslatorVisitor::SLI_2(bool Q, Imm<4> immh, Imm<3> immb, Vec Vn, Vec Vd)

    const IR::U128 shifted = ir.VectorLogicalShiftLeft(esize, operand1, shift_amount);
    const IR::U128 mask_vec = ir.VectorBroadcast(esize, I(esize, mask));
-    const IR::U128 result = ir.VectorOr(ir.VectorAnd(operand2, ir.VectorNot(mask_vec)), shifted);
+    const IR::U128 result = ir.VectorOr(ir.VectorAndNot(operand2, mask_vec), shifted);

    V(datasize, Vd, result);
    return true;
@@ -773,7 +773,7 @@ bool TranslatorVisitor::BIC_asimd_reg(bool Q, Vec Vm, Vec Vn, Vec Vd) {
    const IR::U128 operand1 = V(datasize, Vn);
    const IR::U128 operand2 = V(datasize, Vm);

-    IR::U128 result = ir.VectorAnd(operand1, ir.VectorNot(operand2));
+    IR::U128 result = ir.VectorAndNot(operand1, operand2);
    if (datasize == 64) {
        result = ir.VectorZeroUpper(result);
    }
@@ -36,7 +36,7 @@ bool MultiplyByElement(TranslatorVisitor& v, bool Q, Imm<2> size, Imm<1> L, Imm<
    const size_t datasize = Q ? 128 : 64;

    const IR::U128 operand1 = v.V(datasize, Vn);
-    const IR::U128 operand2 = v.ir.VectorBroadcast(esize, v.ir.VectorGetElement(esize, v.V(idxdsize, Vm), index));
+    const IR::U128 operand2 = v.ir.VectorBroadcastElement(esize, v.V(idxdsize, Vm), index);
    const IR::U128 operand3 = v.V(datasize, Vd);

    IR::U128 result = v.ir.VectorMultiply(esize, operand1, operand2);
@@ -64,9 +64,8 @@ bool FPMultiplyByElement(TranslatorVisitor& v, bool Q, bool sz, Imm<1> L, Imm<1>
    const size_t esize = sz ? 64 : 32;
    const size_t datasize = Q ? 128 : 64;

-    const IR::UAny element2 = v.ir.VectorGetElement(esize, v.V(idxdsize, Vm), index);
    const IR::U128 operand1 = v.V(datasize, Vn);
-    const IR::U128 operand2 = Q ? v.ir.VectorBroadcast(esize, element2) : v.ir.VectorBroadcastLower(esize, element2);
+    const IR::U128 operand2 = Q ? v.ir.VectorBroadcastElement(esize, v.V(idxdsize, Vm), index) : v.ir.VectorBroadcastElementLower(esize, v.V(idxdsize, Vm), index);
    const IR::U128 operand3 = v.V(datasize, Vd);

    const IR::U128 result = [&] {
@@ -93,9 +92,8 @@ bool FPMultiplyByElementHalfPrecision(TranslatorVisitor& v, bool Q, Imm<1> L, Im
    const size_t esize = 16;
    const size_t datasize = Q ? 128 : 64;

-    const IR::UAny element2 = v.ir.VectorGetElement(esize, v.V(idxdsize, Vm), index);
    const IR::U128 operand1 = v.V(datasize, Vn);
-    const IR::U128 operand2 = Q ? v.ir.VectorBroadcast(esize, element2) : v.ir.VectorBroadcastLower(esize, element2);
+    const IR::U128 operand2 = Q ? v.ir.VectorBroadcastElement(esize, v.V(idxdsize, Vm), index) : v.ir.VectorBroadcastElementLower(esize, v.V(idxdsize, Vm), index);
    const IR::U128 operand3 = v.V(datasize, Vd);

    // TODO: We currently don't implement half-precision paths for
@@ -179,7 +177,7 @@ bool MultiplyLong(TranslatorVisitor& v, bool Q, Imm<2> size, Imm<1> L, Imm<1> M,

    const IR::U128 operand1 = v.Vpart(datasize, Vn, Q);
    const IR::U128 operand2 = v.V(idxsize, Vm);
-    const IR::U128 index_vector = v.ir.VectorBroadcast(esize, v.ir.VectorGetElement(esize, operand2, index));
+    const IR::U128 index_vector = v.ir.VectorBroadcastElement(esize, operand2, index);

    const IR::U128 result = [&] {
        const auto [extended_op1, extended_index] = extend_operands(operand1, index_vector);
@@ -349,7 +347,7 @@ bool TranslatorVisitor::SQDMULL_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, I

    const IR::U128 operand1 = Vpart(datasize, Vn, part);
    const IR::U128 operand2 = V(idxsize, Vm);
-    const IR::U128 index_vector = ir.VectorBroadcast(esize, ir.VectorGetElement(esize, operand2, index));
+    const IR::U128 index_vector = ir.VectorBroadcastElement(esize, operand2, index);
    const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiplyLong(esize, operand1, index_vector);

    V(128, Vd, result);
@@ -368,7 +366,7 @@ bool TranslatorVisitor::SQDMULH_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M, I

    const IR::U128 operand1 = V(datasize, Vn);
    const IR::U128 operand2 = V(idxsize, Vm);
-    const IR::U128 index_vector = ir.VectorBroadcast(esize, ir.VectorGetElement(esize, operand2, index));
+    const IR::U128 index_vector = ir.VectorBroadcastElement(esize, operand2, index);
    const IR::U128 result = ir.VectorSignedSaturatedDoublingMultiply(esize, operand1, index_vector).upper;

    V(datasize, Vd, result);
@@ -387,7 +385,7 @@ bool TranslatorVisitor::SQRDMULH_elt_2(bool Q, Imm<2> size, Imm<1> L, Imm<1> M,

    const IR::U128 operand1 = V(datasize, Vn);
    const IR::U128 operand2 = V(idxsize, Vm);
-    const IR::U128 index_vector = ir.VectorBroadcast(esize, ir.VectorGetElement(esize, operand2, index));
+    const IR::U128 index_vector = ir.VectorBroadcastElement(esize, operand2, index);
    const IR::UpperAndLower multiply = ir.VectorSignedSaturatedDoublingMultiply(esize, operand1, index_vector);
    const IR::U128 result = ir.VectorAdd(esize, multiply.upper, ir.VectorLogicalShiftRight(esize, multiply.lower, static_cast<u8>(esize - 1)));

@@ -15,7 +15,7 @@ bool TranslatorVisitor::AXFlag() {
    const IR::U32 v = ir.And(nzcv, ir.Imm32(0x10000000));

    const IR::U32 new_z = ir.Or(ir.LogicalShiftLeft(v, ir.Imm8(2)), z);
-    const IR::U32 new_c = ir.And(ir.And(c, ir.Not(ir.LogicalShiftLeft(v, ir.Imm8(1)))), ir.Imm32(0x20000000));
+    const IR::U32 new_c = ir.And(ir.AndNot(c, ir.LogicalShiftLeft(v, ir.Imm8(1))), ir.Imm32(0x20000000));

    ir.SetNZCVRaw(ir.Or(new_z, new_c));
    return true;
@@ -27,8 +27,8 @@ bool TranslatorVisitor::XAFlag() {
    const IR::U32 z = ir.And(nzcv, ir.Imm32(0x40000000));
    const IR::U32 c = ir.And(nzcv, ir.Imm32(0x20000000));

-    const IR::U32 not_z = ir.And(ir.Not(z), ir.Imm32(0x40000000));
-    const IR::U32 not_c = ir.And(ir.Not(c), ir.Imm32(0x20000000));
+    const IR::U32 not_z = ir.AndNot(ir.Imm32(0x40000000), z);
+    const IR::U32 not_c = ir.AndNot(ir.Imm32(0x20000000), c);

    const IR::U32 new_n = ir.And(ir.LogicalShiftLeft(not_c, ir.Imm8(2)),
                                 ir.LogicalShiftLeft(not_z, ir.Imm8(1)));
@@ -104,7 +104,10 @@ struct detail {
            }
        }

+#ifndef DYNARMIC_IGNORE_ASSERTS
+        // Avoids a MSVC ICE.
        ASSERT(std::all_of(masks.begin(), masks.end(), [](auto m) { return m != 0; }));
+#endif

        return std::make_tuple(masks, shifts);
    }
@@ -88,11 +88,8 @@ public:
        return is_executing;
    }

-    /**
-     * Debugging: Disassemble all of compiled code.
-     * @return A string containing disassembly of all host machine code produced.
-     */
-    std::string Disassemble() const;
+    /// Debugging: Dump a disassembly all compiled code to the console.
+    void DumpDisassembly() const;

 private:
    bool is_executing = false;
@@ -114,11 +114,8 @@ public:
     */
    bool IsExecuting() const;

-    /**
-     * Debugging: Disassemble all of compiled code.
-     * @return A string containing disassembly of all host machine code produced.
-     */
-    std::string Disassemble() const;
+    /// Debugging: Dump a disassembly all of compiled code to the console.
+    void DumpDisassembly() const;

 private:
    struct Impl;
@@ -317,6 +317,15 @@ U32U64 IREmitter::And(const U32U64& a, const U32U64& b) {
    }
 }

+U32U64 IREmitter::AndNot(const U32U64& a, const U32U64& b) {
+    ASSERT(a.GetType() == b.GetType());
+    if (a.GetType() == Type::U32) {
+        return Inst<U32>(Opcode::AndNot32, a, b);
+    } else {
+        return Inst<U64>(Opcode::AndNot64, a, b);
+    }
+}
+
 U32U64 IREmitter::Eor(const U32U64& a, const U32U64& b) {
    ASSERT(a.GetType() == b.GetType());
    if (a.GetType() == Type::U32) {
@@ -958,6 +967,10 @@ U128 IREmitter::VectorAnd(const U128& a, const U128& b) {
    return Inst<U128>(Opcode::VectorAnd, a, b);
 }

+U128 IREmitter::VectorAndNot(const U128& a, const U128& b) {
+    return Inst<U128>(Opcode::VectorAndNot, a, b);
+}
+
 U128 IREmitter::VectorArithmeticShiftRight(size_t esize, const U128& a, u8 shift_amount) {
    switch (esize) {
    case 8:
@@ -1012,6 +1025,34 @@ U128 IREmitter::VectorBroadcast(size_t esize, const UAny& a) {
    UNREACHABLE();
 }

+U128 IREmitter::VectorBroadcastElementLower(size_t esize, const U128& a, size_t index) {
+    ASSERT_MSG(esize * index < 128, "Invalid index");
+    switch (esize) {
+    case 8:
+        return Inst<U128>(Opcode::VectorBroadcastElementLower8, a, u8(index));
+    case 16:
+        return Inst<U128>(Opcode::VectorBroadcastElementLower16, a, u8(index));
+    case 32:
+        return Inst<U128>(Opcode::VectorBroadcastElementLower32, a, u8(index));
+    }
+    UNREACHABLE();
+}
+
+U128 IREmitter::VectorBroadcastElement(size_t esize, const U128& a, size_t index) {
+    ASSERT_MSG(esize * index < 128, "Invalid index");
+    switch (esize) {
+    case 8:
+        return Inst<U128>(Opcode::VectorBroadcastElement8, a, u8(index));
+    case 16:
+        return Inst<U128>(Opcode::VectorBroadcastElement16, a, u8(index));
+    case 32:
+        return Inst<U128>(Opcode::VectorBroadcastElement32, a, u8(index));
+    case 64:
+        return Inst<U128>(Opcode::VectorBroadcastElement64, a, u8(index));
+    }
+    UNREACHABLE();
+}
+
 U128 IREmitter::VectorCountLeadingZeros(size_t esize, const U128& a) {
    switch (esize) {
    case 8:
@@ -144,6 +144,7 @@ public:
    U32U64 UnsignedDiv(const U32U64& a, const U32U64& b);
    U32U64 SignedDiv(const U32U64& a, const U32U64& b);
    U32U64 And(const U32U64& a, const U32U64& b);
+    U32U64 AndNot(const U32U64& a, const U32U64& b);
    U32U64 Eor(const U32U64& a, const U32U64& b);
    U32U64 Or(const U32U64& a, const U32U64& b);
    U32U64 Not(const U32U64& a);
@@ -240,10 +241,13 @@ public:
    U128 VectorAbs(size_t esize, const U128& a);
    U128 VectorAdd(size_t esize, const U128& a, const U128& b);
    U128 VectorAnd(const U128& a, const U128& b);
+    U128 VectorAndNot(const U128& a, const U128& b);
    U128 VectorArithmeticShiftRight(size_t esize, const U128& a, u8 shift_amount);
    U128 VectorArithmeticVShift(size_t esize, const U128& a, const U128& b);
    U128 VectorBroadcast(size_t esize, const UAny& a);
    U128 VectorBroadcastLower(size_t esize, const UAny& a);
+    U128 VectorBroadcastElement(size_t esize, const U128& a, size_t index);
+    U128 VectorBroadcastElementLower(size_t esize, const U128& a, size_t index);
    U128 VectorCountLeadingZeros(size_t esize, const U128& a);
    U128 VectorEor(const U128& a, const U128& b);
    U128 VectorDeinterleaveEven(size_t esize, const U128& a, const U128& b);
@@ -155,10 +155,7 @@ bool Inst::IsMemoryReadOrWrite() const {
 bool Inst::ReadsFromCPSR() const {
    switch (op) {
    case Opcode::A32GetCpsr:
-    case Opcode::A32GetNFlag:
-    case Opcode::A32GetZFlag:
    case Opcode::A32GetCFlag:
-    case Opcode::A32GetVFlag:
    case Opcode::A32GetGEFlags:
    case Opcode::A32UpdateUpperLocationDescriptor:
    case Opcode::A64GetCFlag:
@@ -566,6 +563,8 @@ bool Inst::MayGetNZCVFromOp() const {
    case Opcode::Sub64:
    case Opcode::And32:
    case Opcode::And64:
+    case Opcode::AndNot32:
+    case Opcode::AndNot64:
    case Opcode::Eor32:
    case Opcode::Eor64:
    case Opcode::Or32:
@@ -1,3 +1,5 @@
+// clang-format off
+
 //     opcode name,                                         return type,    arg1 type,      arg2 type,      arg3 type,      arg4 type,      ...

 OPCODE(Void,                                                Void,                                                                           )
@@ -20,13 +22,10 @@ A32OPC(SetCpsr,                                             Void,           U32
 A32OPC(SetCpsrNZCV,                                         Void,           NZCV                                                            )
 A32OPC(SetCpsrNZCVRaw,                                      Void,           U32                                                             )
 A32OPC(SetCpsrNZCVQ,                                        Void,           U32                                                             )
-A32OPC(GetNFlag,                                            U1,                                                                             )
 A32OPC(SetNFlag,                                            Void,           U1                                                              )
-A32OPC(GetZFlag,                                            U1,                                                                             )
 A32OPC(SetZFlag,                                            Void,           U1                                                              )
 A32OPC(GetCFlag,                                            U1,                                                                             )
 A32OPC(SetCFlag,                                            Void,           U1                                                              )
-A32OPC(GetVFlag,                                            U1,                                                                             )
 A32OPC(SetVFlag,                                            Void,           U1                                                              )
 A32OPC(OrQFlag,                                             Void,           U1                                                              )
 A32OPC(GetGEFlags,                                          U32,                                                                            )
@@ -141,6 +140,8 @@ OPCODE(SignedDiv32,                                         U32,            U32,
 OPCODE(SignedDiv64,                                         U64,            U64,            U64                                             )
 OPCODE(And32,                                               U32,            U32,            U32                                             )
 OPCODE(And64,                                               U64,            U64,            U64                                             )
+OPCODE(AndNot32,                                            U32,            U32,            U32                                             )
+OPCODE(AndNot64,                                            U64,            U64,            U64                                             )
 OPCODE(Eor32,                                               U32,            U32,            U32                                             )
 OPCODE(Eor64,                                               U64,            U64,            U64                                             )
 OPCODE(Or32,                                                U32,            U32,            U32                                             )
@@ -289,6 +290,7 @@ OPCODE(VectorAdd16,                                         U128,           U128
 OPCODE(VectorAdd32,                                         U128,           U128,           U128                                            )
 OPCODE(VectorAdd64,                                         U128,           U128,           U128                                            )
 OPCODE(VectorAnd,                                           U128,           U128,           U128                                            )
+OPCODE(VectorAndNot,                                        U128,           U128,           U128                                            )
 OPCODE(VectorArithmeticShiftRight8,                         U128,           U128,           U8                                              )
 OPCODE(VectorArithmeticShiftRight16,                        U128,           U128,           U8                                              )
 OPCODE(VectorArithmeticShiftRight32,                        U128,           U128,           U8                                              )
@@ -304,6 +306,13 @@ OPCODE(VectorBroadcast8,                                    U128,           U8
 OPCODE(VectorBroadcast16,                                   U128,           U16                                                             )
 OPCODE(VectorBroadcast32,                                   U128,           U32                                                             )
 OPCODE(VectorBroadcast64,                                   U128,           U64                                                             )
+OPCODE(VectorBroadcastElementLower8,                        U128,           U128,           U8                                              )
+OPCODE(VectorBroadcastElementLower16,                       U128,           U128,           U8                                              )
+OPCODE(VectorBroadcastElementLower32,                       U128,           U128,           U8                                              )
+OPCODE(VectorBroadcastElement8,                             U128,           U128,           U8                                              )
+OPCODE(VectorBroadcastElement16,                            U128,           U128,           U8                                              )
+OPCODE(VectorBroadcastElement32,                            U128,           U128,           U8                                              )
+OPCODE(VectorBroadcastElement64,                            U128,           U128,           U8                                              )
 OPCODE(VectorCountLeadingZeros8,                            U128,           U128                                                            )
 OPCODE(VectorCountLeadingZeros16,                           U128,           U128                                                            )
 OPCODE(VectorCountLeadingZeros32,                           U128,           U128                                                            )
@@ -718,3 +727,5 @@ A32OPC(CoprocGetOneWord,                                    U32,            Copr
 A32OPC(CoprocGetTwoWords,                                   U64,            CoprocInfo                                                      )
 A32OPC(CoprocLoadWords,                                     Void,           CoprocInfo,     U32                                             )
 A32OPC(CoprocStoreWords,                                    Void,           CoprocInfo,     U32                                             )
+
+// clang-format on
@@ -170,18 +170,10 @@ void A32GetSetElimination(IR::Block& block) {
            do_set(cpsr_info.n, inst->GetArg(0), inst);
            break;
        }
-        case IR::Opcode::A32GetNFlag: {
-            do_get(cpsr_info.n, inst);
-            break;
-        }
        case IR::Opcode::A32SetZFlag: {
            do_set(cpsr_info.z, inst->GetArg(0), inst);
            break;
        }
-        case IR::Opcode::A32GetZFlag: {
-            do_get(cpsr_info.z, inst);
-            break;
-        }
        case IR::Opcode::A32SetCFlag: {
            do_set(cpsr_info.c, inst->GetArg(0), inst);
            break;
@@ -194,10 +186,6 @@ void A32GetSetElimination(IR::Block& block) {
            do_set(cpsr_info.v, inst->GetArg(0), inst);
            break;
        }
-        case IR::Opcode::A32GetVFlag: {
-            do_get(cpsr_info.v, inst);
-            break;
-        }
        case IR::Opcode::A32SetGEFlags: {
            do_set(cpsr_info.ge, inst->GetArg(0), inst);
            break;
@@ -10,7 +10,7 @@
 #include <tuple>
 #include <vector>

-#include <catch.hpp>
+#include <catch2/catch.hpp>

 #include "../fuzz_util.h"
 #include "../rand_int.h"
@@ -372,7 +372,7 @@ static void RunTestInstance(Dynarmic::A32::Jit& jit,
        fmt::print("\n");

        fmt::print("x86_64:\n");
-        fmt::print("{}\n", jit.Disassemble());
+        jit.DumpDisassembly();

        fmt::print("Interrupts:\n");
        for (const auto& i : uni_env.interrupts) {
@@ -12,7 +12,7 @@
 #include <string_view>
 #include <tuple>

-#include <catch.hpp>
+#include <catch2/catch.hpp>

 #include "../rand_int.h"
 #include "../unicorn_emu/a32_unicorn.h"
@@ -183,7 +183,8 @@ static void RunInstance(size_t run_number, ThumbTestEnv& test_env, A32Unicorn<Th
            Optimization::DeadCodeElimination(ir_block);
            Optimization::VerificationPass(ir_block);
            printf("\n\nIR:\n%s", IR::DumpBlock(ir_block).c_str());
-            printf("\n\nx86_64:\n%s", jit.Disassemble().c_str());
+            printf("\n\nx86_64:\n");
+            jit.DumpDisassembly();
            num_insts += ir_block.CycleCount();
        }

@@ -3,7 +3,7 @@
 * SPDX-License-Identifier: 0BSD
 */

-#include <catch.hpp>
+#include <catch2/catch.hpp>

 #include "dynarmic/frontend/A32/disassembler/disassembler.h"

@@ -3,7 +3,7 @@
 * SPDX-License-Identifier: 0BSD
 */

-#include <catch.hpp>
+#include <catch2/catch.hpp>

 #include "./testenv.h"
 #include "dynarmic/frontend/A32/location_descriptor.h"
@@ -3,7 +3,7 @@
 * SPDX-License-Identifier: 0BSD
 */

-#include <catch.hpp>
+#include <catch2/catch.hpp>

 #include "./testenv.h"
 #include "dynarmic/common/common_types.h"
@@ -3,7 +3,7 @@
 * SPDX-License-Identifier: 0BSD
 */

-#include <catch.hpp>
+#include <catch2/catch.hpp>

 #include "./testenv.h"
 #include "dynarmic/common/fp/fpsr.h"
@@ -675,6 +675,57 @@ TEST_CASE("A64: FMADD", "[a64]") {
    REQUIRE(jit.GetVector(10) == Vector{0x3f059921bf0dbfff, 0x0000000000000000});
 }

+TEST_CASE("A64: FMLA.4S(lane)", "[a64]") {
+    A64TestEnv env;
+    A64::Jit jit{A64::UserConfig{&env}};
+
+    env.code_mem.emplace_back(0x4f8f11c0);  // FMLA.4S V0, V14, V15[0]
+    env.code_mem.emplace_back(0x4faf11c1);  // FMLA.4S V1, V14, V15[1]
+    env.code_mem.emplace_back(0x4f8f19c2);  // FMLA.4S V2, V14, V15[2]
+    env.code_mem.emplace_back(0x4faf19c3);  // FMLA.4S V3, V14, V15[3]
+    env.code_mem.emplace_back(0x14000000);  // B .
+
+    jit.SetPC(0);
+    jit.SetVector(0, {0x3ff00000'3ff00000, 0x00000000'00000000});
+    jit.SetVector(1, {0x3ff00000'3ff00000, 0x00000000'00000000});
+    jit.SetVector(2, {0x3ff00000'3ff00000, 0x00000000'00000000});
+    jit.SetVector(3, {0x3ff00000'3ff00000, 0x00000000'00000000});
+
+    jit.SetVector(14, {0x3ff00000'3ff00000, 0x3ff00000'3ff00000});
+    jit.SetVector(15, {0x3ff00000'40000000, 0x40400000'40800000});
+
+    env.ticks_left = 5;
+    jit.Run();
+
+    REQUIRE(jit.GetVector(0) == Vector{0x40b4000040b40000, 0x4070000040700000});
+    REQUIRE(jit.GetVector(1) == Vector{0x40ac800040ac8000, 0x4061000040610000});
+    REQUIRE(jit.GetVector(2) == Vector{0x4116000041160000, 0x40f0000040f00000});
+    REQUIRE(jit.GetVector(3) == Vector{0x40f0000040f00000, 0x40b4000040b40000});
+}
+
+TEST_CASE("A64: FMUL.4S(lane)", "[a64]") {
+    A64TestEnv env;
+    A64::Jit jit{A64::UserConfig{&env}};
+
+    env.code_mem.emplace_back(0x4f8f91c0);  // FMUL.4S V0, V14, V15[0]
+    env.code_mem.emplace_back(0x4faf91c1);  // FMUL.4S V1, V14, V15[1]
+    env.code_mem.emplace_back(0x4f8f99c2);  // FMUL.4S V2, V14, V15[2]
+    env.code_mem.emplace_back(0x4faf99c3);  // FMUL.4S V3, V14, V15[3]
+    env.code_mem.emplace_back(0x14000000);  // B .
+
+    jit.SetPC(0);
+    jit.SetVector(14, {0x3ff00000'3ff00000, 0x3ff00000'3ff00000});
+    jit.SetVector(15, {0x3ff00000'40000000, 0x40400000'40800000});
+
+    env.ticks_left = 5;
+    jit.Run();
+
+    REQUIRE(jit.GetVector(0) == Vector{0x4070000040700000, 0x4070000040700000});
+    REQUIRE(jit.GetVector(1) == Vector{0x4061000040610000, 0x4061000040610000});
+    REQUIRE(jit.GetVector(2) == Vector{0x40f0000040f00000, 0x40f0000040f00000});
+    REQUIRE(jit.GetVector(3) == Vector{0x40b4000040b40000, 0x40b4000040b40000});
+}
+
 TEST_CASE("A64: FMLA.4S (denormal)", "[a64]") {
    A64TestEnv env;
    A64::Jit jit{A64::UserConfig{&env}};
@@ -8,7 +8,7 @@
 #include <string>
 #include <vector>

-#include <catch.hpp>
+#include <catch2/catch.hpp>

 #include "../fuzz_util.h"
 #include "../rand_int.h"
@@ -232,7 +232,7 @@ static void RunTestInstance(Dynarmic::A64::Jit& jit, A64Unicorn& uni, A64TestEnv
        }
        const auto uni_vecs = uni.GetVectors();
        for (size_t i = 0; i < vecs.size(); ++i) {
-            fmt::print("{:3s}: {}{} {}{} {}\n", A64::VecToString(static_cast<A64::Vec>(i)),
+            fmt::print("{:3s}: {:016x}{:016x} {:016x}{:016x} {}\n", A64::VecToString(static_cast<A64::Vec>(i)),
                       uni_vecs[i][1], uni_vecs[i][0],
                       jit.GetVectors()[i][1], jit.GetVectors()[i][0],
                       uni_vecs[i] != jit.GetVectors()[i] ? "*" : "");
@@ -276,7 +276,7 @@ static void RunTestInstance(Dynarmic::A64::Jit& jit, A64Unicorn& uni, A64TestEnv
        fmt::print("{}\n", IR::DumpBlock(ir_block));

        fmt::print("x86_64:\n");
-        fmt::print("{}\n", jit.Disassemble());
+        jit.DumpDisassembly();

        fmt::print("Interrupts:\n");
        for (auto& i : uni_env.interrupts) {
@@ -3,7 +3,7 @@
 * SPDX-License-Identifier: 0BSD
 */

-#include <catch.hpp>
+#include <catch2/catch.hpp>

 #include "./testenv.h"
 #include "dynarmic/interface/A64/a64.h"
@@ -5,7 +5,7 @@

 #include <array>

-#include <catch.hpp>
+#include <catch2/catch.hpp>

 #include "../rand_int.h"
 #include "../unicorn_emu/a64_unicorn.h"
@@ -6,8 +6,8 @@
 #include <array>
 #include <utility>

-#include <catch.hpp>
-#include <xbyak_util.h>
+#include <catch2/catch.hpp>
+#include <xbyak/xbyak_util.h>

 TEST_CASE("Host CPU supports", "[a64]") {
    Xbyak::util::Cpu cpu_info;
@@ -7,7 +7,7 @@
 #include <iomanip>
 #include <iostream>

-#include <catch.hpp>
+#include <catch2/catch.hpp>

 #include "dynarmic/common/assert.h"
 #include "dynarmic/frontend/A32/decoder/asimd.h"
@@ -6,7 +6,7 @@
 #include <tuple>
 #include <vector>

-#include <catch.hpp>
+#include <catch2/catch.hpp>

 #include "../rand_int.h"
 #include "dynarmic/common/common_types.h"
@@ -6,7 +6,7 @@
 #include <tuple>
 #include <vector>

-#include <catch.hpp>
+#include <catch2/catch.hpp>

 #include "../rand_int.h"
 #include "dynarmic/common/common_types.h"
@@ -6,7 +6,7 @@
 #include <tuple>
 #include <vector>

-#include <catch.hpp>
+#include <catch2/catch.hpp>

 #include "../rand_int.h"
 #include "dynarmic/common/common_types.h"
@@ -4,4 +4,4 @@
 */

 #define CATCH_CONFIG_MAIN  // This tells Catch to provide a main() - only do this in one cpp file
-#include <catch.hpp>
+#include <catch2/catch.hpp>
@@ -3,7 +3,7 @@
 * SPDX-License-Identifier: 0BSD
 */

-#include <catch.hpp>
+#include <catch2/catch.hpp>
 #include <fmt/printf.h>

 #include "dynarmic/common/common_types.h"