SASS

class tests.test.sass.test_any.TestAnyMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.instruction.AnyMatcher.

INSTRUCTIONS: Final[dict[str, InstructionMatch]] = {'@!P3 LDG.E.64.CONSTANT R6, desc[UR16][R26.64]': InstructionMatch(opcode='LDG', modifiers=('E', '64', 'CONSTANT'), operands=('R6', 'desc[UR16][R26.64]'), predicate='@!P3', additional=None), '@!PT LDS RZ, [RZ]': InstructionMatch(opcode='LDS', modifiers=(), operands=('RZ', '[RZ]'), predicate='@!PT', additional=None), '@!UP0 UIMAD UR7, UR10, 0xc, URZ': InstructionMatch(opcode='UIMAD', modifiers=(), operands=('UR7', 'UR10', '0xc', 'URZ'), predicate='@!UP0', additional=None), '@UP0 LDCU.64 UR12, c[0x0][0x388]': InstructionMatch(opcode='LDCU', modifiers=('64',), operands=('UR12', 'c[0x0][0x388]'), predicate='@UP0', additional=None), 'ATOMG.E.ADD.STRONG.GPU PT, R4, desc[UR16][R4.64], R7': InstructionMatch(opcode='ATOMG', modifiers=('E', 'ADD', 'STRONG', 'GPU'), operands=('PT', 'R4', 'desc[UR16][R4.64]', 'R7'), predicate=None, additional=None), 'BAR.SYNC.DEFER_BLOCKING 0x0': InstructionMatch(opcode='BAR', modifiers=('SYNC', 'DEFER_BLOCKING'), operands=('0x0',), predicate=None, additional=None), 'BRA 0x240': InstructionMatch(opcode='BRA', modifiers=(), operands=('0x240',), predicate=None, additional=None), 'DADD R12, |R12|, |R14|': InstructionMatch(opcode='DADD', modifiers=(), operands=('R12', '|R12|', '|R14|'), predicate=None, additional=None), 'EXIT': InstructionMatch(opcode='EXIT', modifiers=(), operands=(), predicate=None, additional=None), 'F2FP.BF16.PACK_AB R27, R6, R27': InstructionMatch(opcode='F2FP', modifiers=('BF16', 'PACK_AB'), operands=('R27', 'R6', 'R27'), predicate=None, additional=None), 'FADD.FTZ.RN R0, R1, R2': InstructionMatch(opcode='FADD', modifiers=('FTZ', 'RN'), operands=('R0', 'R1', 'R2'), predicate=None, additional=None), 'FMUL R6, R27.reuse, 0.044714998453855514526': InstructionMatch(opcode='FMUL', modifiers=(), operands=('R6', 'R27.reuse', '0.044714998453855514526'), predicate=None, additional=None), 'FMUL.FTZ R2, R2, R3': InstructionMatch(opcode='FMUL', modifiers=('FTZ',), operands=('R2', 'R2', 'R3'), predicate=None, additional=None), 'HFMA2 R7, -RZ, RZ, 0, 5.9604644775390625e-08': InstructionMatch(opcode='HFMA2', modifiers=(), operands=('R7', '-RZ', 'RZ', '0', '5.9604644775390625e-08'), predicate=None, additional=None), 'HMUL2 R0, R2.H0_H0, R3.H0_H0': InstructionMatch(opcode='HMUL2', modifiers=(), operands=('R0', 'R2.H0_H0', 'R3.H0_H0'), predicate=None, additional=None), 'IADD3 R2, R2, 0x4, RZ': InstructionMatch(opcode='IADD3', modifiers=(), operands=('R2', 'R2', '0x4', 'RZ'), predicate=None, additional=None), 'IMAD.MOV.U32 R4, R4, R5, R6': InstructionMatch(opcode='IMAD', modifiers=('MOV', 'U32'), operands=('R4', 'R4', 'R5', 'R6'), predicate=None, additional=None), 'ISETP.NE.AND P0, PT, R1, RZ, PT': InstructionMatch(opcode='ISETP', modifiers=('NE', 'AND'), operands=('P0', 'PT', 'R1', 'RZ', 'PT'), predicate=None, additional=None), 'LDG.E R0, desc[UR12][R18.64]': InstructionMatch(opcode='LDG', modifiers=('E',), operands=('R0', 'desc[UR12][R18.64]'), predicate=None, additional=None), 'LDG.E.SYS R4, [R2]': InstructionMatch(opcode='LDG', modifiers=('E', 'SYS'), operands=('R4', '[R2]'), predicate=None, additional=None), 'LDGSTS.E.LTC128B [R211+-0x4000], desc[UR24][R182.64+0x20]': InstructionMatch(opcode='LDGSTS', modifiers=('E', 'LTC128B'), operands=('[R211+-0x4000]', 'desc[UR24][R182.64+0x20]'), predicate=None, additional=None), 'LDS.64 R12, [UR7+0x8]': InstructionMatch(opcode='LDS', modifiers=('64',), operands=('R12', '[UR7+0x8]'), predicate=None, additional=None), 'LEA.HI.X R145, R140, R134.reuse, R135, 0x2, P3': InstructionMatch(opcode='LEA', modifiers=('HI', 'X'), operands=('R145', 'R140', 'R134.reuse', 'R135', '0x2', 'P3'), predicate=None, additional=None), 'LOP3.LUT P0, RZ, R8, R94, RZ, 0xfc, !PT': InstructionMatch(opcode='LOP3', modifiers=('LUT',), operands=('P0', 'RZ', 'R8', 'R94', 'RZ', '0xfc', '!PT'), predicate=None, additional=None), 'MEMBAR.SC.GPU': InstructionMatch(opcode='MEMBAR', modifiers=('SC', 'GPU'), operands=(), predicate=None, additional=None), 'MOV R8, c[0x0][0x140]': InstructionMatch(opcode='MOV', modifiers=(), operands=('R8', 'c[0x0][0x140]'), predicate=None, additional=None), 'NOP': InstructionMatch(opcode='NOP', modifiers=(), operands=(), predicate=None, additional=None), 'RET.REL.NODEC R4 0x0': InstructionMatch(opcode='RET', modifiers=('REL', 'NODEC'), operands=('R4', '0x0'), predicate=None, additional=None), 'S2R R0, SR_CTAID.X': InstructionMatch(opcode='S2R', modifiers=(), operands=('R0', 'SR_CTAID.X'), predicate=None, additional=None), 'STG.E [R2], R4': InstructionMatch(opcode='STG', modifiers=('E',), operands=('[R2]', 'R4'), predicate=None, additional=None), 'STG.E.64.SYS [R2.64+UR4], R4': InstructionMatch(opcode='STG', modifiers=('E', '64', 'SYS'), operands=('[R2.64+UR4]', 'R4'), predicate=None, additional=None), 'STL [R1], R2': InstructionMatch(opcode='STL', modifiers=(), operands=('[R1]', 'R2'), predicate=None, additional=None), 'STS.64 [R9.X8], R2': InstructionMatch(opcode='STS', modifiers=('64',), operands=('[R9.X8]', 'R2'), predicate=None, additional=None), 'UIADD3 UR5, UPT, UPT, -UR4, UR9, URZ': InstructionMatch(opcode='UIADD3', modifiers=(), operands=('UR5', 'UPT', 'UPT', '-UR4', 'UR9', 'URZ'), predicate=None, additional=None)}

Zoo of real SASS instructions.

MATCHER: Final[AnyMatcher] = AnyMatcher(pattern=regex.Regex('(?:(?P<predicate>@!?U?P(?:T|[0-9]+)))?\\s*(?P<opcode>[A-Z0-9]+)(?:\\.(?P<modifiers>[A-Z0-9_]+))*\\s*(?:(?P<operands>[\\w!\\.\\[\\]\\+\\-\\|~]+)(?:,?\\s*(?P<operands>[\\w!\\.\\[\\]\\+\\-\\|~]+))*)?', flags=regex.V0))
test(instruction: str, expected: InstructionMatch) NoneView on GitHub
test_no_match() NoneView on GitHub
test_pattern() NoneView on GitHub
class tests.test.sass.test_atomic.TestAtomicMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.instruction.atomic.AtomicMatcher.

CODE_ADD_BASED_ON_CAS = '__global__ void cas({type}* __restrict__ const dst, const {type}* __restrict__ const src)\n{{\n    static_assert(sizeof({type}) == {size});\n\n    const auto index = blockIdx.x * blockDim.x + threadIdx.x;\n\n    {integer}* const dest = reinterpret_cast<{integer}*>(dst + index);\n\n    {integer} old = *dest;\n    {integer} assumed;\n\n    do {{\n        assumed = old;\n        {type} new_val = assumed + src[index];\n        old = atomicCAS(\n            dest,\n            assumed,\n            reinterpret_cast<{integer}&>(new_val)\n        );\n    }} while (old != assumed);\n}}\n'
CODE_ADD_BASED_ON_CAS_128 = 'struct alignas(2 * sizeof(double)) My128Struct\n{\n    double x, y;\n\n    __host__ __device__ friend My128Struct operator+(const My128Struct& a, const My128Struct& b)\n    {\n        return My128Struct{\n            .x = a.x + b.x,\n            .y = a.y + b.y\n        };\n    }\n\n    auto operator<=>(const My128Struct&) const = default;\n};\n\n__global__ void cas(My128Struct* __restrict__ const dst, const My128Struct* __restrict__ const src)\n{\n    static_assert(sizeof (My128Struct) == 16);\n    static_assert(alignof(My128Struct) == 16);\n\n    static_assert(std::is_trivially_copyable_v<My128Struct>);\n\n    const auto index = blockIdx.x * blockDim.x + threadIdx.x;\n\n    auto* const dest = dst + index;\n\n    My128Struct old = *dest;\n    My128Struct assumed;\n\n    do {\n        assumed = old;\n        My128Struct new_val = assumed + src[index];\n        old = atomicCAS(\n            dest,\n            assumed,\n            new_val\n        );\n    } while (old != assumed);\n}\n'
CODE_ADD_RELAXED_BLOCK = '#include "cuda/atomic"\n\n__global__ void add({type}* __restrict__ const dst, const {type}* __restrict__ const src)\n{{\n    const auto index = blockIdx.x * blockDim.x + threadIdx.x;\n\n    cuda::atomic_ref<{type}, cuda::thread_scope_block> ref(dst[index]);\n    ref.fetch_add(src[index], cuda::memory_order_relaxed);\n}}\n'
CODE_COMPARE_EXCHANGE = '#include "cuda/atomic"\n\n__global__ void compare_exchange({type}* __restrict__ dst, const {type}* __restrict__ src)\n{{\n    const auto index = blockIdx.x * blockDim.x + threadIdx.x;\n    cuda::atomic_ref<{type}, cuda::thread_scope_{scope}> ref(dst[index]);\n\n    {type} old_val = ref.load(cuda::memory_order_relaxed);\n    {type} new_val;\n\n    do {{\n        new_val = min(old_val, src[index]);\n    }} while (!ref.compare_exchange_{consistency}(old_val, new_val,\n                                       cuda::memory_order_acquire,\n                                       cuda::memory_order_acquire));\n}}\n'
CODE_EXCH = '__global__ void exch({type}* __restrict__ const dst, const {type}* __restrict__ const src)\n{{\n    const auto index = blockIdx.x * blockDim.x + threadIdx.x;\n    atomicExch(&dst[index], src[index]);\n}}\n'
CODE_EXCH_DEVICE_PTR: Final[str] = '__device__ __constant__ int32_t* ptr;\n\n__global__ void atomic_exch_kernel() {\n    atomicExch(ptr, 0);\n}\n'
CODE_MIN = '#include "cuda/atomic"\n\n__global__ void add({type}* __restrict__ const dst, const {type}* __restrict__ const src)\n{{\n    const auto index = blockIdx.x * blockDim.x + threadIdx.x;\n    cuda::atomic_ref<{type}, cuda::thread_scope_{scope}> ref(dst[index]);\n    ref.fetch_min(src[index], cuda::memory_order_relaxed);\n}}\n'
static assert_atomicCAS_ptx(*, output: Path, matcher: AtomicMatcher, cuda_compiler: CMakeToolchainCompiler) NoneView on GitHub

Check for the expected PTX code in output given the reprospect.test.sass.instruction.ThreadScope.

According to:

it should generate reprospect.test.sass.instruction.ThreadScope.DEVICE instructions. However, as noted in:

it may unexpectedly generate reprospect.test.sass.instruction.ThreadScope.SYSTEM instructions.

Note

According to https://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces-state-spaces-tab, .global is global memory shared by all threads.

Note

According to https://docs.nvidia.com/cuda/parallel-thread-execution/#id682, .sys is the set of all threads in the current program, including all kernel grids invoked by the host program on all compute devices, and all threads constituting the host program itself.

static get_atomicCAS_thread_scope(*, size: int, arch: NVIDIAArch, cuda_compiler: CMakeToolchainCompiler) ThreadScopeView on GitHub

Get the expected reprospect.test.sass.instruction.ThreadScope for atomicCAS.

static match_one(*, decoder, **kwargs) tuple[AtomicMatcher, Instruction, InstructionMatch]View on GitHub

Match exactly one instruction.

pytestmark = [Mark(name='parametrize', args=('parameters', (Parameters(arch=NVIDIAArch(family=<NVIDIAFamily.VOLTA: 'VOLTA'>, compute_capability=ComputeCapability(major=7, minor=0))), Parameters(arch=NVIDIAArch(family=<NVIDIAFamily.TURING: 'TURING'>, compute_capability=ComputeCapability(major=7, minor=5))), Parameters(arch=NVIDIAArch(family=<NVIDIAFamily.AMPERE: 'AMPERE'>, compute_capability=ComputeCapability(major=8, minor=0))), Parameters(arch=NVIDIAArch(family=<NVIDIAFamily.AMPERE: 'AMPERE'>, compute_capability=ComputeCapability(major=8, minor=6))), Parameters(arch=NVIDIAArch(family=<NVIDIAFamily.ADA: 'ADA'>, compute_capability=ComputeCapability(major=8, minor=9))), Parameters(arch=NVIDIAArch(family=<NVIDIAFamily.HOPPER: 'HOPPER'>, compute_capability=ComputeCapability(major=9, minor=0))), Parameters(arch=NVIDIAArch(family=<NVIDIAFamily.BLACKWELL: 'BLACKWELL'>, compute_capability=ComputeCapability(major=10, minor=0))), Parameters(arch=NVIDIAArch(family=<NVIDIAFamily.BLACKWELL: 'BLACKWELL'>, compute_capability=ComputeCapability(major=12, minor=0))))), kwargs={'ids': <class 'str'>})]
test_add_relaxed_block_double(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Similar to test_add_relaxed_block_int() for double.

test_add_relaxed_block_float(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Similar to test_add_relaxed_block_int() for float.

test_add_relaxed_block_int(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

As of CUDA 13.0.0, the generated code still applies the .STRONG modifier, regardless of the .relaxed qualifier shown in the PTX.

test_add_relaxed_block_unsigned_long_long_int(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Similar to test_add_relaxed_block_int() for unsigned long long int.

test_atomicCAS(request, workdir, word, parameters: Parameters, cmake_file_api: FileAPI, cmake_cuda_compiler: CMakeToolchainCompiler)View on GitHub

Test with CODE_ADD_BASED_ON_CAS.

test_atomicCAS_128(request, workdir, parameters: Parameters, cmake_file_api: FileAPI, cmake_cuda_compiler: CMakeToolchainCompiler)View on GitHub

Supported from compute capability 9.x.

test_compare_exchange_system(request, consistency: str, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Test with CODE_COMPARE_EXCHANGE for unsigned long long int, reprospect.test.sass.instruction.ThreadScope.SYSTEM scope.

Note

Both weak and strong lead to a STRONG consistency.

test_exch_device_ptr(request, workdir: Path, parameters: Parameters, cmake_file_api: FileAPI, cmake_cuda_compiler: CMakeToolchainCompiler) NoneView on GitHub

This test demonstrates that while nvcc emits an ATOMG instruction for an atomic exchange using a device pointer marked with __constant__, clang (as of 21.1.5) is not able to infer that the referenced memory resides in global memory and therefore falls back emitting a generic ATOM instruction.

nvcc appears to generate better code in this case: because the device pointer is declared as __constant__, the compiler can reasonably assume that it cannot point to local or shared memory, and thus must refer to global memory. This allows nvcc to use the more specific global-memory atomic instruction.

test_exch_strong_device_float(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Test with CODE_EXCH for float.

test_exch_strong_device_int(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Test with CODE_EXCH for int.

test_exch_strong_device_unsigned_long_long_int(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Test with CODE_EXCH for unsigned long long int.

test_min_relaxed_device_int(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Test with CODE_MIN for int and reprospect.test.sass.instruction.ThreadScope.DEVICE scope.

test_min_relaxed_device_long_long_int(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Test with CODE_MIN for long long int and reprospect.test.sass.instruction.ThreadScope.DEVICE scope.

test_min_relaxed_device_unsigned_long_long_int(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Test with CODE_MIN for unsigned long long int and reprospect.test.sass.instruction.ThreadScope.DEVICE scope.

test_min_relaxed_system_int(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Test with CODE_MIN for int and reprospect.test.sass.instruction.ThreadScope.SYSTEM scope.

class tests.test.sass.test_branch.TestBranchMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.instruction.branch.BranchMatcher.

INSTRUCTIONS: Final[dict[str, InstructionMatch]] = {'@!UP0 BRA 0xc': InstructionMatch(opcode='BRA', modifiers=(), operands=('0xc',), predicate='@!UP0', additional=None), '@PT BRA 0xf0': InstructionMatch(opcode='BRA', modifiers=(), operands=('0xf0',), predicate='@PT', additional=None), '@UPT BRA 0x170': InstructionMatch(opcode='BRA', modifiers=(), operands=('0x170',), predicate='@UPT', additional=None), 'BRA 0x240': InstructionMatch(opcode='BRA', modifiers=(), operands=('0x240',), predicate=None, additional=None)}

Zoo of real BRA instructions.

MATCHER: Final[BranchMatcher] = BranchMatcher(pattern=regex.Regex('(?:(?P<predicate>@!?U?P(?:T|[0-9]+)))?\\s*(?P<opcode>BRA)\\s*(?P<operands>0x[0-9A-Fa-f]+)', flags=regex.V0))
test(instruction: str, expected: InstructionMatch) NoneView on GitHub
test_no_match() NoneView on GitHub
test_pattern() NoneView on GitHub
test_with_predicate() NoneView on GitHub

Verify that the matcher only matches instructions containing the exact predicate. Instructions without the predicate or with a different predicate should not match.

class tests.test.sass.test_composite.TestAnyOfView on GitHub

Bases: object

Tests for reprospect.test.sass.composite.any_of().

test() NoneView on GitHub
class tests.test.sass.test_composite.TestFindallView on GitHub

Bases: object

Tests for reprospect.test.sass.composite.findall().

test() NoneView on GitHub
class tests.test.sass.test_composite.TestFinduniqueView on GitHub

Bases: object

Tests for reprospect.test.sass.composite.findunique().

test() NoneView on GitHub
class tests.test.sass.test_composite.TestInstructionCountIsView on GitHub

Bases: object

Tests for reprospect.test.sass.composite.instruction_count_is().

test() NoneView on GitHub
class tests.test.sass.test_composite.TestInstructionIsView on GitHub

Bases: object

Tests for reprospect.test.sass.composite.instruction_is().

test() NoneView on GitHub
test_one_or_more_times() NoneView on GitHub
test_times_1() NoneView on GitHub
test_times_2() NoneView on GitHub
test_with_modifier() NoneView on GitHub

Test reprospect.test.sass.composite.Fluentizer.with_modifier().

test_with_operand() NoneView on GitHub

Test reprospect.test.sass.composite.Fluentizer.with_operand().

test_with_operand_composed() NoneView on GitHub

Similar to test_with_operands() but calls reprospect.test.sass.composite.Fluentizer.with_operand() many times.

test_with_operands() NoneView on GitHub

Test reprospect.test.sass.composite.Fluentizer.with_operands().

test_zero_or_more_time() NoneView on GitHub
class tests.test.sass.test_composite.TestInstructionsAreView on GitHub

Bases: object

Tests for reprospect.test.sass.composite.instructions_are().

test() NoneView on GitHub
test_mix() NoneView on GitHub
class tests.test.sass.test_composite.TestInstructionsContainView on GitHub

Bases: object

Tests for reprospect.test.sass.composite.instructions_contain().

test() NoneView on GitHub
class tests.test.sass.test_composite.TestInterleavedInstructionsAreView on GitHub

Bases: object

Tests for reprospect.test.sass.composite.interleaved_instructions_are().

test() NoneView on GitHub
class tests.test.sass.test_composite.TestUnorderedInstructionsAreView on GitHub

Bases: object

Tests for reprospect.test.sass.composite.unordered_instructions_are().

test() NoneView on GitHub
class tests.test.sass.test_composite.TestUnorderedInterleavedInstructionsAreView on GitHub

Bases: object

Tests for reprospect.test.sass.composite.unordered_interleaved_instructions_are().

test() NoneView on GitHub
tests.test.sass.test_composite_impl.DADD_DMUL = (Instruction(offset=0, instruction='DADD R4, R4, c[0x0][0x180]', hex='0x0', control=ControlCode(stall_count=1, yield_flag=True, read=7, write=0, wait=[False, False, False, False, False, False], reuse={'A': False, 'B': False, 'C': False, 'D': False})), Instruction(offset=0, instruction='DMUL R6, R6, c[0x0][0x188]', hex='0x1', control=ControlCode(stall_count=1, yield_flag=True, read=7, write=0, wait=[False, False, False, False, False, False], reuse={'A': False, 'B': False, 'C': False, 'D': False})))

One DADD followed by a DMUL.

tests.test.sass.test_composite_impl.DADD_NOP_DMUL = (Instruction(offset=0, instruction='DADD R4, R4, c[0x0][0x180]', hex='0x0', control=ControlCode(stall_count=1, yield_flag=True, read=7, write=0, wait=[False, False, False, False, False, False], reuse={'A': False, 'B': False, 'C': False, 'D': False})), Instruction(offset=0, instruction='NOP', hex='0x2', control=ControlCode(stall_count=1, yield_flag=True, read=7, write=0, wait=[False, False, False, False, False, False], reuse={'A': False, 'B': False, 'C': False, 'D': False})), Instruction(offset=0, instruction='NOP', hex='0x2', control=ControlCode(stall_count=1, yield_flag=True, read=7, write=0, wait=[False, False, False, False, False, False], reuse={'A': False, 'B': False, 'C': False, 'D': False})), Instruction(offset=0, instruction='NOP', hex='0x2', control=ControlCode(stall_count=1, yield_flag=True, read=7, write=0, wait=[False, False, False, False, False, False], reuse={'A': False, 'B': False, 'C': False, 'D': False})), Instruction(offset=0, instruction='DMUL R6, R6, c[0x0][0x188]', hex='0x1', control=ControlCode(stall_count=1, yield_flag=True, read=7, write=0, wait=[False, False, False, False, False, False], reuse={'A': False, 'B': False, 'C': False, 'D': False})))

One DADD instruction followed by a few NOP, and a DMUL.

tests.test.sass.test_composite_impl.NOP_DMUL_NOP_DADD = (Instruction(offset=0, instruction='NOP', hex='0x2', control=ControlCode(stall_count=1, yield_flag=True, read=7, write=0, wait=[False, False, False, False, False, False], reuse={'A': False, 'B': False, 'C': False, 'D': False})), Instruction(offset=0, instruction='DMUL R6, R6, c[0x0][0x188]', hex='0x1', control=ControlCode(stall_count=1, yield_flag=True, read=7, write=0, wait=[False, False, False, False, False, False], reuse={'A': False, 'B': False, 'C': False, 'D': False})), Instruction(offset=0, instruction='NOP', hex='0x2', control=ControlCode(stall_count=1, yield_flag=True, read=7, write=0, wait=[False, False, False, False, False, False], reuse={'A': False, 'B': False, 'C': False, 'D': False})), Instruction(offset=0, instruction='NOP', hex='0x2', control=ControlCode(stall_count=1, yield_flag=True, read=7, write=0, wait=[False, False, False, False, False, False], reuse={'A': False, 'B': False, 'C': False, 'D': False})), Instruction(offset=0, instruction='DADD R4, R4, c[0x0][0x180]', hex='0x0', control=ControlCode(stall_count=1, yield_flag=True, read=7, write=0, wait=[False, False, False, False, False, False], reuse={'A': False, 'B': False, 'C': False, 'D': False})))

NOP instructions with one DADD and one DMUL.

class tests.test.sass.test_composite_impl.TestAllInSequenceMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.composite_impl.AllInSequenceMatcher.

test_no_match() NoneView on GitHub
test_sequence() NoneView on GitHub

The inner matcher is a reprospect.test.sass.composite_impl.SequenceMatcher.

test_single() NoneView on GitHub

The inner matcher is a reprospect.test.sass.instruction.InstructionMatcher.

class tests.test.sass.test_composite_impl.TestAnyOfMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.composite_impl.AnyOfMatcher.

test_explain() NoneView on GitHub
test_matches() NoneView on GitHub
test_no_match() NoneView on GitHub
class tests.test.sass.test_composite_impl.TestCountInSequenceMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.composite_impl.CountInSequenceMatcher.

INNER: Final[OpcodeModsMatcher] = OpcodeModsMatcher(pattern=regex.Regex('(?P<opcode>YIELD)(?:\\.(?P<modifiers>[A-Z0-9_]+))*', flags=regex.V0))
test_explain() NoneView on GitHub
test_match() NoneView on GitHub
test_no_match() NoneView on GitHub
class tests.test.sass.test_composite_impl.TestInSequenceAtMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.composite_impl.InSequenceAtMatcher.

MATCHER: Final[InSequenceAtMatcher] = <reprospect.test.sass.composite_impl.InSequenceAtMatcher object>
test_explain() NoneView on GitHub
test_match_first_element() NoneView on GitHub

Matches the first element.

test_match_with_start() NoneView on GitHub

Matches the element pointed to by start.

test_no_match() NoneView on GitHub

Raises if the first element does not match.

class tests.test.sass.test_composite_impl.TestInSequenceMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.composite_impl.InSequenceMatcher.

test_explain() NoneView on GitHub
test_matches() NoneView on GitHub
test_no_match() NoneView on GitHub
class tests.test.sass.test_composite_impl.TestOneOrMoreInSequenceMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.composite_impl.OneOrMoreInSequenceMatcher.

test_explain() NoneView on GitHub
test_matches_more() NoneView on GitHub

Matches the sequence DADD_NOP_DMUL many times starting at 1.

test_matches_one() NoneView on GitHub

Matches the sequence DADD_NOP_DMUL.

test_matches_zero() NoneView on GitHub

Does not match sequence DADD_NOP_DMUL.

class tests.test.sass.test_composite_impl.TestOrderedInSequenceMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.composite_impl.OrderedInSequenceMatcher.

MATCHER: Final[OrderedInSequenceMatcher] = <reprospect.test.sass.composite_impl.OrderedInSequenceMatcher object>
test_explain() NoneView on GitHub
test_match() NoneView on GitHub

Matches the sequence DADD_DMUL.

test_match_with_nop() NoneView on GitHub

Matches the sequence DADD_NOP_DMUL.

test_no_match() NoneView on GitHub

Does not match reversed sequence DADD_DMUL.

class tests.test.sass.test_composite_impl.TestOrderedInterleavedInSequenceMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.composite_impl.OrderedInterleavedInSequenceMatcher.

INSTRUCTIONS_DADD: Final[tuple[str, ...]] = ('LDG.E.ENL2.256 R8, R4, desc[UR6][R2.64]', 'DADD R4, R4, UR12', 'NOP', 'NOP', 'NOP', 'NOP', 'DADD R6, R6, UR14', 'NOP', 'NOP', 'NOP', 'NOP', 'DADD R8, R8, UR16', 'NOP', 'NOP', 'NOP', 'NOP', 'DADD R10, R10, UR18', 'STG.E.ENL2.256 desc[UR6][R2.64], R4, R8')
INSTRUCTIONS_LDG: Final[tuple[str, ...]] = ('LDG.E.U16.SYS R2, [R2]', 'LDG.E.U16.SYS R4, [R4]')
MATCHERS_DADD: Final[tuple[OpcodeModsWithOperandsMatcher, ...]] = (OpcodeModsWithOperandsMatcher(pattern=regex.Regex('(?P<opcode>DADD)\\s*(?P<operands>R4),?\\s*(?P<operands>R4),?\\s*(?P<operands>UR[0-9]+)', flags=regex.V0)), OpcodeModsWithOperandsMatcher(pattern=regex.Regex('(?P<opcode>DADD)\\s*(?P<operands>R6),?\\s*(?P<operands>R6),?\\s*(?P<operands>UR[0-9]+)', flags=regex.V0)), OpcodeModsWithOperandsMatcher(pattern=regex.Regex('(?P<opcode>DADD)\\s*(?P<operands>R8),?\\s*(?P<operands>R8),?\\s*(?P<operands>UR[0-9]+)', flags=regex.V0)), OpcodeModsWithOperandsMatcher(pattern=regex.Regex('(?P<opcode>DADD)\\s*(?P<operands>R10),?\\s*(?P<operands>R10),?\\s*(?P<operands>UR[0-9]+)', flags=regex.V0)))
MATCHERS_LDG: Final[tuple[LoadGlobalMatcher, ...]] = (LoadGlobalMatcher(pattern=regex.Regex('(?P<opcode>LDG)\\.(?P<modifiers>E)\\.(?P<modifiers>U16)\\.(?P<modifiers>SYS) (?P<operands>R[0-9]+), (?P<address>(?P<operands>\\[R[0-9]+(?:\\+-?0x[0-9A-Fa-f]+)?\\]))', flags=regex.V0)), LoadGlobalMatcher(pattern=regex.Regex('(?P<opcode>LDG)\\.(?P<modifiers>E)\\.(?P<modifiers>U16)\\.(?P<modifiers>SYS) (?P<operands>R[0-9]+), (?P<address>(?P<operands>\\[R[0-9]+(?:\\+-?0x[0-9A-Fa-f]+)?\\]))', flags=regex.V0)))
test_dadd() NoneView on GitHub
test_ldg() NoneView on GitHub
test_no_match() NoneView on GitHub
class tests.test.sass.test_composite_impl.TestUnorderedInSequenceMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.composite_impl.UnorderedInSequenceMatcher.

test_explain() NoneView on GitHub
test_match_with_nop() NoneView on GitHub

Matches the sequence DADD_NOP_DMUL.

test_no_match() NoneView on GitHub

All permutations fail on sequence NOP_DMUL_NOP_DADD.

test_with_split_nop() NoneView on GitHub

Matches the sequence NOP_DMUL_NOP_DADD.

test_without_nop() NoneView on GitHub

Matches the sequence DADD_DMUL.

class tests.test.sass.test_composite_impl.TestUnorderedInterleavedInSequenceMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.composite_impl.UnorderedInterleavedInSequenceMatcher.

test_dadd() NoneView on GitHub
class tests.test.sass.test_composite_impl.TestZeroOrMoreInSequenceMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.composite_impl.ZeroOrMoreInSequenceMatcher.

test_assert_matches_always_true() NoneView on GitHub

Matching zero time is fine.

test_explain() NoneView on GitHub
test_matches_with_start() NoneView on GitHub

Matches many times, with a start and sequence DADD_NOP_DMUL.

test_matches_zero() NoneView on GitHub

Matches zero time with sequence DADD_NOP_DMUL.

tests.test.sass.test_instruction.CODE_ELEMENTWISE_ADD_RESTRICT = '__global__ void elementwise_add_restrict(int* __restrict__ const dst, const int* __restrict__ const src) {\n    const auto index = blockIdx.x * blockDim.x + threadIdx.x;\n    dst[index] += src[index];\n}\n'

Element-wise add with 32-bit int.

tests.test.sass.test_instruction.CODE_ELEMENTWISE_ADD_RESTRICT_128_WIDE = '__global__ void elementwise_add_restrict_128_wide(float4* __restrict__ const dst, const float4* __restrict__ const src) {\n    const auto index = blockIdx.x * blockDim.x + threadIdx.x;\n    const float4& a = src[index];\n    const float4& b = dst[index];\n    dst[index] = make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);\n}\n'

Element-wise add with 128-bit float4.

tests.test.sass.test_instruction.CODE_ELEMENTWISE_ADD_RESTRICT_256_WIDE = 'struct alignas(4 * sizeof(double)) Tester {\n    double x, y, z, w;\n};\n\n__global__ void elementwise_add_restrict_256_wide(Tester* __restrict__ const dst, const Tester* __restrict__ const src)\n{\n    const auto index = blockIdx.x * blockDim.x + threadIdx.x;\n    const Tester& a = src[index];\n    const Tester& b = dst[index];\n    dst[index] = Tester{.x = a.x + b.x, .y = a.y + b.y, .z = a.z + b.z, .w = a.w + b.w};\n}\n'

Element-wise add with 256-bit aligned elements.

class tests.test.sass.test_instruction.TestOpcodeModsMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.instruction.OpcodeModsMatcher.

test_with_descr()View on GitHub
test_with_minus_sign()View on GitHub
test_with_reuse()View on GitHub
test_with_square_brackets()View on GitHub
class tests.test.sass.test_instruction.TestOpcodeModsWithOperandsMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.instruction.OpcodeModsWithOperandsMatcher.

test()View on GitHub
class tests.test.sass.test_instruction.TestReductionMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.instruction.atomic.ReductionMatcher.

CODE_ADD = '__global__ void add({type}* __restrict__ const dst, const {type}* __restrict__ const src)\n{{\n    const auto index = blockIdx.x * blockDim.x + threadIdx.x;\n    atomicAdd(&dst[index], src[index]);\n}}\n'
CODE_MAX = '__global__ void max({type}* __restrict__ const dst, const {type}* __restrict__ const src)\n{{\n    const auto index = blockIdx.x * blockDim.x + threadIdx.x;\n    atomicMax(&dst[index], src[index]);\n}}\n'
CODE_SUB = '__global__ void sub(int* __restrict__ const dst, const int* __restrict__ const src)\n{\n    const auto index = blockIdx.x * blockDim.x + threadIdx.x;\n    atomicSub(&dst[index], src[index]);\n}\n'
pytestmark = [Mark(name='parametrize', args=('parameters', (Parameters(arch=NVIDIAArch(family=<NVIDIAFamily.VOLTA: 'VOLTA'>, compute_capability=ComputeCapability(major=7, minor=0))), Parameters(arch=NVIDIAArch(family=<NVIDIAFamily.TURING: 'TURING'>, compute_capability=ComputeCapability(major=7, minor=5))), Parameters(arch=NVIDIAArch(family=<NVIDIAFamily.AMPERE: 'AMPERE'>, compute_capability=ComputeCapability(major=8, minor=0))), Parameters(arch=NVIDIAArch(family=<NVIDIAFamily.AMPERE: 'AMPERE'>, compute_capability=ComputeCapability(major=8, minor=6))), Parameters(arch=NVIDIAArch(family=<NVIDIAFamily.ADA: 'ADA'>, compute_capability=ComputeCapability(major=8, minor=9))), Parameters(arch=NVIDIAArch(family=<NVIDIAFamily.HOPPER: 'HOPPER'>, compute_capability=ComputeCapability(major=9, minor=0))), Parameters(arch=NVIDIAArch(family=<NVIDIAFamily.BLACKWELL: 'BLACKWELL'>, compute_capability=ComputeCapability(major=10, minor=0))), Parameters(arch=NVIDIAArch(family=<NVIDIAFamily.BLACKWELL: 'BLACKWELL'>, compute_capability=ComputeCapability(major=12, minor=0))))), kwargs={'ids': <class 'str'>})]
test_add_strong_device_double(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Test with CODE_ADD for double.

test_add_strong_device_float(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Test with CODE_ADD for float.

test_add_strong_device_int(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Test with CODE_ADD for int.

test_add_strong_device_unsigned_int(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Test with CODE_ADD for unsigned int.

test_add_strong_device_unsigned_long_long_int(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Test with CODE_ADD for unsigned long long int.

test_max_strong_device_int(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Test with CODE_MAX for int. The modifier is MAX.S32.

test_max_strong_device_long_long_int(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Test with CODE_MAX for long long int. The modifier is MAX.S64.

test_max_strong_device_unsigned_int(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Test with CODE_MAX for unsigned int. The modifier is MAX.

test_max_strong_device_unsigned_long_long_int(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Test with CODE_MAX for unsigned long long int. The modifier is MAX.64.

test_sub_strong_device(request, workdir, parameters: Parameters, cmake_file_api: FileAPI)View on GitHub

Test with CODE_SUB.

tests.test.sass.test_instruction.get_decoder(*, cwd: Path, arch: NVIDIAArch, file: Path, cmake_file_api: FileAPI, **kwargs) tuple[Decoder, Path]View on GitHub

Compile the code in file for arch and return a reprospect.tools.sass.Decoder.

class tests.test.sass.test_load.TestLoadConstantMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.instruction.load.LoadConstantMatcher.

CODE_CONSTANT_ARRAY: Final[str] = '__constant__ {type} data[128];\n__global__ __launch_bounds__(128, 1) void ldc({type}* __restrict__ const out)\n{{\n    const auto index = blockIdx.x * blockDim.x + threadIdx.x;\n    out[index] = data[index];\n}}\n'
INSTRUCTIONS: Final[dict[str, tuple[LoadConstantMatcher, InstructionMatch]]] = {'LDC R1, c[0x0][0x37c]': (LoadConstantMatcher(pattern=regex.Regex('(?P<opcode>(?:LDC|LDCU)) (?P<operands>(?:R[0-9]+|UR[0-9]+)), (?P<operands>(?:(?:!|\\-\\||\\-|\\~|\\|))?c\\[(?P<bank>0x[0-9]+)\\]\\[(?P<offset>(?:0x[0-9A-Fa-f]+|R(?:Z|\\d+)|UR[0-9]+))\\])', flags=regex.V0)), InstructionMatch(opcode='LDC', modifiers=(), operands=('R1', 'c[0x0][0x37c]'), predicate=None, additional={'bank': ['0x0'], 'offset': ['0x37c']})), 'LDC.64 R6, c[0x1][0x398]': (LoadConstantMatcher(pattern=regex.Regex('(?P<opcode>(?:LDC|LDCU))\\.(?P<modifiers>64) (?P<operands>(?:R[0-9]+|UR[0-9]+)), (?P<operands>(?:(?:!|\\-\\||\\-|\\~|\\|))?c\\[(?P<bank>0x[0-9]+)\\]\\[(?P<offset>(?:0x[0-9A-Fa-f]+|R(?:Z|\\d+)|UR[0-9]+))\\])', flags=regex.V0)), InstructionMatch(opcode='LDC', modifiers=('64',), operands=('R6', 'c[0x1][0x398]'), predicate=None, additional={'bank': ['0x1'], 'offset': ['0x398']})), 'LDCU UR4, c[0x2][0x364]': (LoadConstantMatcher(pattern=regex.Regex('(?P<opcode>LDCU) (?P<operands>UR[0-9]+), (?P<operands>(?:(?:!|\\-\\||\\-|\\~|\\|))?c\\[(?P<bank>0x[0-9]+)\\]\\[(?P<offset>(?:0x[0-9A-Fa-f]+|R(?:Z|\\d+)|UR[0-9]+))\\])', flags=regex.V0)), InstructionMatch(opcode='LDCU', modifiers=(), operands=('UR4', 'c[0x2][0x364]'), predicate=None, additional={'bank': ['0x2'], 'offset': ['0x364']})), 'LDCU UR4, c[0x3][UR0]': (LoadConstantMatcher(pattern=regex.Regex('(?P<opcode>LDCU) (?P<operands>UR[0-9]+), (?P<operands>(?:(?:!|\\-\\||\\-|\\~|\\|))?c\\[(?P<bank>0x[0-9]+)\\]\\[(?P<offset>(?:0x[0-9A-Fa-f]+|R(?:Z|\\d+)|UR[0-9]+))\\])', flags=regex.V0)), InstructionMatch(opcode='LDCU', modifiers=(), operands=('UR4', 'c[0x3][UR0]'), predicate=None, additional={'bank': ['0x3'], 'offset': ['UR0']})), 'LDCU.64 UR6, c[0x3][0x358]': (LoadConstantMatcher(pattern=regex.Regex('(?P<opcode>LDCU)\\.(?P<modifiers>64) (?P<operands>UR[0-9]+), (?P<operands>(?:(?:!|\\-\\||\\-|\\~|\\|))?c\\[(?P<bank>0x[0-9]+)\\]\\[(?P<offset>(?:0x[0-9A-Fa-f]+|R(?:Z|\\d+)|UR[0-9]+))\\])', flags=regex.V0)), InstructionMatch(opcode='LDCU', modifiers=('64',), operands=('UR6', 'c[0x3][0x358]'), predicate=None, additional={'bank': ['0x3'], 'offset': ['0x358']}))}

Zoo of real SASS instructions.

test(instruction: str, matcher: LoadConstantMatcher, expected: InstructionMatch) NoneView on GitHub
test_array_of_64bit_elements(request, workdir: Path, parameters: Parameters, cmake_file_api: FileAPI) NoneView on GitHub

Loads of size 64 with CODE_CONSTANT_ARRAY.

class tests.test.sass.test_load.TestLoadMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.instruction.load.LoadMatcher and reprospect.test.sass.instruction.load.LoadGlobalMatcher.

CODE_ELEMENTWISE_ADD = '__global__ void elementwise_add(int* const dst, const int* const src) {\n    const auto index = blockIdx.x * blockDim.x + threadIdx.x;\n    dst[index] += src[index];\n}\n'
CODE_ELEMENTWISE_ADD_LDG = '__global__ void elementwise_add_ldg(int* const dst, const int* const src) {\n    const auto index = blockIdx.x * blockDim.x + threadIdx.x;\n    dst[index] += __ldg(&src[index]);\n}\n'
CODE_EXTEND = '#include <cstdint>\n\n__global__ void extend({dst}* {restrict} const dst, {src}* {restrict} const src, const unsigned int size)\n{{\n    const auto index = blockIdx.x * blockDim.x + threadIdx.x;\n    if (index < size) dst[index] = src[index];\n}}\n'
test() NoneView on GitHub
test_constant(request, workdir, parameters: Parameters, cmake_file_api: FileAPI) NoneView on GitHub

If src is declared const __restrict__, the compiler is able to use the .CONSTANT modifier. Otherwise, we need to explicitly use __ldg to end up using .CONSTANT.

test_elementwise_add_restrict(request, workdir, parameters: Parameters, cmake_file_api: FileAPI) NoneView on GitHub

Test loads with tests.test.sass.test_instruction.CODE_ELEMENTWISE_ADD_RESTRICT.

test_elementwise_add_restrict_128_wide(request, workdir, parameters: Parameters, cmake_file_api: FileAPI) NoneView on GitHub

Test 128-bit wide loads with tests.test.sass.test_instruction.CODE_ELEMENTWISE_ADD_RESTRICT_128_WIDE.

test_elementwise_add_restrict_256_wide(request, workdir, parameters: Parameters, cmake_file_api: FileAPI) NoneView on GitHub

Test 256-bit wide loads with tests.test.sass.test_instruction.CODE_ELEMENTWISE_ADD_RESTRICT_256_WIDE.

test_sign_extend_s16(request, workdir: Path, parameters: Parameters, cmake_file_api: FileAPI, cmake_cuda_compiler: CMakeToolchainCompiler) NoneView on GitHub

Check when CODE_EXTEND leads to sign extension.

Uses reprospect.test.features.Memory.sign_extension().

test_zero_extend_u16(request, workdir: Path, parameters: Parameters, cmake_file_api: FileAPI) NoneView on GitHub

Use CODE_EXTEND to enforce zero extension.

class tests.test.sass.test_store.TestStoreMatcherView on GitHub

Bases: object

Tests for reprospect.test.sass.instruction.store.StoreMatcher and reprospect.test.sass.instruction.store.StoreGlobalMatcher.

test() NoneView on GitHub
test_elementwise_add_restrict(request, workdir, parameters: Parameters, cmake_file_api: FileAPI) NoneView on GitHub

Test store with tests.test.sass.test_instruction.CODE_ELEMENTWISE_ADD_RESTRICT.

test_elementwise_add_restrict_128_wide(request, workdir, parameters: Parameters, cmake_file_api: FileAPI) NoneView on GitHub

Test 128-bit wide store with tests.test.sass.test_instruction.CODE_ELEMENTWISE_ADD_RESTRICT_128_WIDE.

test_elementwise_add_restrict_256_wide(request, workdir, parameters: Parameters, cmake_file_api: FileAPI) NoneView on GitHub

Test 256-bit wide store with tests.test.sass.test_instruction.CODE_ELEMENTWISE_ADD_RESTRICT_256_WIDE.