name: Unit Tests
env:
  # increment this when downloads substantially change to avoid the internet
  DOWNLOAD_CACHE_VERSION: '12'
  PYTHON_CACHE_VERSION: '3'
  APT_CACHE_VERSION: '1'
  BUILD_CACHE_VERSION: '1'
  CAPTURE_PROCESS_REPLAY: 1
  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

on:
  push:
    branches:
      - master
  pull_request:
  workflow_dispatch:

jobs:
  llvmspeed:
    name: LLVM Speed
    runs-on: ubuntu-24.04
    timeout-minutes: 20
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
        key: llvm-speed
        deps: testing_minimal
        llvm: 'true'
    - name: External Benchmark Schedule
      run: PYTHONPATH="." python3 test/external/external_benchmark_schedule.py
    - name: Speed Test
      run: LLVM=1 python3 test/speed/external_test_speed_v_torch.py
    - name: Speed Test (BEAM=2)
      run: BEAM=2 LLVM=1 python3 test/speed/external_test_speed_v_torch.py

  docs:
    name: Docs
    runs-on: ubuntu-22.04
    timeout-minutes: 10
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
        deps: docs
        pydeps: "capstone"
    - name: Build wheel and show size
      run: |
        pip install build
        python -m build --wheel --outdir dist
        ls -lh dist/*.whl
    - name: Use as an external package
      run: |
        mkdir $HOME/test_external_dir
        cd $HOME/test_external_dir
        python -m venv venv
        source venv/bin/activate
        pip install $GITHUB_WORKSPACE
        python -c "from tinygrad.tensor import Tensor; print(Tensor([1,2,3,4,5]))"
        pip install mypy
        mypy -c "from tinygrad.tensor import Tensor; print(Tensor([1,2,3,4,5]))"
    - name: Run beautiful_mnist with tinygrad only
      run: |
        mkdir $GITHUB_WORKSPACE/test_dir
        cd $GITHUB_WORKSPACE/test_dir
        python -m venv venv
        source venv/bin/activate
        pip install $GITHUB_WORKSPACE
        cp $GITHUB_WORKSPACE/examples/beautiful_mnist.py .
        PYTHONPATH=$GITHUB_WORKSPACE BS=2 STEPS=10 python beautiful_mnist.py
    - name: Test Docs Build
      run: python -m mkdocs build --strict
    - name: Test Docs
      run: |
        python docs/abstractions2.py
        python docs/abstractions3.py
    - name: Test Quickstart
      run: awk '/```python/{flag=1;next}/```/{flag=0}flag' docs/quickstart.md > quickstart.py &&  PYTHONPATH=. python quickstart.py
    - name: Test DEBUG
      run: DEBUG=100 python3 -c "from tinygrad import Tensor; N = 1024; a, b = Tensor.rand(N, N), Tensor.rand(N, N); c = (a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2); print((c.numpy() - (a.numpy() @ b.numpy())).mean())"
    - name: Compile EfficientNet to C and test it
      run: |
        CPU=1 PYTHONPATH="." python examples/compile_efficientnet.py > recognize.c
        clang -O2 recognize.c -lm -o recognize
        cat test/models/efficientnet/Chicken.jpg | ./recognize | grep cock

  autogen:
    name: Autogen
    runs-on: ubuntu-24.04
    timeout-minutes: 15
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
        opencl: 'true'
        amd: 'true'
        cuda: 'true'
        webgpu: 'true'
        llvm: 'true'
    - name: Install autogen support packages
      run: sudo apt-get install -y --no-install-recommends llvm-14-dev libclang-14-dev
    - name: Verify OpenCL autogen
      run: |
        cp tinygrad/runtime/autogen/opencl.py /tmp/opencl.py.bak
        ./autogen_stubs.sh opencl
        diff /tmp/opencl.py.bak tinygrad/runtime/autogen/opencl.py
    - name: Verify CUDA autogen
      run: |
        cp tinygrad/runtime/autogen/cuda.py /tmp/cuda.py.bak
        cp tinygrad/runtime/autogen/nv_gpu.py /tmp/nv_gpu.py.bak
        ./autogen_stubs.sh cuda
        ./autogen_stubs.sh nv
        diff /tmp/cuda.py.bak tinygrad/runtime/autogen/cuda.py
        diff /tmp/nv_gpu.py.bak tinygrad/runtime/autogen/nv_gpu.py
    - name: Verify AMD autogen
      run: |
        cp tinygrad/runtime/autogen/hsa.py /tmp/hsa.py.bak
        cp tinygrad/runtime/autogen/kfd.py /tmp/kfd.py.bak
        cp tinygrad/runtime/autogen/comgr.py /tmp/comgr.py.bak
        cp tinygrad/runtime/autogen/amd_gpu.py /tmp/amd_gpu.py.bak
        cp tinygrad/runtime/autogen/sqtt.py /tmp/sqtt.py.bak
        ./autogen_stubs.sh hsa
        ./autogen_stubs.sh kfd
        ./autogen_stubs.sh comgr
        ./autogen_stubs.sh amd
        ./autogen_stubs.sh sqtt
        diff /tmp/hsa.py.bak tinygrad/runtime/autogen/hsa.py
        diff /tmp/kfd.py.bak tinygrad/runtime/autogen/kfd.py
        diff /tmp/comgr.py.bak tinygrad/runtime/autogen/comgr.py
        diff /tmp/amd_gpu.py.bak tinygrad/runtime/autogen/amd_gpu.py
        diff /tmp/sqtt.py.bak tinygrad/runtime/autogen/sqtt.py
    - name: Verify Linux autogen
      run: |
        cp tinygrad/runtime/autogen/libc.py /tmp/libc.py.bak
        cp tinygrad/runtime/autogen/io_uring.py /tmp/io_uring.py.bak
        cp tinygrad/runtime/autogen/ib.py /tmp/ib.py.bak
        ./autogen_stubs.sh libc
        ./autogen_stubs.sh io_uring
        ./autogen_stubs.sh ib
        diff /tmp/libc.py.bak tinygrad/runtime/autogen/libc.py
        diff /tmp/io_uring.py.bak tinygrad/runtime/autogen/io_uring.py
        diff /tmp/ib.py.bak tinygrad/runtime/autogen/ib.py
    - name: Verify WebGPU autogen
      run: |
        cp tinygrad/runtime/autogen/webgpu.py /tmp/webgpu.py.bak
        ./autogen_stubs.sh webgpu
        diff /tmp/webgpu.py.bak tinygrad/runtime/autogen/webgpu.py
    - name: Verify LLVM autogen
      run: |
        cp tinygrad/runtime/autogen/llvm.py /tmp/llvm.py.bak
        ./autogen_stubs.sh llvm
        diff /tmp/llvm.py.bak tinygrad/runtime/autogen/llvm.py

  torchbackend:
    name: Torch Backend Tests
    runs-on: ubuntu-latest
    timeout-minutes: 15
    env:
      IGNORE_OOB: 0
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
        key: torch-backend-pillow-torchvision-et-pt
        deps: testing_minimal
        pydeps: "pillow torchvision expecttest"
        llvm: 'true'
    - name: Install ninja
      run: |
        sudo apt update || true
        sudo apt install -y --no-install-recommends ninja-build
    - name: Lint with ruff
      run: |
        pip3 install --upgrade --force-reinstall ruff==0.11.0
        python3 -m ruff check extra/torch_backend/backend.py
    - name: Test one op
      run: PYTHONPATH=. FORWARD_ONLY=1 TINY_BACKEND=1 python3 test/test_ops.py TestOps.test_add
    - name: Test ResNet-18
      run: PYTHONPATH=. DEBUG=2 python3 extra/torch_backend/example.py
    - name: My (custom) tests
      run: PYTHONPATH=. python3 extra/torch_backend/test.py
    - name: Test one op in torch tests
      run: PYTHONPATH=. DEBUG=2 python3 extra/torch_backend/torch_tests.py TestTinyBackendPRIVATEUSE1.test_unary_log_tiny_float32
    - name: Test Ops with TINY_BACKEND
      run: PYTHONPATH=. LLVM=1 LLVMOPT=0 TINY_BACKEND=1 python3 -m pytest -n auto test/test_ops.py --durations=20
    - name: Test in-place operations on views
      run: PYTHONPATH=. TORCH_DEBUG=1 python3 extra/torch_backend/test_inplace.py
    - name: Test multi-gpu
      run: PYTHONPATH=. LLVM=1 GPUS=4 TORCH_DEBUG=1 python3 extra/torch_backend/test_multigpu.py

  torchbackendmore:
    name: Torch Backend Tests More
    runs-on: ubuntu-latest
    timeout-minutes: 15
    env:
      IGNORE_OOB: 0
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
        key: torch-backend-pillow-torchvision-et-pt
        deps: testing_minimal
        llvm: 'true'
    - name: Install ninja
      run: |
        sudo apt update || true
        sudo apt install -y --no-install-recommends ninja-build
    - name: Test beautiful_mnist in torch with TINY_BACKEND
      run: SPLIT_REDUCEOP=0 FUSE_ARANGE=1 PYTHONPATH=. LLVM=1 TARGET_EVAL_ACC_PCT=96.0 TINY_BACKEND=1 python3 examples/other_mnist/beautiful_mnist_torch.py
    - name: Test some torch tests (expect failure)
      run: PYTHONPATH=. python3 -m pytest extra/torch_backend/torch_tests.py -v --tb=no || true

  tc:
    name: Tensor Core tests
    runs-on: ubuntu-latest
    timeout-minutes: 10
    env:
      IGNORE_OOB: 0
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
        key: uops-minimal
        deps: testing_minimal
    - name: Test IMAGE=2 support
      run: |
        IMAGE=2 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm
        IMAGE=2 PYTHON=1 python3 test/test_ops.py TestOps.test_simple_conv2d
    - name: Test emulated METAL tensor cores
      run: |
        DEBUG=2 EMULATE_METAL=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_big_gemm
        PYTHONPATH=. DEBUG=2 EMULATE_METAL=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores
        PYTHONPATH=. DEBUG=2 EMULATE_METAL=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
    - name: Test emulated AMX tensor cores
      run: PYTHONPATH=. DEBUG=2 AMX=1 EMULATE_AMX=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm
    - name: Test emulated AMD tensor cores
      run: |
        PYTHONPATH=. DEBUG=2 EMULATE_AMD=1 FORWARD_ONLY=1 PYTHON=1 N=16 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py
        PYTHONPATH=. DEBUG=2 EMULATE_AMD=1 FORWARD_ONLY=1 PYTHON=1 N=64 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py
        PYTHONPATH=. DEBUG=2 EMULATE_AMD=1 FORWARD_ONLY=1 PYTHON=1 N=16 HALF=1 ACC_HALF=1 ATOL=1e-3 python3 ./extra/gemm/simple_matmul.py
        PYTHONPATH=. DEBUG=2 EMULATE_AMD=1 FORWARD_ONLY=1 PYTHON=1 N=64 HALF=1 ACC_HALF=1 ATOL=1e-3 python3 ./extra/gemm/simple_matmul.py
        PYTHONPATH=. DEBUG=2 EMULATE_AMD=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores
        PYTHONPATH=. DEBUG=2 EMULATE_AMD=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores_padded_amd TestLinearizer.test_tensor_cores_padded_uops
    - name: Test emulated AMD MFMA tensor cores
      run: |
        PYTHONPATH=. DEBUG=2 EMULATE_AMD_MFMA=1 FORWARD_ONLY=1 PYTHON=1 N=64 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py
        PYTHONPATH=. DEBUG=2 EMULATE_AMD_MFMA=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores
        PYTHONPATH=. DEBUG=2 EMULATE_AMD_MFMA=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
    - name: Test emulated AMD RDNA4 tensor cores
      run: |
        PYTHONPATH=. DEBUG=2 EMULATE_AMD_RDNA4=1 FORWARD_ONLY=1 PYTHON=1 N=16 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py
        PYTHONPATH=. DEBUG=2 EMULATE_AMD_RDNA4=1 FORWARD_ONLY=1 PYTHON=1 N=64 HALF=1 ACC_HALF=0 python3 ./extra/gemm/simple_matmul.py
        PYTHONPATH=. DEBUG=2 EMULATE_AMD_RDNA4=1 FORWARD_ONLY=1 PYTHON=1 N=16 HALF=1 ACC_HALF=1 ATOL=1e-3 python3 ./extra/gemm/simple_matmul.py
        PYTHONPATH=. DEBUG=2 EMULATE_AMD_RDNA4=1 FORWARD_ONLY=1 PYTHON=1 N=64 HALF=1 ACC_HALF=1 ATOL=1e-3 python3 ./extra/gemm/simple_matmul.py
        PYTHONPATH=. DEBUG=2 EMULATE_AMD_RDNA4=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores
        PYTHONPATH=. DEBUG=2 EMULATE_AMD_RDNA4=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
    - name: Test emulated CUDA tensor cores
      run: |
        DEBUG=2 EMULATE_CUDA=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm_fp16
        DEBUG=2 EMULATE_CUDA=1 ALLOW_TF32=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm
        DEBUG=2 EMULATE_CUDA_SM75=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm_fp16
        PYTHONPATH="." DEBUG=2 EMULATE_CUDA=1 ALLOW_TF32=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores
        PYTHONPATH="." DEBUG=2 EMULATE_CUDA=1 ALLOW_TF32=1 FORWARD_ONLY=1 PYTHON=1 python3 test/test_linearizer.py TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
    - name: Test emulated INTEL OpenCL tensor cores
      run: DEBUG=2 EMULATE_INTEL=1 FORWARD_ONLY=1 PYTHON=1 HALF=1 N=64 python3 ./extra/gemm/simple_matmul.py
    - name: Full test tensor cores
      run: |
        PYTHONPATH=. DEBUG=2 EMULATE_METAL=1 FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores
        PYTHONPATH=. DEBUG=2 EMULATE_AMD=1 FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores
        PYTHONPATH=. DEBUG=2 EMULATE_CUDA=1 ALLOW_TF32=1 FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores
        PYTHONPATH=. DEBUG=2 EMULATE_INTEL=1 FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores
        PYTHONPATH=. DEBUG=2 AMX=1 EMULATE_AMX=1 FORWARD_ONLY=1 PYTHON=1 python3 ./test/test_linearizer.py TestLinearizer.test_tensor_cores
    - name: Test device flop counts
      run: |
        PYTHONPATH=. DEBUG=2 EMULATE_METAL=1 PYTHON=1 python3 ./test/test_uops_stats.py TestUOpsStatsMatmulHalf
        PYTHONPATH=. DEBUG=2 EMULATE_AMD=1 PYTHON=1 python3 ./test/test_uops_stats.py TestUOpsStatsMatmulHalf
        PYTHONPATH=. DEBUG=2 EMULATE_CUDA=1 PYTHON=1 python3 ./test/test_uops_stats.py TestUOpsStatsMatmulHalf
        PYTHONPATH=. DEBUG=2 EMULATE_INTEL=1 PYTHON=1 python3 ./test/test_uops_stats.py TestUOpsStatsMatmulHalf
        PYTHONPATH=. DEBUG=2 AMX=1 EMULATE_AMX=1 PYTHON=1 python3 ./test/test_uops_stats.py TestUOpsStats.test_simple_matmul

  bepython:
    name: Python Backend
    runs-on: ubuntu-latest
    timeout-minutes: 10
    env:
      IGNORE_OOB: 0
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
        key: be-minimal
        deps: testing_minimal
    - name: Test dtype with Python emulator
      run: DEBUG=1 PYTHONPATH=. PYTHON=1 python3 -m pytest -n=auto test/test_dtype.py test/test_dtype_alu.py
    - name: Test ops with Python emulator
      run: DEBUG=2 PYTHON=1 python3 -m pytest -n=auto test/test_ops.py -k "not (test_split or test_simple_cumsum or test_cumsum or test_einsum or test_dot or test_dot_1d or test_big_gemm or test_broadcastdot or test_multidot or test_var_axis or test_std_axis or test_broadcast_full or test_broadcast_partial or test_simple_conv3d or test_dilated_conv_transpose2d or test_simple_conv_transpose3d or test_large_input_conv2d or test_max_pool2d or test_max_pool2d_simple or test_max_pool2d_bigger_stride or test_avg_pool2d or test_cat or test_scaled_product_attention or test_scaled_product_attention_causal or test_slice_fancy_indexing_dim_inject_none or test_slice_fancy_indexing_list_indices or test_slice_fancy_indexing_no_dim_collapse or test_slice_fancy_indexing_tuple_indices or test_slice_fancy_indexing_list_with_tensors or test_slice_fancy_indexing_dim_collapse_int or test_interpolate_bilinear or test_interpolate_bilinear_corners_aligned or test_scaled_dot_product_attention or test_cummax or test_simple_cummax or test_logcumsumexp or test_sort or test_cumprod)" --durations=20
    - name: Test uops with Python emulator
      run: PYTHON=1 python3 -m pytest test/test_uops.py --durations=20
    - name: Test symbolic with Python emulator
      run: PYTHONPATH=. PYTHON=1 python3 test/test_symbolic_ops.py
    - name: test_renderer_failures with Python emulator
      run: PYTHONPATH=. PYTHON=1 python3 -m pytest -rA test/test_renderer_failures.py::TestRendererFailures

  linter:
    name: Linters
    runs-on: ubuntu-latest
    timeout-minutes: 10

    # TODO: run the pre-commit hook to replace a lot of this
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
        key: linting-only
        python-version: '3.10'
        deps: linting
    - name: Lint bad-indentation and trailing-whitespace with pylint
      run: python -m pylint --disable=all -e W0311 -e C0303 --jobs=0 --indent-string='  ' --recursive=y .
    - name: Lint with ruff
      run: |
        pip3 install --upgrade --force-reinstall ruff==0.11.0
        python3 -m ruff check .
        python3 -m ruff check examples/mlperf/ --ignore E501
    - name: Lint tinygrad with pylint
      run: python -m pylint tinygrad/
    - name: Run mypy
      run: |
        python -m mypy --strict-equality --lineprecision-report .
        cat lineprecision.txt
    - name: Run TYPED=1
      run: TYPED=1 python -c "import tinygrad"

  unittest:
    name: Unit Tests
    runs-on: ubuntu-latest
    timeout-minutes: 15

    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
        key: unittest-12
        pydeps: "pillow"
        deps: testing_unit
    - name: Test README
      run: awk '/```python/{flag=1;next}/```/{flag=0}flag' README.md > README.py &&  PYTHONPATH=. python README.py
    - name: Run unit tests
      run: PYTHONPATH="." python -m pytest -n=auto test/unit/ --durations=20
    - name: Run targetted tests on NULL backend
      run: PYTHONPATH="." NULL=1 python3 test/test_multitensor.py TestMultiTensor.test_data_parallel_resnet_train_step
    - name: Run SDXL on NULL backend
      run: MAX_BUFFER_SIZE=0 PYTHONPATH="." NULL=1 DEBUG=1 python3 examples/sdxl.py --seed 0 --noshow --timing --fakeweights
    # TODO: support fake weights
    #- name: Run LLaMA 7B on 4 fake devices
    #  run: NULL=1 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 3 --temperature 0 --timing
    - name: Run GC tests
      run: PYTHONPATH="." python test/external/external_uop_gc.py
    - name: Run process replay tests
      uses: ./.github/actions/process-replay
    - name: Regen dataset on test_tiny
      run: |
        test/external/process_replay/reset.py
        CAPTURE_PROCESS_REPLAY=1 python test/test_tiny.py TestTiny.test_plus
        PYTHONPATH=. python extra/optimization/extract_dataset.py
        gzip -c /tmp/sops > extra/datasets/sops.gz
        DEBUG=1 MIN_ASTS=1 PYTHONPATH=. python extra/optimization/get_action_space.py
    - name: Repo line count < 17500 lines
      run: MAX_LINE_COUNT=17500 python sz.py

  fuzzing:
    name: Fuzzing
    runs-on: ubuntu-latest
    timeout-minutes: 10
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
        key: fuzzing-unit
        deps: testing_unit
    - name: Fuzz Test symbolic
      run: python test/external/fuzz_symbolic.py
    - name: Fuzz Test fast idiv
      run: python test/external/fuzz_fast_idiv.py
    - name: Fuzz Test shapetracker
      run: |
        PYTHONPATH="." python test/external/fuzz_shapetracker.py
        PYTHONPATH="." python test/external/fuzz_shapetracker_math.py
    - name: Fuzz Test shape ops
      run: python test/external/fuzz_shape_ops.py

  testgpuimage:
    name: 'GPU IMAGE Tests'
    runs-on: ubuntu-22.04
    timeout-minutes: 10
    env:
      IGNORE_OOB: 0
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
      - name: Setup Environment
        uses: ./.github/actions/setup-tinygrad
        with:
          key: gpu-image
          deps: testing_minimal
          opencl: 'true'
      - name: Run Kernel Count Test
        run: PYTHONPATH="." GPU=1 python -m pytest -n=auto test/external/external_test_opt.py
      - name: Test WINO=1
        run: GPU=1 DEBUG=2 WINO=1 python3 test/test_ops.py TestOps.test_simple_conv2d
      - name: Test GPU IMAGE=2 ops + training
        run: |
          PYTHONPATH="." GPU=1 IMAGE=2 python -m pytest -n=auto test/test_ops.py --durations=20
          PYTHONPATH="." GPU=1 IMAGE=2 python3 test/models/test_end2end.py TestEnd2End.test_linear_mnist
      - name: Run fused optimizer tests
        run: PYTHONPATH="." GPU=1 FUSE_OPTIM=1 python -m pytest -n=auto test/models/test_mnist.py
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

  testgendataset:
    name: 'GPU Generate Kernel Dataset'
    runs-on: ubuntu-22.04
    timeout-minutes: 10
    env:
      IGNORE_OOB: 0
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
      - name: Setup Environment
        uses: ./.github/actions/setup-tinygrad
        with:
          key: gen-dataset
          deps: testing_minimal
          opencl: 'true'
      - name: Generate Dataset
        run: PYTHONPATH="." extra/optimization/generate_dataset.sh
      - name: Upload artifact
        uses: actions/upload-artifact@v4
        with:
          name: sops.gz
          path: /tmp/sops.gz

  testopenpilot:
    name: 'openpilot Compile Tests'
    runs-on: ubuntu-22.04
    timeout-minutes: 15
    env:
      IGNORE_OOB: 0
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
      - name: Setup Environment
        uses: ./.github/actions/setup-tinygrad
        with:
          key: openpilot-compile
          deps: testing
          opencl: 'true'
          llvm: 'true'
      - name: Test openpilot model kernel count and gate usage
        run: |
          PYTHONPATH="." ALLOWED_KERNEL_COUNT=208 ALLOWED_READ_IMAGE=2134 ALLOWED_GATED_READ_IMAGE=13 FLOAT16=0 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx
      - name: Test openpilot alt model correctness (float32)
        run: PYTHONPATH="." FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx
      - name: Test openpilot fastvits model correctness (float32)
        run: PYTHONPATH="." FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
      # - name: Test openpilot simple_plan vision model correctness (float32)
      #   run: PYTHONPATH="." FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://gitlab.com/commaai/openpilot-lfs.git/gitlab-lfs/objects/35ff4f4577002f2685e50c8346addae33fe8da27a41dd4d6a0f14d1f4b1af81b
      - name: Test openpilot LLVM compile
        run: PYTHONPATH="." LLVM=1 LLVMOPT=1 JIT=2 BEAM=0 IMAGE=0 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
      - name: Test openpilot compile4
        run: PYTHONPATH="." NOLOCALS=1 GPU=1 IMAGE=2 FLOAT16=1 DEBUG=2 python3 examples/openpilot/compile4.py
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

  testonnxcpu:
    name: 'ONNX (CPU) Tests'
    runs-on: ubuntu-22.04
    timeout-minutes: 20
    env:
      IGNORE_OOB: 0

    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
      - name: Setup Environment
        uses: ./.github/actions/setup-tinygrad
        with:
          key: onnxoptc
          deps: testing
          python-version: '3.11'
          llvm: 'true'
      - name: Test ONNX (CPU)
        run: CPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
      - name: Test ONNX (LLVM)
        run: LLVM=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
      - name: Test ONNX Runner (CPU)
        run: CPU=1 PYTHONPATH=. python3 test/external/external_test_onnx_runner.py
      - name: Test Additional ONNX Ops (CPU)
        run: CPU=1 PYTHONPATH=. python3 test/external/external_test_onnx_ops.py
      - name: Test Quantize ONNX
        run: CPU=1 PYTHONPATH=. python3 test/test_quantize_onnx.py
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

  testopencl:
    name: 'ONNX (GPU)+Optimization Tests'
    runs-on: ubuntu-22.04
    timeout-minutes: 20
    env:
      IGNORE_OOB: 0

    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
      - name: Setup Environment
        uses: ./.github/actions/setup-tinygrad
        with:
          key: onnxoptl
          deps: testing
          pydeps: "tensorflow==2.15.1 tensorflow_addons"
          python-version: '3.11'
          opencl: 'true'
      - name: Test ONNX (GPU)
        run: GPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
      - name: Test Optimization Helpers
        run: PYTHONPATH="." DEBUG=1 python3 extra/optimization/test_helpers.py
      #- name: Test Action Space
      #  run: PYTHONPATH="." DEBUG=1 GPU=1 python3 extra/optimization/get_action_space.py
      - name: Test Beam Search
        run: PYTHONPATH="." GPU=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py
      - name: Test MLPerf stuff
        run: GPU=1 python -m pytest -n=auto test/external/external_test_optim.py test/external/external_test_losses.py test/external/external_test_metrics.py test/external/external_test_datasets.py --durations=20
      - name: Test llama 3 training
        run: MAX_BUFFER_SIZE=0 PYTHONPATH="." DEV=NULL SAMPLES=300 BS=8 SEQLEN=512 GRADIENT_ACC_STEPS=8 FAKEDATA=1 DEFAULT_FLOAT=bfloat16 OPTIM_DTYPE=bfloat16 LLAMA3_SIZE=1B MODEL=llama3 python3 examples/mlperf/model_train.py
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

  testllm:
    name: Test LLM
    runs-on: ubuntu-24.04
    timeout-minutes: 15
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
      - name: Setup Environment
        uses: ./.github/actions/setup-tinygrad
        with:
          key: apps_llm
      - name: Test 1B LLM
        run: echo "What's a male chicken called? Answer with only one word." | MAX_BUFFER_SIZE=0 python3 -m tinygrad.apps.llm | grep -i rooster

  testmodels:
    name: Models (llvm+cpu+gpu)
    runs-on: ubuntu-22.04
    timeout-minutes: 15
    env:
      IGNORE_OOB: 0
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
      - name: Setup Environment
        uses: ./.github/actions/setup-tinygrad
        with:
          key: models
          deps: testing
          opencl: 'true'
          llvm: 'true'
      - name: Test models (llvm)
        run: LLVM=1 python -m pytest -n=auto test/models --durations=20
      - name: Test models (gpu)
        run: GPU=1 python -m pytest -n=auto test/models --durations=20
      - name: Test models (cpu)
        run: CPU=1 python -m pytest -n=auto test/models --durations=20
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

  testrangeify:
    name: Linux (rangeify)
    runs-on: ubuntu-24.04
    timeout-minutes: 15
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
        key: rangeify-minimal-llvm
        deps: testing_minimal
        llvm: "true"
    - name: Test CPU=1 RANGEIFY=1
      # TODO: add more passing tests here
      # test_symbolic_arange_sym_step is passing now
      # test_threefry_doesnt_use_long is because there's a contig after the long now
      run: |
        CPU=1 RANGEIFY=1 python3 -m pytest -n auto --durations 20 \
          -k "not test_symbolic_arange_sym_step and not test_threefry_doesnt_use_long" \
          test/test_tiny.py test/test_rangeify.py test/test_ops.py test/test_tensor_variable.py \
          test/test_outerworld_range.py test/test_sample.py test/test_randomness.py test/test_tensor_data.py
    - name: Test CPU=1 RANGEIFY=2
      run: CPU=1 RANGEIFY=2 python3 -m pytest -n auto test/test_tiny.py test/test_rangeify.py test/test_ops.py --durations 20
    - name: Test LLVM=1 RANGEIFY=1 (slow tests)
      run: LLVM=1 RANGEIFY=1 python3 -m pytest -n auto test/models/test_mnist.py --durations 20

  testdevectorize:
    name: Linux (devectorize)
    runs-on: ubuntu-24.04
    timeout-minutes: 15
    env:
      IGNORE_OOB: 0
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
        key: devectorize-minimal
        deps: testing_minimal
        pydeps: "pillow"
        llvm: "true"
    - name: Test LLVM=1 DEVECTORIZE=0
      run: LLVM=1 DEVECTORIZE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"
    - name: Test LLVM=1 DEVECTORIZE=0 for model
      run: PYTHONPATH="." LLVM=1 DEVECTORIZE=0 python3 test/models/test_efficientnet.py
    - name: Test CPU=1 DEVECTORIZE=0
      run: CPU=1 DEVECTORIZE=0 FUSE_ARANGE=0 python3 -m pytest -n auto test/test_tiny.py test/test_ops.py -k "not test_avg_pool3d_failure"

  testdsp:
    name: Linux (DSP)
    runs-on: ubuntu-24.04
    timeout-minutes: 15
    env:
      IGNORE_OOB: 0
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
        key: dsp-minimal
        deps: testing_minimal
        pydeps: "onnx==1.18.0 onnxruntime pillow"
        llvm: "true"
    - name: Set up Docker Buildx
      uses: docker/setup-buildx-action@v3
    - name: Build QEMU Docker with cache
      uses: docker/build-push-action@v4
      with:
        file: extra/dsp/Dockerfile
        push: false
        load: true
        tags: qemu-hexagon:latest
        cache-from: type=gha
        cache-to: type=gha,mode=min
    - name: Set MOCKDSP env
      run: printf "MOCKDSP=1" >> $GITHUB_ENV
    - name: Run test_tiny on DSP
      run: DEBUG=2 DSP=1 python test/test_tiny.py
    - name: Test transcendentals
      run: CC=clang-20 PYTHONPATH="." DEBUG=2 DSP=1 python test/test_transcendental.py TestTranscendentalVectorized
    - name: Test quantize onnx
      run: PYTHONPATH="." DEBUG=2 DSP=1 python3 test/test_quantize_onnx.py

  testwebgpu:
    name: Linux (WebGPU)
    runs-on: ubuntu-22.04
    timeout-minutes: 20
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
        key: webgpu-minimal
        deps: testing_minimal
        python-version: '3.11'
        webgpu: 'true'
    - name: Check Device.DEFAULT (WEBGPU) and print some source
      run: |
        WEBGPU=1 python -c "from tinygrad import Device; assert Device.DEFAULT == 'WEBGPU', Device.DEFAULT"
        WEBGPU=1 DEBUG=4 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
    - name: Run selected webgpu tests
      run: |
          WEBGPU=1 WEBGPU_BACKEND="WGPUBackendType_Vulkan" python3 -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit \
          --ignore=test/test_copy_speed.py --ignore=test/test_rearrange_einops.py \
          --ignore=test/test_fuzz_shape_ops.py --durations=20
    - name: Run process replay tests
      uses: ./.github/actions/process-replay

  testamd:
    strategy:
      fail-fast: false
      matrix:
        backend: [amd, amdllvm]

    name: Linux (${{ matrix.backend }})
    runs-on: ubuntu-22.04
    timeout-minutes: 20
    env:
      IGNORE_OOB: 0
      AMD: 1
      MOCKGPU: 1
      FORWARD_ONLY: 1
      AMD_LLVM: ${{ matrix.backend == 'amdllvm' && '1' || matrix.backend != 'amdllvm' && '0' }}
      PYTHONPATH: ${{ github.workspace }}
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
      - name: Setup Environment
        uses: ./.github/actions/setup-tinygrad
        with:
          key: ${{ matrix.backend }}-minimal
          deps: testing_minimal
          amd: 'true'
          llvm: ${{ matrix.backend == 'amdllvm' && 'true' }}
      - name: Check Device.DEFAULT and print some source
        run: |
          python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['AMD'], Device.DEFAULT"
          DEBUG=5 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
      - name: Run LLVM test
        if: matrix.backend=='amdllvm'
        run: python test/device/test_amd_llvm.py
      - name: Run pytest (amd)
        run: python -m pytest -n=auto test/test_ops.py test/test_dtype.py test/test_dtype_alu.py test/test_linearizer.py test/test_randomness.py test/test_jit.py test/test_graph.py test/test_multitensor.py test/device/test_hcq.py --durations=20
      - name: Run pytest (amd)
        run: python -m pytest test/external/external_test_am.py --durations=20
      - name: Run TRANSCENDENTAL math
        run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
      - name: Run TestOps.test_add with SQTT
        run: |
          PROFILE=1 SQTT=1 DEBUG=5 python3 test/test_ops.py TestOps.test_add
          extra/sqtt/rgptool.py create "/tmp/profile.pkl.$USER" -o /tmp/gpu0.rgp
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

  testnvidia:
    strategy:
      fail-fast: false
      matrix:
        backend: [ptx, nv]

    name: Linux (${{ matrix.backend }})
    runs-on: ubuntu-22.04
    timeout-minutes: 20

    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
      - name: Setup Environment
        uses: ./.github/actions/setup-tinygrad
        with:
          key: ${{ matrix.backend }}-minimal
          deps: testing_minimal
          cuda: 'true'
          ocelot: 'true'
      - name: Set env
        run: printf "${{ matrix.backend == 'PTX' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nPTX=1\nMOCKGPU=1' || matrix.backend == 'nv' && 'NV=1\nMOCKGPU=1\nFORWARD_ONLY=1' }}" >> $GITHUB_ENV
      - name: Check Device.DEFAULT and print some source
        run: |
          PYTHONPATH=${{ github.workspace }} python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CUDA','NV'], Device.DEFAULT"
          DEBUG=5 PYTHONPATH=${{ github.workspace }} FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
      - name: Run pytest (cuda)
        # skip multitensor because it's slow
        run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --ignore test/test_gc.py --ignore test/test_multitensor.py --durations=20
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

  tests:
    strategy:
      fail-fast: false
      matrix:
        backend: [llvm, cpu, gpu]

    name: Linux (${{ matrix.backend }})
    runs-on: ubuntu-22.04
    timeout-minutes: 20
    env:
      IGNORE_OOB: 0

    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
      - name: Setup Environment
        uses: ./.github/actions/setup-tinygrad
        with:
          key: ${{ matrix.backend }}-minimal
          deps: testing_minimal
          opencl: ${{ matrix.backend == 'gpu' && 'true' }}
          llvm: ${{ matrix.backend == 'llvm' && 'true' }}
      - name: Set env
        run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'cpu' && 'CPU=1' || matrix.backend == 'gpu' && 'GPU=1' }}" >> $GITHUB_ENV
      - name: Check Device.DEFAULT and print some source
        run: |
          PYTHONPATH=${{ github.workspace }} python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['LLVM','CPU','GPU'], Device.DEFAULT"
          DEBUG=5 PYTHONPATH=${{ github.workspace }} FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
      - name: Run pytest (not cuda)
        run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --durations=20
      - name: Run TRANSCENDENTAL math
        run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
      - name: Run process replay tests
        uses: ./.github/actions/process-replay

# ****** OSX Tests ******

  testmetal2:
    name: MacOS (unit)
    runs-on: macos-14
    timeout-minutes: 20
    env:
      IGNORE_OOB: 0

    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
        key: metal2
        deps: testing
        python-version: '3.11'
        amd: 'true'
        cuda: 'true'
        ocelot: 'true'
        llvm: 'true'
    - name: Run real world test
      run: METAL=1 python -m pytest -n=auto test/models/test_real_world.py --durations=20
    - name: Test models (Metal)
      run: METAL=1 python -m pytest -n=auto test/models -v --durations=20
    - name: Run ONNX
      run: METAL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
    - name: Test tensor core ops (fake)
      run: TC=2 METAL=1 DEBUG=3 python test/test_ops.py TestOps.test_gemm
    - name: Test tensor core ops (real)
      run: METAL=1 DEBUG=3 python test/test_ops.py TestOps.test_big_gemm
    - name: Test LLaMA compile speed
      run: PYTHONPATH="." METAL=1 python test/external/external_test_speed_llama.py
    - name: Test Beam Search
      run: PYTHONPATH="." METAL=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py
    #- name: Fuzz Test linearizer
    #  run: PYTHONPATH="." METAL=1 DEPTH=4 FUZZ_N=50 FUZZ_MAX_SIZE=1000000 python test/external/fuzz_linearizer.py
    - name: Run TRANSCENDENTAL math
      run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
    - name: Run pytest (amd)
      env:
        MOCKGPU: 1
        AMD: 1
        FORWARD_ONLY: 1
      run: |
        python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
    - name: Run pytest (amd with llvm backend)
      env:
        MOCKGPU: 1
        AMD: 1
        FORWARD_ONLY: 1
      run: |
        python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20
    - name: Run pytest (ptx)
      env:
        MOCKGPU: 1
        PTX: 1
        NV: 1
        FORWARD_ONLY: 1
      run: |
        python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
    - name: Run process replay tests
      uses: ./.github/actions/process-replay

  osxwebgpu:
    name: MacOS (WebGPU)
    runs-on: macos-14
    timeout-minutes: 10
    steps:
    - name: Checkout Code
      uses: actions/checkout@v4
    - name: Setup Environment
      uses: ./.github/actions/setup-tinygrad
      with:
        key: osx-webgpu
        deps: testing
        webgpu: 'true'
    - name: Test infinity math in WGSL
      run: WEBGPU=1 python -m pytest -n=auto test/test_renderer_failures.py::TestWGSLFailures::test_multiply_infinity --durations=20
    - name: Build WEBGPU Efficientnet
      run: WEBGPU=1 WEBGPU_BACKEND="WGPUBackendType_Metal" python3 -m examples.compile_efficientnet
    - name: Clean npm cache
      run: npm cache clean --force
    - name: Install Puppeteer
      run: npm install puppeteer
    # this is also flaky
    #- name: Run WEBGPU Efficientnet
    #  run: node test/web/test_webgpu.js
    # this is flaky
    #- name: Run VIZ tests as external package
    #  run: |
    #    mkdir $GITHUB_WORKSPACE/test_dir
    #    cd $GITHUB_WORKSPACE/test_dir
    #    python -m venv venv
    #    source venv/bin/activate
    #    pip install $GITHUB_WORKSPACE
    #    cp $GITHUB_WORKSPACE/test/web/test_viz.js .
    #    node test_viz.js
    - name: Test ONNX Runner (WEBGPU)
      run: WEBGPU=1 PYTHONPATH=. python3 test/external/external_test_onnx_runner.py

  osxremote:
   name: MacOS (remote metal)
   runs-on: macos-15
   timeout-minutes: 10
   env:
     REMOTE: 1
     REMOTEDEV: METAL
   steps:
     - name: Checkout Code
       uses: actions/checkout@v4
     - name: Setup Environment
       uses: ./.github/actions/setup-tinygrad
       with:
         key: macos-remote
         deps: testing_minimal
     - name: Check Device.DEFAULT and print some source
       run: |
         python -c "from tinygrad import Device; assert Device.DEFAULT == 'REMOTE', Device.DEFAULT"
         python -c "from tinygrad import Device; assert Device.default.properties.real_device == 'METAL', Device.default.properties.real_device"
         DEBUG=4 python3 test/test_tiny.py TestTiny.test_plus
     - name: Run REMOTE=1 Test
       run: |
         python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_tensor_variable.py

  amdremote:
    name: Linux (remote)
    runs-on: ubuntu-22.04
    timeout-minutes: 20
    env:
      REMOTE: 1
      PYTHONPATH: ${{ github.workspace }}
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
      - name: Setup Environment
        uses: ./.github/actions/setup-tinygrad
        with:
          key: linux-remote
          deps: testing_minimal
          amd: 'true'
          llvm: 'true'
          opencl: 'true'
      - name: Start remote server
        run: |
          start_server() {
            systemd-run --user \
              --unit="$1" \
              --setenv=REMOTEDEV="$2" \
              --setenv=MOCKGPU=1 \
              --setenv=PYTHONPATH=. \
              --setenv=PORT="$3" \
              --working-directory="$(pwd)" \
              python tinygrad/runtime/ops_remote.py
          }

          start_server "remote-server-amd-1" "AMD" 6667
          start_server "remote-server-amd-2" "AMD" 6668
          start_server "remote-server-gpu" "GPU" 7667
          start_server "remote-server-cpu" "CPU" 8667
      - name: Check Device.DEFAULT and print some source
        env:
          HOST: 127.0.0.1:6667*6,127.0.0.1:6668*6
        run: |
          python -c "from tinygrad import Device; assert Device.DEFAULT == 'REMOTE', Device.DEFAULT"
          python -c "from tinygrad import Device; assert Device.default.properties.real_device == 'AMD', Device.default.properties.real_device"
          DEBUG=4 python3 test/test_tiny.py TestTiny.test_plus
      - name: Run REMOTE=1 Test (AMD)
        env:
          HOST: 127.0.0.1:6667*6,127.0.0.1:6668*6
        run: |
          python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_remote.py test/test_tensor_variable.py --durations 20
      - name: Run REMOTE=1 Test (GPU)
        env:
          HOST: 127.0.0.1:7667*6
        run: |
          python3 -m pytest test/test_tiny.py test/test_image_dtype.py test/test_jit.py --durations 20
          IMAGE=2 python3 -m pytest test/test_tiny.py test/test_image_dtype.py
      - name: Run REMOTE=1 Test (CPU)
        env:
          HOST: 127.0.0.1:8667*6
        run: |
          python3 -m pytest test/test_tiny.py test/test_jit.py test/test_multitensor.py --durations 20
      - name: Show remote server logs
        if: always()
        run: |
          journalctl --user -u remote-server-amd-1 --no-pager
          journalctl --user -u remote-server-amd-2 --no-pager
          journalctl --user -u remote-server-gpu --no-pager
          journalctl --user -u remote-server-cpu --no-pager

  osxtests:
    strategy:
      fail-fast: false
      matrix:
        backend: [metal, llvm, cpu]
    name: MacOS (${{ matrix.backend }})
    runs-on: macos-15
    timeout-minutes: 20
    env:
      IGNORE_OOB: 0
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
      - name: Setup Environment
        uses: ./.github/actions/setup-tinygrad
        with:
          key: macos-${{ matrix.backend }}-minimal
          deps: testing_minimal
          pydeps: "capstone"
          llvm: ${{ matrix.backend == 'llvm' && 'true' }}
      - name: Set env
        run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'cpu' && 'CPU=1' || matrix.backend == 'metal' && 'METAL=1'}}" >> $GITHUB_ENV
      - name: Check Device.DEFAULT and print some source
        run: |
          python -c "from tinygrad import Device; assert Device.DEFAULT == '${{ matrix.backend }}'.upper(), Device.DEFAULT"
          DEBUG=4 python3 test/test_tiny.py TestTiny.test_plus
      - name: Run pytest (${{ matrix.backend }})
        run: python3 -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --durations=20
      - name: Run process replay tests
        uses: ./.github/actions/process-replay
      - name: Run macOS-specific unit test
        if: matrix.backend == 'cpu'
        run: python3 -m pytest test/unit/test_disk_tensor.py::TestDiskTensor::test_copy_to_cpu_not_truncated

# ****** Windows Tests ******

  wintests:
    strategy:
      fail-fast: false
      matrix:
        backend: [llvm, cpu, webgpu]

    name: Windows (${{ matrix.backend }})
    runs-on: windows-latest
    timeout-minutes: 15
    env:
      IGNORE_OOB: 0
    steps:
      - name: Checkout Code
        uses: actions/checkout@v4
      - name: Setup Environment
        uses: ./.github/actions/setup-tinygrad
        with:
          key: windows-${{ matrix.backend }}-minimal
          deps: testing_unit
          pydeps: ${{ matrix.backend == 'webgpu' && 'dawn-python' || '' }}
      - name: Set env
        shell: bash
        run:  printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'cpu' && 'CPU=1' || matrix.backend == 'webgpu' && 'WEBGPU=1'}}" >> $GITHUB_ENV
      - name: Run unit tests
        if: matrix.backend=='llvm'
        run: python -m pytest -n=auto test/unit/ --ignore=test/unit/test_disk_tensor.py --ignore=test/unit/test_elf.py --ignore=test/unit/test_tar.py
      - name: Run pytest (${{ matrix.backend }})
        shell: bash
        run: |
          python -c "from tinygrad import Device; assert Device.DEFAULT == '${{ matrix.backend }}'.upper(), Device.DEFAULT"
          python -m pytest -n=auto test/test_tiny.py test/test_ops.py --durations=20