openpilot/tinygrad_repo/test/speed/external_test_copy_speed.py
Vehicle Researcher c5d5c5d1f3 openpilot v0.10.1 release
date: 2025-10-24T00:30:59
master commit: 405631baf9685e171a0dd19547cb763f1b163d18
2025-10-24 00:31:03 -07:00

95 lines
3.6 KiB
Python

import unittest, numpy as np
from tinygrad import Tensor, Device, TinyJit
from tinygrad.helpers import Timing, CI, OSX, getenv
import multiprocessing.shared_memory as shared_memory
N = getenv("NSZ", 256)
class TestCopySpeed(unittest.TestCase):
@classmethod
def setUpClass(cls): Device[Device.DEFAULT].synchronize()
def testCopySHMtoDefault(self):
s = shared_memory.SharedMemory(name="test_X", create=True, size=N*N*4)
s.close()
if CI and not OSX:
t = Tensor.empty(N, N, device="disk:/dev/shm/test_X").realize()
else:
t = Tensor.empty(N, N, device="disk:shm:test_X").realize()
for _ in range(3):
with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
with Timing("queue: "):
t.to(Device.DEFAULT).realize()
Device[Device.DEFAULT].synchronize()
s.unlink()
def testCopyCPUtoDefault(self):
t = Tensor.ones(N, N, device="CPU").contiguous().realize()
print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
for _ in range(3):
with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
with Timing("queue: "):
t.to(Device.DEFAULT).realize()
Device[Device.DEFAULT].synchronize()
def testCopyCPUtoDefaultFresh(self):
print("fresh copy")
for _ in range(3):
t = Tensor.ones(N, N, device="CPU").contiguous().realize()
with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"): # noqa: F821
with Timing("queue: "):
t.to(Device.DEFAULT).realize()
Device[Device.DEFAULT].synchronize()
del t
def testCopyDefaulttoCPU(self):
t = Tensor.ones(N, N).contiguous().realize()
print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
for _ in range(3):
with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
t.to('CPU').realize()
def testCopyDefaulttoCPUJit(self):
if Device.DEFAULT == "CPU": return unittest.skip("CPU to CPU copy is a no-op")
@TinyJit
def _do_copy(t): return t.to('CPU').realize()
t = Tensor.randn(N, N).contiguous().realize()
Device[Device.DEFAULT].synchronize()
for _ in range(5):
with Timing(f"copy {Device.DEFAULT} -> CPU {t.nbytes()/(1024**2)}M: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
x = _do_copy(t)
Device[Device.DEFAULT].synchronize()
np.testing.assert_equal(t.numpy(), x.numpy())
def testCopyCPUtoDefaultJit(self):
if Device.DEFAULT == "CPU": return unittest.skip("CPU to CPU copy is a no-op")
@TinyJit
def _do_copy(x): return x.to(Device.DEFAULT).realize()
for _ in range(5):
t = Tensor.randn(N, N, device="CPU").contiguous().realize()
Device["CPU"].synchronize()
with Timing(f"copy CPU -> {Device.DEFAULT} {t.nbytes()/(1024**2)}M: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s"):
x = _do_copy(t)
Device[Device.DEFAULT].synchronize()
np.testing.assert_equal(t.numpy(), x.numpy())
@unittest.skipIf(CI, "CI doesn't have 6 GPUs")
@unittest.skipIf(Device.DEFAULT != "CL", "only test this on CL")
def testCopyCPUto6GPUs(self):
from tinygrad.runtime.ops_cl import CLDevice
if len(CLDevice.device_ids) != 6: raise unittest.SkipTest("computer doesn't have 6 GPUs")
t = Tensor.ones(N, N, device="CPU").contiguous().realize()
print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
for _ in range(3):
with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s ({t.nbytes()*6/ns:.2f} GB/s total)"):
with Timing("queue: "):
for g in range(6):
t.to(f"CL:{g}").realize()
Device["CL"].synchronize()
if __name__ == '__main__':
unittest.main()