Hello everyone! Did anybody try to install AMD ROCm on the FW16 and train/run local small models? I’m planning to use Ubuntu and run ROCm+PyTorch to get started with my Uni projects. Any advice?
Hey there
I didnt actually try it, because I dont work at all with ML. But if you dont have a FW16 yet, I could do some tests for you?
Here is a useful thread:
I am using it in EndevourOS (Arch). I had to install rocm-hip-sdk and set the flag: HSA_OVERRIDE_GFX_VERSION=11.0.0
(more on this requirement here: RuntimeError: HIP error: invalid device function - if there is a solution already existed against this issue. · Issue #2536 · ROCm/ROCm · GitHub)
Keep in mind that the FW16 dGPU Radeon ™ RX *7700S is not officially supported:
Also, here are some useful scripts to test your cuda installation. I used Python 3.11:
Check Rocm:
import torch, grp, pwd, os, subprocess
devices = []
try:
print("\n\nChecking ROCM support...")
result = subprocess.run(["rocminfo"], stdout=subprocess.PIPE)
cmd_str = result.stdout.decode("utf-8")
cmd_split = cmd_str.split("Agent ")
for part in cmd_split:
item_single = part[0:1]
item_double = part[0:2]
if item_single.isnumeric() or item_double.isnumeric():
new_split = cmd_str.split("Agent " + item_double)
device = (
new_split[1]
.split("Marketing Name:")[0]
.replace(" Name: ", "")
.replace("\n", "")
.replace(" ", "")
.split("Uuid:")[0]
.split("*******")[1]
)
devices.append(device)
if len(devices) > 0:
print("GOOD: ROCM devices found: ", len(devices))
else:
print("BAD: No ROCM devices found.")
print("Checking PyTorch...")
x = torch.rand(5, 3)
has_torch = False
len_x = len(x)
if len_x == 5:
has_torch = True
for i in x:
if len(i) == 3:
has_torch = True
else:
has_torch = False
if has_torch:
print("GOOD: PyTorch is working fine.")
else:
print("BAD: PyTorch is NOT working.")
print("Checking user groups...")
user = os.getlogin()
groups = [g.gr_name for g in grp.getgrall() if user in g.gr_mem]
gid = pwd.getpwnam(user).pw_gid
groups.append(grp.getgrgid(gid).gr_name)
if "render" in groups and "video" in groups:
print("GOOD: The user", user, "is in RENDER and VIDEO groups.")
else:
print(
"BAD: The user",
user,
"is NOT in RENDER and VIDEO groups. This is necessary in order to PyTorch use HIP resources",
)
if torch.cuda.is_available():
print("GOOD: PyTorch ROCM support found.")
t = torch.tensor([5, 5, 5], dtype=torch.int64, device="cuda")
print("Testing PyTorch ROCM support...")
if str(t) == "tensor([5, 5, 5], device='cuda:0')":
print("Everything fine! You can run PyTorch code inside of: ")
for idx, device in enumerate(devices):
print(f"---> {device}")
else:
print("BAD: PyTorch ROCM support NOT found.")
except:
print(
"Cannot find rocminfo command information. Unable to determine if AMDGPU drivers with ROCM support were installed."
)
Check Cuda GPU:
import random
from time import time
from os import putenv
import torch
class SmallModel(torch.nn.Module):
def __init__(self, in_f) -> None:
super().__init__()
self.cnn = torch.nn.Sequential(
torch.nn.Linear(in_f, 10000),
torch.nn.ReLU(),
torch.nn.Linear(10000, 10000),
torch.nn.ReLU(),
torch.nn.Linear(10000, 10000),
torch.nn.ReLU(),
torch.nn.Linear(10000, 10000),
torch.nn.ReLU(),
torch.nn.Linear(10000, 10000),
torch.nn.ReLU(),
torch.nn.Linear(10000, 10000),
torch.nn.ReLU(),
torch.nn.Linear(10000, 10000),
torch.nn.ReLU(),
torch.nn.Linear(10000, 10000),
torch.nn.ReLU(),
torch.nn.Linear(10000, 10000),
torch.nn.ReLU(),
torch.nn.Linear(10000, 100),
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
# print(x.device)
return self.cnn(x)
# putenv("HSA_OVERRIDE_GFX_VERSION", "11.0.0")
device = "cuda:0"
# device = "cpu"
a = torch.randn((100, 100))
a = a.to(device)
m = SmallModel(100)
m = m.to(device)
start_time = time()
for k in range(100):
b = m(a)
print(f"Total time: {(time() - start_time)*1000} ms with device {device}")
1 Like