OK - I think what might be going on is that by the “surprise hotplug” we never get a chance to clean up kfd software nodes then.
See if this helps:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index d4c8b03b6bf5..f40a83be2cac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5263,6 +5263,9 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
if (drm_dev_is_unplugged(adev_to_drm(adev)))
amdgpu_device_unmap_mmio(adev);
+ /* surprise hotplug */
+ if (pci_dev_is_disconnected(adev->pdev))
+ amdgpu_amdkfd_device_fini_sw(adev);
}
void amdgpu_device_fini_sw(struct amdgpu_device *adev)