aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
authorEllen Pan <[email protected]>2025-04-29 21:18:44 +0000
committerAlex Deucher <[email protected]>2025-05-07 21:43:13 +0000
commit086809c82c96116aed78f762e4f1d61a4e0210a7 (patch)
tree3cf9ef1bb98475b4432599191ffab84a23f494da /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parentdrm/amdgpu: Add unrecoverable error message definitions for VFs (diff)
downloadkernel-086809c82c96116aed78f762e4f1d61a4e0210a7.tar.gz
kernel-086809c82c96116aed78f762e4f1d61a4e0210a7.zip
drm/amdgpu: Implement unrecoverable error message handling for VFs
This notification may arrive in VF mailbox while polling for response from another event. This patches covers the following scenarios: - If VF is already in RMA state, then do not attempt to contact the host. Host will ignore the VF after sending the notification. - If the notification is detected during polling, then set the RMA status, and return error to caller. - If the notification arrives by interrupt, then set the RMA status and queue a reset. This reset will fail and VF will stop runtime services. Reviewed-by: Shravan Kumar Gande <[email protected]> Signed-off-by: Victor Skvortsov <[email protected]> Signed-off-by: Ellen Pan <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c5
1 files changed, 5 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ccc6217761bf..a2997ededd0e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6150,6 +6150,11 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
/* Actual ASIC resets if needed.*/
/* Host driver will handle XGMI hive reset for SRIOV */
if (amdgpu_sriov_vf(adev)) {
+
+ /* Bail out of reset early */
+ if (amdgpu_ras_is_rma(adev))
+ return -ENODEV;
+
if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
amdgpu_ras_set_fed(adev, true);