aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
diff options
context:
space:
mode:
authorDave Airlie <[email protected]>2025-05-11 21:14:27 +0000
committerDave Airlie <[email protected]>2025-05-11 21:14:34 +0000
commit1faeeb315fdbd005bbc1bc74214e39087971dda9 (patch)
treed390ef6948a417cd63a252e70cef96bbb0bd5434 /drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
parentMerge tag 'drm-misc-next-2025-05-08' of https://gitlab.freedesktop.org/drm/mi... (diff)
parentReapply: drm/amdgpu: Use generic hdp flush function (diff)
downloadkernel-1faeeb315fdbd005bbc1bc74214e39087971dda9.tar.gz
kernel-1faeeb315fdbd005bbc1bc74214e39087971dda9.zip
Merge tag 'amd-drm-next-6.16-2025-05-09' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.16-2025-05-09: amdgpu: - IPS fixes - DSC cleanup - DC Scaling updates - DC FP fixes - Fused I2C-over-AUX updates - SubVP fixes - Freesync fix - DMUB AUX fixes - VCN fix - Hibernation fixes - HDP fixes - DCN 2.1 fixes - DPIA fixes - DMUB updates - Use drm_file_err in amdgpu - Enforce isolation updates - Use new dma_fence helpers - USERQ fixes - Documentation updates - Misc code cleanups - SR-IOV updates - RAS updates - PSP 12 cleanups amdkfd: - Update error messages for SDMA - Userptr updates drm: - Add drm_file_err function dma-buf: - Add a helper to sort and deduplicate dma_fence arrays From: Alex Deucher <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Dave Airlie <[email protected]>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c63
1 files changed, 46 insertions, 17 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index f5411b798e11..48101a34e049 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -274,6 +274,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
{
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
+ struct amdgpu_reset_context reset_context = { 0 };
amdgpu_virt_fini_data_exchange(adev);
@@ -281,8 +282,6 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
if (amdgpu_device_should_recover_gpu(adev)
&& (!amdgpu_device_has_job_running(adev) ||
adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT)) {
- struct amdgpu_reset_context reset_context;
- memset(&reset_context, 0, sizeof(reset_context));
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
@@ -293,6 +292,19 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
}
}
+static void xgpu_ai_mailbox_bad_pages_work(struct work_struct *work)
+{
+ struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, bad_pages_work);
+ struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
+
+ if (down_read_trylock(&adev->reset_domain->sem)) {
+ amdgpu_virt_fini_data_exchange(adev);
+ amdgpu_virt_request_bad_pages(adev);
+ amdgpu_virt_init_data_exchange(adev);
+ up_read(&adev->reset_domain->sem);
+ }
+}
+
static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *src,
unsigned type,
@@ -312,26 +324,42 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry)
{
enum idh_event event = xgpu_ai_mailbox_peek_msg(adev);
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
switch (event) {
- case IDH_FLR_NOTIFICATION:
+ case IDH_RAS_BAD_PAGES_NOTIFICATION:
+ xgpu_ai_mailbox_send_ack(adev);
+ if (amdgpu_sriov_runtime(adev))
+ schedule_work(&adev->virt.bad_pages_work);
+ break;
+ case IDH_UNRECOV_ERR_NOTIFICATION:
+ xgpu_ai_mailbox_send_ack(adev);
+ ras->is_rma = true;
+ dev_err(adev->dev, "VF is in an unrecoverable state. Runtime Services are halted.\n");
if (amdgpu_sriov_runtime(adev))
WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
- &adev->virt.flr_work),
- "Failed to queue work! at %s",
- __func__);
+ &adev->virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
break;
- case IDH_QUERY_ALIVE:
- xgpu_ai_mailbox_send_ack(adev);
- break;
- /* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore
- * it byfar since that polling thread will handle it,
- * other msg like flr complete is not handled here.
- */
- case IDH_CLR_MSG_BUF:
- case IDH_FLR_NOTIFICATION_CMPL:
- case IDH_READY_TO_ACCESS_GPU:
- default:
+ case IDH_FLR_NOTIFICATION:
+ if (amdgpu_sriov_runtime(adev))
+ WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
+ &adev->virt.flr_work),
+ "Failed to queue work! at %s",
+ __func__);
+ break;
+ case IDH_QUERY_ALIVE:
+ xgpu_ai_mailbox_send_ack(adev);
+ break;
+ /* READY_TO_ACCESS_GPU is fetched by kernel polling, IRQ can ignore
+ * it byfar since that polling thread will handle it,
+ * other msg like flr complete is not handled here.
+ */
+ case IDH_CLR_MSG_BUF:
+ case IDH_FLR_NOTIFICATION_CMPL:
+ case IDH_READY_TO_ACCESS_GPU:
+ default:
break;
}
@@ -387,6 +415,7 @@ int xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
}
INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
+ INIT_WORK(&adev->virt.bad_pages_work, xgpu_ai_mailbox_bad_pages_work);
return 0;
}