From 11bb33766f66d0cfe997601d6d974d0158a0e7b8 Mon Sep 17 00:00:00 2001 From: Ce Sun Date: Thu, 20 Mar 2025 18:12:40 +0800 Subject: drm/amdgpu: refactor amdgpu_device_gpu_recover Split amdgpu_device_gpu_recover into the following stages: halt activities,asic reset,schedule resume and amdgpu resume. The reason is that the subsequent addition of dpc recover code will have a high similarity with gpu reset Signed-off-by: Ce Sun Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 234 ++++++++++++++++++----------- 1 file changed, 144 insertions(+), 90 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6ebf6179064b..1866d07db133 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5930,94 +5930,40 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle) return ret; } -/** - * amdgpu_device_gpu_recover - reset the asic and recover scheduler - * - * @adev: amdgpu_device pointer - * @job: which job trigger hang - * @reset_context: amdgpu reset context pointer - * - * Attempt to reset the GPU if it has hung (all asics). - * Attempt to do soft-reset or full-reset and reinitialize Asic - * Returns 0 for success or an error on failure. - */ - -int amdgpu_device_gpu_recover(struct amdgpu_device *adev, +static int amdgpu_device_halt_activities(struct amdgpu_device *adev, struct amdgpu_job *job, - struct amdgpu_reset_context *reset_context) + struct amdgpu_reset_context *reset_context, + struct list_head *device_list, + struct amdgpu_hive_info *hive, + bool need_emergency_restart) { - struct list_head device_list, *device_list_handle = NULL; - bool job_signaled = false; - struct amdgpu_hive_info *hive = NULL; + struct list_head *device_list_handle = NULL; struct amdgpu_device *tmp_adev = NULL; int i, r = 0; - bool need_emergency_restart = false; - bool audio_suspended = false; - int retry_limit = AMDGPU_MAX_RETRY_LIMIT; - /* - * If it reaches here because of hang/timeout and a RAS error is - * detected at the same time, let RAS recovery take care of it. - */ - if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && - !amdgpu_sriov_vf(adev) && - reset_context->src != AMDGPU_RESET_SRC_RAS) { - dev_dbg(adev->dev, - "Gpu recovery from source: %d yielding to RAS error recovery handling", - reset_context->src); - return 0; - } - /* - * Special case: RAS triggered and full reset isn't supported - */ - need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); - - /* - * Flush RAM to disk so that after reboot - * the user can read log and see why the system rebooted. - */ - if (need_emergency_restart && amdgpu_ras_get_context(adev) && - amdgpu_ras_get_context(adev)->reboot) { - DRM_WARN("Emergency reboot."); - - ksys_sync_helper(); - emergency_restart(); - } - - dev_info(adev->dev, "GPU %s begin!\n", - need_emergency_restart ? "jobs stop":"reset"); - - if (!amdgpu_sriov_vf(adev)) - hive = amdgpu_get_xgmi_hive(adev); - if (hive) - mutex_lock(&hive->hive_lock); - - reset_context->job = job; - reset_context->hive = hive; /* * Build list of devices to reset. * In case we are in XGMI hive mode, resort the device list * to put adev in the 1st position. */ - INIT_LIST_HEAD(&device_list); if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { - list_add_tail(&tmp_adev->reset_list, &device_list); + list_add_tail(&tmp_adev->reset_list, device_list); if (adev->shutdown) tmp_adev->shutdown = true; } - if (!list_is_first(&adev->reset_list, &device_list)) - list_rotate_to_front(&adev->reset_list, &device_list); - device_list_handle = &device_list; + if (!list_is_first(&adev->reset_list, device_list)) + list_rotate_to_front(&adev->reset_list, device_list); + device_list_handle = device_list; } else { - list_add_tail(&adev->reset_list, &device_list); - device_list_handle = &device_list; + list_add_tail(&adev->reset_list, device_list); + device_list_handle = device_list; } if (!amdgpu_sriov_vf(adev)) { r = amdgpu_device_health_check(device_list_handle); if (r) - goto end_reset; + return r; } /* We need to lock reset domain only once both for XGMI and single device */ @@ -6041,7 +5987,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * some audio codec errors. */ if (!amdgpu_device_suspend_display_audio(tmp_adev)) - audio_suspended = true; + tmp_adev->pcie_reset_ctx.audio_suspended = true; amdgpu_ras_set_error_query_ready(tmp_adev, false); @@ -6076,23 +6022,19 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, atomic_inc(&tmp_adev->gpu_reset_counter); } - if (need_emergency_restart) - goto skip_sched_resume; + return r; +} - /* - * Must check guilty signal here since after this point all old - * HW fences are force signaled. - * - * job->base holds a reference to parent fence - */ - if (job && dma_fence_is_signaled(&job->hw_fence)) { - job_signaled = true; - dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); - goto skip_hw_reset; - } +static int amdgpu_device_asic_reset(struct amdgpu_device *adev, + struct list_head *device_list, + struct amdgpu_reset_context *reset_context) +{ + struct amdgpu_device *tmp_adev = NULL; + int retry_limit = AMDGPU_MAX_RETRY_LIMIT; + int r = 0; retry: /* Rest of adevs pre asic reset from XGMI hive. */ - list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + list_for_each_entry(tmp_adev, device_list, reset_list) { r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); /*TODO Should we stop ?*/ if (r) { @@ -6119,12 +6061,12 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ if (r) adev->asic_reset_res = r; } else { - r = amdgpu_do_asic_reset(device_list_handle, reset_context); + r = amdgpu_do_asic_reset(device_list, reset_context); if (r && r == -EAGAIN) goto retry; } - list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + list_for_each_entry(tmp_adev, device_list, reset_list) { /* * Drop any pending non scheduler resets queued before reset is done. * Any reset scheduled after this point would be valid. Scheduler resets @@ -6134,10 +6076,18 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ amdgpu_device_stop_pending_resets(tmp_adev); } -skip_hw_reset: + return r; +} + +static int amdgpu_device_sched_resume(struct list_head *device_list, + struct amdgpu_reset_context *reset_context, + bool job_signaled) +{ + struct amdgpu_device *tmp_adev = NULL; + int i, r = 0; /* Post ASIC reset for all devs .*/ - list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + list_for_each_entry(tmp_adev, device_list, reset_list) { for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; @@ -6173,8 +6123,16 @@ skip_hw_reset: } } -skip_sched_resume: - list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + return r; +} + +static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, + struct list_head *device_list, + bool need_emergency_restart) +{ + struct amdgpu_device *tmp_adev = NULL; + + list_for_each_entry(tmp_adev, device_list, reset_list) { /* unlock kfd: SRIOV would do it separately */ if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) amdgpu_amdkfd_post_reset(tmp_adev); @@ -6185,18 +6143,114 @@ skip_sched_resume: if (!adev->kfd.init_complete) amdgpu_amdkfd_device_init(adev); - if (audio_suspended) + if (tmp_adev->pcie_reset_ctx.audio_suspended) amdgpu_device_resume_display_audio(tmp_adev); amdgpu_device_unset_mp1_state(tmp_adev); amdgpu_ras_set_error_query_ready(tmp_adev, true); + } - tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, + tmp_adev = list_first_entry(device_list, struct amdgpu_device, reset_list); amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); +} + + +/** + * amdgpu_device_gpu_recover - reset the asic and recover scheduler + * + * @adev: amdgpu_device pointer + * @job: which job trigger hang + * @reset_context: amdgpu reset context pointer + * + * Attempt to reset the GPU if it has hung (all asics). + * Attempt to do soft-reset or full-reset and reinitialize Asic + * Returns 0 for success or an error on failure. + */ + +int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + struct amdgpu_job *job, + struct amdgpu_reset_context *reset_context) +{ + struct list_head device_list; + bool job_signaled = false; + struct amdgpu_hive_info *hive = NULL; + int r = 0; + bool need_emergency_restart = false; + + /* + * If it reaches here because of hang/timeout and a RAS error is + * detected at the same time, let RAS recovery take care of it. + */ + if (amdgpu_ras_is_err_state(adev, AMDGPU_RAS_BLOCK__ANY) && + !amdgpu_sriov_vf(adev) && + reset_context->src != AMDGPU_RESET_SRC_RAS) { + dev_dbg(adev->dev, + "Gpu recovery from source: %d yielding to RAS error recovery handling", + reset_context->src); + return 0; + } + + /* + * Special case: RAS triggered and full reset isn't supported + */ + need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); + + /* + * Flush RAM to disk so that after reboot + * the user can read log and see why the system rebooted. + */ + if (need_emergency_restart && amdgpu_ras_get_context(adev) && + amdgpu_ras_get_context(adev)->reboot) { + DRM_WARN("Emergency reboot."); + + ksys_sync_helper(); + emergency_restart(); + } + + dev_info(adev->dev, "GPU %s begin!\n", + need_emergency_restart ? "jobs stop":"reset"); + + if (!amdgpu_sriov_vf(adev)) + hive = amdgpu_get_xgmi_hive(adev); + if (hive) + mutex_lock(&hive->hive_lock); + + reset_context->job = job; + reset_context->hive = hive; + INIT_LIST_HEAD(&device_list); + + r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list, + hive, need_emergency_restart); + if (r) + goto end_reset; + + if (need_emergency_restart) + goto skip_sched_resume; + /* + * Must check guilty signal here since after this point all old + * HW fences are force signaled. + * + * job->base holds a reference to parent fence + */ + if (job && dma_fence_is_signaled(&job->hw_fence)) { + job_signaled = true; + dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); + goto skip_hw_reset; + } + + r = amdgpu_device_asic_reset(adev, &device_list, reset_context); + if (r) + goto end_reset; +skip_hw_reset: + r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); + if (r) + goto end_reset; +skip_sched_resume: + amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); end_reset: if (hive) { mutex_unlock(&hive->hive_lock); -- cgit From 8ba904f54148d2a093650b13a229d2ff18142f7d Mon Sep 17 00:00:00 2001 From: Ce Sun Date: Fri, 21 Mar 2025 10:11:18 +0800 Subject: drm/amdgpu: Multi-GPU DPC recovery support Add support for DPC recover based on refactored code Signed-off-by: Ce Sun Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 5 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 172 +++++++++++++++++++---------- drivers/gpu/drm/amd/amdgpu/soc15.c | 5 + 3 files changed, 125 insertions(+), 57 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 2db233a9f972..9a9e5249c63f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -551,6 +551,7 @@ struct amdgpu_allowed_register_entry { * are reset depends on the ASIC. Notably doesn't reset IPs * shared with the CPU on APUs or the memory controllers (so * VRAM is not lost). Not available on all ASICs. + * @AMD_RESET_LINK: Triggers SW-UP link reset on other GPUs * @AMD_RESET_BACO: BACO (Bus Alive, Chip Off) method powers off and on the card * but without powering off the PCI bus. Suitable only for * discrete GPUs. @@ -568,6 +569,7 @@ enum amd_reset_method { AMD_RESET_METHOD_MODE0, AMD_RESET_METHOD_MODE1, AMD_RESET_METHOD_MODE2, + AMD_RESET_METHOD_LINK, AMD_RESET_METHOD_BACO, AMD_RESET_METHOD_PCI, AMD_RESET_METHOD_ON_INIT, @@ -830,6 +832,8 @@ struct amdgpu_mqd { }; struct amdgpu_pcie_reset_ctx { + bool in_link_reset; + bool occurs_dpc; bool audio_suspended; }; @@ -1469,6 +1473,7 @@ void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, const u32 array_size); int amdgpu_device_mode1_reset(struct amdgpu_device *adev); +int amdgpu_device_link_reset(struct amdgpu_device *adev); bool amdgpu_device_supports_atpx(struct drm_device *dev); bool amdgpu_device_supports_px(struct drm_device *dev); bool amdgpu_device_supports_boco(struct drm_device *dev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1866d07db133..7525128f971f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3172,6 +3172,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) * always assumed to be lost. */ switch (amdgpu_asic_reset_method(adev)) { + case AMD_RESET_METHOD_LINK: case AMD_RESET_METHOD_BACO: case AMD_RESET_METHOD_MODE1: return true; @@ -5510,6 +5511,29 @@ mode1_reset_failed: return ret; } +int amdgpu_device_link_reset(struct amdgpu_device *adev) +{ + int ret = 0; + + dev_info(adev->dev, "GPU link reset\n"); + + if (!adev->pcie_reset_ctx.occurs_dpc) + ret = amdgpu_dpm_link_reset(adev); + + if (ret) + goto link_reset_failed; + + ret = amdgpu_psp_wait_for_bootloader(adev); + if (ret) + goto link_reset_failed; + + return 0; + +link_reset_failed: + dev_err(adev->dev, "GPU link reset failed\n"); + return ret; +} + int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, struct amdgpu_reset_context *reset_context) { @@ -5814,6 +5838,7 @@ static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev) switch (amdgpu_asic_reset_method(adev)) { case AMD_RESET_METHOD_MODE1: + case AMD_RESET_METHOD_LINK: adev->mp1_state = PP_MP1_STATE_SHUTDOWN; break; case AMD_RESET_METHOD_MODE2: @@ -5951,6 +5976,8 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev, list_add_tail(&tmp_adev->reset_list, device_list); if (adev->shutdown) tmp_adev->shutdown = true; + if (adev->pcie_reset_ctx.occurs_dpc) + tmp_adev->pcie_reset_ctx.in_link_reset = true; } if (!list_is_first(&adev->reset_list, device_list)) list_rotate_to_front(&adev->reset_list, device_list); @@ -5960,7 +5987,7 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev, device_list_handle = device_list; } - if (!amdgpu_sriov_vf(adev)) { + if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) { r = amdgpu_device_health_check(device_list_handle); if (r) return r; @@ -6005,6 +6032,7 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev, /* disable ras on ALL IPs */ if (!need_emergency_restart && + (!adev->pcie_reset_ctx.occurs_dpc) && amdgpu_device_ip_need_full_reset(tmp_adev)) amdgpu_ras_suspend(tmp_adev); @@ -6035,7 +6063,11 @@ static int amdgpu_device_asic_reset(struct amdgpu_device *adev, retry: /* Rest of adevs pre asic reset from XGMI hive. */ list_for_each_entry(tmp_adev, device_list, reset_list) { + if (adev->pcie_reset_ctx.occurs_dpc) + tmp_adev->no_hw_access = true; r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); + if (adev->pcie_reset_ctx.occurs_dpc) + tmp_adev->no_hw_access = false; /*TODO Should we stop ?*/ if (r) { dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", @@ -6634,12 +6666,15 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta { struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); - int i; + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + struct amdgpu_reset_context reset_context; + struct list_head device_list; + int r = 0; - DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); + dev_info(adev->dev, "PCI error: detected callback!!\n"); - if (adev->gmc.xgmi.num_physical_nodes > 1) { - DRM_WARN("No support for XGMI hive yet..."); + if (!amdgpu_dpm_is_link_reset_supported(adev)) { + dev_warn(adev->dev, "No support for XGMI hive yet...\n"); return PCI_ERS_RESULT_DISCONNECT; } @@ -6647,32 +6682,30 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta switch (state) { case pci_channel_io_normal: + dev_info(adev->dev, "pci_channel_io_normal: state(%d)!!\n", state); return PCI_ERS_RESULT_CAN_RECOVER; - /* Fatal error, prepare for slot reset */ case pci_channel_io_frozen: - /* - * Locking adev->reset_domain->sem will prevent any external access - * to GPU during PCI error recovery - */ - amdgpu_device_lock_reset_domain(adev->reset_domain); - amdgpu_device_set_mp1_state(adev); - - /* - * Block any work scheduling as we do for regular GPU reset - * for the duration of the recovery - */ - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { - struct amdgpu_ring *ring = adev->rings[i]; - - if (!amdgpu_ring_sched_ready(ring)) - continue; - - drm_sched_stop(&ring->sched, NULL); + /* Fatal error, prepare for slot reset */ + dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state); + + if (hive) + mutex_lock(&hive->hive_lock); + adev->pcie_reset_ctx.occurs_dpc = true; + memset(&reset_context, 0, sizeof(reset_context)); + INIT_LIST_HEAD(&device_list); + + r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, + hive, false); + if (hive) { + mutex_unlock(&hive->hive_lock); + amdgpu_put_xgmi_hive(hive); } - atomic_inc(&adev->gpu_reset_counter); + if (r) + return PCI_ERS_RESULT_DISCONNECT; return PCI_ERS_RESULT_NEED_RESET; case pci_channel_io_perm_failure: /* Permanent error, prepare for device removal */ + dev_info(adev->dev, "pci_channel_io_perm_failure: state(%d)!!\n", state); return PCI_ERS_RESULT_DISCONNECT; } @@ -6685,8 +6718,10 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta */ pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) { + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); - DRM_INFO("PCI error: mmio enabled callback!!\n"); + dev_info(adev->dev, "PCI error: mmio enabled callback!!\n"); /* TODO - dump whatever for debugging purposes */ @@ -6710,10 +6745,12 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) { struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); - int r, i; struct amdgpu_reset_context reset_context; - u32 memsize; + struct amdgpu_device *tmp_adev = NULL; + struct amdgpu_hive_info *hive = NULL; struct list_head device_list; + int r = 0, i; + u32 memsize; /* PCI error slot reset should be skipped During RAS recovery */ if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || @@ -6721,15 +6758,12 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) amdgpu_ras_in_recovery(adev)) return PCI_ERS_RESULT_RECOVERED; - DRM_INFO("PCI error: slot reset callback!!\n"); + dev_info(adev->dev, "PCI error: slot reset callback!!\n"); memset(&reset_context, 0, sizeof(reset_context)); - INIT_LIST_HEAD(&device_list); - list_add_tail(&adev->reset_list, &device_list); - /* wait for asic to come out of reset */ - msleep(500); + msleep(700); /* Restore PCI confspace */ amdgpu_device_load_pci_state(pdev); @@ -6750,26 +6784,40 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); - set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); - - adev->no_hw_access = true; - r = amdgpu_device_pre_asic_reset(adev, &reset_context); - adev->no_hw_access = false; - if (r) - goto out; + set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); + INIT_LIST_HEAD(&device_list); - r = amdgpu_do_asic_reset(&device_list, &reset_context); + hive = amdgpu_get_xgmi_hive(adev); + if (hive) { + mutex_lock(&hive->hive_lock); + reset_context.hive = hive; + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + tmp_adev->pcie_reset_ctx.in_link_reset = true; + list_add_tail(&tmp_adev->reset_list, &device_list); + } + } else { + set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); + list_add_tail(&adev->reset_list, &device_list); + } + r = amdgpu_device_asic_reset(adev, &device_list, &reset_context); out: if (!r) { if (amdgpu_device_cache_pci_state(adev->pdev)) pci_restore_state(adev->pdev); - - DRM_INFO("PCIe error recovery succeeded\n"); + dev_info(adev->dev, "PCIe error recovery succeeded\n"); } else { - DRM_ERROR("PCIe error recovery failed, err:%d", r); - amdgpu_device_unset_mp1_state(adev); - amdgpu_device_unlock_reset_domain(adev->reset_domain); + dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); + if (tmp_adev) { + list_for_each_entry(tmp_adev, &device_list, reset_list) + amdgpu_device_unset_mp1_state(tmp_adev); + amdgpu_device_unlock_reset_domain(adev->reset_domain); + } + } + + if (hive) { + mutex_unlock(&hive->hive_lock); + amdgpu_put_xgmi_hive(hive); } return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; @@ -6786,26 +6834,36 @@ void amdgpu_pci_resume(struct pci_dev *pdev) { struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); - int i; - + struct list_head device_list; + struct amdgpu_hive_info *hive = NULL; + struct amdgpu_device *tmp_adev = NULL; - DRM_INFO("PCI error: resume callback!!\n"); + dev_info(adev->dev, "PCI error: resume callback!!\n"); /* Only continue execution for the case of pci_channel_io_frozen */ if (adev->pci_channel_state != pci_channel_io_frozen) return; - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { - struct amdgpu_ring *ring = adev->rings[i]; + INIT_LIST_HEAD(&device_list); - if (!amdgpu_ring_sched_ready(ring)) - continue; + hive = amdgpu_get_xgmi_hive(adev); + if (hive) { + mutex_lock(&hive->hive_lock); + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { + tmp_adev->pcie_reset_ctx.in_link_reset = false; + list_add_tail(&tmp_adev->reset_list, &device_list); + } + } else + list_add_tail(&adev->reset_list, &device_list); - drm_sched_start(&ring->sched, 0); - } + amdgpu_device_sched_resume(&device_list, NULL, NULL); + amdgpu_device_gpu_resume(adev, &device_list, false); + adev->pcie_reset_ctx.occurs_dpc = false; - amdgpu_device_unset_mp1_state(adev); - amdgpu_device_unlock_reset_domain(adev->reset_domain); + if (hive) { + mutex_unlock(&hive->hive_lock); + amdgpu_put_xgmi_hive(hive); + } } bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 659eab9b90be..c457be3a3c56 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -584,6 +584,8 @@ soc15_asic_reset_method(struct amdgpu_device *adev) * Enable triggering of GPU reset only if specified * by module parameter. */ + if (adev->pcie_reset_ctx.in_link_reset) + return AMD_RESET_METHOD_LINK; if (amdgpu_gpu_recovery == 4 || amdgpu_gpu_recovery == 5) return AMD_RESET_METHOD_MODE2; else if (!(adev->flags & AMD_IS_APU)) @@ -640,6 +642,9 @@ asic_reset: case AMD_RESET_METHOD_MODE2: dev_info(adev->dev, "MODE2 reset\n"); return amdgpu_dpm_mode2_reset(adev); + case AMD_RESET_METHOD_LINK: + dev_info(adev->dev, "Link reset\n"); + return amdgpu_device_link_reset(adev); default: dev_info(adev->dev, "MODE1 reset\n"); return amdgpu_device_mode1_reset(adev); -- cgit From 48b733d99b0d8db01fcf55d7ae31f69510fb1a4a Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 27 Feb 2025 12:31:28 -0500 Subject: drm/amdgpu: add rebar parameter Add a new parameter to disable BAR resizing. Note that this only disables the driver from attempting to resize the BAR, The BIOS may have resized the BAR at boot. Some teams have found this useful in debugging P2P DMA issues on systems where the available MMIO space did not allow for all of the GPUs present to resize their BARs. Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 11 +++++++++++ 3 files changed, 15 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 9a9e5249c63f..8316f93c1cce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -266,6 +266,7 @@ extern int amdgpu_umsch_mm_fwlog; extern int amdgpu_user_partt_mode; extern int amdgpu_agp; +extern int amdgpu_rebar; extern int amdgpu_wbrf; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7525128f971f..f4a01704effc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1680,6 +1680,9 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) if (amdgpu_sriov_vf(adev)) return 0; + if (!amdgpu_rebar) + return 0; + /* resizing on Dell G5 SE platforms causes problems with runtime pm */ if ((amdgpu_runtime_pm != 0) && adev->pdev->vendor == PCI_VENDOR_ID_ATI && diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 26bf896f1444..2ca04d6aaa82 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -238,6 +238,7 @@ int amdgpu_agp = -1; /* auto */ int amdgpu_wbrf = -1; int amdgpu_damage_clips = -1; /* auto */ int amdgpu_umsch_mm_fwlog; +int amdgpu_rebar = -1; /* auto */ DECLARE_DYNDBG_CLASSMAP(drm_debug_classes, DD_CLASS_TYPE_DISJOINT_BITS, 0, "DRM_UT_CORE", @@ -1096,6 +1097,16 @@ MODULE_PARM_DESC(wbrf, "Enable Wifi RFI interference mitigation (0 = disabled, 1 = enabled, -1 = auto(default)"); module_param_named(wbrf, amdgpu_wbrf, int, 0444); +/** + * DOC: rebar (int) + * Allow BAR resizing. Disable this to prevent the driver from attempting + * to resize the BAR if the GPU supports it and there is available MMIO space. + * Note that this just prevents the driver from resizing the BAR. The BIOS + * may have already resized the BAR at boot time. + */ +MODULE_PARM_DESC(rebar, "Resizable BAR (-1 = auto (default), 0 = disable, 1 = enable)"); +module_param_named(rebar, amdgpu_rebar, int, 0444); + /* These devices are not supported by amdgpu. * They are supported by the mach64, r128, radeon drivers */ -- cgit From daafa303d19f5522e4c24fbf5c1c981a16df2c2f Mon Sep 17 00:00:00 2001 From: Apurv Mishra Date: Mon, 17 Mar 2025 14:00:38 -0400 Subject: drm/amdkfd: Drop workaround for GC v9.4.3 revID 0 Remove workaround code for the early engineering samples GC v9.4.3 SOCs with revID 0 Reviewed-by: Amber Lin Signed-off-by: Apurv Mishra Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 +++++++- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 14 ++------------ drivers/gpu/drm/amd/amdkfd/kfd_device.c | 5 ----- drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 4 ++-- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 3 +-- 5 files changed, 12 insertions(+), 22 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f4a01704effc..1878bbacf0a2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2692,6 +2692,13 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) break; } + /* Check for IP version 9.4.3 with A0 hardware */ + if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && + !amdgpu_device_get_rev_id(adev)) { + dev_err(adev->dev, "Unsupported A0 hardware\n"); + return -ENODEV; /* device unsupported - no device error */ + } + if (amdgpu_has_atpx() && (amdgpu_is_atpx_hybrid() || amdgpu_has_atpx_dgpu_power_cntl()) && @@ -2704,7 +2711,6 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) adev->has_pr3 = parent ? pci_pr3_present(parent) : false; } - adev->pm.pp_feature = amdgpu_pp_feature_mask; if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) adev->pm.pp_feature &= ~PP_GFXOFF_MASK; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 783e0c3b86b4..8d3560314e5b 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1213,10 +1213,7 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev, if (uncached) { mtype = MTYPE_UC; } else if (ext_coherent) { - if (gc_ip_version == IP_VERSION(9, 5, 0) || adev->rev_id) - mtype = is_local ? MTYPE_CC : MTYPE_UC; - else - mtype = MTYPE_UC; + mtype = is_local ? MTYPE_CC : MTYPE_UC; } else if (adev->flags & AMD_IS_APU) { mtype = is_local ? mtype_local : MTYPE_NC; } else { @@ -1336,7 +1333,7 @@ static void gmc_v9_0_override_vm_pte_flags(struct amdgpu_device *adev, mtype_local = MTYPE_CC; *flags = AMDGPU_PTE_MTYPE_VG10(*flags, mtype_local); - } else if (adev->rev_id) { + } else { /* MTYPE_UC case */ *flags = AMDGPU_PTE_MTYPE_VG10(*flags, MTYPE_CC); } @@ -2411,13 +2408,6 @@ static int gmc_v9_0_hw_init(struct amdgpu_ip_block *ip_block) adev->gmc.flush_tlb_needs_extra_type_2 = amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 0) && adev->gmc.xgmi.num_physical_nodes; - /* - * TODO: This workaround is badly documented and had a buggy - * implementation. We should probably verify what we do here. - */ - adev->gmc.flush_tlb_needs_extra_type_0 = - amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) && - adev->rev_id == 0; /* The sequence of these two function calls matters.*/ gmc_v9_0_init_golden_registers(adev); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index b9c82be6ce13..bf0854bd5555 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -352,11 +352,6 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) f2g = &aldebaran_kfd2kgd; break; case IP_VERSION(9, 4, 3): - gfx_target_version = adev->rev_id >= 1 ? 90402 - : adev->flags & AMD_IS_APU ? 90400 - : 90401; - f2g = &gc_9_4_3_kfd2kgd; - break; case IP_VERSION(9, 4, 4): gfx_target_version = 90402; f2g = &gc_9_4_3_kfd2kgd; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index 4afff7094caf..a65c67cf56ff 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -402,7 +402,7 @@ static u32 kfd_get_vgpr_size_per_cu(u32 gfxv) { u32 vgpr_size = 0x40000; - if ((gfxv / 100 * 100) == 90400 || /* GFX_VERSION_AQUA_VANJARAM */ + if (gfxv == 90402 || /* GFX_VERSION_AQUA_VANJARAM */ gfxv == 90010 || /* GFX_VERSION_ALDEBARAN */ gfxv == 90008 || /* GFX_VERSION_ARCTURUS */ gfxv == 90500) @@ -462,7 +462,7 @@ void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev) if (gfxv == 80002) /* GFX_VERSION_TONGA */ props->eop_buffer_size = 0x8000; - else if ((gfxv / 100 * 100) == 90400) /* GFX_VERSION_AQUA_VANJARAM */ + else if (gfxv == 90402) /* GFX_VERSION_AQUA_VANJARAM */ props->eop_buffer_size = 4096; else if (gfxv >= 80000) props->eop_buffer_size = 4096; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 100717a98ec1..72be6e152e88 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -1245,8 +1245,7 @@ svm_range_get_pte_flags(struct kfd_node *node, case IP_VERSION(9, 4, 4): case IP_VERSION(9, 5, 0): if (ext_coherent) - mtype_local = (gc_ip_version < IP_VERSION(9, 5, 0) && !node->adev->rev_id) ? - AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_CC; + mtype_local = AMDGPU_VM_MTYPE_CC; else mtype_local = amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC : amdgpu_mtype_local == 2 ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; -- cgit From 906ad451675155380c1dc1881a244ebde8e8df0a Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Sun, 6 Apr 2025 17:27:24 -0400 Subject: drm/amdgpu: cancel gfx idle work in device suspend for s0ix This is normally handled in the gfx IP suspend callbacks, but for S0ix, those are skipped because we don't want to touch gfx. So handle it in device suspend. Fixes: b9467983b774 ("drm/amdgpu: add dynamic workload profile switching for gfx10") Fixes: 963537ca2325 ("drm/amdgpu: add dynamic workload profile switching for gfx11") Fixes: 5f95a1549555 ("drm/amdgpu: add dynamic workload profile switching for gfx12") Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1878bbacf0a2..979942f26dda 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3653,6 +3653,13 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) adev, adev->ip_blocks[i].version->type)) continue; + /* Since we skip suspend for S0i3, we need to cancel the delayed + * idle work here as the suspend callback never gets called. + */ + if (adev->in_s0ix && + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX && + amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 0, 0)) + cancel_delayed_work_sync(&adev->gfx.idle_work); /* skip suspend of gfx/mes and psp for S0ix * gfx is in gfxoff state, so on resume it will exit gfxoff just * like at runtime. PSP is also part of the always on hardware -- cgit From bb00bf17328d80f519df93bebc41f6c9171547d6 Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Tue, 1 Apr 2025 15:56:41 +0800 Subject: drm/amd/amdgpu: decouple ASPM with pcie dpm ASPM doesn't need to be disabled if pcie dpm is disabled. So ASPM can be independantly enabled. Signed-off-by: Kenneth Feng Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 979942f26dda..cfaa5d77b20a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1897,8 +1897,6 @@ bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) } if (adev->flags & AMD_IS_APU) return false; - if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) - return false; return pcie_aspm_enabled(adev->pdev); } -- cgit From 8949843762631d9d0fc526dfb61a272dca29fc6f Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Wed, 30 Oct 2024 10:38:46 +0530 Subject: drm/amdgpu: Enable userq fence interrupt support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support to handle the userqueue protected fence signal hardware interrupt. Create a xarray which maps the doorbell index to the fence driver address. This would help to retrieve the fence driver information when an userq fence interrupt is triggered. Firmware sends the doorbell offset value and this info is compared with the queue's mqd doorbell offset value. If they are same, we process the userq fence interrupt. v1:(Christian): - use xa_load to extract the fence driver. - move the amdgpu_userq_fence_driver_process call within the xa_lock as there is a chance that fence_drv might be freed. Signed-off-by: Arunpravin Paneer Selvam Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 6 ++++++ drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 25 ++++++++++++------------- 3 files changed, 20 insertions(+), 13 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index cfaa5d77b20a..fdd271edb0b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4336,6 +4336,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, spin_lock_init(&adev->virt.rlcg_reg_lock); spin_lock_init(&adev->wb.lock); + xa_init_flags(&adev->userq_xa, XA_FLAGS_LOCK_IRQ); + INIT_LIST_HEAD(&adev->reset_list); INIT_LIST_HEAD(&adev->ras_list); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index 1a9565b61266..cd473c985e36 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -71,6 +71,7 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, struct amdgpu_usermode_queue *userq) { struct amdgpu_userq_fence_driver *fence_drv; + unsigned long flags; int r; fence_drv = kzalloc(sizeof(*fence_drv), GFP_KERNEL); @@ -98,6 +99,11 @@ int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev, fence_drv->context = dma_fence_context_alloc(1); get_task_comm(fence_drv->timeline_name, current); + xa_lock_irqsave(&adev->userq_xa, flags); + __xa_store(&adev->userq_xa, userq->doorbell_index, + fence_drv, GFP_KERNEL); + xa_unlock_irqrestore(&adev->userq_xa, flags); + userq->fence_drv = fence_drv; return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 62a6da083a8f..b061b654d1ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -49,6 +49,7 @@ #include "nbio_v4_3.h" #include "mes_v11_0.h" #include "mes_v11_0_userqueue.h" +#include "amdgpu_userq_fence.h" #define GFX11_NUM_GFX_RINGS 1 #define GFX11_MEC_HPD_SIZE 2048 @@ -6334,25 +6335,23 @@ static int gfx_v11_0_eop_irq(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry) { - int i; + u32 doorbell_offset = entry->src_data[0]; u8 me_id, pipe_id, queue_id; struct amdgpu_ring *ring; - uint32_t mes_queue_id = entry->src_data[0]; + int i; DRM_DEBUG("IH: CP EOP\n"); - if (adev->enable_mes && (mes_queue_id & AMDGPU_FENCE_MES_QUEUE_FLAG)) { - struct amdgpu_mes_queue *queue; + if (adev->enable_mes && doorbell_offset) { + struct amdgpu_userq_fence_driver *fence_drv = NULL; + struct xarray *xa = &adev->userq_xa; + unsigned long flags; - mes_queue_id &= AMDGPU_FENCE_MES_QUEUE_ID_MASK; - - spin_lock(&adev->mes.queue_id_lock); - queue = idr_find(&adev->mes.queue_id_idr, mes_queue_id); - if (queue) { - DRM_DEBUG("process mes queue id = %d\n", mes_queue_id); - amdgpu_fence_process(queue->ring); - } - spin_unlock(&adev->mes.queue_id_lock); + xa_lock_irqsave(xa, flags); + fence_drv = xa_load(xa, doorbell_offset); + if (fence_drv) + amdgpu_userq_fence_driver_process(fence_drv); + xa_unlock_irqrestore(xa, flags); } else { me_id = (entry->ring_id & 0x0c) >> 2; pipe_id = (entry->ring_id & 0x03) >> 0; -- cgit From ac4a1f7f13309f8235a0c4719c34094de3303dfd Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Wed, 30 Oct 2024 10:46:49 +0530 Subject: drm/amdgpu: Remove the MES self test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove MES self test as this conflicts the userqueue fence interrupts. v2:(Christian) - remove the amdgpu_mes_self_test() function and any now unused code. Signed-off-by: Arunpravin Paneer Selvam Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 - drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 169 ----------------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 2 - drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 14 +-- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 13 +-- 5 files changed, 2 insertions(+), 199 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index fdd271edb0b9..4da18ed0beea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5144,9 +5144,6 @@ exit: } adev->in_suspend = false; - if (adev->enable_mes) - amdgpu_mes_self_test(adev); - if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0)) DRM_WARN("smart shift update failed\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index f8202e6ed4b1..acdca3110c24 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -1405,175 +1405,6 @@ out_unlock: return r; } -static int amdgpu_mes_test_create_gang_and_queues(struct amdgpu_device *adev, - int pasid, int *gang_id, - int queue_type, int num_queue, - struct amdgpu_ring **added_rings, - struct amdgpu_mes_ctx_data *ctx_data) -{ - struct amdgpu_ring *ring; - struct amdgpu_mes_gang_properties gprops = {0}; - int r, j; - - /* create a gang for the process */ - gprops.priority = AMDGPU_MES_PRIORITY_LEVEL_NORMAL; - gprops.gang_quantum = adev->mes.default_gang_quantum; - gprops.inprocess_gang_priority = AMDGPU_MES_PRIORITY_LEVEL_NORMAL; - gprops.priority_level = AMDGPU_MES_PRIORITY_LEVEL_NORMAL; - gprops.global_priority_level = AMDGPU_MES_PRIORITY_LEVEL_NORMAL; - - r = amdgpu_mes_add_gang(adev, pasid, &gprops, gang_id); - if (r) { - DRM_ERROR("failed to add gang\n"); - return r; - } - - /* create queues for the gang */ - for (j = 0; j < num_queue; j++) { - r = amdgpu_mes_add_ring(adev, *gang_id, queue_type, j, - ctx_data, &ring); - if (r) { - DRM_ERROR("failed to add ring\n"); - break; - } - - DRM_INFO("ring %s was added\n", ring->name); - added_rings[j] = ring; - } - - return 0; -} - -static int amdgpu_mes_test_queues(struct amdgpu_ring **added_rings) -{ - struct amdgpu_ring *ring; - int i, r; - - for (i = 0; i < AMDGPU_MES_CTX_MAX_RINGS; i++) { - ring = added_rings[i]; - if (!ring) - continue; - - r = amdgpu_ring_test_helper(ring); - if (r) - return r; - - r = amdgpu_ring_test_ib(ring, 1000 * 10); - if (r) { - DRM_DEV_ERROR(ring->adev->dev, - "ring %s ib test failed (%d)\n", - ring->name, r); - return r; - } else - DRM_INFO("ring %s ib test pass\n", ring->name); - } - - return 0; -} - -int amdgpu_mes_self_test(struct amdgpu_device *adev) -{ - struct amdgpu_vm *vm = NULL; - struct amdgpu_mes_ctx_data ctx_data = {0}; - struct amdgpu_ring *added_rings[AMDGPU_MES_CTX_MAX_RINGS] = { NULL }; - int gang_ids[3] = {0}; - int queue_types[][2] = { { AMDGPU_RING_TYPE_GFX, 1 }, - { AMDGPU_RING_TYPE_COMPUTE, 1 }, - { AMDGPU_RING_TYPE_SDMA, 1} }; - int i, r, pasid, k = 0; - - pasid = amdgpu_pasid_alloc(16); - if (pasid < 0) { - dev_warn(adev->dev, "No more PASIDs available!"); - pasid = 0; - } - - vm = kzalloc(sizeof(*vm), GFP_KERNEL); - if (!vm) { - r = -ENOMEM; - goto error_pasid; - } - - r = amdgpu_vm_init(adev, vm, -1); - if (r) { - DRM_ERROR("failed to initialize vm\n"); - goto error_pasid; - } - - r = amdgpu_mes_ctx_alloc_meta_data(adev, &ctx_data); - if (r) { - DRM_ERROR("failed to alloc ctx meta data\n"); - goto error_fini; - } - - ctx_data.meta_data_gpu_addr = AMDGPU_VA_RESERVED_BOTTOM; - r = amdgpu_mes_ctx_map_meta_data(adev, vm, &ctx_data); - if (r) { - DRM_ERROR("failed to map ctx meta data\n"); - goto error_vm; - } - - r = amdgpu_mes_create_process(adev, pasid, vm); - if (r) { - DRM_ERROR("failed to create MES process\n"); - goto error_vm; - } - - for (i = 0; i < ARRAY_SIZE(queue_types); i++) { - /* On GFX v10.3, fw hasn't supported to map sdma queue. */ - if (amdgpu_ip_version(adev, GC_HWIP, 0) >= - IP_VERSION(10, 3, 0) && - amdgpu_ip_version(adev, GC_HWIP, 0) < - IP_VERSION(11, 0, 0) && - queue_types[i][0] == AMDGPU_RING_TYPE_SDMA) - continue; - - r = amdgpu_mes_test_create_gang_and_queues(adev, pasid, - &gang_ids[i], - queue_types[i][0], - queue_types[i][1], - &added_rings[k], - &ctx_data); - if (r) - goto error_queues; - - k += queue_types[i][1]; - } - - /* start ring test and ib test for MES queues */ - amdgpu_mes_test_queues(added_rings); - -error_queues: - /* remove all queues */ - for (i = 0; i < ARRAY_SIZE(added_rings); i++) { - if (!added_rings[i]) - continue; - amdgpu_mes_remove_ring(adev, added_rings[i]); - } - - for (i = 0; i < ARRAY_SIZE(gang_ids); i++) { - if (!gang_ids[i]) - continue; - amdgpu_mes_remove_gang(adev, gang_ids[i]); - } - - amdgpu_mes_destroy_process(adev, pasid); - -error_vm: - amdgpu_mes_ctx_unmap_meta_data(adev, &ctx_data); - -error_fini: - amdgpu_vm_fini(adev, vm); - -error_pasid: - if (pasid) - amdgpu_pasid_free(pasid); - - amdgpu_mes_ctx_free_meta_data(&ctx_data); - kfree(vm); - return 0; -} - int amdgpu_mes_init_microcode(struct amdgpu_device *adev, int pipe) { const struct mes_firmware_header_v1_0 *mes_hdr; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 52dd54a32fb4..78362a838212 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -470,8 +470,6 @@ int amdgpu_mes_ctx_map_meta_data(struct amdgpu_device *adev, int amdgpu_mes_ctx_unmap_meta_data(struct amdgpu_device *adev, struct amdgpu_mes_ctx_data *ctx_data); -int amdgpu_mes_self_test(struct amdgpu_device *adev); - int amdgpu_mes_doorbell_process_slice(struct amdgpu_device *adev); /* diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 4cfd86aa2ea3..ccc19a40f03d 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -1712,22 +1712,10 @@ static int mes_v11_0_early_init(struct amdgpu_ip_block *ip_block) return 0; } -static int mes_v11_0_late_init(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - - /* it's only intended for use in mes_self_test case, not for s0ix and reset */ - if (!amdgpu_in_reset(adev) && !adev->in_s0ix && !adev->in_suspend && - (amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(11, 0, 3))) - amdgpu_mes_self_test(adev); - - return 0; -} - static const struct amd_ip_funcs mes_v11_0_ip_funcs = { .name = "mes_v11_0", .early_init = mes_v11_0_early_init, - .late_init = mes_v11_0_late_init, + .late_init = NULL, .sw_init = mes_v11_0_sw_init, .sw_fini = mes_v11_0_sw_fini, .hw_init = mes_v11_0_hw_init, diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index 62aba0b5dbe2..801928555eff 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -1820,21 +1820,10 @@ static int mes_v12_0_early_init(struct amdgpu_ip_block *ip_block) return 0; } -static int mes_v12_0_late_init(struct amdgpu_ip_block *ip_block) -{ - struct amdgpu_device *adev = ip_block->adev; - - /* it's only intended for use in mes_self_test case, not for s0ix and reset */ - if (!amdgpu_in_reset(adev) && !adev->in_s0ix && !adev->in_suspend) - amdgpu_mes_self_test(adev); - - return 0; -} - static const struct amd_ip_funcs mes_v12_0_ip_funcs = { .name = "mes_v12_0", .early_init = mes_v12_0_early_init, - .late_init = mes_v12_0_late_init, + .late_init = NULL, .sw_init = mes_v12_0_sw_init, .sw_fini = mes_v12_0_sw_fini, .hw_init = mes_v12_0_hw_init, -- cgit From 4ce60dbada9639e385f2f4be2dacf10d8ba22378 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 20 Feb 2025 15:56:24 -0500 Subject: drm/amdgpu: store userq_managers in a list in adev So we can iterate across them when we need to manage all user queues. v2: add uq_mgr to adev list in amdgpu_userq_mgr_init Reviewed-by: Arunpravin Paneer Selvam Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 16 +++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.h | 1 + 4 files changed, 22 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 3cdb5f8325aa..f88a04b1c4ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1237,6 +1237,9 @@ struct amdgpu_device { * in KFD: VRAM or GTT. */ bool apu_prefer_gtt; + + struct list_head userq_mgr_list; + struct mutex userq_mutex; }; static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 4da18ed0beea..6b38b0511330 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4317,6 +4317,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->gfx.kfd_sch_mutex); mutex_init(&adev->gfx.workload_profile_mutex); mutex_init(&adev->vcn.workload_profile_mutex); + mutex_init(&adev->userq_mutex); amdgpu_device_init_apu_flags(adev); @@ -4344,6 +4345,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, INIT_LIST_HEAD(&adev->pm.od_kobj_list); + INIT_LIST_HEAD(&adev->userq_mgr_list); + INIT_DELAYED_WORK(&adev->delayed_init_work, amdgpu_device_delayed_init_work_handler); INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c index beae931152a3..ecd49cf15b2a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c @@ -658,20 +658,34 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct amdgpu_devi idr_init_base(&userq_mgr->userq_idr, 1); userq_mgr->adev = adev; + mutex_lock(&adev->userq_mutex); + list_add(&userq_mgr->list, &adev->userq_mgr_list); + mutex_unlock(&adev->userq_mutex); + INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userqueue_resume_worker); return 0; } void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr) { - uint32_t queue_id; + struct amdgpu_device *adev = userq_mgr->adev; struct amdgpu_usermode_queue *queue; + struct amdgpu_userq_mgr *uqm, *tmp; + uint32_t queue_id; cancel_delayed_work(&userq_mgr->resume_work); mutex_lock(&userq_mgr->userq_mutex); idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) amdgpu_userqueue_cleanup(userq_mgr, queue, queue_id); + mutex_lock(&adev->userq_mutex); + list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { + if (uqm == userq_mgr) { + list_del(&uqm->list); + break; + } + } + mutex_unlock(&adev->userq_mutex); idr_destroy(&userq_mgr->userq_idr); mutex_unlock(&userq_mgr->userq_mutex); mutex_destroy(&userq_mgr->userq_mutex); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.h index 0f358f77f2d9..ec1a4ca6f632 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.h @@ -76,6 +76,7 @@ struct amdgpu_userq_mgr { struct mutex userq_mutex; struct amdgpu_device *adev; struct delayed_work resume_work; + struct list_head list; }; struct amdgpu_db_info { -- cgit From c770ef19673fb1defcbde2ee2b91c3c89bfcf164 Mon Sep 17 00:00:00 2001 From: Kenneth Feng Date: Tue, 1 Apr 2025 16:04:41 +0800 Subject: drm/amd/amdgpu: disable ASPM in some situations disable ASPM with some ASICs on some specific platforms. required from PCIe controller owner. Signed-off-by: Kenneth Feng Reviewed-by: Yang Wang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 32 ++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6b38b0511330..84ddd02579bb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -85,6 +85,7 @@ #if IS_ENABLED(CONFIG_X86) #include +#include #endif MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); @@ -1873,6 +1874,35 @@ static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device return true; } +static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev) +{ +#if IS_ENABLED(CONFIG_X86) + struct cpuinfo_x86 *c = &cpu_data(0); + + if (!(amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 0) || + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 0, 1))) + return false; + + if (c->x86 == 6 && + adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5) { + switch (c->x86_model) { + case VFM_MODEL(INTEL_ALDERLAKE): + case VFM_MODEL(INTEL_ALDERLAKE_L): + case VFM_MODEL(INTEL_RAPTORLAKE): + case VFM_MODEL(INTEL_RAPTORLAKE_P): + case VFM_MODEL(INTEL_RAPTORLAKE_S): + return true; + default: + return false; + } + } else { + return false; + } +#else + return false; +#endif +} + /** * amdgpu_device_should_use_aspm - check if the device should program ASPM * @@ -1897,6 +1927,8 @@ bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) } if (adev->flags & AMD_IS_APU) return false; + if (amdgpu_device_aspm_support_quirk(adev)) + return false; return pcie_aspm_enabled(adev->pdev); } -- cgit From 39938a8ed979e398faa3791a47e282c82bcc6f04 Mon Sep 17 00:00:00 2001 From: ZhenGuo Yin Date: Tue, 8 Apr 2025 16:18:28 +0800 Subject: drm/amdgpu: fix warning of drm_mm_clean MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kernel doorbell BOs needs to be freed before ttm_fini. Fixes: 54c30d2a8def ("drm/amdgpu: create kernel doorbell pages") Acked-by: Alex Deucher Reviewed-by: Christian König Signed-off-by: ZhenGuo Yin Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 84ddd02579bb..66ed6a7924ef 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3550,6 +3550,7 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) amdgpu_device_mem_scratch_fini(adev); amdgpu_ib_pool_fini(adev); amdgpu_seq64_fini(adev); + amdgpu_doorbell_fini(adev); } if (adev->ip_blocks[i].version->funcs->sw_fini) { r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); @@ -4903,7 +4904,6 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev) iounmap(adev->rmmio); adev->rmmio = NULL; - amdgpu_doorbell_fini(adev); drm_dev_exit(idx); } -- cgit From 732c6cefc1ecfc8de5d7a2029480798655d979d8 Mon Sep 17 00:00:00 2001 From: Ce Sun Date: Wed, 9 Apr 2025 19:53:11 +0800 Subject: drm/amdgpu: Replace tmp_adev with hive in amdgpu_pci_slot_reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Checking hive is more readable. The following smatch warning: drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:6820 amdgpu_pci_slot_reset() warn: iterator used outside loop: 'tmp_adev' Reported-by: Dan Carpenter Signed-off-by: Ce Sun Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 66ed6a7924ef..8e150e9393c7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -6794,8 +6794,8 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); struct amdgpu_reset_context reset_context; - struct amdgpu_device *tmp_adev = NULL; - struct amdgpu_hive_info *hive = NULL; + struct amdgpu_device *tmp_adev; + struct amdgpu_hive_info *hive; struct list_head device_list; int r = 0, i; u32 memsize; @@ -6856,7 +6856,7 @@ out: dev_info(adev->dev, "PCIe error recovery succeeded\n"); } else { dev_err(adev->dev, "PCIe error recovery failed, err:%d\n", r); - if (tmp_adev) { + if (hive) { list_for_each_entry(tmp_adev, &device_list, reset_list) amdgpu_device_unset_mp1_state(tmp_adev); amdgpu_device_unlock_reset_domain(adev->reset_domain); -- cgit From 2e0454b730648e9349ca54eb4a8142d77e8e7008 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 8 Apr 2025 10:39:06 -0400 Subject: drm/amdgpu: adjust enforce_isolation handling Switch from a bool to an enum and allow more options for enforce isolation. There are now 3 modes of operation: - Disabled (0) - Enabled (serialization and cleaner shader) (1) - Enabled in legacy mode (no serialization or cleaner shader) (2) This provides better flexibility for more use cases. Acked-by: Srinivasan Shanmugam Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 ++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 16 ++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 22 ++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 12 ++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 39 ++++++++++++++++------ drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 3 +- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/mes_v12_0.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c | 11 +++--- 12 files changed, 93 insertions(+), 30 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index bb5df7831308..b156e31ac86a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -230,7 +230,7 @@ extern int amdgpu_force_asic_type; extern int amdgpu_smartshift_bias; extern int amdgpu_use_xgmi_p2p; extern int amdgpu_mtype_local; -extern bool enforce_isolation; +extern int amdgpu_enforce_isolation; #ifdef CONFIG_HSA_AMD extern int sched_policy; extern bool debug_evictions; @@ -873,6 +873,13 @@ struct amdgpu_init_level { struct amdgpu_reset_domain; struct amdgpu_fru_info; +enum amdgpu_enforce_isolation_mode { + AMDGPU_ENFORCE_ISOLATION_DISABLE = 0, + AMDGPU_ENFORCE_ISOLATION_ENABLE = 1, + AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY = 2, +}; + + /* * Non-zero (true) if the GPU has VRAM. Zero (false) otherwise. */ @@ -1225,7 +1232,7 @@ struct amdgpu_device { /* Protection for the following isolation structure */ struct mutex enforce_isolation_mutex; - bool enforce_isolation[MAX_XCP]; + enum amdgpu_enforce_isolation_mode enforce_isolation[MAX_XCP]; struct amdgpu_isolation { void *owner; struct dma_fence *spearhead; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index ea047305eb64..0941b3495b2c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -296,7 +296,21 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, num_ibs[i], &p->jobs[i]); if (ret) goto free_all_kdata; - p->jobs[i]->enforce_isolation = p->adev->enforce_isolation[fpriv->xcp_id]; + switch (p->adev->enforce_isolation[fpriv->xcp_id]) { + case AMDGPU_ENFORCE_ISOLATION_DISABLE: + default: + p->jobs[i]->enforce_isolation = false; + p->jobs[i]->run_cleaner_shader = false; + break; + case AMDGPU_ENFORCE_ISOLATION_ENABLE: + p->jobs[i]->enforce_isolation = true; + p->jobs[i]->run_cleaner_shader = true; + break; + case AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY: + p->jobs[i]->enforce_isolation = true; + p->jobs[i]->run_cleaner_shader = false; + break; + } } p->gang_leader = p->jobs[p->gang_leader_idx]; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8e150e9393c7..475bcd2a8a31 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2145,8 +2145,26 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); - for (i = 0; i < MAX_XCP; i++) - adev->enforce_isolation[i] = !!enforce_isolation; + for (i = 0; i < MAX_XCP; i++) { + switch (amdgpu_enforce_isolation) { + case -1: + case 0: + default: + /* disable */ + adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; + break; + case 1: + /* enable */ + adev->enforce_isolation[i] = + AMDGPU_ENFORCE_ISOLATION_ENABLE; + break; + case 2: + /* enable legacy mode */ + adev->enforce_isolation[i] = + AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; + break; + } + } return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index f5e83acb6169..a117cd95b9dc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -179,7 +179,7 @@ uint amdgpu_pg_mask = 0xffffffff; uint amdgpu_sdma_phase_quantum = 32; char *amdgpu_disable_cu; char *amdgpu_virtual_display; -bool enforce_isolation; +int amdgpu_enforce_isolation = -1; int amdgpu_modeset = -1; /* Specifies the default granularity for SVM, used in buffer @@ -1038,11 +1038,13 @@ module_param_named(user_partt_mode, amdgpu_user_partt_mode, uint, 0444); /** - * DOC: enforce_isolation (bool) - * enforce process isolation between graphics and compute via using the same reserved vmid. + * DOC: enforce_isolation (int) + * enforce process isolation between graphics and compute. + * (-1 = auto, 0 = disable, 1 = enable, 2 = enable legacy mode) */ -module_param(enforce_isolation, bool, 0444); -MODULE_PARM_DESC(enforce_isolation, "enforce process isolation between graphics and compute . enforce_isolation = on"); +module_param_named(enforce_isolation, amdgpu_enforce_isolation, int, 0444); +MODULE_PARM_DESC(enforce_isolation, +"enforce process isolation between graphics and compute. (-1 = auto, 0 = disable, 1 = enable, 2 = enable legacy mode)"); /** * DOC: modeset (int) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 663830c6c73b..2c933d436e56 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1468,6 +1468,8 @@ static int amdgpu_gfx_run_cleaner_shader_job(struct amdgpu_ring *ring) goto err; job->enforce_isolation = true; + /* always run the cleaner shader */ + job->run_cleaner_shader = true; ib = &job->ibs[0]; for (i = 0; i <= ring->funcs->align_mask; ++i) @@ -1599,7 +1601,7 @@ static ssize_t amdgpu_gfx_set_run_cleaner_shader(struct device *dev, * Provides the sysfs read interface to get the current settings of the 'enforce_isolation' * feature for each GPU partition. Reading from the 'enforce_isolation' * sysfs file returns the isolation settings for all partitions, where '0' - * indicates disabled and '1' indicates enabled. + * indicates disabled, '1' indicates enabled, and '2' indicates enabled in legacy mode. * * Return: The number of bytes read from the sysfs file. */ @@ -1634,9 +1636,10 @@ static ssize_t amdgpu_gfx_get_enforce_isolation(struct device *dev, * @count: The size of the input data * * This function allows control over the 'enforce_isolation' feature, which - * serializes access to the graphics engine. Writing '1' or '0' to the - * 'enforce_isolation' sysfs file enables or disables process isolation for - * each partition. The input should specify the setting for all partitions. + * serializes access to the graphics engine. Writing '1', '2', or '0' to the + * 'enforce_isolation' sysfs file enables (full or legacy) or disables process + * isolation for each partition. The input should specify the setting for all + * partitions. * * Return: The number of bytes written to the sysfs file. */ @@ -1673,13 +1676,29 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev, return -EINVAL; for (i = 0; i < num_partitions; i++) { - if (partition_values[i] != 0 && partition_values[i] != 1) + if (partition_values[i] != 0 && + partition_values[i] != 1 && + partition_values[i] != 2) return -EINVAL; } mutex_lock(&adev->enforce_isolation_mutex); - for (i = 0; i < num_partitions; i++) - adev->enforce_isolation[i] = partition_values[i]; + for (i = 0; i < num_partitions; i++) { + switch (partition_values[i]) { + case 0: + default: + adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_DISABLE; + break; + case 1: + adev->enforce_isolation[i] = + AMDGPU_ENFORCE_ISOLATION_ENABLE; + break; + case 2: + adev->enforce_isolation[i] = + AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; + break; + } + } mutex_unlock(&adev->enforce_isolation_mutex); amdgpu_mes_update_enforce_isolation(adev); @@ -2034,7 +2053,7 @@ amdgpu_gfx_enforce_isolation_wait_for_kfd(struct amdgpu_device *adev, bool wait = false; mutex_lock(&adev->enforce_isolation_mutex); - if (adev->enforce_isolation[idx]) { + if (adev->enforce_isolation[idx] == AMDGPU_ENFORCE_ISOLATION_ENABLE) { /* set the initial values if nothing is set */ if (!adev->gfx.enforce_isolation_jiffies[idx]) { adev->gfx.enforce_isolation_jiffies[idx] = jiffies; @@ -2101,7 +2120,7 @@ void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring) amdgpu_gfx_enforce_isolation_wait_for_kfd(adev, idx); mutex_lock(&adev->enforce_isolation_mutex); - if (adev->enforce_isolation[idx]) { + if (adev->enforce_isolation[idx] == AMDGPU_ENFORCE_ISOLATION_ENABLE) { if (adev->kfd.init_complete) sched_work = true; } @@ -2138,7 +2157,7 @@ void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring) return; mutex_lock(&adev->enforce_isolation_mutex); - if (adev->enforce_isolation[idx]) { + if (adev->enforce_isolation[idx] == AMDGPU_ENFORCE_ISOLATION_ENABLE) { if (adev->kfd.init_complete) sched_work = true; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c index 4c4e087230ac..359c19de9a5b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c @@ -588,7 +588,7 @@ void amdgpu_vmid_mgr_init(struct amdgpu_device *adev) } /* alloc a default reserved vmid to enforce isolation */ for (i = 0; i < (adev->xcp_mgr ? adev->xcp_mgr->num_xcps : 1); i++) { - if (adev->enforce_isolation[i]) + if (adev->enforce_isolation[i] != AMDGPU_ENFORCE_ISOLATION_DISABLE) amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(i)); } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h index ce6b9ba967ff..f2c049129661 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h @@ -78,6 +78,7 @@ struct amdgpu_job { /* enforce isolation */ bool enforce_isolation; + bool run_cleaner_shader; uint32_t num_ibs; struct amdgpu_ib ibs[]; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 36f2e8716126..38ea64d87a0a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -768,7 +768,7 @@ int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev) if (adev->enable_mes && adev->gfx.enable_cleaner_shader) { mutex_lock(&adev->enforce_isolation_mutex); for (i = 0; i < (adev->xcp_mgr ? adev->xcp_mgr->num_xcps : 1); i++) { - if (adev->enforce_isolation[i]) + if (adev->enforce_isolation[i] == AMDGPU_ENFORCE_ISOLATION_ENABLE) r |= amdgpu_mes_set_enforce_isolation(adev, i, true); else r |= amdgpu_mes_set_enforce_isolation(adev, i, false); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index aa65d64fb15c..3911c78f8282 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -787,7 +787,8 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping && ring->funcs->emit_wreg; - cleaner_shader_needed = adev->gfx.enable_cleaner_shader && + cleaner_shader_needed = job->run_cleaner_shader && + adev->gfx.enable_cleaner_shader && ring->funcs->emit_cleaner_shader && job->base.s_fence && &job->base.s_fence->scheduled == isolation->spearhead; diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 344d32268c3c..f7aa45775ead 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -724,7 +724,7 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes) mes->event_log_gpu_addr; } - if (adev->enforce_isolation[0]) + if (adev->enforce_isolation[0] == AMDGPU_ENFORCE_ISOLATION_ENABLE) mes_set_hw_res_pkt.limit_single_process = 1; return mes_v11_0_submit_pkt_and_poll_completion(mes, diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c index be43e19b7b7f..b0e042a4cea1 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c @@ -762,7 +762,7 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes, int pipe) pipe * (AMDGPU_MES_LOG_BUFFER_SIZE + AMDGPU_MES_MSCRATCH_SIZE); } - if (adev->enforce_isolation[0]) + if (adev->enforce_isolation[0] == AMDGPU_ENFORCE_ISOLATION_ENABLE) mes_set_hw_res_pkt.limit_single_process = 1; return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c index 2893fd5e5d00..fa28c57692b8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c @@ -43,7 +43,7 @@ static int pm_map_process_v9(struct packet_manager *pm, memset(buffer, 0, sizeof(struct pm4_mes_map_process)); packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, sizeof(struct pm4_mes_map_process)); - if (adev->enforce_isolation[kfd->node_id]) + if (adev->enforce_isolation[kfd->node_id] == AMDGPU_ENFORCE_ISOLATION_ENABLE) packet->bitfields2.exec_cleaner_shader = 1; packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; packet->bitfields2.process_quantum = 10; @@ -102,7 +102,8 @@ static int pm_map_process_aldebaran(struct packet_manager *pm, memset(buffer, 0, sizeof(struct pm4_mes_map_process_aldebaran)); packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, sizeof(struct pm4_mes_map_process_aldebaran)); - if (adev->enforce_isolation[knode->node_id]) + if (adev->enforce_isolation[knode->node_id] == + AMDGPU_ENFORCE_ISOLATION_ENABLE) packet->bitfields2.exec_cleaner_shader = 1; packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; packet->bitfields2.process_quantum = 10; @@ -165,9 +166,9 @@ static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer, * hws_max_conc_proc has been done in * kgd2kfd_device_init(). */ - concurrent_proc_cnt = adev->enforce_isolation[kfd->node_id] ? - 1 : min(pm->dqm->processes_count, - kfd->max_proc_per_quantum); + concurrent_proc_cnt = (adev->enforce_isolation[kfd->node_id] == + AMDGPU_ENFORCE_ISOLATION_ENABLE) ? + 1 : min(pm->dqm->processes_count, kfd->max_proc_per_quantum); packet = (struct pm4_mes_runlist *)buffer; -- cgit From c2c722217af4d2d17de8665968975a70a08046ea Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 21 Feb 2025 12:16:34 -0500 Subject: drm/amdgpu/userq: handle system suspend and resume Unmap user queues on suspend and map them on resume. Reviewed-by: Sunil Khatri Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 475bcd2a8a31..e966aefc2b0f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3513,6 +3513,9 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); amdgpu_amdkfd_suspend(adev, false); +#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ + amdgpu_userq_suspend(adev); +#endif /* Workaround for ASICs need to disable SMC first */ amdgpu_device_smu_fini_early(adev); @@ -5081,8 +5084,12 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) amdgpu_device_ip_suspend_phase1(adev); - if (!adev->in_s0ix) + if (!adev->in_s0ix) { amdgpu_amdkfd_suspend(adev, adev->in_runpm); +#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ + amdgpu_userq_suspend(adev); +#endif + } r = amdgpu_device_evict_resources(adev); if (r) @@ -5149,6 +5156,11 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) r = amdgpu_amdkfd_resume(adev, adev->in_runpm); if (r) goto exit; +#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ + r = amdgpu_userq_resume(adev); + if (r) + goto exit; +#endif } r = amdgpu_device_ip_late_init(adev); -- cgit From 28fc3172e4204c8cdd8c70226ea02b0ae9930b69 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 21 Feb 2025 15:20:45 -0500 Subject: drm/amdgpu: rename enforce isolation variables Since they will be used for both KFD and KGD user queues, rename them from kfd to userq. No intended functional change. Acked-by: Sunil Khatri Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 32 +++++++++++++++--------------- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 6 +++--- 3 files changed, 20 insertions(+), 20 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e966aefc2b0f..b96e0613ea7e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4368,7 +4368,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, amdgpu_sync_create(&adev->isolation[i].active); amdgpu_sync_create(&adev->isolation[i].prev); } - mutex_init(&adev->gfx.kfd_sch_mutex); + mutex_init(&adev->gfx.userq_sch_mutex); mutex_init(&adev->gfx.workload_profile_mutex); mutex_init(&adev->vcn.workload_profile_mutex); mutex_init(&adev->userq_mutex); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 2c933d436e56..c58d32983c45 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1947,39 +1947,40 @@ void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device *adev, static void amdgpu_gfx_kfd_sch_ctrl(struct amdgpu_device *adev, u32 idx, bool enable) { - mutex_lock(&adev->gfx.kfd_sch_mutex); + mutex_lock(&adev->gfx.userq_sch_mutex); if (enable) { /* If the count is already 0, it means there's an imbalance bug somewhere. * Note that the bug may be in a different caller than the one which triggers the * WARN_ON_ONCE. */ - if (WARN_ON_ONCE(adev->gfx.kfd_sch_req_count[idx] == 0)) { + if (WARN_ON_ONCE(adev->gfx.userq_sch_req_count[idx] == 0)) { dev_err(adev->dev, "Attempted to enable KFD scheduler when reference count is already zero\n"); goto unlock; } - adev->gfx.kfd_sch_req_count[idx]--; + adev->gfx.userq_sch_req_count[idx]--; - if (adev->gfx.kfd_sch_req_count[idx] == 0 && - adev->gfx.kfd_sch_inactive[idx]) { + if (adev->gfx.userq_sch_req_count[idx] == 0 && + adev->gfx.userq_sch_inactive[idx]) { schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work, msecs_to_jiffies(adev->gfx.enforce_isolation_time[idx])); } } else { - if (adev->gfx.kfd_sch_req_count[idx] == 0) { + if (adev->gfx.userq_sch_req_count[idx] == 0) { cancel_delayed_work_sync(&adev->gfx.enforce_isolation[idx].work); - if (!adev->gfx.kfd_sch_inactive[idx]) { - amdgpu_amdkfd_stop_sched(adev, idx); - adev->gfx.kfd_sch_inactive[idx] = true; + if (!adev->gfx.userq_sch_inactive[idx]) { + if (adev->kfd.init_complete) + amdgpu_amdkfd_stop_sched(adev, idx); + adev->gfx.userq_sch_inactive[idx] = true; } } - adev->gfx.kfd_sch_req_count[idx]++; + adev->gfx.userq_sch_req_count[idx]++; } unlock: - mutex_unlock(&adev->gfx.kfd_sch_mutex); + mutex_unlock(&adev->gfx.userq_sch_mutex); } /** @@ -2024,12 +2025,11 @@ void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work) msecs_to_jiffies(1)); } else { /* Tell KFD to resume the runqueue */ - if (adev->kfd.init_complete) { - WARN_ON_ONCE(!adev->gfx.kfd_sch_inactive[idx]); - WARN_ON_ONCE(adev->gfx.kfd_sch_req_count[idx]); + WARN_ON_ONCE(!adev->gfx.userq_sch_inactive[idx]); + WARN_ON_ONCE(adev->gfx.userq_sch_req_count[idx]); + if (adev->kfd.init_complete) amdgpu_amdkfd_start_sched(adev, idx); - adev->gfx.kfd_sch_inactive[idx] = false; - } + adev->gfx.userq_sch_inactive[idx] = false; } mutex_unlock(&adev->enforce_isolation_mutex); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 91dd365cb1e6..ed54095e6ad6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -475,9 +475,9 @@ struct amdgpu_gfx { bool enable_cleaner_shader; struct amdgpu_isolation_work enforce_isolation[MAX_XCP]; /* Mutex for synchronizing KFD scheduler operations */ - struct mutex kfd_sch_mutex; - u64 kfd_sch_req_count[MAX_XCP]; - bool kfd_sch_inactive[MAX_XCP]; + struct mutex userq_sch_mutex; + u64 userq_sch_req_count[MAX_XCP]; + bool userq_sch_inactive[MAX_XCP]; unsigned long enforce_isolation_jiffies[MAX_XCP]; unsigned long enforce_isolation_time[MAX_XCP]; -- cgit From 56801cb83c8c95ae23ea576570c75a01c1f07774 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Tue, 22 Apr 2025 19:29:03 +0530 Subject: drm/amdgpu: remove DRM_AMDGPU_NAVI3X_USERQ config for UQ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DRM_AMDGPU_NAVI3X_USERQ config support is not required for usermode queue. v2: rebase. Cc: Arunpravin Paneer Selvam Reviewed-by: Sunil Khatri Reviewed-by: Alex Deucher Reviewed-by: Christian König Signed-off-by: Arvind Yadav Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/Kconfig | 8 -------- drivers/gpu/drm/amd/amdgpu/Makefile | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +------ drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 5 +---- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 8 -------- drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 18 ------------------ drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 4 ---- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 2 -- drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 3 +-- drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 3 --- 10 files changed, 4 insertions(+), 56 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig b/drivers/gpu/drm/amd/amdgpu/Kconfig index 7b95221d2f3d..1a11cab741ac 100644 --- a/drivers/gpu/drm/amd/amdgpu/Kconfig +++ b/drivers/gpu/drm/amd/amdgpu/Kconfig @@ -96,14 +96,6 @@ config DRM_AMDGPU_WERROR Add -Werror to the build flags for amdgpu.ko. Only enable this if you are warning code for amdgpu.ko. -config DRM_AMDGPU_NAVI3X_USERQ - bool "Enable amdgpu usermode queues" - depends on DRM_AMDGPU - default n - help - Choose this option to enable GFX usermode queue support for GFX/SDMA/Compute - workload submission. This feature is experimental and supported on GFX11+. - source "drivers/gpu/drm/amd/acp/Kconfig" source "drivers/gpu/drm/amd/display/Kconfig" source "drivers/gpu/drm/amd/amdkfd/Kconfig" diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index 8595e05c691b..87080c06e5fc 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -177,7 +177,7 @@ amdgpu-y += \ mes_v12_0.o \ # add GFX userqueue support -amdgpu-$(CONFIG_DRM_AMDGPU_NAVI3X_USERQ) += mes_userqueue.o +amdgpu-y += mes_userqueue.o # add UVD block amdgpu-y += \ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b96e0613ea7e..fe68ba9997ae 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3513,9 +3513,7 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); amdgpu_amdkfd_suspend(adev, false); -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ amdgpu_userq_suspend(adev); -#endif /* Workaround for ASICs need to disable SMC first */ amdgpu_device_smu_fini_early(adev); @@ -5086,9 +5084,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients) if (!adev->in_s0ix) { amdgpu_amdkfd_suspend(adev, adev->in_runpm); -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ amdgpu_userq_suspend(adev); -#endif } r = amdgpu_device_evict_resources(adev); @@ -5156,11 +5152,10 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients) r = amdgpu_amdkfd_resume(adev, adev->in_runpm); if (r) goto exit; -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ + r = amdgpu_userq_resume(adev); if (r) goto exit; -#endif } r = amdgpu_device_ip_late_init(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index e0cc2bb083cb..8f1a2f7b03c1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1981,9 +1981,7 @@ static void amdgpu_gfx_kfd_sch_ctrl(struct amdgpu_device *adev, u32 idx, if (adev->gfx.userq_sch_req_count[idx] == 0) { cancel_delayed_work_sync(&adev->gfx.enforce_isolation[idx].work); if (!adev->gfx.userq_sch_inactive[idx]) { -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ amdgpu_userq_stop_sched_for_enforce_isolation(adev, idx); -#endif if (adev->kfd.init_complete) amdgpu_amdkfd_stop_sched(adev, idx); adev->gfx.userq_sch_inactive[idx] = true; @@ -2041,9 +2039,8 @@ void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work) /* Tell KFD to resume the runqueue */ WARN_ON_ONCE(!adev->gfx.userq_sch_inactive[idx]); WARN_ON_ONCE(adev->gfx.userq_sch_req_count[idx]); -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ + amdgpu_userq_start_sched_for_enforce_isolation(adev, idx); -#endif if (adev->kfd.init_complete) amdgpu_amdkfd_start_sched(adev, idx); adev->gfx.userq_sch_inactive[idx] = false; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index b0e8098a3988..451890ee3fb7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -129,7 +129,6 @@ amdgpu_userq_active(struct amdgpu_userq_mgr *uq_mgr) return ret; } -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ static struct amdgpu_usermode_queue * amdgpu_userq_find(struct amdgpu_userq_mgr *uq_mgr, int qid) { @@ -520,13 +519,6 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data, return r; } -#else -int amdgpu_userq_ioctl(struct drm_device *dev, void *data, - struct drm_file *filp) -{ - return -ENOTSUPP; -} -#endif static int amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index be068e8e37d1..3288c2ff692e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -216,7 +216,6 @@ void amdgpu_userq_fence_driver_put(struct amdgpu_userq_fence_driver *fence_drv) kref_put(&fence_drv->refcount, amdgpu_userq_fence_driver_destroy); } -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ static int amdgpu_userq_fence_alloc(struct amdgpu_userq_fence **userq_fence) { *userq_fence = kmem_cache_alloc(amdgpu_userq_fence_slab, GFP_ATOMIC); @@ -288,7 +287,6 @@ static int amdgpu_userq_fence_create(struct amdgpu_usermode_queue *userq, return 0; } -#endif static const char *amdgpu_userq_fence_get_driver_name(struct dma_fence *f) { @@ -343,7 +341,6 @@ static const struct dma_fence_ops amdgpu_userq_fence_ops = { .release = amdgpu_userq_fence_release, }; -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ /** * amdgpu_userq_fence_read_wptr - Read the userq wptr value * @@ -594,15 +591,7 @@ free_syncobj_handles: return r; } -#else -int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data, - struct drm_file *filp) -{ - return -ENOTSUPP; -} -#endif -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { @@ -968,10 +957,3 @@ free_bo_handles_read: return r; } -#else -int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data, - struct drm_file *filp) -{ - return -ENOTSUPP; -} -#endif diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 2df11f4127cc..3f4ee4b3b0a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -1606,7 +1606,6 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) case IP_VERSION(11, 0, 0): case IP_VERSION(11, 0, 2): case IP_VERSION(11, 0, 3): -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ if (!adev->gfx.disable_uq && adev->gfx.me_fw_version >= 2390 && adev->gfx.pfp_fw_version >= 2530 && @@ -1615,7 +1614,6 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) adev->userq_funcs[AMDGPU_HW_IP_GFX] = &userq_mes_funcs; adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] = &userq_mes_funcs; } -#endif break; case IP_VERSION(11, 0, 1): case IP_VERSION(11, 0, 4): @@ -1623,13 +1621,11 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) case IP_VERSION(11, 5, 1): case IP_VERSION(11, 5, 2): case IP_VERSION(11, 5, 3): -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ /* add firmware version checks here */ if (0 && !adev->gfx.disable_uq) { adev->userq_funcs[AMDGPU_HW_IP_GFX] = &userq_mes_funcs; adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] = &userq_mes_funcs; } -#endif break; default: break; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index dfa0830a4eb1..f09d96bfee16 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -1416,7 +1416,6 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block) switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { case IP_VERSION(12, 0, 0): case IP_VERSION(12, 0, 1): -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ if (!adev->gfx.disable_uq && adev->gfx.me_fw_version >= 2780 && adev->gfx.pfp_fw_version >= 2840 && @@ -1425,7 +1424,6 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block) adev->userq_funcs[AMDGPU_HW_IP_GFX] = &userq_mes_funcs; adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] = &userq_mes_funcs; } -#endif break; default: break; diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c index 6bb36187a53d..da5b5d64f137 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c @@ -1363,11 +1363,10 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block *ip_block) else DRM_ERROR("Failed to allocated memory for SDMA IP Dump\n"); -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ /* add firmware version checks here */ if (0 && !adev->sdma.disable_uq) adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs; -#endif + r = amdgpu_sdma_sysfs_reset_mask_init(adev); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c index 943c6446a0a7..befe013b11a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c @@ -1338,12 +1338,9 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block *ip_block) else DRM_ERROR("Failed to allocated memory for SDMA IP Dump\n"); -#ifdef CONFIG_DRM_AMDGPU_NAVI3X_USERQ /* add firmware version checks here */ if (0 && !adev->sdma.disable_uq) adev->userq_funcs[AMDGPU_HW_IP_DMA] = &userq_mes_funcs; -#endif - return r; } -- cgit From 68071eb0ae64c076ed93897be644ebbd1c89a278 Mon Sep 17 00:00:00 2001 From: Srinivasan Shanmugam Date: Mon, 28 Apr 2025 16:16:34 +0530 Subject: drm/amdgpu: Add Support for enforcing isolation without Cleaner Shader MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adjusted the enforce isolation setting handling to include the ability to disable the cleaner shader without affecting isolation between tasks. v2: Updated enforce isolation documentation and parameters. (Alex) Cc: Christian König Cc: Alex Deucher Signed-off-by: Srinivasan Shanmugam Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++++ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 18 +++++++++++++----- 5 files changed, 25 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index cc26cf1bd843..7e5ae8f1f0a9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -878,6 +878,7 @@ enum amdgpu_enforce_isolation_mode { AMDGPU_ENFORCE_ISOLATION_DISABLE = 0, AMDGPU_ENFORCE_ISOLATION_ENABLE = 1, AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY = 2, + AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER = 3, }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 0941b3495b2c..9ea0d9b71f48 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -310,6 +310,10 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, p->jobs[i]->enforce_isolation = true; p->jobs[i]->run_cleaner_shader = false; break; + case AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER: + p->jobs[i]->enforce_isolation = true; + p->jobs[i]->run_cleaner_shader = false; + break; } } p->gang_leader = p->jobs[p->gang_leader_idx]; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index fe68ba9997ae..8330e30f0caf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2163,6 +2163,11 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; break; + case 3: + /* enable only process isolation without submitting cleaner shader */ + adev->enforce_isolation[i] = + AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; + break; } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index ec8057597c5a..8e45f18ad470 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1040,11 +1040,11 @@ module_param_named(user_partt_mode, amdgpu_user_partt_mode, uint, 0444); /** * DOC: enforce_isolation (int) * enforce process isolation between graphics and compute. - * (-1 = auto, 0 = disable, 1 = enable, 2 = enable legacy mode) + * (-1 = auto, 0 = disable, 1 = enable, 2 = enable legacy mode, 3 = enable without cleaner shader) */ module_param_named(enforce_isolation, amdgpu_enforce_isolation, int, 0444); MODULE_PARM_DESC(enforce_isolation, -"enforce process isolation between graphics and compute. (-1 = auto, 0 = disable, 1 = enable, 2 = enable legacy mode)"); +"enforce process isolation between graphics and compute. (-1 = auto, 0 = disable, 1 = enable, 2 = enable legacy mode, 3 = enable without cleaner shader)"); /** * DOC: modeset (int) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 8f1a2f7b03c1..1db1e6ec0184 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1612,7 +1612,8 @@ static ssize_t amdgpu_gfx_set_run_cleaner_shader(struct device *dev, * Provides the sysfs read interface to get the current settings of the 'enforce_isolation' * feature for each GPU partition. Reading from the 'enforce_isolation' * sysfs file returns the isolation settings for all partitions, where '0' - * indicates disabled, '1' indicates enabled, and '2' indicates enabled in legacy mode. + * indicates disabled, '1' indicates enabled, and '2' indicates enabled in legacy mode, + * and '3' indicates enabled without cleaner shader. * * Return: The number of bytes read from the sysfs file. */ @@ -1647,9 +1648,11 @@ static ssize_t amdgpu_gfx_get_enforce_isolation(struct device *dev, * @count: The size of the input data * * This function allows control over the 'enforce_isolation' feature, which - * serializes access to the graphics engine. Writing '1', '2', or '0' to the - * 'enforce_isolation' sysfs file enables (full or legacy) or disables process - * isolation for each partition. The input should specify the setting for all + * serializes access to the graphics engine. Writing '0' to disable, '1' to + * enable isolation with cleaner shader, '2' to enable legacy isolation without + * cleaner shader, or '3' to enable process isolation without submitting the + * cleaner shader to the 'enforce_isolation' sysfs file sets the isolation mode + * for each partition. The input should specify the setting for all * partitions. * * Return: The number of bytes written to the sysfs file. @@ -1689,7 +1692,8 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev, for (i = 0; i < num_partitions; i++) { if (partition_values[i] != 0 && partition_values[i] != 1 && - partition_values[i] != 2) + partition_values[i] != 2 && + partition_values[i] != 3) return -EINVAL; } @@ -1708,6 +1712,10 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev, adev->enforce_isolation[i] = AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY; break; + case 3: + adev->enforce_isolation[i] = + AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER; + break; } } mutex_unlock(&adev->enforce_isolation_mutex); -- cgit From ce8f7d95899c2869b47ea6ce0b3e5bf304b2fff4 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 1 May 2025 13:00:16 -0400 Subject: Revert "drm/amd: Stop evicting resources on APUs in suspend" This reverts commit 3a9626c816db901def438dc2513622e281186d39. This breaks S4 because we end up setting the s3/s0ix flags even when we are entering s4 since prepare is used by both flows. The causes both the S3/s0ix and s4 flags to be set which breaks several checks in the driver which assume they are mutually exclusive. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3634 Cc: Mario Limonciello Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 -- drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 18 ------------------ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++--------- 3 files changed, 2 insertions(+), 29 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index de57ab032cc7..f59ba0960a77 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1707,11 +1707,9 @@ static inline void amdgpu_acpi_get_backlight_caps(struct amdgpu_dm_backlight_cap #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND) bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev); bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev); -void amdgpu_choose_low_power_state(struct amdgpu_device *adev); #else static inline bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) { return false; } static inline bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev) { return false; } -static inline void amdgpu_choose_low_power_state(struct amdgpu_device *adev) { } #endif void amdgpu_register_gpu_instance(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c index b7f8f2ff143d..707e131f89d2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c @@ -1533,22 +1533,4 @@ bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) #endif /* CONFIG_AMD_PMC */ } -/** - * amdgpu_choose_low_power_state - * - * @adev: amdgpu_device_pointer - * - * Choose the target low power state for the GPU - */ -void amdgpu_choose_low_power_state(struct amdgpu_device *adev) -{ - if (adev->in_runpm) - return; - - if (amdgpu_acpi_is_s0ix_active(adev)) - adev->in_s0ix = true; - else if (amdgpu_acpi_is_s3_active(adev)) - adev->in_s3 = true; -} - #endif /* CONFIG_SUSPEND */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8330e30f0caf..ccc6217761bf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5018,15 +5018,13 @@ int amdgpu_device_prepare(struct drm_device *dev) struct amdgpu_device *adev = drm_to_adev(dev); int i, r; - amdgpu_choose_low_power_state(adev); - if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; /* Evict the majority of BOs before starting suspend sequence */ r = amdgpu_device_evict_resources(adev); if (r) - goto unprepare; + return r; flush_delayed_work(&adev->gfx.gfx_off_delay_work); @@ -5037,15 +5035,10 @@ int amdgpu_device_prepare(struct drm_device *dev) continue; r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]); if (r) - goto unprepare; + return r; } return 0; - -unprepare: - adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; - - return r; } /** -- cgit From 086809c82c96116aed78f762e4f1d61a4e0210a7 Mon Sep 17 00:00:00 2001 From: Ellen Pan Date: Tue, 29 Apr 2025 17:18:44 -0400 Subject: drm/amdgpu: Implement unrecoverable error message handling for VFs This notification may arrive in VF mailbox while polling for response from another event. This patches covers the following scenarios: - If VF is already in RMA state, then do not attempt to contact the host. Host will ignore the VF after sending the notification. - If the notification is detected during polling, then set the RMA status, and return error to caller. - If the notification arrives by interrupt, then set the RMA status and queue a reset. This reset will fail and VF will stop runtime services. Reviewed-by: Shravan Kumar Gande Signed-off-by: Victor Skvortsov Signed-off-by: Ellen Pan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++++ drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 17 ++++++++++++--- drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h | 1 + drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 34 +++++++++++++++++++++++++++--- drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h | 1 + 5 files changed, 52 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ccc6217761bf..a2997ededd0e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -6150,6 +6150,11 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ /* Actual ASIC resets if needed.*/ /* Host driver will handle XGMI hive reset for SRIOV */ if (amdgpu_sriov_vf(adev)) { + + /* Bail out of reset early */ + if (amdgpu_ras_is_rma(adev)) + return -ENODEV; + if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); amdgpu_ras_set_fed(adev, true); diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index f2a74aa76b56..48101a34e049 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -324,6 +324,7 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry) { enum idh_event event = xgpu_ai_mailbox_peek_msg(adev); + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); switch (event) { case IDH_RAS_BAD_PAGES_NOTIFICATION: @@ -331,12 +332,22 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev, if (amdgpu_sriov_runtime(adev)) schedule_work(&adev->virt.bad_pages_work); break; + case IDH_UNRECOV_ERR_NOTIFICATION: + xgpu_ai_mailbox_send_ack(adev); + ras->is_rma = true; + dev_err(adev->dev, "VF is in an unrecoverable state. Runtime Services are halted.\n"); + if (amdgpu_sriov_runtime(adev)) + WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, + &adev->virt.flr_work), + "Failed to queue work! at %s", + __func__); + break; case IDH_FLR_NOTIFICATION: if (amdgpu_sriov_runtime(adev)) WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, - &adev->virt.flr_work), - "Failed to queue work! at %s", - __func__); + &adev->virt.flr_work), + "Failed to queue work! at %s", + __func__); break; case IDH_QUERY_ALIVE: xgpu_ai_mailbox_send_ack(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h index efb452ad1700..874b9f8f9804 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h @@ -57,6 +57,7 @@ enum idh_event { IDH_RAS_ERROR_DETECTED, IDH_RAS_BAD_PAGES_READY = 15, IDH_RAS_BAD_PAGES_NOTIFICATION = 16, + IDH_UNRECOV_ERR_NOTIFICATION = 17, IDH_TEXT_MESSAGE = 255, }; diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index 74a50c0036ef..f6d8597452ed 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -67,6 +67,8 @@ static int xgpu_nv_mailbox_rcv_msg(struct amdgpu_device *adev, reg = RREG32_NO_KIQ(mmMAILBOX_MSGBUF_RCV_DW0); if (reg == IDH_FAIL) r = -EINVAL; + if (reg == IDH_UNRECOV_ERR_NOTIFICATION) + r = -ENODEV; else if (reg != event) return -ENOENT; @@ -103,6 +105,7 @@ static int xgpu_nv_poll_msg(struct amdgpu_device *adev, enum idh_event event) { int r; uint64_t timeout, now; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); now = (uint64_t)ktime_to_ms(ktime_get()); timeout = now + NV_MAILBOX_POLL_MSG_TIMEDOUT; @@ -110,8 +113,16 @@ static int xgpu_nv_poll_msg(struct amdgpu_device *adev, enum idh_event event) do { r = xgpu_nv_mailbox_rcv_msg(adev, event); if (!r) { - dev_dbg(adev->dev, "rcv_msg 0x%x after %llu ms\n", event, NV_MAILBOX_POLL_MSG_TIMEDOUT - timeout + now); + dev_dbg(adev->dev, "rcv_msg 0x%x after %llu ms\n", + event, NV_MAILBOX_POLL_MSG_TIMEDOUT - timeout + now); return 0; + } else if (r == -ENODEV) { + if (!amdgpu_ras_is_rma(adev)) { + ras->is_rma = true; + dev_err(adev->dev, "VF is in an unrecoverable state. " + "Runtime Services are halted.\n"); + } + return r; } msleep(10); @@ -166,6 +177,10 @@ static int xgpu_nv_send_access_requests_with_param(struct amdgpu_device *adev, enum idh_event event = -1; send_request: + + if (amdgpu_ras_is_rma(adev)) + return -ENODEV; + xgpu_nv_mailbox_trans_msg(adev, req, data1, data2, data3); switch (req) { @@ -323,6 +338,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) { struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); + struct amdgpu_reset_context reset_context = { 0 }; amdgpu_virt_fini_data_exchange(adev); @@ -333,8 +349,6 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT || adev->compute_timeout == MAX_SCHEDULE_TIMEOUT || adev->video_timeout == MAX_SCHEDULE_TIMEOUT)) { - struct amdgpu_reset_context reset_context; - memset(&reset_context, 0, sizeof(reset_context)); reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; @@ -380,6 +394,7 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry) { enum idh_event event = xgpu_nv_mailbox_peek_msg(adev); + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); switch (event) { case IDH_RAS_BAD_PAGES_NOTIFICATION: @@ -387,6 +402,19 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev, if (amdgpu_sriov_runtime(adev)) schedule_work(&adev->virt.bad_pages_work); break; + case IDH_UNRECOV_ERR_NOTIFICATION: + xgpu_nv_mailbox_send_ack(adev); + if (!amdgpu_ras_is_rma(adev)) { + ras->is_rma = true; + dev_err(adev->dev, "VF is in an unrecoverable state. Runtime Services are halted.\n"); + } + + if (amdgpu_sriov_runtime(adev)) + WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, + &adev->virt.flr_work), + "Failed to queue work! at %s", + __func__); + break; case IDH_FLR_NOTIFICATION: if (amdgpu_sriov_runtime(adev)) WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h index 6d292a537c1b..5808689562cc 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h @@ -61,6 +61,7 @@ enum idh_event { IDH_RAS_CPER_DUMP_READY = 14, IDH_RAS_BAD_PAGES_READY = 15, IDH_RAS_BAD_PAGES_NOTIFICATION = 16, + IDH_UNRECOV_ERR_NOTIFICATION = 17, IDH_TEXT_MESSAGE = 255, }; -- cgit From 06f2dcc241e7e5c681f81fbc46cacdf4bfd7d6d7 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 1 May 2025 13:46:46 -0400 Subject: drm/amdgpu: fix pm notifier handling Set the s3/s0ix and s4 flags in the pm notifier so that we can skip the resource evictions properly in pm prepare based on whether we are suspending or hibernating. Drop the eviction as processes are not frozen at this time, we we can end up getting stuck trying to evict VRAM while applications continue to submit work which causes the buffers to get pulled back into VRAM. v2: Move suspend flags out of pm notifier (Mario) Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4178 Fixes: 2965e6355dcd ("drm/amd: Add Suspend/Hibernate notification callback support") Cc: Mario Limonciello Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 +++++------------- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 +--------- 2 files changed, 6 insertions(+), 22 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a2997ededd0e..5d47c36c8280 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4976,28 +4976,20 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev) * @data: data * * This function is called when the system is about to suspend or hibernate. - * It is used to evict resources from the device before the system goes to - * sleep while there is still access to swap. + * It is used to set the appropriate flags so that eviction can be optimized + * in the pm prepare callback. */ static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, void *data) { struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); - int r; switch (mode) { case PM_HIBERNATION_PREPARE: adev->in_s4 = true; - fallthrough; - case PM_SUSPEND_PREPARE: - r = amdgpu_device_evict_resources(adev); - /* - * This is considered non-fatal at this time because - * amdgpu_device_prepare() will also fatally evict resources. - * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 - */ - if (r) - drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); + break; + case PM_POST_HIBERNATION: + adev->in_s4 = false; break; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 8e45f18ad470..7f437aea3d06 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2644,13 +2644,8 @@ static int amdgpu_pmops_freeze(struct device *dev) static int amdgpu_pmops_thaw(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); - struct amdgpu_device *adev = drm_to_adev(drm_dev); - int r; - - r = amdgpu_device_resume(drm_dev, true); - adev->in_s4 = false; - return r; + return amdgpu_device_resume(drm_dev, true); } static int amdgpu_pmops_poweroff(struct device *dev) @@ -2663,9 +2658,6 @@ static int amdgpu_pmops_poweroff(struct device *dev) static int amdgpu_pmops_restore(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); - struct amdgpu_device *adev = drm_to_adev(drm_dev); - - adev->in_s4 = false; return amdgpu_device_resume(drm_dev, true); } -- cgit