aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
authorsaturneric <[email protected]>2025-11-16 14:34:50 +0000
committersaturneric <[email protected]>2025-11-16 14:34:50 +0000
commit690862a8d74fee1e07f33dad44b761753f101779 (patch)
treef81bdcab8cd5a640518dba5056de6c62bd071eaf /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parentMerge tag 'v6.17.7' into linux-6.17.y (diff)
parentLinux 6.17.8 (diff)
downloadkernel-linux-6.17.y.tar.gz
kernel-linux-6.17.y.zip
Merge tag 'v6.17.8' into linux-6.17.ylinux-6.17.y
This is the 6.17.8 stable release
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c68
1 files changed, 35 insertions, 33 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index c8459337fcb8..ddd0e7ab82be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -95,6 +95,7 @@ MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
+MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin");
#define AMDGPU_RESUME_MS 2000
#define AMDGPU_MAX_RETRY_LIMIT 2
@@ -2595,6 +2596,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
return 0;
chip_name = "navi12";
break;
+ case CHIP_CYAN_SKILLFISH:
+ chip_name = "cyan_skillfish";
+ break;
}
err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
@@ -3389,7 +3393,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
for (i = 0; i < mgpu_info.num_dgpu; i++) {
gpu_ins = &(mgpu_info.gpu_ins[i]);
adev = gpu_ins->adev;
- if (!(adev->flags & AMD_IS_APU) &&
+ if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) &&
!gpu_ins->mgpu_fan_enabled) {
ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
if (ret)
@@ -5012,6 +5016,10 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
if (!adev->in_s4 && (adev->flags & AMD_IS_APU))
return 0;
+ /* No need to evict when going to S5 through S4 callbacks */
+ if (system_state == SYSTEM_POWER_OFF)
+ return 0;
+
ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
if (ret) {
dev_warn(adev->dev, "evicting device resources failed\n");
@@ -6132,12 +6140,11 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle)
return ret;
}
-static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
+static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
struct list_head *device_list,
struct amdgpu_hive_info *hive)
{
struct amdgpu_device *tmp_adev = NULL;
- int r;
/*
* Build list of devices to reset.
@@ -6157,14 +6164,6 @@ static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
} else {
list_add_tail(&adev->reset_list, device_list);
}
-
- if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) {
- r = amdgpu_device_health_check(device_list);
- if (r)
- return r;
- }
-
- return 0;
}
static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,
@@ -6338,23 +6337,28 @@ static int amdgpu_device_sched_resume(struct list_head *device_list,
if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
- if (tmp_adev->asic_reset_res)
- r = tmp_adev->asic_reset_res;
-
- tmp_adev->asic_reset_res = 0;
-
- if (r) {
+ if (tmp_adev->asic_reset_res) {
/* bad news, how to tell it to userspace ?
* for ras error, we should report GPU bad status instead of
* reset failure
*/
if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
!amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
- dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
- atomic_read(&tmp_adev->gpu_reset_counter));
- amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
+ dev_info(
+ tmp_adev->dev,
+ "GPU reset(%d) failed with error %d \n",
+ atomic_read(
+ &tmp_adev->gpu_reset_counter),
+ tmp_adev->asic_reset_res);
+ amdgpu_vf_error_put(tmp_adev,
+ AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0,
+ tmp_adev->asic_reset_res);
+ if (!r)
+ r = tmp_adev->asic_reset_res;
+ tmp_adev->asic_reset_res = 0;
} else {
- dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
+ dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n",
+ atomic_read(&tmp_adev->gpu_reset_counter));
if (amdgpu_acpi_smart_shift_update(tmp_adev,
AMDGPU_SS_DEV_D0))
dev_warn(tmp_adev->dev,
@@ -6457,8 +6461,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
reset_context->hive = hive;
INIT_LIST_HEAD(&device_list);
- if (amdgpu_device_recovery_prepare(adev, &device_list, hive))
- goto end_reset;
+ amdgpu_device_recovery_prepare(adev, &device_list, hive);
+
+ if (!amdgpu_sriov_vf(adev)) {
+ r = amdgpu_device_health_check(&device_list);
+ if (r)
+ goto end_reset;
+ }
/* We need to lock reset domain only once both for XGMI and single device */
amdgpu_device_recovery_get_reset_lock(adev, &device_list);
@@ -6880,7 +6889,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
{
struct drm_device *dev = pci_get_drvdata(pdev);
struct amdgpu_device *adev = drm_to_adev(dev);
- struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+ struct amdgpu_hive_info *hive __free(xgmi_put_hive) =
+ amdgpu_get_xgmi_hive(adev);
struct amdgpu_reset_context reset_context;
struct list_head device_list;
@@ -6911,10 +6921,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
amdgpu_device_recovery_get_reset_lock(adev, &device_list);
amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list,
hive, false);
- if (hive) {
+ if (hive)
mutex_unlock(&hive->hive_lock);
- amdgpu_put_xgmi_hive(hive);
- }
return PCI_ERS_RESULT_NEED_RESET;
case pci_channel_io_perm_failure:
/* Permanent error, prepare for device removal */
@@ -6965,12 +6973,6 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
int r = 0, i;
u32 memsize;
- /* PCI error slot reset should be skipped During RAS recovery */
- if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
- amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
- amdgpu_ras_in_recovery(adev))
- return PCI_ERS_RESULT_RECOVERED;
-
dev_info(adev->dev, "PCI error: slot reset callback!!\n");
memset(&reset_context, 0, sizeof(reset_context));