aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu
diff options
context:
space:
mode:
authorsaturneric <[email protected]>2025-11-16 14:34:50 +0000
committersaturneric <[email protected]>2025-11-16 14:34:50 +0000
commit690862a8d74fee1e07f33dad44b761753f101779 (patch)
treef81bdcab8cd5a640518dba5056de6c62bd071eaf /drivers/gpu/drm/amd/amdgpu
parentMerge tag 'v6.17.7' into linux-6.17.y (diff)
parentLinux 6.17.8 (diff)
downloadkernel-linux-6.17.y.tar.gz
kernel-linux-6.17.y.zip
Merge tag 'v6.17.8' into linux-6.17.ylinux-6.17.y
This is the 6.17.8 stable release
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c53
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c19
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c66
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c14
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c21
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c68
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c5
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c14
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c125
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c5
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c123
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c40
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/atom.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c12
-rw-r--r--drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c20
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mes_userqueue.c23
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c32
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c35
-rw-r--r--drivers/gpu/drm/amd/amdgpu/soc15.c1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/umc_v12_0.c5
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c11
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c20
37 files changed, 561 insertions, 242 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index cbc40cad581b..9b3180449150 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -76,6 +76,7 @@ static void aca_banks_release(struct aca_banks *banks)
list_for_each_entry_safe(node, tmp, &banks->list, node) {
list_del(&node->node);
kvfree(node);
+ banks->nr_banks--;
}
}
@@ -130,6 +131,27 @@ static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, st
RAS_EVENT_LOG(adev, event_id, HW_ERR "hardware error logged by the scrubber\n");
}
+static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type type)
+{
+
+ struct aca_hwip *hwip;
+ int hwid, mcatype;
+ u64 ipid;
+
+ if (!bank || type == ACA_HWIP_TYPE_UNKNOW)
+ return false;
+
+ hwip = &aca_hwid_mcatypes[type];
+ if (!hwip->hwid)
+ return false;
+
+ ipid = bank->regs[ACA_REG_IDX_IPID];
+ hwid = ACA_REG__IPID__HARDWAREID(ipid);
+ mcatype = ACA_REG__IPID__MCATYPE(ipid);
+
+ return hwip->hwid == hwid && hwip->mcatype == mcatype;
+}
+
static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type,
int start, int count,
struct aca_banks *banks, struct ras_query_context *qctx)
@@ -168,6 +190,15 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
bank.smu_err_type = type;
+ /*
+ * Poison being consumed when injecting a UE while running background workloads,
+ * which are unexpected.
+ */
+ if (type == ACA_SMU_TYPE_UE &&
+ ACA_REG__STATUS__POISON(bank.regs[ACA_REG_IDX_STATUS]) &&
+ !aca_bank_hwip_is_matched(&bank, ACA_HWIP_TYPE_UMC))
+ continue;
+
aca_smu_bank_dump(adev, i, count, &bank, qctx);
ret = aca_banks_add_bank(banks, &bank);
@@ -178,27 +209,6 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_
return 0;
}
-static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type type)
-{
-
- struct aca_hwip *hwip;
- int hwid, mcatype;
- u64 ipid;
-
- if (!bank || type == ACA_HWIP_TYPE_UNKNOW)
- return false;
-
- hwip = &aca_hwid_mcatypes[type];
- if (!hwip->hwid)
- return false;
-
- ipid = bank->regs[ACA_REG_IDX_IPID];
- hwid = ACA_REG__IPID__HARDWAREID(ipid);
- mcatype = ACA_REG__IPID__MCATYPE(ipid);
-
- return hwip->hwid == hwid && hwip->mcatype == mcatype;
-}
-
static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)
{
const struct aca_bank_ops *bank_ops = handle->bank_ops;
@@ -229,6 +239,7 @@ static struct aca_bank_error *new_bank_error(struct aca_error *aerr, struct aca_
mutex_lock(&aerr->lock);
list_add_tail(&bank_error->node, &aerr->list);
+ aerr->nr_errors++;
mutex_unlock(&aerr->lock);
return bank_error;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 902eac2c685f..8994ed564f35 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1263,6 +1263,10 @@ static int unmap_bo_from_gpuvm(struct kgd_mem *mem,
(void)amdgpu_vm_bo_unmap(adev, bo_va, entry->va);
+ /* VM entity stopped if process killed, don't clear freed pt bo */
+ if (!amdgpu_vm_ready(vm))
+ return 0;
+
(void)amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);
(void)amdgpu_sync_fence(sync, bo_va->last_pt_update, GFP_KERNEL);
@@ -2993,9 +2997,22 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu *
struct amdgpu_device *adev = amdgpu_ttm_adev(
peer_vm->root.bo->tbo.bdev);
+ struct amdgpu_fpriv *fpriv =
+ container_of(peer_vm, struct amdgpu_fpriv, vm);
+
+ ret = amdgpu_vm_bo_update(adev, fpriv->prt_va, false);
+ if (ret) {
+ dev_dbg(adev->dev,
+ "Memory eviction: handle PRT moved failed, pid %8d. Try again.\n",
+ pid_nr(process_info->pid));
+ goto validate_map_fail;
+ }
+
ret = amdgpu_vm_handle_moved(adev, peer_vm, &exec.ticket);
if (ret) {
- pr_debug("Memory eviction: handle moved failed. Try again\n");
+ dev_dbg(adev->dev,
+ "Memory eviction: handle moved failed, pid %8d. Try again.\n",
+ pid_nr(process_info->pid));
goto validate_map_fail;
}
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c
index 5e375e9c4f5d..bf38fc69c1cf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c
@@ -1195,29 +1195,69 @@ static void amdgpu_connector_dvi_force(struct drm_connector *connector)
amdgpu_connector->use_digital = true;
}
+/**
+ * amdgpu_max_hdmi_pixel_clock - Return max supported HDMI (TMDS) pixel clock
+ * @adev: pointer to amdgpu_device
+ *
+ * Return: maximum supported HDMI (TMDS) pixel clock in KHz.
+ */
+static int amdgpu_max_hdmi_pixel_clock(const struct amdgpu_device *adev)
+{
+ if (adev->asic_type >= CHIP_POLARIS10)
+ return 600000;
+ else if (adev->asic_type >= CHIP_TONGA)
+ return 300000;
+ else
+ return 297000;
+}
+
+/**
+ * amdgpu_connector_dvi_mode_valid - Validate a mode on DVI/HDMI connectors
+ * @connector: DRM connector to validate the mode on
+ * @mode: display mode to validate
+ *
+ * Validate the given display mode on DVI and HDMI connectors, including
+ * analog signals on DVI-I.
+ *
+ * Return: drm_mode_status indicating whether the mode is valid.
+ */
static enum drm_mode_status amdgpu_connector_dvi_mode_valid(struct drm_connector *connector,
const struct drm_display_mode *mode)
{
struct drm_device *dev = connector->dev;
struct amdgpu_device *adev = drm_to_adev(dev);
struct amdgpu_connector *amdgpu_connector = to_amdgpu_connector(connector);
+ const int max_hdmi_pixel_clock = amdgpu_max_hdmi_pixel_clock(adev);
+ const int max_dvi_single_link_pixel_clock = 165000;
+ int max_digital_pixel_clock_khz;
/* XXX check mode bandwidth */
- if (amdgpu_connector->use_digital && (mode->clock > 165000)) {
- if ((amdgpu_connector->connector_object_id == CONNECTOR_OBJECT_ID_DUAL_LINK_DVI_I) ||
- (amdgpu_connector->connector_object_id == CONNECTOR_OBJECT_ID_DUAL_LINK_DVI_D) ||
- (amdgpu_connector->connector_object_id == CONNECTOR_OBJECT_ID_HDMI_TYPE_B)) {
- return MODE_OK;
- } else if (connector->display_info.is_hdmi) {
- /* HDMI 1.3+ supports max clock of 340 Mhz */
- if (mode->clock > 340000)
- return MODE_CLOCK_HIGH;
- else
- return MODE_OK;
- } else {
- return MODE_CLOCK_HIGH;
+ if (amdgpu_connector->use_digital) {
+ switch (amdgpu_connector->connector_object_id) {
+ case CONNECTOR_OBJECT_ID_HDMI_TYPE_A:
+ max_digital_pixel_clock_khz = max_hdmi_pixel_clock;
+ break;
+ case CONNECTOR_OBJECT_ID_SINGLE_LINK_DVI_I:
+ case CONNECTOR_OBJECT_ID_SINGLE_LINK_DVI_D:
+ max_digital_pixel_clock_khz = max_dvi_single_link_pixel_clock;
+ break;
+ case CONNECTOR_OBJECT_ID_DUAL_LINK_DVI_I:
+ case CONNECTOR_OBJECT_ID_DUAL_LINK_DVI_D:
+ case CONNECTOR_OBJECT_ID_HDMI_TYPE_B:
+ max_digital_pixel_clock_khz = max_dvi_single_link_pixel_clock * 2;
+ break;
}
+
+ /* When the display EDID claims that it's an HDMI display,
+ * we use the HDMI encoder mode of the display HW,
+ * so we should verify against the max HDMI clock here.
+ */
+ if (connector->display_info.is_hdmi)
+ max_digital_pixel_clock_khz = max_hdmi_pixel_clock;
+
+ if (mode->clock > max_digital_pixel_clock_khz)
+ return MODE_CLOCK_HIGH;
}
/* check against the max pixel clock */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
index 25252231a68a..54de38dbaf2d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: MIT
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
@@ -68,7 +68,6 @@ void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev,
hdr->error_severity = sev;
hdr->valid_bits.platform_id = 1;
- hdr->valid_bits.partition_id = 1;
hdr->valid_bits.timestamp = 1;
amdgpu_cper_get_timestamp(&hdr->timestamp);
@@ -206,6 +205,7 @@ int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev
{
struct cper_sec_desc *section_desc;
struct cper_sec_nonstd_err *section;
+ uint32_t socket_id;
section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx));
section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr +
@@ -219,11 +219,17 @@ int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev
section->hdr.valid_bits.err_context_cnt = 1;
section->info.error_type = RUNTIME;
+ section->info.valid_bits.ms_chk = 1;
section->info.ms_chk_bits.err_type_valid = 1;
+ section->info.ms_chk_bits.err_type = 1;
+ section->info.ms_chk_bits.pcc = 1;
section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH;
section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump);
/* Hardcoded Reg dump for bad page threshold CPER */
+ socket_id = (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ?
+ adev->smuio.funcs->get_socket_id(adev) :
+ 0;
section->ctx.reg_dump[CPER_ACA_REG_CTL_LO] = 0x1;
section->ctx.reg_dump[CPER_ACA_REG_CTL_HI] = 0x0;
section->ctx.reg_dump[CPER_ACA_REG_STATUS_LO] = 0x137;
@@ -234,8 +240,8 @@ int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev
section->ctx.reg_dump[CPER_ACA_REG_MISC0_HI] = 0x0;
section->ctx.reg_dump[CPER_ACA_REG_CONFIG_LO] = 0x2;
section->ctx.reg_dump[CPER_ACA_REG_CONFIG_HI] = 0x1ff;
- section->ctx.reg_dump[CPER_ACA_REG_IPID_LO] = 0x0;
- section->ctx.reg_dump[CPER_ACA_REG_IPID_HI] = 0x96;
+ section->ctx.reg_dump[CPER_ACA_REG_IPID_LO] = (socket_id / 4) & 0x01;
+ section->ctx.reg_dump[CPER_ACA_REG_IPID_HI] = 0x096 | (((socket_id % 4) & 0x3) << 12);
section->ctx.reg_dump[CPER_ACA_REG_SYND_LO] = 0x0;
section->ctx.reg_dump[CPER_ACA_REG_SYND_HI] = 0x0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
index bcb97d245673..353421807387 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: MIT */
/*
* Copyright 2025 Advanced Micro Devices, Inc.
*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index d3f220be2ef9..1ce1fd0c87a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -286,7 +286,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
}
}
- if (!p->gang_size) {
+ if (!p->gang_size || (amdgpu_sriov_vf(p->adev) && p->gang_size > 1)) {
ret = -EINVAL;
goto free_all_kdata;
}
@@ -1767,30 +1767,21 @@ int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data,
{
struct amdgpu_device *adev = drm_to_adev(dev);
union drm_amdgpu_wait_fences *wait = data;
- uint32_t fence_count = wait->in.fence_count;
- struct drm_amdgpu_fence *fences_user;
struct drm_amdgpu_fence *fences;
int r;
/* Get the fences from userspace */
- fences = kmalloc_array(fence_count, sizeof(struct drm_amdgpu_fence),
- GFP_KERNEL);
- if (fences == NULL)
- return -ENOMEM;
-
- fences_user = u64_to_user_ptr(wait->in.fences);
- if (copy_from_user(fences, fences_user,
- sizeof(struct drm_amdgpu_fence) * fence_count)) {
- r = -EFAULT;
- goto err_free_fences;
- }
+ fences = memdup_array_user(u64_to_user_ptr(wait->in.fences),
+ wait->in.fence_count,
+ sizeof(struct drm_amdgpu_fence));
+ if (IS_ERR(fences))
+ return PTR_ERR(fences);
if (wait->in.wait_all)
r = amdgpu_cs_wait_all_fences(adev, filp, wait, fences);
else
r = amdgpu_cs_wait_any_fence(adev, filp, wait, fences);
-err_free_fences:
kfree(fences);
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index c8459337fcb8..ddd0e7ab82be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -95,6 +95,7 @@ MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
+MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin");
#define AMDGPU_RESUME_MS 2000
#define AMDGPU_MAX_RETRY_LIMIT 2
@@ -2595,6 +2596,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
return 0;
chip_name = "navi12";
break;
+ case CHIP_CYAN_SKILLFISH:
+ chip_name = "cyan_skillfish";
+ break;
}
err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
@@ -3389,7 +3393,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
for (i = 0; i < mgpu_info.num_dgpu; i++) {
gpu_ins = &(mgpu_info.gpu_ins[i]);
adev = gpu_ins->adev;
- if (!(adev->flags & AMD_IS_APU) &&
+ if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) &&
!gpu_ins->mgpu_fan_enabled) {
ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
if (ret)
@@ -5012,6 +5016,10 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
if (!adev->in_s4 && (adev->flags & AMD_IS_APU))
return 0;
+ /* No need to evict when going to S5 through S4 callbacks */
+ if (system_state == SYSTEM_POWER_OFF)
+ return 0;
+
ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
if (ret) {
dev_warn(adev->dev, "evicting device resources failed\n");
@@ -6132,12 +6140,11 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle)
return ret;
}
-static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
+static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
struct list_head *device_list,
struct amdgpu_hive_info *hive)
{
struct amdgpu_device *tmp_adev = NULL;
- int r;
/*
* Build list of devices to reset.
@@ -6157,14 +6164,6 @@ static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
} else {
list_add_tail(&adev->reset_list, device_list);
}
-
- if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) {
- r = amdgpu_device_health_check(device_list);
- if (r)
- return r;
- }
-
- return 0;
}
static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,
@@ -6338,23 +6337,28 @@ static int amdgpu_device_sched_resume(struct list_head *device_list,
if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
- if (tmp_adev->asic_reset_res)
- r = tmp_adev->asic_reset_res;
-
- tmp_adev->asic_reset_res = 0;
-
- if (r) {
+ if (tmp_adev->asic_reset_res) {
/* bad news, how to tell it to userspace ?
* for ras error, we should report GPU bad status instead of
* reset failure
*/
if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
!amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
- dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
- atomic_read(&tmp_adev->gpu_reset_counter));
- amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
+ dev_info(
+ tmp_adev->dev,
+ "GPU reset(%d) failed with error %d \n",
+ atomic_read(
+ &tmp_adev->gpu_reset_counter),
+ tmp_adev->asic_reset_res);
+ amdgpu_vf_error_put(tmp_adev,
+ AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0,
+ tmp_adev->asic_reset_res);
+ if (!r)
+ r = tmp_adev->asic_reset_res;
+ tmp_adev->asic_reset_res = 0;
} else {
- dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
+ dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n",
+ atomic_read(&tmp_adev->gpu_reset_counter));
if (amdgpu_acpi_smart_shift_update(tmp_adev,
AMDGPU_SS_DEV_D0))
dev_warn(tmp_adev->dev,
@@ -6457,8 +6461,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
reset_context->hive = hive;
INIT_LIST_HEAD(&device_list);
- if (amdgpu_device_recovery_prepare(adev, &device_list, hive))
- goto end_reset;
+ amdgpu_device_recovery_prepare(adev, &device_list, hive);
+
+ if (!amdgpu_sriov_vf(adev)) {
+ r = amdgpu_device_health_check(&device_list);
+ if (r)
+ goto end_reset;
+ }
/* We need to lock reset domain only once both for XGMI and single device */
amdgpu_device_recovery_get_reset_lock(adev, &device_list);
@@ -6880,7 +6889,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
{
struct drm_device *dev = pci_get_drvdata(pdev);
struct amdgpu_device *adev = drm_to_adev(dev);
- struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+ struct amdgpu_hive_info *hive __free(xgmi_put_hive) =
+ amdgpu_get_xgmi_hive(adev);
struct amdgpu_reset_context reset_context;
struct list_head device_list;
@@ -6911,10 +6921,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
amdgpu_device_recovery_get_reset_lock(adev, &device_list);
amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list,
hive, false);
- if (hive) {
+ if (hive)
mutex_unlock(&hive->hive_lock);
- amdgpu_put_xgmi_hive(hive);
- }
return PCI_ERS_RESULT_NEED_RESET;
case pci_channel_io_perm_failure:
/* Permanent error, prepare for device removal */
@@ -6965,12 +6973,6 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
int r = 0, i;
u32 memsize;
- /* PCI error slot reset should be skipped During RAS recovery */
- if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
- amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
- amdgpu_ras_in_recovery(adev))
- return PCI_ERS_RESULT_RECOVERED;
-
dev_info(adev->dev, "PCI error: slot reset callback!!\n");
memset(&reset_context, 0, sizeof(reset_context));
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index e814da2b1422..dd7b2b796427 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -2126,7 +2126,6 @@ static int amdgpu_discovery_set_smu_ip_blocks(struct amdgpu_device *adev)
case IP_VERSION(11, 0, 5):
case IP_VERSION(11, 0, 9):
case IP_VERSION(11, 0, 7):
- case IP_VERSION(11, 0, 8):
case IP_VERSION(11, 0, 11):
case IP_VERSION(11, 0, 12):
case IP_VERSION(11, 0, 13):
@@ -2134,6 +2133,10 @@ static int amdgpu_discovery_set_smu_ip_blocks(struct amdgpu_device *adev)
case IP_VERSION(11, 5, 2):
amdgpu_device_ip_block_add(adev, &smu_v11_0_ip_block);
break;
+ case IP_VERSION(11, 0, 8):
+ if (adev->apu_flags & AMD_APU_IS_CYAN_SKILLFISH2)
+ amdgpu_device_ip_block_add(adev, &smu_v11_0_ip_block);
+ break;
case IP_VERSION(12, 0, 0):
case IP_VERSION(12, 0, 1):
amdgpu_device_ip_block_add(adev, &smu_v12_0_ip_block);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 65f4a76490ea..e60043ac9841 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2172,6 +2172,11 @@ static const struct pci_device_id pciidlist[] = {
{0x1002, 0x7410, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ALDEBARAN},
/* CYAN_SKILLFISH */
+ {0x1002, 0x13DB, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYAN_SKILLFISH|AMD_IS_APU},
+ {0x1002, 0x13F9, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYAN_SKILLFISH|AMD_IS_APU},
+ {0x1002, 0x13FA, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYAN_SKILLFISH|AMD_IS_APU},
+ {0x1002, 0x13FB, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYAN_SKILLFISH|AMD_IS_APU},
+ {0x1002, 0x13FC, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYAN_SKILLFISH|AMD_IS_APU},
{0x1002, 0x13FE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYAN_SKILLFISH|AMD_IS_APU},
{0x1002, 0x143F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYAN_SKILLFISH|AMD_IS_APU},
@@ -2597,6 +2602,7 @@ static int amdgpu_pmops_suspend(struct device *dev)
else if (amdgpu_acpi_is_s3_active(adev))
adev->in_s3 = true;
if (!adev->in_s0ix && !adev->in_s3) {
+#if IS_ENABLED(CONFIG_SUSPEND)
/* don't allow going deep first time followed by s2idle the next time */
if (adev->last_suspend_state != PM_SUSPEND_ON &&
adev->last_suspend_state != pm_suspend_target_state) {
@@ -2604,11 +2610,14 @@ static int amdgpu_pmops_suspend(struct device *dev)
pm_suspend_target_state);
return -EINVAL;
}
+#endif
return 0;
}
+#if IS_ENABLED(CONFIG_SUSPEND)
/* cache the state last used for suspend */
adev->last_suspend_state = pm_suspend_target_state;
+#endif
return amdgpu_device_suspend(drm_dev, true);
}
@@ -2933,11 +2942,14 @@ static int amdgpu_drm_release(struct inode *inode, struct file *filp)
{
struct drm_file *file_priv = filp->private_data;
struct amdgpu_fpriv *fpriv = file_priv->driver_priv;
+ struct drm_device *dev = file_priv->minor->dev;
+ int idx;
- if (fpriv) {
+ if (fpriv && drm_dev_enter(dev, &idx)) {
fpriv->evf_mgr.fd_closing = true;
amdgpu_eviction_fence_destroy(&fpriv->evf_mgr);
amdgpu_userq_mgr_fini(&fpriv->userq_mgr);
+ drm_dev_exit(idx);
}
return drm_release(inode, filp);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index 82d58ac7afb0..5d5e9ee83a5d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -121,10 +121,12 @@ static void amdgpu_jpeg_idle_work_handler(struct work_struct *work)
fences += amdgpu_fence_count_emitted(&adev->jpeg.inst[i].ring_dec[j]);
}
- if (!fences && !atomic_read(&adev->jpeg.total_submission_cnt))
+ if (!fences && !atomic_read(&adev->jpeg.total_submission_cnt)) {
+ mutex_lock(&adev->jpeg.jpeg_pg_lock);
amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_JPEG,
AMD_PG_STATE_GATE);
- else
+ mutex_unlock(&adev->jpeg.jpeg_pg_lock);
+ } else
schedule_delayed_work(&adev->jpeg.idle_work, JPEG_IDLE_TIMEOUT);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 540817e296da..893cae9813fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -122,7 +122,7 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
-#define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms
+#define MAX_UMC_POISON_POLLING_TIME_ASYNC 10
#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
@@ -136,9 +136,9 @@ enum amdgpu_ras_retire_page_reservation {
atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
-static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
+static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
uint64_t addr);
-static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
+static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
uint64_t addr);
#ifdef CONFIG_X86_MCE_AMD
static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
@@ -169,18 +169,16 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
struct eeprom_table_record err_rec;
int ret;
- if ((address >= adev->gmc.mc_vram_size) ||
- (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
+ ret = amdgpu_ras_check_bad_page(adev, address);
+ if (ret == -EINVAL) {
dev_warn(adev->dev,
- "RAS WARN: input address 0x%llx is invalid.\n",
- address);
+ "RAS WARN: input address 0x%llx is invalid.\n",
+ address);
return -EINVAL;
- }
-
- if (amdgpu_ras_check_bad_page(adev, address)) {
+ } else if (ret == 1) {
dev_warn(adev->dev,
- "RAS WARN: 0x%llx has already been marked as bad page!\n",
- address);
+ "RAS WARN: 0x%llx has already been marked as bad page!\n",
+ address);
return 0;
}
@@ -513,22 +511,16 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
break;
case 2:
- if ((data.inject.address >= adev->gmc.mc_vram_size &&
- adev->gmc.mc_vram_size) ||
- (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
- dev_warn(adev->dev, "RAS WARN: input address "
- "0x%llx is invalid.",
+ /* umc ce/ue error injection for a bad page is not allowed */
+ if (data.head.block == AMDGPU_RAS_BLOCK__UMC)
+ ret = amdgpu_ras_check_bad_page(adev, data.inject.address);
+ if (ret == -EINVAL) {
+ dev_warn(adev->dev, "RAS WARN: input address 0x%llx is invalid.",
data.inject.address);
- ret = -EINVAL;
break;
- }
-
- /* umc ce/ue error injection for a bad page is not allowed */
- if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
- amdgpu_ras_check_bad_page(adev, data.inject.address)) {
- dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
- "already been marked as bad!\n",
- data.inject.address);
+ } else if (ret == 1) {
+ dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has already been marked as bad!\n",
+ data.inject.address);
break;
}
@@ -3134,18 +3126,24 @@ out:
return ret;
}
-static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
+static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
uint64_t addr)
{
struct ras_err_handler_data *data = con->eh_data;
+ struct amdgpu_device *adev = con->adev;
int i;
+ if ((addr >= adev->gmc.mc_vram_size &&
+ adev->gmc.mc_vram_size) ||
+ (addr >= RAS_UMC_INJECT_ADDR_LIMIT))
+ return -EINVAL;
+
addr >>= AMDGPU_GPU_PAGE_SHIFT;
for (i = 0; i < data->count; i++)
if (addr == data->bps[i].retired_page)
- return true;
+ return 1;
- return false;
+ return 0;
}
/*
@@ -3153,11 +3151,11 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
*
* Note: this check is only for umc block
*/
-static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
+static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
uint64_t addr)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- bool ret = false;
+ int ret = 0;
if (!con || !con->eh_data)
return ret;
@@ -3241,7 +3239,7 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
ecc_log->de_queried_count = 0;
- ecc_log->prev_de_queried_count = 0;
+ ecc_log->consumption_q_count = 0;
}
static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
@@ -3261,7 +3259,7 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
mutex_destroy(&ecc_log->lock);
ecc_log->de_queried_count = 0;
- ecc_log->prev_de_queried_count = 0;
+ ecc_log->consumption_q_count = 0;
}
static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con,
@@ -3287,7 +3285,6 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
page_retirement_dwork.work);
struct amdgpu_device *adev = con->adev;
struct ras_err_data err_data;
- unsigned long err_cnt;
/* If gpu reset is ongoing, delay retiring the bad pages */
if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) {
@@ -3299,13 +3296,9 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
amdgpu_ras_error_data_init(&err_data);
amdgpu_umc_handle_bad_pages(adev, &err_data);
- err_cnt = err_data.err_addr_cnt;
amdgpu_ras_error_data_fini(&err_data);
- if (err_cnt && amdgpu_ras_is_rma(adev))
- amdgpu_ras_reset_gpu(adev);
-
amdgpu_ras_schedule_retirement_dwork(con,
AMDGPU_RAS_RETIRE_PAGE_INTERVAL);
}
@@ -3316,49 +3309,39 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
int ret = 0;
struct ras_ecc_log_info *ecc_log;
struct ras_query_if info;
- uint32_t timeout = 0;
+ u32 timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
- uint64_t de_queried_count;
- uint32_t new_detect_count, total_detect_count;
- uint32_t need_query_count = poison_creation_count;
+ u64 de_queried_count;
+ u64 consumption_q_count;
enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;
memset(&info, 0, sizeof(info));
info.head.block = AMDGPU_RAS_BLOCK__UMC;
ecc_log = &ras->umc_ecc_log;
- total_detect_count = 0;
+ ecc_log->de_queried_count = 0;
+ ecc_log->consumption_q_count = 0;
+
do {
ret = amdgpu_ras_query_error_status_with_event(adev, &info, type);
if (ret)
return ret;
de_queried_count = ecc_log->de_queried_count;
- if (de_queried_count > ecc_log->prev_de_queried_count) {
- new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;
- ecc_log->prev_de_queried_count = de_queried_count;
- timeout = 0;
- } else {
- new_detect_count = 0;
- }
+ consumption_q_count = ecc_log->consumption_q_count;
- if (new_detect_count) {
- total_detect_count += new_detect_count;
- } else {
- if (!timeout && need_query_count)
- timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
+ if (de_queried_count && consumption_q_count)
+ break;
- if (timeout) {
- if (!--timeout)
- break;
- msleep(1);
- }
- }
- } while (total_detect_count < need_query_count);
+ msleep(100);
+ } while (--timeout);
- if (total_detect_count)
+ if (de_queried_count)
schedule_delayed_work(&ras->page_retirement_dwork, 0);
+ if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0)
+ amdgpu_ras_reset_gpu(adev);
+
return 0;
}
@@ -3394,6 +3377,12 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
reset_flags |= msg.reset;
}
+ /*
+ * Try to ensure poison creation handler is completed first
+ * to set rma if bad page exceed threshold.
+ */
+ flush_delayed_work(&con->page_retirement_dwork);
+
/* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
if (reset_flags && !amdgpu_ras_is_rma(adev)) {
if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET)
@@ -3403,8 +3392,6 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
else
reset = reset_flags;
- flush_delayed_work(&con->page_retirement_dwork);
-
con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
@@ -3446,7 +3433,8 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_sub(poison_creation_count, &con->poison_creation_count);
atomic_sub(poison_creation_count, &con->page_retirement_req_cnt);
}
- } while (atomic_read(&con->poison_creation_count));
+ } while (atomic_read(&con->poison_creation_count) &&
+ !atomic_read(&con->poison_consumption_count));
if (ret != -EIO) {
msg_count = kfifo_len(&con->poison_fifo);
@@ -3463,6 +3451,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
/* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */
/* Clear poison creation request */
atomic_set(&con->poison_creation_count, 0);
+ atomic_set(&con->poison_consumption_count, 0);
/* Clear poison fifo */
amdgpu_ras_clear_poison_fifo(adev);
@@ -3487,6 +3476,8 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_sub(msg_count, &con->page_retirement_req_cnt);
}
+ atomic_set(&con->poison_consumption_count, 0);
+
/* Wake up work to save bad pages to eeprom */
schedule_delayed_work(&con->page_retirement_dwork, 0);
}
@@ -3572,6 +3563,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
mutex_init(&con->recovery_lock);
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
atomic_set(&con->in_recovery, 0);
+ atomic_set(&con->rma_in_recovery, 0);
con->eeprom_control.bad_channel_bitmap = 0;
max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
@@ -3589,6 +3581,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
init_waitqueue_head(&con->page_retirement_wq);
atomic_set(&con->page_retirement_req_cnt, 0);
atomic_set(&con->poison_creation_count, 0);
+ atomic_set(&con->poison_consumption_count, 0);
con->page_retirement_thread =
kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement");
if (IS_ERR(con->page_retirement_thread)) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 927d6bff734a..96cb62a44a35 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -492,8 +492,8 @@ struct ras_ecc_err {
struct ras_ecc_log_info {
struct mutex lock;
struct radix_tree_root de_page_tree;
- uint64_t de_queried_count;
- uint64_t prev_de_queried_count;
+ uint64_t de_queried_count;
+ uint64_t consumption_q_count;
};
struct amdgpu_ras {
@@ -515,6 +515,7 @@ struct amdgpu_ras {
/* gpu recovery */
struct work_struct recovery_work;
atomic_t in_recovery;
+ atomic_t rma_in_recovery;
struct amdgpu_device *adev;
/* error handler data */
struct ras_err_handler_data *eh_data;
@@ -557,6 +558,7 @@ struct amdgpu_ras {
struct mutex page_retirement_lock;
atomic_t page_retirement_req_cnt;
atomic_t poison_creation_count;
+ atomic_t poison_consumption_count;
struct mutex page_rsv_lock;
DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
struct ras_ecc_log_info umc_ecc_log;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 9bda9ad13f88..88ded6296be3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -774,9 +774,10 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
control->tbl_rai.health_percent = 0;
}
ras->is_rma = true;
- /* ignore the -ENOTSUPP return value */
- amdgpu_dpm_send_rma_reason(adev);
}
+
+ /* ignore the -ENOTSUPP return value */
+ amdgpu_dpm_send_rma_reason(adev);
}
if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index c92b8794aa73..2e039fb778ea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -252,6 +252,7 @@ int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
block, pasid, pasid_fn, data, reset);
if (!ret) {
atomic_inc(&con->page_retirement_req_cnt);
+ atomic_inc(&con->poison_consumption_count);
wake_up(&con->page_retirement_wq);
}
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 8190c24a649a..a0fb13172e8b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -44,6 +44,39 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
return userq_ip_mask;
}
+int amdgpu_userq_input_va_validate(struct amdgpu_vm *vm, u64 addr,
+ u64 expected_size)
+{
+ struct amdgpu_bo_va_mapping *va_map;
+ u64 user_addr;
+ u64 size;
+ int r = 0;
+
+ user_addr = (addr & AMDGPU_GMC_HOLE_MASK) >> AMDGPU_GPU_PAGE_SHIFT;
+ size = expected_size >> AMDGPU_GPU_PAGE_SHIFT;
+
+ r = amdgpu_bo_reserve(vm->root.bo, false);
+ if (r)
+ return r;
+
+ va_map = amdgpu_vm_bo_lookup_mapping(vm, user_addr);
+ if (!va_map) {
+ r = -EINVAL;
+ goto out_err;
+ }
+ /* Only validate the userq whether resident in the VM mapping range */
+ if (user_addr >= va_map->start &&
+ va_map->last - user_addr + 1 >= size) {
+ amdgpu_bo_unreserve(vm->root.bo);
+ return 0;
+ }
+
+ r = -EINVAL;
+out_err:
+ amdgpu_bo_unreserve(vm->root.bo);
+ return r;
+}
+
static int
amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_queue *queue)
@@ -404,27 +437,10 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
(args->in.flags & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK) >>
AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_SHIFT;
- /* Usermode queues are only supported for GFX IP as of now */
- if (args->in.ip_type != AMDGPU_HW_IP_GFX &&
- args->in.ip_type != AMDGPU_HW_IP_DMA &&
- args->in.ip_type != AMDGPU_HW_IP_COMPUTE) {
- drm_file_err(uq_mgr->file, "Usermode queue doesn't support IP type %u\n",
- args->in.ip_type);
- return -EINVAL;
- }
-
r = amdgpu_userq_priority_permit(filp, priority);
if (r)
return r;
- if ((args->in.flags & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE) &&
- (args->in.ip_type != AMDGPU_HW_IP_GFX) &&
- (args->in.ip_type != AMDGPU_HW_IP_COMPUTE) &&
- !amdgpu_is_tmz(adev)) {
- drm_file_err(uq_mgr->file, "Secure only supported on GFX/Compute queues\n");
- return -EINVAL;
- }
-
r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
if (r < 0) {
drm_file_err(uq_mgr->file, "pm_runtime_get_sync() failed for userqueue create\n");
@@ -456,6 +472,15 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
r = -ENOMEM;
goto unlock;
}
+
+ /* Validate the userq virtual address.*/
+ if (amdgpu_userq_input_va_validate(&fpriv->vm, args->in.queue_va, args->in.queue_size) ||
+ amdgpu_userq_input_va_validate(&fpriv->vm, args->in.rptr_va, AMDGPU_GPU_PAGE_SIZE) ||
+ amdgpu_userq_input_va_validate(&fpriv->vm, args->in.wptr_va, AMDGPU_GPU_PAGE_SIZE)) {
+ r = -EINVAL;
+ kfree(queue);
+ goto unlock;
+ }
queue->doorbell_handle = args->in.doorbell_handle;
queue->queue_type = args->in.ip_type;
queue->vm = &fpriv->vm;
@@ -543,22 +568,45 @@ unlock:
return r;
}
-int amdgpu_userq_ioctl(struct drm_device *dev, void *data,
- struct drm_file *filp)
+static int amdgpu_userq_input_args_validate(struct drm_device *dev,
+ union drm_amdgpu_userq *args,
+ struct drm_file *filp)
{
- union drm_amdgpu_userq *args = data;
- int r;
+ struct amdgpu_device *adev = drm_to_adev(dev);
switch (args->in.op) {
case AMDGPU_USERQ_OP_CREATE:
if (args->in.flags & ~(AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK |
AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE))
return -EINVAL;
- r = amdgpu_userq_create(filp, args);
- if (r)
- drm_file_err(filp, "Failed to create usermode queue\n");
- break;
+ /* Usermode queues are only supported for GFX IP as of now */
+ if (args->in.ip_type != AMDGPU_HW_IP_GFX &&
+ args->in.ip_type != AMDGPU_HW_IP_DMA &&
+ args->in.ip_type != AMDGPU_HW_IP_COMPUTE) {
+ drm_file_err(filp, "Usermode queue doesn't support IP type %u\n",
+ args->in.ip_type);
+ return -EINVAL;
+ }
+
+ if ((args->in.flags & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE) &&
+ (args->in.ip_type != AMDGPU_HW_IP_GFX) &&
+ (args->in.ip_type != AMDGPU_HW_IP_COMPUTE) &&
+ !amdgpu_is_tmz(adev)) {
+ drm_file_err(filp, "Secure only supported on GFX/Compute queues\n");
+ return -EINVAL;
+ }
+ if (args->in.queue_va == AMDGPU_BO_INVALID_OFFSET ||
+ args->in.queue_va == 0 ||
+ args->in.queue_size == 0) {
+ drm_file_err(filp, "invalidate userq queue va or size\n");
+ return -EINVAL;
+ }
+ if (!args->in.wptr_va || !args->in.rptr_va) {
+ drm_file_err(filp, "invalidate userq queue rptr or wptr\n");
+ return -EINVAL;
+ }
+ break;
case AMDGPU_USERQ_OP_FREE:
if (args->in.ip_type ||
args->in.doorbell_handle ||
@@ -572,6 +620,31 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data,
args->in.mqd ||
args->in.mqd_size)
return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int amdgpu_userq_ioctl(struct drm_device *dev, void *data,
+ struct drm_file *filp)
+{
+ union drm_amdgpu_userq *args = data;
+ int r;
+
+ if (amdgpu_userq_input_args_validate(dev, args, filp) < 0)
+ return -EINVAL;
+
+ switch (args->in.op) {
+ case AMDGPU_USERQ_OP_CREATE:
+ r = amdgpu_userq_create(filp, args);
+ if (r)
+ drm_file_err(filp, "Failed to create usermode queue\n");
+ break;
+
+ case AMDGPU_USERQ_OP_FREE:
r = amdgpu_userq_destroy(filp, args->in.queue_id);
if (r)
drm_file_err(filp, "Failed to destroy usermode queue\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index b1ca91b7cda4..8603c31320f1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -133,4 +133,6 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
u32 idx);
+int amdgpu_userq_input_va_validate(struct amdgpu_vm *vm, u64 addr,
+ u64 expected_size);
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
index c2a983ff23c9..b372baae3979 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -276,7 +276,7 @@ static int amdgpu_userq_fence_create(struct amdgpu_usermode_queue *userq,
/* Check if hardware has already processed the job */
spin_lock_irqsave(&fence_drv->fence_list_lock, flags);
- if (!dma_fence_is_signaled_locked(fence))
+ if (!dma_fence_is_signaled(fence))
list_add_tail(&userq_fence->link, &fence_drv->fences);
else
dma_fence_put(fence);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 3da3ebb1d9a1..58accf2259b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -267,7 +267,8 @@ struct amdgpu_virt {
struct amdgpu_irq_src rcv_irq;
struct work_struct flr_work;
- struct work_struct bad_pages_work;
+ struct work_struct req_bad_pages_work;
+ struct work_struct handle_bad_pages_work;
struct amdgpu_mm_table mm_table;
const struct amdgpu_virt_ops *ops;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
index 121ee17b522b..118fbe38b33a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
@@ -322,6 +322,26 @@ static int vpe_early_init(struct amdgpu_ip_block *ip_block)
return 0;
}
+static bool vpe_need_dpm0_at_power_down(struct amdgpu_device *adev)
+{
+ switch (amdgpu_ip_version(adev, VPE_HWIP, 0)) {
+ case IP_VERSION(6, 1, 1):
+ return adev->pm.fw_version < 0x0a640500;
+ default:
+ return false;
+ }
+}
+
+static int vpe_get_dpm_level(struct amdgpu_device *adev)
+{
+ struct amdgpu_vpe *vpe = &adev->vpe;
+
+ if (!adev->pm.dpm_enabled)
+ return 0;
+
+ return RREG32(vpe_get_reg_offset(vpe, 0, vpe->regs.dpm_request_lv));
+}
+
static void vpe_idle_work_handler(struct work_struct *work)
{
struct amdgpu_device *adev =
@@ -329,11 +349,17 @@ static void vpe_idle_work_handler(struct work_struct *work)
unsigned int fences = 0;
fences += amdgpu_fence_count_emitted(&adev->vpe.ring);
+ if (fences)
+ goto reschedule;
+
+ if (vpe_need_dpm0_at_power_down(adev) && vpe_get_dpm_level(adev) != 0)
+ goto reschedule;
- if (fences == 0)
- amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VPE, AMD_PG_STATE_GATE);
- else
- schedule_delayed_work(&adev->vpe.idle_work, VPE_IDLE_TIMEOUT);
+ amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VPE, AMD_PG_STATE_GATE);
+ return;
+
+reschedule:
+ schedule_delayed_work(&adev->vpe.idle_work, VPE_IDLE_TIMEOUT);
}
static int vpe_common_init(struct amdgpu_vpe *vpe)
@@ -435,6 +461,8 @@ static int vpe_hw_fini(struct amdgpu_ip_block *ip_block)
struct amdgpu_device *adev = ip_block->adev;
struct amdgpu_vpe *vpe = &adev->vpe;
+ cancel_delayed_work_sync(&adev->vpe.idle_work);
+
vpe_ring_stop(vpe);
/* Power off VPE */
@@ -445,10 +473,6 @@ static int vpe_hw_fini(struct amdgpu_ip_block *ip_block)
static int vpe_suspend(struct amdgpu_ip_block *ip_block)
{
- struct amdgpu_device *adev = ip_block->adev;
-
- cancel_delayed_work_sync(&adev->vpe.idle_work);
-
return vpe_hw_fini(ip_block);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
index c417f8689220..699acc1b46b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
@@ -406,6 +406,7 @@ void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev)
p_ddev->primary->dev = adev->xcp_mgr->xcp[i].pdev;
p_ddev->driver = adev->xcp_mgr->xcp[i].driver;
p_ddev->vma_offset_manager = adev->xcp_mgr->xcp[i].vma_offset_manager;
+ amdgpu_xcp_drm_dev_free(p_ddev);
}
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index bba0b26fee8f..5f36aff17e79 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -126,4 +126,8 @@ uint32_t amdgpu_xgmi_get_max_bandwidth(struct amdgpu_device *adev);
void amgpu_xgmi_set_max_speed_width(struct amdgpu_device *adev,
uint16_t max_speed, uint8_t max_width);
+
+/* Cleanup macro for use with __free(xgmi_put_hive) */
+DEFINE_FREE(xgmi_put_hive, struct amdgpu_hive_info *, if (_T) amdgpu_put_xgmi_hive(_T))
+
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c b/drivers/gpu/drm/amd/amdgpu/atom.c
index 427b073de2fc..1a7591ca2f9a 100644
--- a/drivers/gpu/drm/amd/amdgpu/atom.c
+++ b/drivers/gpu/drm/amd/amdgpu/atom.c
@@ -1246,6 +1246,10 @@ static int amdgpu_atom_execute_table_locked(struct atom_context *ctx, int index,
ectx.last_jump_jiffies = 0;
if (ws) {
ectx.ws = kcalloc(4, ws, GFP_KERNEL);
+ if (!ectx.ws) {
+ ret = -ENOMEM;
+ goto free;
+ }
ectx.ws_size = ws;
} else {
ectx.ws = NULL;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 51babf5c78c8..f06bc94cf6e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -3562,6 +3562,7 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
struct amdgpu_device *adev = ring->adev;
struct amdgpu_kiq *kiq = &adev->gfx.kiq[ring->xcc_id];
struct amdgpu_ring *kiq_ring = &kiq->ring;
+ int reset_mode = AMDGPU_RESET_TYPE_PER_QUEUE;
unsigned long flags;
int r;
@@ -3599,6 +3600,7 @@ pipe_reset:
if (!(adev->gfx.compute_supported_reset & AMDGPU_RESET_TYPE_PER_PIPE))
return -EOPNOTSUPP;
r = gfx_v9_4_3_reset_hw_pipe(ring);
+ reset_mode = AMDGPU_RESET_TYPE_PER_PIPE;
dev_info(adev->dev, "ring: %s pipe reset :%s\n", ring->name,
r ? "failed" : "successfully");
if (r)
@@ -3621,10 +3623,20 @@ pipe_reset:
r = amdgpu_ring_test_ring(kiq_ring);
spin_unlock_irqrestore(&kiq->ring_lock, flags);
if (r) {
+ if (reset_mode == AMDGPU_RESET_TYPE_PER_QUEUE)
+ goto pipe_reset;
+
dev_err(adev->dev, "fail to remap queue\n");
return r;
}
+ if (reset_mode == AMDGPU_RESET_TYPE_PER_QUEUE) {
+ r = amdgpu_ring_test_ring(ring);
+ if (r)
+ goto pipe_reset;
+ }
+
+
return amdgpu_ring_reset_helper_end(ring, timedout_fence);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c
index 54523dc1f702..7731ef262d39 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c
@@ -196,6 +196,14 @@ static int jpeg_v5_0_1_sw_init(struct amdgpu_ip_block *ip_block)
}
}
+ if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__JPEG)) {
+ r = amdgpu_jpeg_ras_sw_init(adev);
+ if (r) {
+ dev_err(adev->dev, "Failed to initialize jpeg ras block!\n");
+ return r;
+ }
+ }
+
r = amdgpu_jpeg_reg_dump_init(adev, jpeg_reg_list_5_0_1, ARRAY_SIZE(jpeg_reg_list_5_0_1));
if (r)
return r;
@@ -307,7 +315,7 @@ static int jpeg_v5_0_1_hw_fini(struct amdgpu_ip_block *ip_block)
ret = jpeg_v5_0_1_set_powergating_state(ip_block, AMD_PG_STATE_GATE);
}
- if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__JPEG))
+ if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__JPEG) && !amdgpu_sriov_vf(adev))
amdgpu_irq_put(adev, &adev->jpeg.inst->ras_poison_irq, 0);
return ret;
@@ -1058,6 +1066,11 @@ static int jpeg_v5_0_1_ras_late_init(struct amdgpu_device *adev, struct ras_comm
if (r)
return r;
+ r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__JPEG,
+ &jpeg_v5_0_1_aca_info, NULL);
+ if (r)
+ goto late_fini;
+
if (amdgpu_ras_is_supported(adev, ras_block->block) &&
adev->jpeg.inst->ras_poison_irq.funcs) {
r = amdgpu_irq_get(adev, &adev->jpeg.inst->ras_poison_irq, 0);
@@ -1065,11 +1078,6 @@ static int jpeg_v5_0_1_ras_late_init(struct amdgpu_device *adev, struct ras_comm
goto late_fini;
}
- r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__JPEG,
- &jpeg_v5_0_1_aca_info, NULL);
- if (r)
- goto late_fini;
-
return 0;
late_fini:
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
index d6f50b13e2ba..ef54d211214f 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
@@ -206,6 +206,7 @@ static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_mqd *mqd_hw_default = &adev->mqds[queue->queue_type];
struct drm_amdgpu_userq_in *mqd_user = args_in;
struct amdgpu_mqd_prop *userq_props;
+ struct amdgpu_gfx_shadow_info shadow_info;
int r;
/* Structure to initialize MQD for userqueue using generic MQD init function */
@@ -215,13 +216,6 @@ static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr,
return -ENOMEM;
}
- if (!mqd_user->wptr_va || !mqd_user->rptr_va ||
- !mqd_user->queue_va || mqd_user->queue_size == 0) {
- DRM_ERROR("Invalid MQD parameters for userqueue\n");
- r = -EINVAL;
- goto free_props;
- }
-
r = amdgpu_userq_create_object(uq_mgr, &queue->mqd, mqd_hw_default->mqd_size);
if (r) {
DRM_ERROR("Failed to create MQD object for userqueue\n");
@@ -238,6 +232,8 @@ static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr,
userq_props->doorbell_index = queue->doorbell_index;
userq_props->fence_address = queue->fence_drv->gpu_addr;
+ if (adev->gfx.funcs->get_gfx_shadow_info)
+ adev->gfx.funcs->get_gfx_shadow_info(adev, &shadow_info, true);
if (queue->queue_type == AMDGPU_HW_IP_COMPUTE) {
struct drm_amdgpu_userq_mqd_compute_gfx11 *compute_mqd;
@@ -254,6 +250,10 @@ static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr,
goto free_mqd;
}
+ if (amdgpu_userq_input_va_validate(queue->vm, compute_mqd->eop_va,
+ max_t(u32, PAGE_SIZE, AMDGPU_GPU_PAGE_SIZE)))
+ goto free_mqd;
+
userq_props->eop_gpu_addr = compute_mqd->eop_va;
userq_props->hqd_pipe_priority = AMDGPU_GFX_PIPE_PRIO_NORMAL;
userq_props->hqd_queue_priority = AMDGPU_GFX_QUEUE_PRIORITY_MINIMUM;
@@ -281,6 +281,11 @@ static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr,
userq_props->csa_addr = mqd_gfx_v11->csa_va;
userq_props->tmz_queue =
mqd_user->flags & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE;
+
+ if (amdgpu_userq_input_va_validate(queue->vm, mqd_gfx_v11->shadow_va,
+ shadow_info.shadow_size))
+ goto free_mqd;
+
kfree(mqd_gfx_v11);
} else if (queue->queue_type == AMDGPU_HW_IP_DMA) {
struct drm_amdgpu_userq_mqd_sdma_gfx11 *mqd_sdma_v11;
@@ -298,6 +303,10 @@ static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr,
goto free_mqd;
}
+ if (amdgpu_userq_input_va_validate(queue->vm, mqd_sdma_v11->csa_va,
+ shadow_info.csa_size))
+ goto free_mqd;
+
userq_props->csa_addr = mqd_sdma_v11->csa_va;
kfree(mqd_sdma_v11);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 48101a34e049..9a40107a0869 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -292,14 +292,32 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
}
}
-static void xgpu_ai_mailbox_bad_pages_work(struct work_struct *work)
+static void xgpu_ai_mailbox_req_bad_pages_work(struct work_struct *work)
{
- struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, bad_pages_work);
+ struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, req_bad_pages_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
if (down_read_trylock(&adev->reset_domain->sem)) {
amdgpu_virt_fini_data_exchange(adev);
amdgpu_virt_request_bad_pages(adev);
+ up_read(&adev->reset_domain->sem);
+ }
+}
+
+/**
+ * xgpu_ai_mailbox_handle_bad_pages_work - Reinitialize the data exchange region to get fresh bad page information
+ * @work: pointer to the work_struct
+ *
+ * This work handler is triggered when bad pages are ready, and it reinitializes
+ * the data exchange region to retrieve updated bad page information from the host.
+ */
+static void xgpu_ai_mailbox_handle_bad_pages_work(struct work_struct *work)
+{
+ struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, handle_bad_pages_work);
+ struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
+
+ if (down_read_trylock(&adev->reset_domain->sem)) {
+ amdgpu_virt_fini_data_exchange(adev);
amdgpu_virt_init_data_exchange(adev);
up_read(&adev->reset_domain->sem);
}
@@ -327,10 +345,15 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
switch (event) {
+ case IDH_RAS_BAD_PAGES_READY:
+ xgpu_ai_mailbox_send_ack(adev);
+ if (amdgpu_sriov_runtime(adev))
+ schedule_work(&adev->virt.handle_bad_pages_work);
+ break;
case IDH_RAS_BAD_PAGES_NOTIFICATION:
xgpu_ai_mailbox_send_ack(adev);
if (amdgpu_sriov_runtime(adev))
- schedule_work(&adev->virt.bad_pages_work);
+ schedule_work(&adev->virt.req_bad_pages_work);
break;
case IDH_UNRECOV_ERR_NOTIFICATION:
xgpu_ai_mailbox_send_ack(adev);
@@ -415,7 +438,8 @@ int xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev)
}
INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work);
- INIT_WORK(&adev->virt.bad_pages_work, xgpu_ai_mailbox_bad_pages_work);
+ INIT_WORK(&adev->virt.req_bad_pages_work, xgpu_ai_mailbox_req_bad_pages_work);
+ INIT_WORK(&adev->virt.handle_bad_pages_work, xgpu_ai_mailbox_handle_bad_pages_work);
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index f6d8597452ed..457972aa5632 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -202,9 +202,6 @@ send_request:
case IDH_REQ_RAS_CPER_DUMP:
event = IDH_RAS_CPER_DUMP_READY;
break;
- case IDH_REQ_RAS_BAD_PAGES:
- event = IDH_RAS_BAD_PAGES_READY;
- break;
default:
break;
}
@@ -359,14 +356,32 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
}
}
-static void xgpu_nv_mailbox_bad_pages_work(struct work_struct *work)
+static void xgpu_nv_mailbox_req_bad_pages_work(struct work_struct *work)
{
- struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, bad_pages_work);
+ struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, req_bad_pages_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
if (down_read_trylock(&adev->reset_domain->sem)) {
amdgpu_virt_fini_data_exchange(adev);
amdgpu_virt_request_bad_pages(adev);
+ up_read(&adev->reset_domain->sem);
+ }
+}
+
+/**
+ * xgpu_nv_mailbox_handle_bad_pages_work - Reinitialize the data exchange region to get fresh bad page information
+ * @work: pointer to the work_struct
+ *
+ * This work handler is triggered when bad pages are ready, and it reinitializes
+ * the data exchange region to retrieve updated bad page information from the host.
+ */
+static void xgpu_nv_mailbox_handle_bad_pages_work(struct work_struct *work)
+{
+ struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, handle_bad_pages_work);
+ struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
+
+ if (down_read_trylock(&adev->reset_domain->sem)) {
+ amdgpu_virt_fini_data_exchange(adev);
amdgpu_virt_init_data_exchange(adev);
up_read(&adev->reset_domain->sem);
}
@@ -397,10 +412,15 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev,
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
switch (event) {
+ case IDH_RAS_BAD_PAGES_READY:
+ xgpu_nv_mailbox_send_ack(adev);
+ if (amdgpu_sriov_runtime(adev))
+ schedule_work(&adev->virt.handle_bad_pages_work);
+ break;
case IDH_RAS_BAD_PAGES_NOTIFICATION:
xgpu_nv_mailbox_send_ack(adev);
if (amdgpu_sriov_runtime(adev))
- schedule_work(&adev->virt.bad_pages_work);
+ schedule_work(&adev->virt.req_bad_pages_work);
break;
case IDH_UNRECOV_ERR_NOTIFICATION:
xgpu_nv_mailbox_send_ack(adev);
@@ -485,7 +505,8 @@ int xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev)
}
INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work);
- INIT_WORK(&adev->virt.bad_pages_work, xgpu_nv_mailbox_bad_pages_work);
+ INIT_WORK(&adev->virt.req_bad_pages_work, xgpu_nv_mailbox_req_bad_pages_work);
+ INIT_WORK(&adev->virt.handle_bad_pages_work, xgpu_nv_mailbox_handle_bad_pages_work);
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 9e74c9822e62..9785fada4fa7 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -741,7 +741,6 @@ static void soc15_reg_base_init(struct amdgpu_device *adev)
void soc15_set_virt_ops(struct amdgpu_device *adev)
{
adev->virt.ops = &xgpu_ai_virt_ops;
-
/* init soc15 reg base early enough so we can
* request request full access for sriov before
* set_ip_blocks. */
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index e590cbdd8de9..8dc32787d625 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -536,8 +536,11 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);
- if ((hwid != MCA_UMC_HWID_V12_0) || (mcatype != MCA_UMC_MCATYPE_V12_0))
+ /* The IP block decode of consumption is SMU */
+ if (hwid != MCA_UMC_HWID_V12_0 || mcatype != MCA_UMC_MCATYPE_V12_0) {
+ con->umc_ecc_log.consumption_q_count++;
return 0;
+ }
if (!status)
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
index 68b4371df0f1..d1481e6d57ec 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
@@ -865,6 +865,7 @@ static int vcn_v2_0_start_dpg_mode(struct amdgpu_vcn_inst *vinst, bool indirect)
volatile struct amdgpu_fw_shared *fw_shared = adev->vcn.inst->fw_shared.cpu_addr;
struct amdgpu_ring *ring = &adev->vcn.inst->ring_dec;
uint32_t rb_bufsz, tmp;
+ int ret;
vcn_v2_0_enable_static_power_gating(vinst);
@@ -948,8 +949,13 @@ static int vcn_v2_0_start_dpg_mode(struct amdgpu_vcn_inst *vinst, bool indirect)
UVD, 0, mmUVD_MASTINT_EN),
UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect);
- if (indirect)
- amdgpu_vcn_psp_update_sram(adev, 0, 0);
+ if (indirect) {
+ ret = amdgpu_vcn_psp_update_sram(adev, 0, 0);
+ if (ret) {
+ dev_err(adev->dev, "vcn sram load failed %d\n", ret);
+ return ret;
+ }
+ }
/* force RBC into idle state */
rb_bufsz = order_base_2(ring->ring_size);
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
index f13ed3c1e29c..fdd8e33916f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c
@@ -1012,6 +1012,7 @@ static int vcn_v2_5_start_dpg_mode(struct amdgpu_vcn_inst *vinst, bool indirect)
volatile struct amdgpu_fw_shared *fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr;
struct amdgpu_ring *ring;
uint32_t rb_bufsz, tmp;
+ int ret;
/* disable register anti-hang mechanism */
WREG32_P(SOC15_REG_OFFSET(VCN, inst_idx, mmUVD_POWER_STATUS), 1,
@@ -1102,8 +1103,13 @@ static int vcn_v2_5_start_dpg_mode(struct amdgpu_vcn_inst *vinst, bool indirect)
VCN, 0, mmUVD_MASTINT_EN),
UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect);
- if (indirect)
- amdgpu_vcn_psp_update_sram(adev, inst_idx, 0);
+ if (indirect) {
+ ret = amdgpu_vcn_psp_update_sram(adev, inst_idx, 0);
+ if (ret) {
+ dev_err(adev->dev, "vcn sram load failed %d\n", ret);
+ return ret;
+ }
+ }
ring = &adev->vcn.inst[inst_idx].ring_dec;
/* force RBC into idle state */
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
index 866222fc10a0..b7c4fcca18bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
@@ -1041,6 +1041,7 @@ static int vcn_v3_0_start_dpg_mode(struct amdgpu_vcn_inst *vinst, bool indirect)
volatile struct amdgpu_fw_shared *fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr;
struct amdgpu_ring *ring;
uint32_t rb_bufsz, tmp;
+ int ret;
/* disable register anti-hang mechanism */
WREG32_P(SOC15_REG_OFFSET(VCN, inst_idx, mmUVD_POWER_STATUS), 1,
@@ -1133,8 +1134,13 @@ static int vcn_v3_0_start_dpg_mode(struct amdgpu_vcn_inst *vinst, bool indirect)
WREG32_SOC15_DPG_MODE(inst_idx, SOC15_DPG_MODE_OFFSET(
VCN, inst_idx, mmUVD_VCPU_CNTL), tmp, 0, indirect);
- if (indirect)
- amdgpu_vcn_psp_update_sram(adev, inst_idx, 0);
+ if (indirect) {
+ ret = amdgpu_vcn_psp_update_sram(adev, inst_idx, 0);
+ if (ret) {
+ dev_err(adev->dev, "vcn sram load failed %d\n", ret);
+ return ret;
+ }
+ }
ring = &adev->vcn.inst[inst_idx].ring_dec;
/* force RBC into idle state */
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index ac55549e20be..082def4a6bdf 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -1012,6 +1012,7 @@ static int vcn_v4_0_start_dpg_mode(struct amdgpu_vcn_inst *vinst, bool indirect)
volatile struct amdgpu_vcn4_fw_shared *fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr;
struct amdgpu_ring *ring;
uint32_t tmp;
+ int ret;
/* disable register anti-hang mechanism */
WREG32_P(SOC15_REG_OFFSET(VCN, inst_idx, regUVD_POWER_STATUS), 1,
@@ -1094,8 +1095,13 @@ static int vcn_v4_0_start_dpg_mode(struct amdgpu_vcn_inst *vinst, bool indirect)
UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect);
- if (indirect)
- amdgpu_vcn_psp_update_sram(adev, inst_idx, 0);
+ if (indirect) {
+ ret = amdgpu_vcn_psp_update_sram(adev, inst_idx, 0);
+ if (ret) {
+ dev_err(adev->dev, "vcn sram load failed %d\n", ret);
+ return ret;
+ }
+ }
ring = &adev->vcn.inst[inst_idx].ring_enc[0];
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index ba944a96c070..2e985c4a288a 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -849,7 +849,7 @@ static int vcn_v4_0_3_start_dpg_mode(struct amdgpu_vcn_inst *vinst,
volatile struct amdgpu_vcn4_fw_shared *fw_shared =
adev->vcn.inst[inst_idx].fw_shared.cpu_addr;
struct amdgpu_ring *ring;
- int vcn_inst;
+ int vcn_inst, ret;
uint32_t tmp;
vcn_inst = GET_INST(VCN, inst_idx);
@@ -942,8 +942,13 @@ static int vcn_v4_0_3_start_dpg_mode(struct amdgpu_vcn_inst *vinst,
VCN, 0, regUVD_MASTINT_EN),
UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect);
- if (indirect)
- amdgpu_vcn_psp_update_sram(adev, inst_idx, AMDGPU_UCODE_ID_VCN0_RAM);
+ if (indirect) {
+ ret = amdgpu_vcn_psp_update_sram(adev, inst_idx, AMDGPU_UCODE_ID_VCN0_RAM);
+ if (ret) {
+ dev_err(adev->dev, "vcn sram load failed %d\n", ret);
+ return ret;
+ }
+ }
ring = &adev->vcn.inst[inst_idx].ring_enc[0];
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
index 11fec716e846..3ce49dfd3897 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
@@ -924,6 +924,7 @@ static int vcn_v4_0_5_start_dpg_mode(struct amdgpu_vcn_inst *vinst,
volatile struct amdgpu_vcn4_fw_shared *fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr;
struct amdgpu_ring *ring;
uint32_t tmp;
+ int ret;
/* disable register anti-hang mechanism */
WREG32_P(SOC15_REG_OFFSET(VCN, inst_idx, regUVD_POWER_STATUS), 1,
@@ -1004,8 +1005,13 @@ static int vcn_v4_0_5_start_dpg_mode(struct amdgpu_vcn_inst *vinst,
VCN, inst_idx, regUVD_MASTINT_EN),
UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect);
- if (indirect)
- amdgpu_vcn_psp_update_sram(adev, inst_idx, 0);
+ if (indirect) {
+ ret = amdgpu_vcn_psp_update_sram(adev, inst_idx, 0);
+ if (ret) {
+ dev_err(adev->dev, "vcn sram load failed %d\n", ret);
+ return ret;
+ }
+ }
ring = &adev->vcn.inst[inst_idx].ring_enc[0];
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
index 07a6e9582880..f9dbd2757d83 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c
@@ -713,6 +713,7 @@ static int vcn_v5_0_0_start_dpg_mode(struct amdgpu_vcn_inst *vinst,
volatile struct amdgpu_vcn5_fw_shared *fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr;
struct amdgpu_ring *ring;
uint32_t tmp;
+ int ret;
/* disable register anti-hang mechanism */
WREG32_P(SOC15_REG_OFFSET(VCN, inst_idx, regUVD_POWER_STATUS), 1,
@@ -766,8 +767,13 @@ static int vcn_v5_0_0_start_dpg_mode(struct amdgpu_vcn_inst *vinst,
VCN, inst_idx, regUVD_MASTINT_EN),
UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect);
- if (indirect)
- amdgpu_vcn_psp_update_sram(adev, inst_idx, 0);
+ if (indirect) {
+ ret = amdgpu_vcn_psp_update_sram(adev, inst_idx, 0);
+ if (ret) {
+ dev_err(adev->dev, "%s: vcn sram load failed %d\n", __func__, ret);
+ return ret;
+ }
+ }
ring = &adev->vcn.inst[inst_idx].ring_enc[0];
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c
index cdefd7fcb0da..8ef4a8b2fae9 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c
@@ -284,7 +284,7 @@ static int vcn_v5_0_1_hw_fini(struct amdgpu_ip_block *ip_block)
vinst->set_pg_state(vinst, AMD_PG_STATE_GATE);
}
- if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__VCN))
+ if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__VCN) && !amdgpu_sriov_vf(adev))
amdgpu_irq_put(adev, &adev->vcn.inst->ras_poison_irq, 0);
return 0;
@@ -605,7 +605,7 @@ static int vcn_v5_0_1_start_dpg_mode(struct amdgpu_vcn_inst *vinst,
adev->vcn.inst[inst_idx].fw_shared.cpu_addr;
struct amdgpu_ring *ring;
struct dpg_pause_state state = {.fw_based = VCN_DPG_STATE__PAUSE};
- int vcn_inst;
+ int vcn_inst, ret;
uint32_t tmp;
vcn_inst = GET_INST(VCN, inst_idx);
@@ -666,8 +666,13 @@ static int vcn_v5_0_1_start_dpg_mode(struct amdgpu_vcn_inst *vinst,
VCN, 0, regUVD_MASTINT_EN),
UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect);
- if (indirect)
- amdgpu_vcn_psp_update_sram(adev, inst_idx, AMDGPU_UCODE_ID_VCN0_RAM);
+ if (indirect) {
+ ret = amdgpu_vcn_psp_update_sram(adev, inst_idx, AMDGPU_UCODE_ID_VCN0_RAM);
+ if (ret) {
+ dev_err(adev->dev, "vcn sram load failed %d\n", ret);
+ return ret;
+ }
+ }
/* resetting ring, fw should not check RB ring */
fw_shared->sq.queue_mode |= FW_QUEUE_RING_RESET;
@@ -1603,6 +1608,13 @@ static int vcn_v5_0_1_ras_late_init(struct amdgpu_device *adev, struct ras_commo
if (r)
goto late_fini;
+ if (amdgpu_ras_is_supported(adev, ras_block->block) &&
+ adev->vcn.inst->ras_poison_irq.funcs) {
+ r = amdgpu_irq_get(adev, &adev->vcn.inst->ras_poison_irq, 0);
+ if (r)
+ goto late_fini;
+ }
+
return 0;
late_fini: