diff options
| author | saturneric <[email protected]> | 2025-11-16 14:34:50 +0000 |
|---|---|---|
| committer | saturneric <[email protected]> | 2025-11-16 14:34:50 +0000 |
| commit | 690862a8d74fee1e07f33dad44b761753f101779 (patch) | |
| tree | f81bdcab8cd5a640518dba5056de6c62bd071eaf /drivers/gpu/drm/amd/amdgpu | |
| parent | Merge tag 'v6.17.7' into linux-6.17.y (diff) | |
| parent | Linux 6.17.8 (diff) | |
| download | kernel-linux-6.17.y.tar.gz kernel-linux-6.17.y.zip | |
Merge tag 'v6.17.8' into linux-6.17.ylinux-6.17.y
This is the 6.17.8 stable release
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
37 files changed, 561 insertions, 242 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index cbc40cad581b..9b3180449150 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -76,6 +76,7 @@ static void aca_banks_release(struct aca_banks *banks) list_for_each_entry_safe(node, tmp, &banks->list, node) { list_del(&node->node); kvfree(node); + banks->nr_banks--; } } @@ -130,6 +131,27 @@ static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, st RAS_EVENT_LOG(adev, event_id, HW_ERR "hardware error logged by the scrubber\n"); } +static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type type) +{ + + struct aca_hwip *hwip; + int hwid, mcatype; + u64 ipid; + + if (!bank || type == ACA_HWIP_TYPE_UNKNOW) + return false; + + hwip = &aca_hwid_mcatypes[type]; + if (!hwip->hwid) + return false; + + ipid = bank->regs[ACA_REG_IDX_IPID]; + hwid = ACA_REG__IPID__HARDWAREID(ipid); + mcatype = ACA_REG__IPID__MCATYPE(ipid); + + return hwip->hwid == hwid && hwip->mcatype == mcatype; +} + static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type, int start, int count, struct aca_banks *banks, struct ras_query_context *qctx) @@ -168,6 +190,15 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_ bank.smu_err_type = type; + /* + * Poison being consumed when injecting a UE while running background workloads, + * which are unexpected. + */ + if (type == ACA_SMU_TYPE_UE && + ACA_REG__STATUS__POISON(bank.regs[ACA_REG_IDX_STATUS]) && + !aca_bank_hwip_is_matched(&bank, ACA_HWIP_TYPE_UMC)) + continue; + aca_smu_bank_dump(adev, i, count, &bank, qctx); ret = aca_banks_add_bank(banks, &bank); @@ -178,27 +209,6 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_ return 0; } -static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type type) -{ - - struct aca_hwip *hwip; - int hwid, mcatype; - u64 ipid; - - if (!bank || type == ACA_HWIP_TYPE_UNKNOW) - return false; - - hwip = &aca_hwid_mcatypes[type]; - if (!hwip->hwid) - return false; - - ipid = bank->regs[ACA_REG_IDX_IPID]; - hwid = ACA_REG__IPID__HARDWAREID(ipid); - mcatype = ACA_REG__IPID__MCATYPE(ipid); - - return hwip->hwid == hwid && hwip->mcatype == mcatype; -} - static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type) { const struct aca_bank_ops *bank_ops = handle->bank_ops; @@ -229,6 +239,7 @@ static struct aca_bank_error *new_bank_error(struct aca_error *aerr, struct aca_ mutex_lock(&aerr->lock); list_add_tail(&bank_error->node, &aerr->list); + aerr->nr_errors++; mutex_unlock(&aerr->lock); return bank_error; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 902eac2c685f..8994ed564f35 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1263,6 +1263,10 @@ static int unmap_bo_from_gpuvm(struct kgd_mem *mem, (void)amdgpu_vm_bo_unmap(adev, bo_va, entry->va); + /* VM entity stopped if process killed, don't clear freed pt bo */ + if (!amdgpu_vm_ready(vm)) + return 0; + (void)amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update); (void)amdgpu_sync_fence(sync, bo_va->last_pt_update, GFP_KERNEL); @@ -2993,9 +2997,22 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * struct amdgpu_device *adev = amdgpu_ttm_adev( peer_vm->root.bo->tbo.bdev); + struct amdgpu_fpriv *fpriv = + container_of(peer_vm, struct amdgpu_fpriv, vm); + + ret = amdgpu_vm_bo_update(adev, fpriv->prt_va, false); + if (ret) { + dev_dbg(adev->dev, + "Memory eviction: handle PRT moved failed, pid %8d. Try again.\n", + pid_nr(process_info->pid)); + goto validate_map_fail; + } + ret = amdgpu_vm_handle_moved(adev, peer_vm, &exec.ticket); if (ret) { - pr_debug("Memory eviction: handle moved failed. Try again\n"); + dev_dbg(adev->dev, + "Memory eviction: handle moved failed, pid %8d. Try again.\n", + pid_nr(process_info->pid)); goto validate_map_fail; } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c index 5e375e9c4f5d..bf38fc69c1cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c @@ -1195,29 +1195,69 @@ static void amdgpu_connector_dvi_force(struct drm_connector *connector) amdgpu_connector->use_digital = true; } +/** + * amdgpu_max_hdmi_pixel_clock - Return max supported HDMI (TMDS) pixel clock + * @adev: pointer to amdgpu_device + * + * Return: maximum supported HDMI (TMDS) pixel clock in KHz. + */ +static int amdgpu_max_hdmi_pixel_clock(const struct amdgpu_device *adev) +{ + if (adev->asic_type >= CHIP_POLARIS10) + return 600000; + else if (adev->asic_type >= CHIP_TONGA) + return 300000; + else + return 297000; +} + +/** + * amdgpu_connector_dvi_mode_valid - Validate a mode on DVI/HDMI connectors + * @connector: DRM connector to validate the mode on + * @mode: display mode to validate + * + * Validate the given display mode on DVI and HDMI connectors, including + * analog signals on DVI-I. + * + * Return: drm_mode_status indicating whether the mode is valid. + */ static enum drm_mode_status amdgpu_connector_dvi_mode_valid(struct drm_connector *connector, const struct drm_display_mode *mode) { struct drm_device *dev = connector->dev; struct amdgpu_device *adev = drm_to_adev(dev); struct amdgpu_connector *amdgpu_connector = to_amdgpu_connector(connector); + const int max_hdmi_pixel_clock = amdgpu_max_hdmi_pixel_clock(adev); + const int max_dvi_single_link_pixel_clock = 165000; + int max_digital_pixel_clock_khz; /* XXX check mode bandwidth */ - if (amdgpu_connector->use_digital && (mode->clock > 165000)) { - if ((amdgpu_connector->connector_object_id == CONNECTOR_OBJECT_ID_DUAL_LINK_DVI_I) || - (amdgpu_connector->connector_object_id == CONNECTOR_OBJECT_ID_DUAL_LINK_DVI_D) || - (amdgpu_connector->connector_object_id == CONNECTOR_OBJECT_ID_HDMI_TYPE_B)) { - return MODE_OK; - } else if (connector->display_info.is_hdmi) { - /* HDMI 1.3+ supports max clock of 340 Mhz */ - if (mode->clock > 340000) - return MODE_CLOCK_HIGH; - else - return MODE_OK; - } else { - return MODE_CLOCK_HIGH; + if (amdgpu_connector->use_digital) { + switch (amdgpu_connector->connector_object_id) { + case CONNECTOR_OBJECT_ID_HDMI_TYPE_A: + max_digital_pixel_clock_khz = max_hdmi_pixel_clock; + break; + case CONNECTOR_OBJECT_ID_SINGLE_LINK_DVI_I: + case CONNECTOR_OBJECT_ID_SINGLE_LINK_DVI_D: + max_digital_pixel_clock_khz = max_dvi_single_link_pixel_clock; + break; + case CONNECTOR_OBJECT_ID_DUAL_LINK_DVI_I: + case CONNECTOR_OBJECT_ID_DUAL_LINK_DVI_D: + case CONNECTOR_OBJECT_ID_HDMI_TYPE_B: + max_digital_pixel_clock_khz = max_dvi_single_link_pixel_clock * 2; + break; } + + /* When the display EDID claims that it's an HDMI display, + * we use the HDMI encoder mode of the display HW, + * so we should verify against the max HDMI clock here. + */ + if (connector->display_info.is_hdmi) + max_digital_pixel_clock_khz = max_hdmi_pixel_clock; + + if (mode->clock > max_digital_pixel_clock_khz) + return MODE_CLOCK_HIGH; } /* check against the max pixel clock */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c index 25252231a68a..54de38dbaf2d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: MIT /* * Copyright 2025 Advanced Micro Devices, Inc. * @@ -68,7 +68,6 @@ void amdgpu_cper_entry_fill_hdr(struct amdgpu_device *adev, hdr->error_severity = sev; hdr->valid_bits.platform_id = 1; - hdr->valid_bits.partition_id = 1; hdr->valid_bits.timestamp = 1; amdgpu_cper_get_timestamp(&hdr->timestamp); @@ -206,6 +205,7 @@ int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev { struct cper_sec_desc *section_desc; struct cper_sec_nonstd_err *section; + uint32_t socket_id; section_desc = (struct cper_sec_desc *)((uint8_t *)hdr + SEC_DESC_OFFSET(idx)); section = (struct cper_sec_nonstd_err *)((uint8_t *)hdr + @@ -219,11 +219,17 @@ int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev section->hdr.valid_bits.err_context_cnt = 1; section->info.error_type = RUNTIME; + section->info.valid_bits.ms_chk = 1; section->info.ms_chk_bits.err_type_valid = 1; + section->info.ms_chk_bits.err_type = 1; + section->info.ms_chk_bits.pcc = 1; section->ctx.reg_ctx_type = CPER_CTX_TYPE_CRASH; section->ctx.reg_arr_size = sizeof(section->ctx.reg_dump); /* Hardcoded Reg dump for bad page threshold CPER */ + socket_id = (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) ? + adev->smuio.funcs->get_socket_id(adev) : + 0; section->ctx.reg_dump[CPER_ACA_REG_CTL_LO] = 0x1; section->ctx.reg_dump[CPER_ACA_REG_CTL_HI] = 0x0; section->ctx.reg_dump[CPER_ACA_REG_STATUS_LO] = 0x137; @@ -234,8 +240,8 @@ int amdgpu_cper_entry_fill_bad_page_threshold_section(struct amdgpu_device *adev section->ctx.reg_dump[CPER_ACA_REG_MISC0_HI] = 0x0; section->ctx.reg_dump[CPER_ACA_REG_CONFIG_LO] = 0x2; section->ctx.reg_dump[CPER_ACA_REG_CONFIG_HI] = 0x1ff; - section->ctx.reg_dump[CPER_ACA_REG_IPID_LO] = 0x0; - section->ctx.reg_dump[CPER_ACA_REG_IPID_HI] = 0x96; + section->ctx.reg_dump[CPER_ACA_REG_IPID_LO] = (socket_id / 4) & 0x01; + section->ctx.reg_dump[CPER_ACA_REG_IPID_HI] = 0x096 | (((socket_id % 4) & 0x3) << 12); section->ctx.reg_dump[CPER_ACA_REG_SYND_LO] = 0x0; section->ctx.reg_dump[CPER_ACA_REG_SYND_HI] = 0x0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h index bcb97d245673..353421807387 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cper.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: MIT */ /* * Copyright 2025 Advanced Micro Devices, Inc. * diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index d3f220be2ef9..1ce1fd0c87a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -286,7 +286,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, } } - if (!p->gang_size) { + if (!p->gang_size || (amdgpu_sriov_vf(p->adev) && p->gang_size > 1)) { ret = -EINVAL; goto free_all_kdata; } @@ -1767,30 +1767,21 @@ int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data, { struct amdgpu_device *adev = drm_to_adev(dev); union drm_amdgpu_wait_fences *wait = data; - uint32_t fence_count = wait->in.fence_count; - struct drm_amdgpu_fence *fences_user; struct drm_amdgpu_fence *fences; int r; /* Get the fences from userspace */ - fences = kmalloc_array(fence_count, sizeof(struct drm_amdgpu_fence), - GFP_KERNEL); - if (fences == NULL) - return -ENOMEM; - - fences_user = u64_to_user_ptr(wait->in.fences); - if (copy_from_user(fences, fences_user, - sizeof(struct drm_amdgpu_fence) * fence_count)) { - r = -EFAULT; - goto err_free_fences; - } + fences = memdup_array_user(u64_to_user_ptr(wait->in.fences), + wait->in.fence_count, + sizeof(struct drm_amdgpu_fence)); + if (IS_ERR(fences)) + return PTR_ERR(fences); if (wait->in.wait_all) r = amdgpu_cs_wait_all_fences(adev, filp, wait, fences); else r = amdgpu_cs_wait_any_fence(adev, filp, wait, fences); -err_free_fences: kfree(fences); return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c8459337fcb8..ddd0e7ab82be 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -95,6 +95,7 @@ MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); +MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin"); #define AMDGPU_RESUME_MS 2000 #define AMDGPU_MAX_RETRY_LIMIT 2 @@ -2595,6 +2596,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) return 0; chip_name = "navi12"; break; + case CHIP_CYAN_SKILLFISH: + chip_name = "cyan_skillfish"; + break; } err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, @@ -3389,7 +3393,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void) for (i = 0; i < mgpu_info.num_dgpu; i++) { gpu_ins = &(mgpu_info.gpu_ins[i]); adev = gpu_ins->adev; - if (!(adev->flags & AMD_IS_APU) && + if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) && !gpu_ins->mgpu_fan_enabled) { ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); if (ret) @@ -5012,6 +5016,10 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev) if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) return 0; + /* No need to evict when going to S5 through S4 callbacks */ + if (system_state == SYSTEM_POWER_OFF) + return 0; + ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); if (ret) { dev_warn(adev->dev, "evicting device resources failed\n"); @@ -6132,12 +6140,11 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle) return ret; } -static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev, +static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev, struct list_head *device_list, struct amdgpu_hive_info *hive) { struct amdgpu_device *tmp_adev = NULL; - int r; /* * Build list of devices to reset. @@ -6157,14 +6164,6 @@ static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev, } else { list_add_tail(&adev->reset_list, device_list); } - - if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) { - r = amdgpu_device_health_check(device_list); - if (r) - return r; - } - - return 0; } static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, @@ -6338,23 +6337,28 @@ static int amdgpu_device_sched_resume(struct list_head *device_list, if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); - if (tmp_adev->asic_reset_res) - r = tmp_adev->asic_reset_res; - - tmp_adev->asic_reset_res = 0; - - if (r) { + if (tmp_adev->asic_reset_res) { /* bad news, how to tell it to userspace ? * for ras error, we should report GPU bad status instead of * reset failure */ if (reset_context->src != AMDGPU_RESET_SRC_RAS || !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) - dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", - atomic_read(&tmp_adev->gpu_reset_counter)); - amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); + dev_info( + tmp_adev->dev, + "GPU reset(%d) failed with error %d \n", + atomic_read( + &tmp_adev->gpu_reset_counter), + tmp_adev->asic_reset_res); + amdgpu_vf_error_put(tmp_adev, + AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, + tmp_adev->asic_reset_res); + if (!r) + r = tmp_adev->asic_reset_res; + tmp_adev->asic_reset_res = 0; } else { - dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); + dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", + atomic_read(&tmp_adev->gpu_reset_counter)); if (amdgpu_acpi_smart_shift_update(tmp_adev, AMDGPU_SS_DEV_D0)) dev_warn(tmp_adev->dev, @@ -6457,8 +6461,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, reset_context->hive = hive; INIT_LIST_HEAD(&device_list); - if (amdgpu_device_recovery_prepare(adev, &device_list, hive)) - goto end_reset; + amdgpu_device_recovery_prepare(adev, &device_list, hive); + + if (!amdgpu_sriov_vf(adev)) { + r = amdgpu_device_health_check(&device_list); + if (r) + goto end_reset; + } /* We need to lock reset domain only once both for XGMI and single device */ amdgpu_device_recovery_get_reset_lock(adev, &device_list); @@ -6880,7 +6889,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta { struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev); - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + struct amdgpu_hive_info *hive __free(xgmi_put_hive) = + amdgpu_get_xgmi_hive(adev); struct amdgpu_reset_context reset_context; struct list_head device_list; @@ -6911,10 +6921,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta amdgpu_device_recovery_get_reset_lock(adev, &device_list); amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, hive, false); - if (hive) { + if (hive) mutex_unlock(&hive->hive_lock); - amdgpu_put_xgmi_hive(hive); - } return PCI_ERS_RESULT_NEED_RESET; case pci_channel_io_perm_failure: /* Permanent error, prepare for device removal */ @@ -6965,12 +6973,6 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) int r = 0, i; u32 memsize; - /* PCI error slot reset should be skipped During RAS recovery */ - if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || - amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && - amdgpu_ras_in_recovery(adev)) - return PCI_ERS_RESULT_RECOVERED; - dev_info(adev->dev, "PCI error: slot reset callback!!\n"); memset(&reset_context, 0, sizeof(reset_context)); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index e814da2b1422..dd7b2b796427 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -2126,7 +2126,6 @@ static int amdgpu_discovery_set_smu_ip_blocks(struct amdgpu_device *adev) case IP_VERSION(11, 0, 5): case IP_VERSION(11, 0, 9): case IP_VERSION(11, 0, 7): - case IP_VERSION(11, 0, 8): case IP_VERSION(11, 0, 11): case IP_VERSION(11, 0, 12): case IP_VERSION(11, 0, 13): @@ -2134,6 +2133,10 @@ static int amdgpu_discovery_set_smu_ip_blocks(struct amdgpu_device *adev) case IP_VERSION(11, 5, 2): amdgpu_device_ip_block_add(adev, &smu_v11_0_ip_block); break; + case IP_VERSION(11, 0, 8): + if (adev->apu_flags & AMD_APU_IS_CYAN_SKILLFISH2) + amdgpu_device_ip_block_add(adev, &smu_v11_0_ip_block); + break; case IP_VERSION(12, 0, 0): case IP_VERSION(12, 0, 1): amdgpu_device_ip_block_add(adev, &smu_v12_0_ip_block); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 65f4a76490ea..e60043ac9841 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2172,6 +2172,11 @@ static const struct pci_device_id pciidlist[] = { {0x1002, 0x7410, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ALDEBARAN}, /* CYAN_SKILLFISH */ + {0x1002, 0x13DB, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYAN_SKILLFISH|AMD_IS_APU}, + {0x1002, 0x13F9, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYAN_SKILLFISH|AMD_IS_APU}, + {0x1002, 0x13FA, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYAN_SKILLFISH|AMD_IS_APU}, + {0x1002, 0x13FB, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYAN_SKILLFISH|AMD_IS_APU}, + {0x1002, 0x13FC, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYAN_SKILLFISH|AMD_IS_APU}, {0x1002, 0x13FE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYAN_SKILLFISH|AMD_IS_APU}, {0x1002, 0x143F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CYAN_SKILLFISH|AMD_IS_APU}, @@ -2597,6 +2602,7 @@ static int amdgpu_pmops_suspend(struct device *dev) else if (amdgpu_acpi_is_s3_active(adev)) adev->in_s3 = true; if (!adev->in_s0ix && !adev->in_s3) { +#if IS_ENABLED(CONFIG_SUSPEND) /* don't allow going deep first time followed by s2idle the next time */ if (adev->last_suspend_state != PM_SUSPEND_ON && adev->last_suspend_state != pm_suspend_target_state) { @@ -2604,11 +2610,14 @@ static int amdgpu_pmops_suspend(struct device *dev) pm_suspend_target_state); return -EINVAL; } +#endif return 0; } +#if IS_ENABLED(CONFIG_SUSPEND) /* cache the state last used for suspend */ adev->last_suspend_state = pm_suspend_target_state; +#endif return amdgpu_device_suspend(drm_dev, true); } @@ -2933,11 +2942,14 @@ static int amdgpu_drm_release(struct inode *inode, struct file *filp) { struct drm_file *file_priv = filp->private_data; struct amdgpu_fpriv *fpriv = file_priv->driver_priv; + struct drm_device *dev = file_priv->minor->dev; + int idx; - if (fpriv) { + if (fpriv && drm_dev_enter(dev, &idx)) { fpriv->evf_mgr.fd_closing = true; amdgpu_eviction_fence_destroy(&fpriv->evf_mgr); amdgpu_userq_mgr_fini(&fpriv->userq_mgr); + drm_dev_exit(idx); } return drm_release(inode, filp); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c index 82d58ac7afb0..5d5e9ee83a5d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c @@ -121,10 +121,12 @@ static void amdgpu_jpeg_idle_work_handler(struct work_struct *work) fences += amdgpu_fence_count_emitted(&adev->jpeg.inst[i].ring_dec[j]); } - if (!fences && !atomic_read(&adev->jpeg.total_submission_cnt)) + if (!fences && !atomic_read(&adev->jpeg.total_submission_cnt)) { + mutex_lock(&adev->jpeg.jpeg_pg_lock); amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_JPEG, AMD_PG_STATE_GATE); - else + mutex_unlock(&adev->jpeg.jpeg_pg_lock); + } else schedule_delayed_work(&adev->jpeg.idle_work, JPEG_IDLE_TIMEOUT); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 540817e296da..893cae9813fb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -122,7 +122,7 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) -#define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms +#define MAX_UMC_POISON_POLLING_TIME_ASYNC 10 #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms @@ -136,9 +136,9 @@ enum amdgpu_ras_retire_page_reservation { atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); -static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, +static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, uint64_t addr); -static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, +static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev, uint64_t addr); #ifdef CONFIG_X86_MCE_AMD static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev); @@ -169,18 +169,16 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre struct eeprom_table_record err_rec; int ret; - if ((address >= adev->gmc.mc_vram_size) || - (address >= RAS_UMC_INJECT_ADDR_LIMIT)) { + ret = amdgpu_ras_check_bad_page(adev, address); + if (ret == -EINVAL) { dev_warn(adev->dev, - "RAS WARN: input address 0x%llx is invalid.\n", - address); + "RAS WARN: input address 0x%llx is invalid.\n", + address); return -EINVAL; - } - - if (amdgpu_ras_check_bad_page(adev, address)) { + } else if (ret == 1) { dev_warn(adev->dev, - "RAS WARN: 0x%llx has already been marked as bad page!\n", - address); + "RAS WARN: 0x%llx has already been marked as bad page!\n", + address); return 0; } @@ -513,22 +511,16 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, ret = amdgpu_ras_feature_enable(adev, &data.head, 1); break; case 2: - if ((data.inject.address >= adev->gmc.mc_vram_size && - adev->gmc.mc_vram_size) || - (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) { - dev_warn(adev->dev, "RAS WARN: input address " - "0x%llx is invalid.", + /* umc ce/ue error injection for a bad page is not allowed */ + if (data.head.block == AMDGPU_RAS_BLOCK__UMC) + ret = amdgpu_ras_check_bad_page(adev, data.inject.address); + if (ret == -EINVAL) { + dev_warn(adev->dev, "RAS WARN: input address 0x%llx is invalid.", data.inject.address); - ret = -EINVAL; break; - } - - /* umc ce/ue error injection for a bad page is not allowed */ - if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) && - amdgpu_ras_check_bad_page(adev, data.inject.address)) { - dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has " - "already been marked as bad!\n", - data.inject.address); + } else if (ret == 1) { + dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has already been marked as bad!\n", + data.inject.address); break; } @@ -3134,18 +3126,24 @@ out: return ret; } -static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, +static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, uint64_t addr) { struct ras_err_handler_data *data = con->eh_data; + struct amdgpu_device *adev = con->adev; int i; + if ((addr >= adev->gmc.mc_vram_size && + adev->gmc.mc_vram_size) || + (addr >= RAS_UMC_INJECT_ADDR_LIMIT)) + return -EINVAL; + addr >>= AMDGPU_GPU_PAGE_SHIFT; for (i = 0; i < data->count; i++) if (addr == data->bps[i].retired_page) - return true; + return 1; - return false; + return 0; } /* @@ -3153,11 +3151,11 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, * * Note: this check is only for umc block */ -static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, +static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev, uint64_t addr) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - bool ret = false; + int ret = 0; if (!con || !con->eh_data) return ret; @@ -3241,7 +3239,7 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log) INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); ecc_log->de_queried_count = 0; - ecc_log->prev_de_queried_count = 0; + ecc_log->consumption_q_count = 0; } static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ -3261,7 +3259,7 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) mutex_destroy(&ecc_log->lock); ecc_log->de_queried_count = 0; - ecc_log->prev_de_queried_count = 0; + ecc_log->consumption_q_count = 0; } static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con, @@ -3287,7 +3285,6 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) page_retirement_dwork.work); struct amdgpu_device *adev = con->adev; struct ras_err_data err_data; - unsigned long err_cnt; /* If gpu reset is ongoing, delay retiring the bad pages */ if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) { @@ -3299,13 +3296,9 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) amdgpu_ras_error_data_init(&err_data); amdgpu_umc_handle_bad_pages(adev, &err_data); - err_cnt = err_data.err_addr_cnt; amdgpu_ras_error_data_fini(&err_data); - if (err_cnt && amdgpu_ras_is_rma(adev)) - amdgpu_ras_reset_gpu(adev); - amdgpu_ras_schedule_retirement_dwork(con, AMDGPU_RAS_RETIRE_PAGE_INTERVAL); } @@ -3316,49 +3309,39 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, int ret = 0; struct ras_ecc_log_info *ecc_log; struct ras_query_if info; - uint32_t timeout = 0; + u32 timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - uint64_t de_queried_count; - uint32_t new_detect_count, total_detect_count; - uint32_t need_query_count = poison_creation_count; + u64 de_queried_count; + u64 consumption_q_count; enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; memset(&info, 0, sizeof(info)); info.head.block = AMDGPU_RAS_BLOCK__UMC; ecc_log = &ras->umc_ecc_log; - total_detect_count = 0; + ecc_log->de_queried_count = 0; + ecc_log->consumption_q_count = 0; + do { ret = amdgpu_ras_query_error_status_with_event(adev, &info, type); if (ret) return ret; de_queried_count = ecc_log->de_queried_count; - if (de_queried_count > ecc_log->prev_de_queried_count) { - new_detect_count = de_queried_count - ecc_log->prev_de_queried_count; - ecc_log->prev_de_queried_count = de_queried_count; - timeout = 0; - } else { - new_detect_count = 0; - } + consumption_q_count = ecc_log->consumption_q_count; - if (new_detect_count) { - total_detect_count += new_detect_count; - } else { - if (!timeout && need_query_count) - timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; + if (de_queried_count && consumption_q_count) + break; - if (timeout) { - if (!--timeout) - break; - msleep(1); - } - } - } while (total_detect_count < need_query_count); + msleep(100); + } while (--timeout); - if (total_detect_count) + if (de_queried_count) schedule_delayed_work(&ras->page_retirement_dwork, 0); + if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0) + amdgpu_ras_reset_gpu(adev); + return 0; } @@ -3394,6 +3377,12 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, reset_flags |= msg.reset; } + /* + * Try to ensure poison creation handler is completed first + * to set rma if bad page exceed threshold. + */ + flush_delayed_work(&con->page_retirement_dwork); + /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */ if (reset_flags && !amdgpu_ras_is_rma(adev)) { if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) @@ -3403,8 +3392,6 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, else reset = reset_flags; - flush_delayed_work(&con->page_retirement_dwork); - con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); @@ -3446,7 +3433,8 @@ static int amdgpu_ras_page_retirement_thread(void *param) atomic_sub(poison_creation_count, &con->poison_creation_count); atomic_sub(poison_creation_count, &con->page_retirement_req_cnt); } - } while (atomic_read(&con->poison_creation_count)); + } while (atomic_read(&con->poison_creation_count) && + !atomic_read(&con->poison_consumption_count)); if (ret != -EIO) { msg_count = kfifo_len(&con->poison_fifo); @@ -3463,6 +3451,7 @@ static int amdgpu_ras_page_retirement_thread(void *param) /* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */ /* Clear poison creation request */ atomic_set(&con->poison_creation_count, 0); + atomic_set(&con->poison_consumption_count, 0); /* Clear poison fifo */ amdgpu_ras_clear_poison_fifo(adev); @@ -3487,6 +3476,8 @@ static int amdgpu_ras_page_retirement_thread(void *param) atomic_sub(msg_count, &con->page_retirement_req_cnt); } + atomic_set(&con->poison_consumption_count, 0); + /* Wake up work to save bad pages to eeprom */ schedule_delayed_work(&con->page_retirement_dwork, 0); } @@ -3572,6 +3563,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) mutex_init(&con->recovery_lock); INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); atomic_set(&con->in_recovery, 0); + atomic_set(&con->rma_in_recovery, 0); con->eeprom_control.bad_channel_bitmap = 0; max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); @@ -3589,6 +3581,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) init_waitqueue_head(&con->page_retirement_wq); atomic_set(&con->page_retirement_req_cnt, 0); atomic_set(&con->poison_creation_count, 0); + atomic_set(&con->poison_consumption_count, 0); con->page_retirement_thread = kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement"); if (IS_ERR(con->page_retirement_thread)) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 927d6bff734a..96cb62a44a35 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -492,8 +492,8 @@ struct ras_ecc_err { struct ras_ecc_log_info { struct mutex lock; struct radix_tree_root de_page_tree; - uint64_t de_queried_count; - uint64_t prev_de_queried_count; + uint64_t de_queried_count; + uint64_t consumption_q_count; }; struct amdgpu_ras { @@ -515,6 +515,7 @@ struct amdgpu_ras { /* gpu recovery */ struct work_struct recovery_work; atomic_t in_recovery; + atomic_t rma_in_recovery; struct amdgpu_device *adev; /* error handler data */ struct ras_err_handler_data *eh_data; @@ -557,6 +558,7 @@ struct amdgpu_ras { struct mutex page_retirement_lock; atomic_t page_retirement_req_cnt; atomic_t poison_creation_count; + atomic_t poison_consumption_count; struct mutex page_rsv_lock; DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128); struct ras_ecc_log_info umc_ecc_log; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 9bda9ad13f88..88ded6296be3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -774,9 +774,10 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) control->tbl_rai.health_percent = 0; } ras->is_rma = true; - /* ignore the -ENOTSUPP return value */ - amdgpu_dpm_send_rma_reason(adev); } + + /* ignore the -ENOTSUPP return value */ + amdgpu_dpm_send_rma_reason(adev); } if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index c92b8794aa73..2e039fb778ea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -252,6 +252,7 @@ int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev, block, pasid, pasid_fn, data, reset); if (!ret) { atomic_inc(&con->page_retirement_req_cnt); + atomic_inc(&con->poison_consumption_count); wake_up(&con->page_retirement_wq); } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 8190c24a649a..a0fb13172e8b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -44,6 +44,39 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev) return userq_ip_mask; } +int amdgpu_userq_input_va_validate(struct amdgpu_vm *vm, u64 addr, + u64 expected_size) +{ + struct amdgpu_bo_va_mapping *va_map; + u64 user_addr; + u64 size; + int r = 0; + + user_addr = (addr & AMDGPU_GMC_HOLE_MASK) >> AMDGPU_GPU_PAGE_SHIFT; + size = expected_size >> AMDGPU_GPU_PAGE_SHIFT; + + r = amdgpu_bo_reserve(vm->root.bo, false); + if (r) + return r; + + va_map = amdgpu_vm_bo_lookup_mapping(vm, user_addr); + if (!va_map) { + r = -EINVAL; + goto out_err; + } + /* Only validate the userq whether resident in the VM mapping range */ + if (user_addr >= va_map->start && + va_map->last - user_addr + 1 >= size) { + amdgpu_bo_unreserve(vm->root.bo); + return 0; + } + + r = -EINVAL; +out_err: + amdgpu_bo_unreserve(vm->root.bo); + return r; +} + static int amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_queue *queue) @@ -404,27 +437,10 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) (args->in.flags & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK) >> AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_SHIFT; - /* Usermode queues are only supported for GFX IP as of now */ - if (args->in.ip_type != AMDGPU_HW_IP_GFX && - args->in.ip_type != AMDGPU_HW_IP_DMA && - args->in.ip_type != AMDGPU_HW_IP_COMPUTE) { - drm_file_err(uq_mgr->file, "Usermode queue doesn't support IP type %u\n", - args->in.ip_type); - return -EINVAL; - } - r = amdgpu_userq_priority_permit(filp, priority); if (r) return r; - if ((args->in.flags & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE) && - (args->in.ip_type != AMDGPU_HW_IP_GFX) && - (args->in.ip_type != AMDGPU_HW_IP_COMPUTE) && - !amdgpu_is_tmz(adev)) { - drm_file_err(uq_mgr->file, "Secure only supported on GFX/Compute queues\n"); - return -EINVAL; - } - r = pm_runtime_get_sync(adev_to_drm(adev)->dev); if (r < 0) { drm_file_err(uq_mgr->file, "pm_runtime_get_sync() failed for userqueue create\n"); @@ -456,6 +472,15 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) r = -ENOMEM; goto unlock; } + + /* Validate the userq virtual address.*/ + if (amdgpu_userq_input_va_validate(&fpriv->vm, args->in.queue_va, args->in.queue_size) || + amdgpu_userq_input_va_validate(&fpriv->vm, args->in.rptr_va, AMDGPU_GPU_PAGE_SIZE) || + amdgpu_userq_input_va_validate(&fpriv->vm, args->in.wptr_va, AMDGPU_GPU_PAGE_SIZE)) { + r = -EINVAL; + kfree(queue); + goto unlock; + } queue->doorbell_handle = args->in.doorbell_handle; queue->queue_type = args->in.ip_type; queue->vm = &fpriv->vm; @@ -543,22 +568,45 @@ unlock: return r; } -int amdgpu_userq_ioctl(struct drm_device *dev, void *data, - struct drm_file *filp) +static int amdgpu_userq_input_args_validate(struct drm_device *dev, + union drm_amdgpu_userq *args, + struct drm_file *filp) { - union drm_amdgpu_userq *args = data; - int r; + struct amdgpu_device *adev = drm_to_adev(dev); switch (args->in.op) { case AMDGPU_USERQ_OP_CREATE: if (args->in.flags & ~(AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK | AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE)) return -EINVAL; - r = amdgpu_userq_create(filp, args); - if (r) - drm_file_err(filp, "Failed to create usermode queue\n"); - break; + /* Usermode queues are only supported for GFX IP as of now */ + if (args->in.ip_type != AMDGPU_HW_IP_GFX && + args->in.ip_type != AMDGPU_HW_IP_DMA && + args->in.ip_type != AMDGPU_HW_IP_COMPUTE) { + drm_file_err(filp, "Usermode queue doesn't support IP type %u\n", + args->in.ip_type); + return -EINVAL; + } + + if ((args->in.flags & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE) && + (args->in.ip_type != AMDGPU_HW_IP_GFX) && + (args->in.ip_type != AMDGPU_HW_IP_COMPUTE) && + !amdgpu_is_tmz(adev)) { + drm_file_err(filp, "Secure only supported on GFX/Compute queues\n"); + return -EINVAL; + } + if (args->in.queue_va == AMDGPU_BO_INVALID_OFFSET || + args->in.queue_va == 0 || + args->in.queue_size == 0) { + drm_file_err(filp, "invalidate userq queue va or size\n"); + return -EINVAL; + } + if (!args->in.wptr_va || !args->in.rptr_va) { + drm_file_err(filp, "invalidate userq queue rptr or wptr\n"); + return -EINVAL; + } + break; case AMDGPU_USERQ_OP_FREE: if (args->in.ip_type || args->in.doorbell_handle || @@ -572,6 +620,31 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data, args->in.mqd || args->in.mqd_size) return -EINVAL; + break; + default: + return -EINVAL; + } + + return 0; +} + +int amdgpu_userq_ioctl(struct drm_device *dev, void *data, + struct drm_file *filp) +{ + union drm_amdgpu_userq *args = data; + int r; + + if (amdgpu_userq_input_args_validate(dev, args, filp) < 0) + return -EINVAL; + + switch (args->in.op) { + case AMDGPU_USERQ_OP_CREATE: + r = amdgpu_userq_create(filp, args); + if (r) + drm_file_err(filp, "Failed to create usermode queue\n"); + break; + + case AMDGPU_USERQ_OP_FREE: r = amdgpu_userq_destroy(filp, args->in.queue_id); if (r) drm_file_err(filp, "Failed to destroy usermode queue\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h index b1ca91b7cda4..8603c31320f1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h @@ -133,4 +133,6 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev, int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev, u32 idx); +int amdgpu_userq_input_va_validate(struct amdgpu_vm *vm, u64 addr, + u64 expected_size); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c index c2a983ff23c9..b372baae3979 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c @@ -276,7 +276,7 @@ static int amdgpu_userq_fence_create(struct amdgpu_usermode_queue *userq, /* Check if hardware has already processed the job */ spin_lock_irqsave(&fence_drv->fence_list_lock, flags); - if (!dma_fence_is_signaled_locked(fence)) + if (!dma_fence_is_signaled(fence)) list_add_tail(&userq_fence->link, &fence_drv->fences); else dma_fence_put(fence); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index 3da3ebb1d9a1..58accf2259b3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -267,7 +267,8 @@ struct amdgpu_virt { struct amdgpu_irq_src rcv_irq; struct work_struct flr_work; - struct work_struct bad_pages_work; + struct work_struct req_bad_pages_work; + struct work_struct handle_bad_pages_work; struct amdgpu_mm_table mm_table; const struct amdgpu_virt_ops *ops; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c index 121ee17b522b..118fbe38b33a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c @@ -322,6 +322,26 @@ static int vpe_early_init(struct amdgpu_ip_block *ip_block) return 0; } +static bool vpe_need_dpm0_at_power_down(struct amdgpu_device *adev) +{ + switch (amdgpu_ip_version(adev, VPE_HWIP, 0)) { + case IP_VERSION(6, 1, 1): + return adev->pm.fw_version < 0x0a640500; + default: + return false; + } +} + +static int vpe_get_dpm_level(struct amdgpu_device *adev) +{ + struct amdgpu_vpe *vpe = &adev->vpe; + + if (!adev->pm.dpm_enabled) + return 0; + + return RREG32(vpe_get_reg_offset(vpe, 0, vpe->regs.dpm_request_lv)); +} + static void vpe_idle_work_handler(struct work_struct *work) { struct amdgpu_device *adev = @@ -329,11 +349,17 @@ static void vpe_idle_work_handler(struct work_struct *work) unsigned int fences = 0; fences += amdgpu_fence_count_emitted(&adev->vpe.ring); + if (fences) + goto reschedule; + + if (vpe_need_dpm0_at_power_down(adev) && vpe_get_dpm_level(adev) != 0) + goto reschedule; - if (fences == 0) - amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VPE, AMD_PG_STATE_GATE); - else - schedule_delayed_work(&adev->vpe.idle_work, VPE_IDLE_TIMEOUT); + amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VPE, AMD_PG_STATE_GATE); + return; + +reschedule: + schedule_delayed_work(&adev->vpe.idle_work, VPE_IDLE_TIMEOUT); } static int vpe_common_init(struct amdgpu_vpe *vpe) @@ -435,6 +461,8 @@ static int vpe_hw_fini(struct amdgpu_ip_block *ip_block) struct amdgpu_device *adev = ip_block->adev; struct amdgpu_vpe *vpe = &adev->vpe; + cancel_delayed_work_sync(&adev->vpe.idle_work); + vpe_ring_stop(vpe); /* Power off VPE */ @@ -445,10 +473,6 @@ static int vpe_hw_fini(struct amdgpu_ip_block *ip_block) static int vpe_suspend(struct amdgpu_ip_block *ip_block) { - struct amdgpu_device *adev = ip_block->adev; - - cancel_delayed_work_sync(&adev->vpe.idle_work); - return vpe_hw_fini(ip_block); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c index c417f8689220..699acc1b46b5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c @@ -406,6 +406,7 @@ void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev) p_ddev->primary->dev = adev->xcp_mgr->xcp[i].pdev; p_ddev->driver = adev->xcp_mgr->xcp[i].driver; p_ddev->vma_offset_manager = adev->xcp_mgr->xcp[i].vma_offset_manager; + amdgpu_xcp_drm_dev_free(p_ddev); } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index bba0b26fee8f..5f36aff17e79 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -126,4 +126,8 @@ uint32_t amdgpu_xgmi_get_max_bandwidth(struct amdgpu_device *adev); void amgpu_xgmi_set_max_speed_width(struct amdgpu_device *adev, uint16_t max_speed, uint8_t max_width); + +/* Cleanup macro for use with __free(xgmi_put_hive) */ +DEFINE_FREE(xgmi_put_hive, struct amdgpu_hive_info *, if (_T) amdgpu_put_xgmi_hive(_T)) + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c b/drivers/gpu/drm/amd/amdgpu/atom.c index 427b073de2fc..1a7591ca2f9a 100644 --- a/drivers/gpu/drm/amd/amdgpu/atom.c +++ b/drivers/gpu/drm/amd/amdgpu/atom.c @@ -1246,6 +1246,10 @@ static int amdgpu_atom_execute_table_locked(struct atom_context *ctx, int index, ectx.last_jump_jiffies = 0; if (ws) { ectx.ws = kcalloc(4, ws, GFP_KERNEL); + if (!ectx.ws) { + ret = -ENOMEM; + goto free; + } ectx.ws_size = ws; } else { ectx.ws = NULL; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 51babf5c78c8..f06bc94cf6e1 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -3562,6 +3562,7 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring, struct amdgpu_device *adev = ring->adev; struct amdgpu_kiq *kiq = &adev->gfx.kiq[ring->xcc_id]; struct amdgpu_ring *kiq_ring = &kiq->ring; + int reset_mode = AMDGPU_RESET_TYPE_PER_QUEUE; unsigned long flags; int r; @@ -3599,6 +3600,7 @@ pipe_reset: if (!(adev->gfx.compute_supported_reset & AMDGPU_RESET_TYPE_PER_PIPE)) return -EOPNOTSUPP; r = gfx_v9_4_3_reset_hw_pipe(ring); + reset_mode = AMDGPU_RESET_TYPE_PER_PIPE; dev_info(adev->dev, "ring: %s pipe reset :%s\n", ring->name, r ? "failed" : "successfully"); if (r) @@ -3621,10 +3623,20 @@ pipe_reset: r = amdgpu_ring_test_ring(kiq_ring); spin_unlock_irqrestore(&kiq->ring_lock, flags); if (r) { + if (reset_mode == AMDGPU_RESET_TYPE_PER_QUEUE) + goto pipe_reset; + dev_err(adev->dev, "fail to remap queue\n"); return r; } + if (reset_mode == AMDGPU_RESET_TYPE_PER_QUEUE) { + r = amdgpu_ring_test_ring(ring); + if (r) + goto pipe_reset; + } + + return amdgpu_ring_reset_helper_end(ring, timedout_fence); } diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c index 54523dc1f702..7731ef262d39 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_1.c @@ -196,6 +196,14 @@ static int jpeg_v5_0_1_sw_init(struct amdgpu_ip_block *ip_block) } } + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__JPEG)) { + r = amdgpu_jpeg_ras_sw_init(adev); + if (r) { + dev_err(adev->dev, "Failed to initialize jpeg ras block!\n"); + return r; + } + } + r = amdgpu_jpeg_reg_dump_init(adev, jpeg_reg_list_5_0_1, ARRAY_SIZE(jpeg_reg_list_5_0_1)); if (r) return r; @@ -307,7 +315,7 @@ static int jpeg_v5_0_1_hw_fini(struct amdgpu_ip_block *ip_block) ret = jpeg_v5_0_1_set_powergating_state(ip_block, AMD_PG_STATE_GATE); } - if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__JPEG)) + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__JPEG) && !amdgpu_sriov_vf(adev)) amdgpu_irq_put(adev, &adev->jpeg.inst->ras_poison_irq, 0); return ret; @@ -1058,6 +1066,11 @@ static int jpeg_v5_0_1_ras_late_init(struct amdgpu_device *adev, struct ras_comm if (r) return r; + r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__JPEG, + &jpeg_v5_0_1_aca_info, NULL); + if (r) + goto late_fini; + if (amdgpu_ras_is_supported(adev, ras_block->block) && adev->jpeg.inst->ras_poison_irq.funcs) { r = amdgpu_irq_get(adev, &adev->jpeg.inst->ras_poison_irq, 0); @@ -1065,11 +1078,6 @@ static int jpeg_v5_0_1_ras_late_init(struct amdgpu_device *adev, struct ras_comm goto late_fini; } - r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__JPEG, - &jpeg_v5_0_1_aca_info, NULL); - if (r) - goto late_fini; - return 0; late_fini: diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c index d6f50b13e2ba..ef54d211214f 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c @@ -206,6 +206,7 @@ static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_mqd *mqd_hw_default = &adev->mqds[queue->queue_type]; struct drm_amdgpu_userq_in *mqd_user = args_in; struct amdgpu_mqd_prop *userq_props; + struct amdgpu_gfx_shadow_info shadow_info; int r; /* Structure to initialize MQD for userqueue using generic MQD init function */ @@ -215,13 +216,6 @@ static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr, return -ENOMEM; } - if (!mqd_user->wptr_va || !mqd_user->rptr_va || - !mqd_user->queue_va || mqd_user->queue_size == 0) { - DRM_ERROR("Invalid MQD parameters for userqueue\n"); - r = -EINVAL; - goto free_props; - } - r = amdgpu_userq_create_object(uq_mgr, &queue->mqd, mqd_hw_default->mqd_size); if (r) { DRM_ERROR("Failed to create MQD object for userqueue\n"); @@ -238,6 +232,8 @@ static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr, userq_props->doorbell_index = queue->doorbell_index; userq_props->fence_address = queue->fence_drv->gpu_addr; + if (adev->gfx.funcs->get_gfx_shadow_info) + adev->gfx.funcs->get_gfx_shadow_info(adev, &shadow_info, true); if (queue->queue_type == AMDGPU_HW_IP_COMPUTE) { struct drm_amdgpu_userq_mqd_compute_gfx11 *compute_mqd; @@ -254,6 +250,10 @@ static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr, goto free_mqd; } + if (amdgpu_userq_input_va_validate(queue->vm, compute_mqd->eop_va, + max_t(u32, PAGE_SIZE, AMDGPU_GPU_PAGE_SIZE))) + goto free_mqd; + userq_props->eop_gpu_addr = compute_mqd->eop_va; userq_props->hqd_pipe_priority = AMDGPU_GFX_PIPE_PRIO_NORMAL; userq_props->hqd_queue_priority = AMDGPU_GFX_QUEUE_PRIORITY_MINIMUM; @@ -281,6 +281,11 @@ static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr, userq_props->csa_addr = mqd_gfx_v11->csa_va; userq_props->tmz_queue = mqd_user->flags & AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE; + + if (amdgpu_userq_input_va_validate(queue->vm, mqd_gfx_v11->shadow_va, + shadow_info.shadow_size)) + goto free_mqd; + kfree(mqd_gfx_v11); } else if (queue->queue_type == AMDGPU_HW_IP_DMA) { struct drm_amdgpu_userq_mqd_sdma_gfx11 *mqd_sdma_v11; @@ -298,6 +303,10 @@ static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr, goto free_mqd; } + if (amdgpu_userq_input_va_validate(queue->vm, mqd_sdma_v11->csa_va, + shadow_info.csa_size)) + goto free_mqd; + userq_props->csa_addr = mqd_sdma_v11->csa_va; kfree(mqd_sdma_v11); } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 48101a34e049..9a40107a0869 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -292,14 +292,32 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) } } -static void xgpu_ai_mailbox_bad_pages_work(struct work_struct *work) +static void xgpu_ai_mailbox_req_bad_pages_work(struct work_struct *work) { - struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, bad_pages_work); + struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, req_bad_pages_work); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); if (down_read_trylock(&adev->reset_domain->sem)) { amdgpu_virt_fini_data_exchange(adev); amdgpu_virt_request_bad_pages(adev); + up_read(&adev->reset_domain->sem); + } +} + +/** + * xgpu_ai_mailbox_handle_bad_pages_work - Reinitialize the data exchange region to get fresh bad page information + * @work: pointer to the work_struct + * + * This work handler is triggered when bad pages are ready, and it reinitializes + * the data exchange region to retrieve updated bad page information from the host. + */ +static void xgpu_ai_mailbox_handle_bad_pages_work(struct work_struct *work) +{ + struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, handle_bad_pages_work); + struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); + + if (down_read_trylock(&adev->reset_domain->sem)) { + amdgpu_virt_fini_data_exchange(adev); amdgpu_virt_init_data_exchange(adev); up_read(&adev->reset_domain->sem); } @@ -327,10 +345,15 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev, struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); switch (event) { + case IDH_RAS_BAD_PAGES_READY: + xgpu_ai_mailbox_send_ack(adev); + if (amdgpu_sriov_runtime(adev)) + schedule_work(&adev->virt.handle_bad_pages_work); + break; case IDH_RAS_BAD_PAGES_NOTIFICATION: xgpu_ai_mailbox_send_ack(adev); if (amdgpu_sriov_runtime(adev)) - schedule_work(&adev->virt.bad_pages_work); + schedule_work(&adev->virt.req_bad_pages_work); break; case IDH_UNRECOV_ERR_NOTIFICATION: xgpu_ai_mailbox_send_ack(adev); @@ -415,7 +438,8 @@ int xgpu_ai_mailbox_get_irq(struct amdgpu_device *adev) } INIT_WORK(&adev->virt.flr_work, xgpu_ai_mailbox_flr_work); - INIT_WORK(&adev->virt.bad_pages_work, xgpu_ai_mailbox_bad_pages_work); + INIT_WORK(&adev->virt.req_bad_pages_work, xgpu_ai_mailbox_req_bad_pages_work); + INIT_WORK(&adev->virt.handle_bad_pages_work, xgpu_ai_mailbox_handle_bad_pages_work); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index f6d8597452ed..457972aa5632 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -202,9 +202,6 @@ send_request: case IDH_REQ_RAS_CPER_DUMP: event = IDH_RAS_CPER_DUMP_READY; break; - case IDH_REQ_RAS_BAD_PAGES: - event = IDH_RAS_BAD_PAGES_READY; - break; default: break; } @@ -359,14 +356,32 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) } } -static void xgpu_nv_mailbox_bad_pages_work(struct work_struct *work) +static void xgpu_nv_mailbox_req_bad_pages_work(struct work_struct *work) { - struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, bad_pages_work); + struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, req_bad_pages_work); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); if (down_read_trylock(&adev->reset_domain->sem)) { amdgpu_virt_fini_data_exchange(adev); amdgpu_virt_request_bad_pages(adev); + up_read(&adev->reset_domain->sem); + } +} + +/** + * xgpu_nv_mailbox_handle_bad_pages_work - Reinitialize the data exchange region to get fresh bad page information + * @work: pointer to the work_struct + * + * This work handler is triggered when bad pages are ready, and it reinitializes + * the data exchange region to retrieve updated bad page information from the host. + */ +static void xgpu_nv_mailbox_handle_bad_pages_work(struct work_struct *work) +{ + struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, handle_bad_pages_work); + struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); + + if (down_read_trylock(&adev->reset_domain->sem)) { + amdgpu_virt_fini_data_exchange(adev); amdgpu_virt_init_data_exchange(adev); up_read(&adev->reset_domain->sem); } @@ -397,10 +412,15 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev, struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); switch (event) { + case IDH_RAS_BAD_PAGES_READY: + xgpu_nv_mailbox_send_ack(adev); + if (amdgpu_sriov_runtime(adev)) + schedule_work(&adev->virt.handle_bad_pages_work); + break; case IDH_RAS_BAD_PAGES_NOTIFICATION: xgpu_nv_mailbox_send_ack(adev); if (amdgpu_sriov_runtime(adev)) - schedule_work(&adev->virt.bad_pages_work); + schedule_work(&adev->virt.req_bad_pages_work); break; case IDH_UNRECOV_ERR_NOTIFICATION: xgpu_nv_mailbox_send_ack(adev); @@ -485,7 +505,8 @@ int xgpu_nv_mailbox_get_irq(struct amdgpu_device *adev) } INIT_WORK(&adev->virt.flr_work, xgpu_nv_mailbox_flr_work); - INIT_WORK(&adev->virt.bad_pages_work, xgpu_nv_mailbox_bad_pages_work); + INIT_WORK(&adev->virt.req_bad_pages_work, xgpu_nv_mailbox_req_bad_pages_work); + INIT_WORK(&adev->virt.handle_bad_pages_work, xgpu_nv_mailbox_handle_bad_pages_work); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 9e74c9822e62..9785fada4fa7 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -741,7 +741,6 @@ static void soc15_reg_base_init(struct amdgpu_device *adev) void soc15_set_virt_ops(struct amdgpu_device *adev) { adev->virt.ops = &xgpu_ai_virt_ops; - /* init soc15 reg base early enough so we can * request request full access for sriov before * set_ip_blocks. */ diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index e590cbdd8de9..8dc32787d625 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -536,8 +536,11 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID); mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); - if ((hwid != MCA_UMC_HWID_V12_0) || (mcatype != MCA_UMC_MCATYPE_V12_0)) + /* The IP block decode of consumption is SMU */ + if (hwid != MCA_UMC_HWID_V12_0 || mcatype != MCA_UMC_MCATYPE_V12_0) { + con->umc_ecc_log.consumption_q_count++; return 0; + } if (!status) return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c index 68b4371df0f1..d1481e6d57ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c @@ -865,6 +865,7 @@ static int vcn_v2_0_start_dpg_mode(struct amdgpu_vcn_inst *vinst, bool indirect) volatile struct amdgpu_fw_shared *fw_shared = adev->vcn.inst->fw_shared.cpu_addr; struct amdgpu_ring *ring = &adev->vcn.inst->ring_dec; uint32_t rb_bufsz, tmp; + int ret; vcn_v2_0_enable_static_power_gating(vinst); @@ -948,8 +949,13 @@ static int vcn_v2_0_start_dpg_mode(struct amdgpu_vcn_inst *vinst, bool indirect) UVD, 0, mmUVD_MASTINT_EN), UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect); - if (indirect) - amdgpu_vcn_psp_update_sram(adev, 0, 0); + if (indirect) { + ret = amdgpu_vcn_psp_update_sram(adev, 0, 0); + if (ret) { + dev_err(adev->dev, "vcn sram load failed %d\n", ret); + return ret; + } + } /* force RBC into idle state */ rb_bufsz = order_base_2(ring->ring_size); diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c index f13ed3c1e29c..fdd8e33916f2 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c @@ -1012,6 +1012,7 @@ static int vcn_v2_5_start_dpg_mode(struct amdgpu_vcn_inst *vinst, bool indirect) volatile struct amdgpu_fw_shared *fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr; struct amdgpu_ring *ring; uint32_t rb_bufsz, tmp; + int ret; /* disable register anti-hang mechanism */ WREG32_P(SOC15_REG_OFFSET(VCN, inst_idx, mmUVD_POWER_STATUS), 1, @@ -1102,8 +1103,13 @@ static int vcn_v2_5_start_dpg_mode(struct amdgpu_vcn_inst *vinst, bool indirect) VCN, 0, mmUVD_MASTINT_EN), UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect); - if (indirect) - amdgpu_vcn_psp_update_sram(adev, inst_idx, 0); + if (indirect) { + ret = amdgpu_vcn_psp_update_sram(adev, inst_idx, 0); + if (ret) { + dev_err(adev->dev, "vcn sram load failed %d\n", ret); + return ret; + } + } ring = &adev->vcn.inst[inst_idx].ring_dec; /* force RBC into idle state */ diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c index 866222fc10a0..b7c4fcca18bb 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c @@ -1041,6 +1041,7 @@ static int vcn_v3_0_start_dpg_mode(struct amdgpu_vcn_inst *vinst, bool indirect) volatile struct amdgpu_fw_shared *fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr; struct amdgpu_ring *ring; uint32_t rb_bufsz, tmp; + int ret; /* disable register anti-hang mechanism */ WREG32_P(SOC15_REG_OFFSET(VCN, inst_idx, mmUVD_POWER_STATUS), 1, @@ -1133,8 +1134,13 @@ static int vcn_v3_0_start_dpg_mode(struct amdgpu_vcn_inst *vinst, bool indirect) WREG32_SOC15_DPG_MODE(inst_idx, SOC15_DPG_MODE_OFFSET( VCN, inst_idx, mmUVD_VCPU_CNTL), tmp, 0, indirect); - if (indirect) - amdgpu_vcn_psp_update_sram(adev, inst_idx, 0); + if (indirect) { + ret = amdgpu_vcn_psp_update_sram(adev, inst_idx, 0); + if (ret) { + dev_err(adev->dev, "vcn sram load failed %d\n", ret); + return ret; + } + } ring = &adev->vcn.inst[inst_idx].ring_dec; /* force RBC into idle state */ diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c index ac55549e20be..082def4a6bdf 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c @@ -1012,6 +1012,7 @@ static int vcn_v4_0_start_dpg_mode(struct amdgpu_vcn_inst *vinst, bool indirect) volatile struct amdgpu_vcn4_fw_shared *fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr; struct amdgpu_ring *ring; uint32_t tmp; + int ret; /* disable register anti-hang mechanism */ WREG32_P(SOC15_REG_OFFSET(VCN, inst_idx, regUVD_POWER_STATUS), 1, @@ -1094,8 +1095,13 @@ static int vcn_v4_0_start_dpg_mode(struct amdgpu_vcn_inst *vinst, bool indirect) UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect); - if (indirect) - amdgpu_vcn_psp_update_sram(adev, inst_idx, 0); + if (indirect) { + ret = amdgpu_vcn_psp_update_sram(adev, inst_idx, 0); + if (ret) { + dev_err(adev->dev, "vcn sram load failed %d\n", ret); + return ret; + } + } ring = &adev->vcn.inst[inst_idx].ring_enc[0]; diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c index ba944a96c070..2e985c4a288a 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c @@ -849,7 +849,7 @@ static int vcn_v4_0_3_start_dpg_mode(struct amdgpu_vcn_inst *vinst, volatile struct amdgpu_vcn4_fw_shared *fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr; struct amdgpu_ring *ring; - int vcn_inst; + int vcn_inst, ret; uint32_t tmp; vcn_inst = GET_INST(VCN, inst_idx); @@ -942,8 +942,13 @@ static int vcn_v4_0_3_start_dpg_mode(struct amdgpu_vcn_inst *vinst, VCN, 0, regUVD_MASTINT_EN), UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect); - if (indirect) - amdgpu_vcn_psp_update_sram(adev, inst_idx, AMDGPU_UCODE_ID_VCN0_RAM); + if (indirect) { + ret = amdgpu_vcn_psp_update_sram(adev, inst_idx, AMDGPU_UCODE_ID_VCN0_RAM); + if (ret) { + dev_err(adev->dev, "vcn sram load failed %d\n", ret); + return ret; + } + } ring = &adev->vcn.inst[inst_idx].ring_enc[0]; diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c index 11fec716e846..3ce49dfd3897 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c @@ -924,6 +924,7 @@ static int vcn_v4_0_5_start_dpg_mode(struct amdgpu_vcn_inst *vinst, volatile struct amdgpu_vcn4_fw_shared *fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr; struct amdgpu_ring *ring; uint32_t tmp; + int ret; /* disable register anti-hang mechanism */ WREG32_P(SOC15_REG_OFFSET(VCN, inst_idx, regUVD_POWER_STATUS), 1, @@ -1004,8 +1005,13 @@ static int vcn_v4_0_5_start_dpg_mode(struct amdgpu_vcn_inst *vinst, VCN, inst_idx, regUVD_MASTINT_EN), UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect); - if (indirect) - amdgpu_vcn_psp_update_sram(adev, inst_idx, 0); + if (indirect) { + ret = amdgpu_vcn_psp_update_sram(adev, inst_idx, 0); + if (ret) { + dev_err(adev->dev, "vcn sram load failed %d\n", ret); + return ret; + } + } ring = &adev->vcn.inst[inst_idx].ring_enc[0]; diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c index 07a6e9582880..f9dbd2757d83 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c @@ -713,6 +713,7 @@ static int vcn_v5_0_0_start_dpg_mode(struct amdgpu_vcn_inst *vinst, volatile struct amdgpu_vcn5_fw_shared *fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr; struct amdgpu_ring *ring; uint32_t tmp; + int ret; /* disable register anti-hang mechanism */ WREG32_P(SOC15_REG_OFFSET(VCN, inst_idx, regUVD_POWER_STATUS), 1, @@ -766,8 +767,13 @@ static int vcn_v5_0_0_start_dpg_mode(struct amdgpu_vcn_inst *vinst, VCN, inst_idx, regUVD_MASTINT_EN), UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect); - if (indirect) - amdgpu_vcn_psp_update_sram(adev, inst_idx, 0); + if (indirect) { + ret = amdgpu_vcn_psp_update_sram(adev, inst_idx, 0); + if (ret) { + dev_err(adev->dev, "%s: vcn sram load failed %d\n", __func__, ret); + return ret; + } + } ring = &adev->vcn.inst[inst_idx].ring_enc[0]; diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c index cdefd7fcb0da..8ef4a8b2fae9 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c @@ -284,7 +284,7 @@ static int vcn_v5_0_1_hw_fini(struct amdgpu_ip_block *ip_block) vinst->set_pg_state(vinst, AMD_PG_STATE_GATE); } - if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__VCN)) + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__VCN) && !amdgpu_sriov_vf(adev)) amdgpu_irq_put(adev, &adev->vcn.inst->ras_poison_irq, 0); return 0; @@ -605,7 +605,7 @@ static int vcn_v5_0_1_start_dpg_mode(struct amdgpu_vcn_inst *vinst, adev->vcn.inst[inst_idx].fw_shared.cpu_addr; struct amdgpu_ring *ring; struct dpg_pause_state state = {.fw_based = VCN_DPG_STATE__PAUSE}; - int vcn_inst; + int vcn_inst, ret; uint32_t tmp; vcn_inst = GET_INST(VCN, inst_idx); @@ -666,8 +666,13 @@ static int vcn_v5_0_1_start_dpg_mode(struct amdgpu_vcn_inst *vinst, VCN, 0, regUVD_MASTINT_EN), UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect); - if (indirect) - amdgpu_vcn_psp_update_sram(adev, inst_idx, AMDGPU_UCODE_ID_VCN0_RAM); + if (indirect) { + ret = amdgpu_vcn_psp_update_sram(adev, inst_idx, AMDGPU_UCODE_ID_VCN0_RAM); + if (ret) { + dev_err(adev->dev, "vcn sram load failed %d\n", ret); + return ret; + } + } /* resetting ring, fw should not check RB ring */ fw_shared->sq.queue_mode |= FW_QUEUE_RING_RESET; @@ -1603,6 +1608,13 @@ static int vcn_v5_0_1_ras_late_init(struct amdgpu_device *adev, struct ras_commo if (r) goto late_fini; + if (amdgpu_ras_is_supported(adev, ras_block->block) && + adev->vcn.inst->ras_poison_irq.funcs) { + r = amdgpu_irq_get(adev, &adev->vcn.inst->ras_poison_irq, 0); + if (r) + goto late_fini; + } + return 0; late_fini: |
