diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 870 |
1 files changed, 461 insertions, 409 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3d98fc2ad36b..3f001a50b34a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -35,6 +35,7 @@ #include <linux/devcoredump.h> #include <generated/utsrelease.h> #include <linux/pci-p2pdma.h> +#include <linux/apple-gmux.h> #include <drm/drm_aperture.h> #include <drm/drm_atomic_helper.h> @@ -158,76 +159,11 @@ static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, return sysfs_emit(buf, "%llu\n", cnt); } -static DEVICE_ATTR(pcie_replay_count, S_IRUGO, +static DEVICE_ATTR(pcie_replay_count, 0444, amdgpu_device_get_pcie_replay_count, NULL); static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); -/** - * DOC: product_name - * - * The amdgpu driver provides a sysfs API for reporting the product name - * for the device - * The file product_name is used for this and returns the product name - * as returned from the FRU. - * NOTE: This is only available for certain server cards - */ - -static ssize_t amdgpu_device_get_product_name(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct drm_device *ddev = dev_get_drvdata(dev); - struct amdgpu_device *adev = drm_to_adev(ddev); - - return sysfs_emit(buf, "%s\n", adev->product_name); -} - -static DEVICE_ATTR(product_name, S_IRUGO, - amdgpu_device_get_product_name, NULL); - -/** - * DOC: product_number - * - * The amdgpu driver provides a sysfs API for reporting the part number - * for the device - * The file product_number is used for this and returns the part number - * as returned from the FRU. - * NOTE: This is only available for certain server cards - */ - -static ssize_t amdgpu_device_get_product_number(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct drm_device *ddev = dev_get_drvdata(dev); - struct amdgpu_device *adev = drm_to_adev(ddev); - - return sysfs_emit(buf, "%s\n", adev->product_number); -} - -static DEVICE_ATTR(product_number, S_IRUGO, - amdgpu_device_get_product_number, NULL); - -/** - * DOC: serial_number - * - * The amdgpu driver provides a sysfs API for reporting the serial number - * for the device - * The file serial_number is used for this and returns the serial number - * as returned from the FRU. - * NOTE: This is only available for certain server cards - */ - -static ssize_t amdgpu_device_get_serial_number(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct drm_device *ddev = dev_get_drvdata(dev); - struct amdgpu_device *adev = drm_to_adev(ddev); - - return sysfs_emit(buf, "%s\n", adev->serial); -} - -static DEVICE_ATTR(serial_number, S_IRUGO, - amdgpu_device_get_serial_number, NULL); /** * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control @@ -369,10 +305,16 @@ size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, if (write) { memcpy_toio(addr, buf, count); + /* Make sure HDP write cache flush happens without any reordering + * after the system memory contents are sent over PCIe device + */ mb(); amdgpu_device_flush_hdp(adev, NULL); } else { amdgpu_device_invalidate_hdp(adev, NULL); + /* Make sure HDP read cache is invalidated before issuing a read + * to the PCIe device + */ mb(); memcpy_fromio(buf, addr, count); } @@ -480,8 +422,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, /* * MMIO register read with bytes helper functions * @offset:bytes offset from MMIO start - * -*/ + */ /** * amdgpu_mm_rreg8 - read a memory mapped IO register @@ -505,8 +446,8 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) * MMIO register write with bytes helper functions * @offset:bytes offset from MMIO start * @value: the value want to be written to the register - * -*/ + */ + /** * amdgpu_mm_wreg8 - read a memory mapped IO register * @@ -570,7 +511,8 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, * this function is invoked only for the debugfs register access */ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, - uint32_t reg, uint32_t v) + uint32_t reg, uint32_t v, + uint32_t xcc_id) { if (amdgpu_device_skip_hw_access(adev)) return; @@ -579,7 +521,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, adev->gfx.rlc.funcs && adev->gfx.rlc.funcs->is_rlcg_access_range) { if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) - return amdgpu_sriov_wreg(adev, reg, v, 0, 0); + return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); } else if ((reg * 4) >= adev->rmmio_size) { adev->pcie_wreg(adev, reg * 4, v); } else { @@ -588,119 +530,73 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, } /** - * amdgpu_mm_rdoorbell - read a doorbell dword - * - * @adev: amdgpu_device pointer - * @index: doorbell index - * - * Returns the value in the doorbell aperture at the - * requested doorbell index (CIK). - */ -u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) -{ - if (amdgpu_device_skip_hw_access(adev)) - return 0; - - if (index < adev->doorbell.num_doorbells) { - return readl(adev->doorbell.ptr + index); - } else { - DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); - return 0; - } -} - -/** - * amdgpu_mm_wdoorbell - write a doorbell dword + * amdgpu_device_indirect_rreg - read an indirect register * * @adev: amdgpu_device pointer - * @index: doorbell index - * @v: value to write + * @reg_addr: indirect register address to read from * - * Writes @v to the doorbell aperture at the - * requested doorbell index (CIK). + * Returns the value of indirect register @reg_addr */ -void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) +u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, + u32 reg_addr) { - if (amdgpu_device_skip_hw_access(adev)) - return; - - if (index < adev->doorbell.num_doorbells) { - writel(v, adev->doorbell.ptr + index); - } else { - DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); - } -} + unsigned long flags, pcie_index, pcie_data; + void __iomem *pcie_index_offset; + void __iomem *pcie_data_offset; + u32 r; -/** - * amdgpu_mm_rdoorbell64 - read a doorbell Qword - * - * @adev: amdgpu_device pointer - * @index: doorbell index - * - * Returns the value in the doorbell aperture at the - * requested doorbell index (VEGA10+). - */ -u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) -{ - if (amdgpu_device_skip_hw_access(adev)) - return 0; + pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); + pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); - if (index < adev->doorbell.num_doorbells) { - return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); - } else { - DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); - return 0; - } -} + spin_lock_irqsave(&adev->pcie_idx_lock, flags); + pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; + pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; -/** - * amdgpu_mm_wdoorbell64 - write a doorbell Qword - * - * @adev: amdgpu_device pointer - * @index: doorbell index - * @v: value to write - * - * Writes @v to the doorbell aperture at the - * requested doorbell index (VEGA10+). - */ -void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) -{ - if (amdgpu_device_skip_hw_access(adev)) - return; + writel(reg_addr, pcie_index_offset); + readl(pcie_index_offset); + r = readl(pcie_data_offset); + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); - if (index < adev->doorbell.num_doorbells) { - atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); - } else { - DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); - } + return r; } -/** - * amdgpu_device_indirect_rreg - read an indirect register - * - * @adev: amdgpu_device pointer - * @pcie_index: mmio register offset - * @pcie_data: mmio register offset - * @reg_addr: indirect register address to read from - * - * Returns the value of indirect register @reg_addr - */ -u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, - u32 pcie_index, u32 pcie_data, - u32 reg_addr) +u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, + u64 reg_addr) { - unsigned long flags; + unsigned long flags, pcie_index, pcie_index_hi, pcie_data; u32 r; void __iomem *pcie_index_offset; + void __iomem *pcie_index_hi_offset; void __iomem *pcie_data_offset; + pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); + pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); + if (adev->nbio.funcs->get_pcie_index_hi_offset) + pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); + else + pcie_index_hi = 0; + spin_lock_irqsave(&adev->pcie_idx_lock, flags); pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; + if (pcie_index_hi != 0) + pcie_index_hi_offset = (void __iomem *)adev->rmmio + + pcie_index_hi * 4; writel(reg_addr, pcie_index_offset); readl(pcie_index_offset); + if (pcie_index_hi != 0) { + writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); + readl(pcie_index_hi_offset); + } r = readl(pcie_data_offset); + + /* clear the high bits */ + if (pcie_index_hi != 0) { + writel(0, pcie_index_hi_offset); + readl(pcie_index_hi_offset); + } + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); return r; @@ -710,20 +606,20 @@ u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, * amdgpu_device_indirect_rreg64 - read a 64bits indirect register * * @adev: amdgpu_device pointer - * @pcie_index: mmio register offset - * @pcie_data: mmio register offset * @reg_addr: indirect register address to read from * * Returns the value of indirect register @reg_addr */ u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, - u32 pcie_index, u32 pcie_data, u32 reg_addr) { - unsigned long flags; - u64 r; + unsigned long flags, pcie_index, pcie_data; void __iomem *pcie_index_offset; void __iomem *pcie_data_offset; + u64 r; + + pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); + pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); spin_lock_irqsave(&adev->pcie_idx_lock, flags); pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; @@ -746,28 +642,68 @@ u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, * amdgpu_device_indirect_wreg - write an indirect register address * * @adev: amdgpu_device pointer - * @pcie_index: mmio register offset - * @pcie_data: mmio register offset * @reg_addr: indirect register offset * @reg_data: indirect register data * */ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, - u32 pcie_index, u32 pcie_data, u32 reg_addr, u32 reg_data) { - unsigned long flags; + unsigned long flags, pcie_index, pcie_data; + void __iomem *pcie_index_offset; + void __iomem *pcie_data_offset; + + pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); + pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); + + spin_lock_irqsave(&adev->pcie_idx_lock, flags); + pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; + pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; + + writel(reg_addr, pcie_index_offset); + readl(pcie_index_offset); + writel(reg_data, pcie_data_offset); + readl(pcie_data_offset); + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); +} + +void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, + u64 reg_addr, u32 reg_data) +{ + unsigned long flags, pcie_index, pcie_index_hi, pcie_data; void __iomem *pcie_index_offset; + void __iomem *pcie_index_hi_offset; void __iomem *pcie_data_offset; + pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); + pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); + if (adev->nbio.funcs->get_pcie_index_hi_offset) + pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); + else + pcie_index_hi = 0; + spin_lock_irqsave(&adev->pcie_idx_lock, flags); pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; + if (pcie_index_hi != 0) + pcie_index_hi_offset = (void __iomem *)adev->rmmio + + pcie_index_hi * 4; writel(reg_addr, pcie_index_offset); readl(pcie_index_offset); + if (pcie_index_hi != 0) { + writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); + readl(pcie_index_hi_offset); + } writel(reg_data, pcie_data_offset); readl(pcie_data_offset); + + /* clear the high bits */ + if (pcie_index_hi != 0) { + writel(0, pcie_index_hi_offset); + readl(pcie_index_hi_offset); + } + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); } @@ -775,20 +711,20 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address * * @adev: amdgpu_device pointer - * @pcie_index: mmio register offset - * @pcie_data: mmio register offset * @reg_addr: indirect register offset * @reg_data: indirect register data * */ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, - u32 pcie_index, u32 pcie_data, u32 reg_addr, u64 reg_data) { - unsigned long flags; + unsigned long flags, pcie_index, pcie_data; void __iomem *pcie_index_offset; void __iomem *pcie_data_offset; + pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); + pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); + spin_lock_irqsave(&adev->pcie_idx_lock, flags); pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; @@ -807,6 +743,18 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, } /** + * amdgpu_device_get_rev_id - query device rev_id + * + * @adev: amdgpu_device pointer + * + * Return device rev_id + */ +u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev) +{ + return adev->nbio.funcs->get_rev_id(adev); +} + +/** * amdgpu_invalid_rreg - dummy reg read function * * @adev: amdgpu_device pointer @@ -823,6 +771,13 @@ static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) return 0; } +static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg) +{ + DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); + BUG(); + return 0; +} + /** * amdgpu_invalid_wreg - dummy reg write function * @@ -840,6 +795,13 @@ static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32 BUG(); } +static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v) +{ + DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n", + reg, v); + BUG(); +} + /** * amdgpu_invalid_rreg64 - dummy 64 bit reg read function * @@ -923,12 +885,20 @@ static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, */ static int amdgpu_device_asic_init(struct amdgpu_device *adev) { + int ret; + amdgpu_asic_pre_asic_init(adev); - if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) - return amdgpu_atomfirmware_asic_init(adev, true); - else + if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || + adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) { + amdgpu_psp_wait_for_bootloader(adev); + ret = amdgpu_atomfirmware_asic_init(adev, true); + return ret; + } else { return amdgpu_atom_asic_init(adev->mode_info.atom_context); + } + + return 0; } /** @@ -968,7 +938,7 @@ static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) * @registers: pointer to the register array * @array_size: size of the register array * - * Programs an array or registers with and and or masks. + * Programs an array or registers with and or masks. * This is a helper for setting golden registers. */ void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, @@ -981,7 +951,7 @@ void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, if (array_size % 3) return; - for (i = 0; i < array_size; i +=3) { + for (i = 0; i < array_size; i += 3) { reg = registers[i + 0]; and_mask = registers[i + 1]; or_mask = registers[i + 2]; @@ -1026,82 +996,6 @@ int amdgpu_device_pci_reset(struct amdgpu_device *adev) } /* - * GPU doorbell aperture helpers function. - */ -/** - * amdgpu_device_doorbell_init - Init doorbell driver information. - * - * @adev: amdgpu_device pointer - * - * Init doorbell driver information (CIK) - * Returns 0 on success, error on failure. - */ -static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) -{ - - /* No doorbell on SI hardware generation */ - if (adev->asic_type < CHIP_BONAIRE) { - adev->doorbell.base = 0; - adev->doorbell.size = 0; - adev->doorbell.num_doorbells = 0; - adev->doorbell.ptr = NULL; - return 0; - } - - if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) - return -EINVAL; - - amdgpu_asic_init_doorbell_index(adev); - - /* doorbell bar mapping */ - adev->doorbell.base = pci_resource_start(adev->pdev, 2); - adev->doorbell.size = pci_resource_len(adev->pdev, 2); - - if (adev->enable_mes) { - adev->doorbell.num_doorbells = - adev->doorbell.size / sizeof(u32); - } else { - adev->doorbell.num_doorbells = - min_t(u32, adev->doorbell.size / sizeof(u32), - adev->doorbell_index.max_assignment+1); - if (adev->doorbell.num_doorbells == 0) - return -EINVAL; - - /* For Vega, reserve and map two pages on doorbell BAR since SDMA - * paging queue doorbell use the second page. The - * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the - * doorbells are in the first page. So with paging queue enabled, - * the max num_doorbells should + 1 page (0x400 in dword) - */ - if (adev->asic_type >= CHIP_VEGA10) - adev->doorbell.num_doorbells += 0x400; - } - - adev->doorbell.ptr = ioremap(adev->doorbell.base, - adev->doorbell.num_doorbells * - sizeof(u32)); - if (adev->doorbell.ptr == NULL) - return -ENOMEM; - - return 0; -} - -/** - * amdgpu_device_doorbell_fini - Tear down doorbell driver information. - * - * @adev: amdgpu_device pointer - * - * Tear down doorbell driver information (CIK) - */ -static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) -{ - iounmap(adev->doorbell.ptr); - adev->doorbell.ptr = NULL; -} - - - -/* * amdgpu_device_wb_*() * Writeback is the method by which the GPU updates special pages in memory * with the status of certain GPU events (fences, ring pointers,etc.). @@ -1210,10 +1104,13 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); struct pci_bus *root; struct resource *res; - unsigned i; + unsigned int i; u16 cmd; int r; + if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) + return 0; + /* Bypass for VF */ if (amdgpu_sriov_vf(adev)) return 0; @@ -1248,7 +1145,7 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) cmd & ~PCI_COMMAND_MEMORY); /* Free the VRAM and doorbell BAR, we most likely need to move both. */ - amdgpu_device_doorbell_fini(adev); + amdgpu_doorbell_fini(adev); if (adev->asic_type >= CHIP_BONAIRE) pci_release_resource(adev->pdev, 2); @@ -1265,7 +1162,7 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) /* When the doorbell or fb BAR isn't available we have no chance of * using the device. */ - r = amdgpu_device_doorbell_init(adev); + r = amdgpu_doorbell_init(adev); if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) return -ENODEV; @@ -1274,6 +1171,14 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) return 0; } +static bool amdgpu_device_read_bios(struct amdgpu_device *adev) +{ + if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) + return false; + + return true; +} + /* * GPU helpers function. */ @@ -1293,6 +1198,9 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev) if (amdgpu_sriov_vf(adev)) return false; + if (!amdgpu_device_read_bios(adev)) + return false; + if (amdgpu_passthrough(adev)) { /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot * some old smc fw still need driver do vPost otherwise gpu hang, while @@ -1302,6 +1210,7 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev) if (adev->asic_type == CHIP_FIJI) { int err; uint32_t fw_ver; + err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); /* force vPost if error occured */ if (err) @@ -1335,6 +1244,51 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev) return true; } +/* + * On APUs with >= 64GB white flickering has been observed w/ SG enabled. + * Disable S/G on such systems until we have a proper fix. + * https://gitlab.freedesktop.org/drm/amd/-/issues/2354 + * https://gitlab.freedesktop.org/drm/amd/-/issues/2735 + */ +bool amdgpu_sg_display_supported(struct amdgpu_device *adev) +{ + switch (amdgpu_sg_display) { + case -1: + break; + case 0: + return false; + case 1: + return true; + default: + return false; + } + if ((totalram_pages() << (PAGE_SHIFT - 10)) + + (adev->gmc.real_vram_size / 1024) >= 64000000) { + DRM_WARN("Disabling S/G due to >=64GB RAM\n"); + return false; + } + return true; +} + +/* + * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic + * speed switching. Until we have confirmation from Intel that a specific host + * supports it, it's safer that we keep it disabled for all. + * + * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ + * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 + */ +bool amdgpu_device_pcie_dynamic_switching_supported(void) +{ +#if IS_ENABLED(CONFIG_X86) + struct cpuinfo_x86 *c = &cpu_data(0); + + if (c->x86_vendor == X86_VENDOR_INTEL) + return false; +#endif + return true; +} + /** * amdgpu_device_should_use_aspm - check if the device should program ASPM * @@ -1385,6 +1339,7 @@ static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, bool state) { struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); + amdgpu_asic_set_vga_state(adev, state); if (state) return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | @@ -1407,7 +1362,8 @@ static void amdgpu_device_check_block_size(struct amdgpu_device *adev) { /* defines number of bits in page table versus page directory, * a page is 4KB so we have 12 bits offset, minimum 9 bits in the - * page table and the remaining bits are in the page directory */ + * page table and the remaining bits are in the page directory + */ if (amdgpu_vm_block_size == -1) return; @@ -1530,7 +1486,7 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", amdgpu_sched_jobs); amdgpu_sched_jobs = 4; - } else if (!is_power_of_2(amdgpu_sched_jobs)){ + } else if (!is_power_of_2(amdgpu_sched_jobs)) { dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n", amdgpu_sched_jobs); amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs); @@ -1639,7 +1595,7 @@ static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) { struct drm_device *dev = pci_get_drvdata(pdev); - /* + /* * FIXME: open_count is protected by drm_global_mutex but that would lead to * locking inversion with the driver load path. And the access here is * completely racy anyway. So don't bother with locking for now. @@ -2167,7 +2123,6 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) adev->has_pr3 = parent ? pci_pr3_present(parent) : false; } - amdgpu_amdkfd_device_probe(adev); adev->pm.pp_feature = amdgpu_pp_feature_mask; if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) @@ -2178,7 +2133,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) total = true; for (i = 0; i < adev->num_ip_blocks; i++) { if ((amdgpu_ip_block_mask & (1 << i)) == 0) { - DRM_ERROR("disabled ip block: %d <%s>\n", + DRM_WARN("disabled ip block: %d <%s>\n", i, adev->ip_blocks[i].version->funcs->name); adev->ip_blocks[i].status.valid = false; } else { @@ -2204,14 +2159,16 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) return r; /* Read BIOS */ - if (!amdgpu_get_bios(adev)) - return -EINVAL; + if (amdgpu_device_read_bios(adev)) { + if (!amdgpu_get_bios(adev)) + return -EINVAL; - r = amdgpu_atombios_init(adev); - if (r) { - dev_err(adev->dev, "amdgpu_atombios_init failed\n"); - amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); - return r; + r = amdgpu_atombios_init(adev); + if (r) { + dev_err(adev->dev, "amdgpu_atombios_init failed\n"); + amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); + return r; + } } /*get pf2vf msg info at it's earliest time*/ @@ -2223,6 +2180,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) if (!total) return -ENODEV; + amdgpu_amdkfd_device_probe(adev); adev->cg_flags &= amdgpu_cg_mask; adev->pg_flags &= amdgpu_pg_mask; @@ -2348,7 +2306,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) } r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, - ring->num_hw_submission, amdgpu_job_hang_limit, + ring->num_hw_submission, 0, timeout, adev->reset_domain->wq, ring->sched_score, ring->name, adev->dev); @@ -2359,6 +2317,8 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) } } + amdgpu_xcp_update_partition_sched_list(adev); + return 0; } @@ -2425,7 +2385,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) adev->ip_blocks[i].status.hw = true; /* right after GMC hw init, we create CSA */ - if (amdgpu_mcbp) { + if (adev->gfx.mcbp) { r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT, @@ -2516,14 +2476,14 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) goto init_failed; /* Don't init kfd if whole hive need to be reset during init */ - if (!adev->gmc.xgmi.pending_reset) + if (!adev->gmc.xgmi.pending_reset) { + kgd2kfd_init_zone_device(adev); amdgpu_amdkfd_device_init(adev); + } amdgpu_fru_get_product_info(adev); init_failed: - if (amdgpu_sriov_vf(adev)) - amdgpu_virt_release_full_gpu(adev, true); return r; } @@ -2744,8 +2704,9 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ - if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| - adev->asic_type == CHIP_ALDEBARAN )) + if (amdgpu_passthrough(adev) && + ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) || + adev->asic_type == CHIP_ALDEBARAN)) amdgpu_dpm_handle_passthrough_sbr(adev, true); if (adev->gmc.xgmi.num_physical_nodes > 1) { @@ -3074,7 +3035,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) } adev->ip_blocks[i].status.hw = false; /* handle putting the SMC in the appropriate state */ - if(!amdgpu_sriov_vf(adev)){ + if (!amdgpu_sriov_vf(adev)) { if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); if (r) { @@ -3164,9 +3125,11 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) AMD_IP_BLOCK_TYPE_DCE, AMD_IP_BLOCK_TYPE_GFX, AMD_IP_BLOCK_TYPE_SDMA, + AMD_IP_BLOCK_TYPE_MES, AMD_IP_BLOCK_TYPE_UVD, AMD_IP_BLOCK_TYPE_VCE, - AMD_IP_BLOCK_TYPE_VCN + AMD_IP_BLOCK_TYPE_VCN, + AMD_IP_BLOCK_TYPE_JPEG }; for (i = 0; i < ARRAY_SIZE(ip_order); i++) { @@ -3277,7 +3240,7 @@ static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) * * Main resume function for hardware IPs. The hardware IPs * are split into two resume functions because they are - * are also used in in recovering from a GPU reset and some additional + * also used in recovering from a GPU reset and some additional * steps need to be take between them. In this case (S3/S4) they are * run sequentially. * Returns 0 on success, negative error code on failure. @@ -3286,10 +3249,6 @@ static int amdgpu_device_ip_resume(struct amdgpu_device *adev) { int r; - r = amdgpu_amdkfd_resume_iommu(adev); - if (r) - return r; - r = amdgpu_device_ip_resume_phase1(adev); if (r) return r; @@ -3377,8 +3336,7 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) #else default: if (amdgpu_dc > 0) - DRM_INFO_ONCE("Display Core has been requested via kernel parameter " - "but isn't supported by ASIC, ignoring\n"); + DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); return false; #endif } @@ -3534,13 +3492,28 @@ static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) } static const struct attribute *amdgpu_dev_attributes[] = { - &dev_attr_product_name.attr, - &dev_attr_product_number.attr, - &dev_attr_serial_number.attr, &dev_attr_pcie_replay_count.attr, NULL }; +static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) +{ + if (amdgpu_mcbp == 1) + adev->gfx.mcbp = true; + else if (amdgpu_mcbp == 0) + adev->gfx.mcbp = false; + else if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) && + (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) && + adev->gfx.num_gfx_rings) + adev->gfx.mcbp = true; + + if (amdgpu_sriov_vf(adev)) + adev->gfx.mcbp = true; + + if (adev->gfx.mcbp) + DRM_INFO("MCBP is enabled\n"); +} + /** * amdgpu_device_init - initialize the driver * @@ -3559,6 +3532,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, int r, i; bool px = false; u32 max_MBps; + int tmp; adev->shutdown = false; adev->flags = flags; @@ -3588,6 +3562,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->smc_wreg = &amdgpu_invalid_wreg; adev->pcie_rreg = &amdgpu_invalid_rreg; adev->pcie_wreg = &amdgpu_invalid_wreg; + adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext; + adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext; adev->pciep_rreg = &amdgpu_invalid_rreg; adev->pciep_wreg = &amdgpu_invalid_wreg; adev->pcie_rreg64 = &amdgpu_invalid_rreg64; @@ -3606,13 +3582,15 @@ int amdgpu_device_init(struct amdgpu_device *adev, pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); /* mutex initialization are all done here so we - * can recall function without having locking issues */ + * can recall function without having locking issues + */ mutex_init(&adev->firmware.mutex); mutex_init(&adev->pm.mutex); mutex_init(&adev->gfx.gpu_clock_mutex); mutex_init(&adev->srbm_mutex); mutex_init(&adev->gfx.pipe_reserve_mutex); mutex_init(&adev->gfx.gfx_off_mutex); + mutex_init(&adev->gfx.partition_mutex); mutex_init(&adev->grbm_idx_mutex); mutex_init(&adev->mn_lock); mutex_init(&adev->virt.vf_errors.lock); @@ -3682,16 +3660,11 @@ int amdgpu_device_init(struct amdgpu_device *adev, atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); - if (adev->rmmio == NULL) { + if (!adev->rmmio) return -ENOMEM; - } - DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); - DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); - - amdgpu_device_get_pcie_info(adev); - if (amdgpu_mcbp) - DRM_INFO("MCBP is enabled\n"); + DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); + DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); /* * Reset domain needs to be present early, before XGMI hive discovered @@ -3705,6 +3678,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, /* detect hw virtualization here */ amdgpu_detect_virtualization(adev); + amdgpu_device_get_pcie_info(adev); + r = amdgpu_device_get_job_timeout_settings(adev); if (r) { dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); @@ -3716,6 +3691,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (r) return r; + amdgpu_device_set_mcbp(adev); + /* Get rid of things like offb */ r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); if (r) @@ -3733,20 +3710,29 @@ int amdgpu_device_init(struct amdgpu_device *adev, } /* enable PCIE atomic ops */ - if (amdgpu_sriov_vf(adev)) - adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) - adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == - (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); - else + if (amdgpu_sriov_vf(adev)) { + if (adev->virt.fw_reserve.p_pf2vf) + adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) + adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags == + (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); + /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a + * internal path natively support atomics, set have_atomics_support to true. + */ + } else if ((adev->flags & AMD_IS_APU) && + (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { + adev->have_atomics_support = true; + } else { adev->have_atomics_support = !pci_enable_atomic_ops_to_root(adev->pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); + } + if (!adev->have_atomics_support) dev_info(adev->dev, "PCIE atomic ops is not supported\n"); /* doorbell bar mapping and doorbell index init*/ - amdgpu_device_doorbell_init(adev); + amdgpu_doorbell_init(adev); if (amdgpu_emu_mode == 1) { /* post the asic on emulation mode */ @@ -3757,7 +3743,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, amdgpu_reset_init(adev); /* detect if we are with an SRIOV vbios */ - amdgpu_device_detect_sriov_bios(adev); + if (adev->bios) + amdgpu_device_detect_sriov_bios(adev); /* check if we need to reset the asic * E.g., driver was not cleanly unloaded previously, etc. @@ -3780,7 +3767,13 @@ int amdgpu_device_init(struct amdgpu_device *adev, } } } else { + tmp = amdgpu_reset_method; + /* It should do a default reset when loading or reloading the driver, + * regardless of the module parameter reset_method. + */ + amdgpu_reset_method = AMD_RESET_METHOD_NONE; r = amdgpu_asic_reset(adev); + amdgpu_reset_method = tmp; if (r) { dev_err(adev->dev, "asic reset on init failed\n"); goto failed; @@ -3788,8 +3781,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, } } - pci_enable_pcie_error_reporting(adev->pdev); - /* Post card if necessary */ if (amdgpu_device_need_post(adev)) { if (!adev->bios) { @@ -3805,25 +3796,27 @@ int amdgpu_device_init(struct amdgpu_device *adev, } } - if (adev->is_atom_fw) { - /* Initialize clocks */ - r = amdgpu_atomfirmware_get_clock_info(adev); - if (r) { - dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); - amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); - goto failed; - } - } else { - /* Initialize clocks */ - r = amdgpu_atombios_get_clock_info(adev); - if (r) { - dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); - amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); - goto failed; + if (adev->bios) { + if (adev->is_atom_fw) { + /* Initialize clocks */ + r = amdgpu_atomfirmware_get_clock_info(adev); + if (r) { + dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n"); + amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); + goto failed; + } + } else { + /* Initialize clocks */ + r = amdgpu_atombios_get_clock_info(adev); + if (r) { + dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n"); + amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0); + goto failed; + } + /* init i2c buses */ + if (!amdgpu_device_has_dc_support(adev)) + amdgpu_atombios_i2c_init(adev); } - /* init i2c buses */ - if (!amdgpu_device_has_dc_support(adev)) - amdgpu_atombios_i2c_init(adev); } fence_driver_init: @@ -3840,18 +3833,6 @@ fence_driver_init: r = amdgpu_device_ip_init(adev); if (r) { - /* failed in exclusive mode due to timeout */ - if (amdgpu_sriov_vf(adev) && - !amdgpu_sriov_runtime(adev) && - amdgpu_virt_mmio_blocked(adev) && - !amdgpu_virt_wait_reset(adev)) { - dev_err(adev->dev, "VF exclusive mode timeout\n"); - /* Don't send request since VF is inactive. */ - adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; - adev->virt.ops = NULL; - r = -EAGAIN; - goto release_ras_con; - } dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); goto release_ras_con; @@ -3878,12 +3859,14 @@ fence_driver_init: /* Get a log2 for easy divisions. */ adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); + r = amdgpu_atombios_sysfs_init(adev); + if (r) + drm_err(&adev->ddev, + "registering atombios sysfs failed (%d).\n", r); + r = amdgpu_pm_sysfs_init(adev); - if (r) { - adev->pm_sysfs_en = false; - DRM_ERROR("registering pm debugfs failed (%d).\n", r); - } else - adev->pm_sysfs_en = true; + if (r) + DRM_ERROR("registering pm sysfs failed (%d).\n", r); r = amdgpu_ucode_sysfs_init(adev); if (r) { @@ -3892,14 +3875,6 @@ fence_driver_init: } else adev->ucode_sysfs_en = true; - r = amdgpu_psp_sysfs_init(adev); - if (r) { - adev->psp_sysfs_en = false; - if (!amdgpu_sriov_vf(adev)) - DRM_ERROR("Creating psp sysfs failed\n"); - } else - adev->psp_sysfs_en = true; - /* * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. * Otherwise the mgpu fan boost feature will be skipped due to the @@ -3923,13 +3898,17 @@ fence_driver_init: msecs_to_jiffies(AMDGPU_RESUME_MS)); } - if (amdgpu_sriov_vf(adev)) + if (amdgpu_sriov_vf(adev)) { + amdgpu_virt_release_full_gpu(adev, true); flush_delayed_work(&adev->delayed_init_work); + } r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); if (r) dev_err(adev->dev, "Could not create amdgpu device attr\n"); + amdgpu_fru_sysfs_init(adev); + if (IS_ENABLED(CONFIG_PERF_EVENTS)) r = amdgpu_pmu_init(adev); if (r) @@ -3941,16 +3920,20 @@ fence_driver_init: /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ /* this will fail for cards that aren't VGA class devices, just - * ignore it */ + * ignore it + */ if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); - if (amdgpu_device_supports_px(ddev)) { - px = true; + px = amdgpu_device_supports_px(ddev); + + if (px || (!pci_is_thunderbolt_attached(adev->pdev) && + apple_gmux_detect(NULL, NULL))) vga_switcheroo_register_client(adev->pdev, &amdgpu_switcheroo_ops, px); + + if (px) vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); - } if (adev->gmc.xgmi.pending_reset) queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, @@ -3961,6 +3944,20 @@ fence_driver_init: return 0; release_ras_con: + if (amdgpu_sriov_vf(adev)) + amdgpu_virt_release_full_gpu(adev, true); + + /* failed in exclusive mode due to timeout */ + if (amdgpu_sriov_vf(adev) && + !amdgpu_sriov_runtime(adev) && + amdgpu_virt_mmio_blocked(adev) && + !amdgpu_virt_wait_reset(adev)) { + dev_err(adev->dev, "VF exclusive mode timeout\n"); + /* Don't send request since VF is inactive. */ + adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME; + adev->virt.ops = NULL; + r = -EAGAIN; + } amdgpu_release_ras_context(adev); failed: @@ -3976,7 +3973,7 @@ static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); /* Unmap all mapped bars - Doorbell, registers and VRAM */ - amdgpu_device_doorbell_fini(adev); + amdgpu_doorbell_fini(adev); iounmap(adev->rmmio); adev->rmmio = NULL; @@ -3985,7 +3982,7 @@ static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) adev->mman.aper_base_kaddr = NULL; /* Memory manager related */ - if (!adev->gmc.xgmi.connected_to_cpu) { + if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { arch_phys_wc_del(adev->gmc.vram_mtrr); arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size); } @@ -4007,7 +4004,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) /* make sure IB test finished before entering exclusive mode * to avoid preemption on IB test - * */ + */ if (amdgpu_sriov_vf(adev)) { amdgpu_virt_request_full_gpu(adev, false); amdgpu_virt_fini_data_exchange(adev); @@ -4015,7 +4012,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) /* disable all interrupts */ amdgpu_irq_disable_all(adev); - if (adev->mode_info.mode_config_initialized){ + if (adev->mode_info.mode_config_initialized) { if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) drm_helper_force_disable_all(adev_to_drm(adev)); else @@ -4026,13 +4023,12 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) if (adev->mman.initialized) drain_workqueue(adev->mman.bdev.wq); - if (adev->pm_sysfs_en) + if (adev->pm.sysfs_initialized) amdgpu_pm_sysfs_fini(adev); if (adev->ucode_sysfs_en) amdgpu_ucode_sysfs_fini(adev); - if (adev->psp_sysfs_en) - amdgpu_psp_sysfs_fini(adev); sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); + amdgpu_fru_sysfs_fini(adev); /* disable ras feature must before hw fini */ amdgpu_ras_pre_fini(adev); @@ -4054,6 +4050,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) void amdgpu_device_fini_sw(struct amdgpu_device *adev) { int idx; + bool px; amdgpu_fence_driver_sw_fini(adev); amdgpu_device_ip_fini(adev); @@ -4072,10 +4069,16 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev) kfree(adev->bios); adev->bios = NULL; - if (amdgpu_device_supports_px(adev_to_drm(adev))) { + + px = amdgpu_device_supports_px(adev_to_drm(adev)); + + if (px || (!pci_is_thunderbolt_attached(adev->pdev) && + apple_gmux_detect(NULL, NULL))) vga_switcheroo_unregister_client(adev->pdev); + + if (px) vga_switcheroo_fini_domain_pm_ops(adev->dev); - } + if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) vga_client_unregister(adev->pdev); @@ -4083,7 +4086,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev) iounmap(adev->rmmio); adev->rmmio = NULL; - amdgpu_device_doorbell_fini(adev); + amdgpu_doorbell_fini(adev); drm_dev_exit(idx); } @@ -4164,6 +4167,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); cancel_delayed_work_sync(&adev->delayed_init_work); + flush_delayed_work(&adev->gfx.gfx_off_delay_work); amdgpu_ras_suspend(adev); @@ -4471,7 +4475,11 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev) dev_info(adev->dev, "recover vram bo from shadow start\n"); mutex_lock(&adev->shadow_list_lock); list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) { - shadow = &vmbo->bo; + /* If vm is compute context or adev is APU, shadow will be NULL */ + if (!vmbo->shadow) + continue; + shadow = vmbo->shadow; + /* No need to recover an evicted BO */ if (shadow->tbo.resource->mem_type != TTM_PL_TT || shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET || @@ -4538,6 +4546,10 @@ retry: r = amdgpu_virt_reset_gpu(adev); if (r) return r; + amdgpu_irq_gpu_reset_resume_helper(adev); + + /* some sw clean up VF needs to do before recover */ + amdgpu_virt_post_reset(adev); /* Resume IP prior to SMC */ r = amdgpu_device_ip_reinit_early_sriov(adev); @@ -4564,7 +4576,6 @@ retry: amdgpu_put_xgmi_hive(hive); if (!r) { - amdgpu_irq_gpu_reset_resume_helper(adev); r = amdgpu_ib_ring_tests(adev); amdgpu_amdkfd_post_reset(adev); @@ -4669,42 +4680,55 @@ disabled: int amdgpu_device_mode1_reset(struct amdgpu_device *adev) { - u32 i; - int ret = 0; + u32 i; + int ret = 0; + + amdgpu_atombios_scratch_regs_engine_hung(adev, true); - amdgpu_atombios_scratch_regs_engine_hung(adev, true); + dev_info(adev->dev, "GPU mode1 reset\n"); - dev_info(adev->dev, "GPU mode1 reset\n"); + /* disable BM */ + pci_clear_master(adev->pdev); - /* disable BM */ - pci_clear_master(adev->pdev); + amdgpu_device_cache_pci_state(adev->pdev); + + if (amdgpu_dpm_is_mode1_reset_supported(adev)) { + dev_info(adev->dev, "GPU smu mode1 reset\n"); + ret = amdgpu_dpm_mode1_reset(adev); + } else { + dev_info(adev->dev, "GPU psp mode1 reset\n"); + ret = psp_gpu_reset(adev); + } + + if (ret) + goto mode1_reset_failed; - amdgpu_device_cache_pci_state(adev->pdev); + amdgpu_device_load_pci_state(adev->pdev); + ret = amdgpu_psp_wait_for_bootloader(adev); + if (ret) + goto mode1_reset_failed; - if (amdgpu_dpm_is_mode1_reset_supported(adev)) { - dev_info(adev->dev, "GPU smu mode1 reset\n"); - ret = amdgpu_dpm_mode1_reset(adev); - } else { - dev_info(adev->dev, "GPU psp mode1 reset\n"); - ret = psp_gpu_reset(adev); - } + /* wait for asic to come out of reset */ + for (i = 0; i < adev->usec_timeout; i++) { + u32 memsize = adev->nbio.funcs->get_memsize(adev); - if (ret) - dev_err(adev->dev, "GPU mode1 reset failed\n"); + if (memsize != 0xffffffff) + break; + udelay(1); + } - amdgpu_device_load_pci_state(adev->pdev); + if (i >= adev->usec_timeout) { + ret = -ETIMEDOUT; + goto mode1_reset_failed; + } - /* wait for asic to come out of reset */ - for (i = 0; i < adev->usec_timeout; i++) { - u32 memsize = adev->nbio.funcs->get_memsize(adev); + amdgpu_atombios_scratch_regs_engine_hung(adev, false); - if (memsize != 0xffffffff) - break; - udelay(1); - } + return 0; - amdgpu_atombios_scratch_regs_engine_hung(adev, false); - return ret; +mode1_reset_failed: + dev_err(adev->dev, "GPU mode1 reset failed\n"); + return ret; } int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, @@ -4732,8 +4756,9 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, if (!ring || !ring->sched.thread) continue; - /*clear job fence from fence drv to avoid force_completion - *leave NULL and vm flush fence in fence drv */ + /* Clear job fence from fence drv to avoid force_completion + * leave NULL and vm flush fence in fence drv + */ amdgpu_fence_driver_clear_job_fences(ring); /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ @@ -4747,7 +4772,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, r = amdgpu_reset_prepare_hwcontext(adev, reset_context); /* If reset handler not implemented, continue; otherwise return */ - if (r == -ENOSYS) + if (r == -EOPNOTSUPP) r = 0; else return r; @@ -4844,7 +4869,7 @@ static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) struct drm_device *dev = adev_to_drm(adev); ktime_get_ts64(&adev->reset_time); - dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, + dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT, amdgpu_devcoredump_read, amdgpu_devcoredump_free); } #endif @@ -4865,7 +4890,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, reset_context->reset_device_list = device_list_handle; r = amdgpu_reset_perform_reset(tmp_adev, reset_context); /* If reset handler not implemented, continue; otherwise return */ - if (r == -ENOSYS) + if (r == -EOPNOTSUPP) r = 0; else return r; @@ -4943,9 +4968,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, dev_warn(tmp_adev->dev, "asic atom init failed!"); } else { dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); - r = amdgpu_amdkfd_resume_iommu(tmp_adev); - if (r) - goto out; r = amdgpu_device_ip_resume_phase1(tmp_adev); if (r) @@ -5154,6 +5176,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) * * @adev: amdgpu_device pointer * @job: which job trigger hang + * @reset_context: amdgpu reset context pointer * * Attempt to reset the GPU if it has hung (all asics). * Attempt to do soft-reset or full-reset and reinitialize Asic @@ -5323,8 +5346,9 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ if (r) adev->asic_reset_res = r; - /* Aldebaran supports ras in SRIOV, so need resume ras during reset */ - if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) + /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ + if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || + adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) amdgpu_ras_resume(adev); } else { r = amdgpu_do_asic_reset(device_list_handle, reset_context); @@ -5352,9 +5376,8 @@ skip_hw_reset: if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) amdgpu_mes_self_test(tmp_adev); - if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { + if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); - } if (tmp_adev->asic_reset_res) r = tmp_adev->asic_reset_res; @@ -5431,7 +5454,7 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap; /* covers APUs as well */ - if (pci_is_root_bus(adev->pdev->bus)) { + if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) { if (adev->pm.pcie_gen_mask == 0) adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK; if (adev->pm.pcie_mlw_mask == 0) @@ -5593,7 +5616,7 @@ int amdgpu_device_baco_enter(struct drm_device *dev) struct amdgpu_device *adev = drm_to_adev(dev); struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - if (!amdgpu_device_supports_baco(adev_to_drm(adev))) + if (!amdgpu_device_supports_baco(dev)) return -ENOTSUPP; if (ras && adev->ras_enabled && @@ -5609,7 +5632,7 @@ int amdgpu_device_baco_exit(struct drm_device *dev) struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); int ret = 0; - if (!amdgpu_device_supports_baco(adev_to_drm(adev))) + if (!amdgpu_device_supports_baco(dev)) return -ENOTSUPP; ret = amdgpu_dpm_baco_exit(adev); @@ -5912,6 +5935,7 @@ void amdgpu_device_halt(struct amdgpu_device *adev) struct pci_dev *pdev = adev->pdev; struct drm_device *ddev = adev_to_drm(adev); + amdgpu_xcp_dev_unplug(adev); drm_dev_unplug(ddev); amdgpu_irq_disable_all(adev); @@ -6032,3 +6056,31 @@ bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) return true; } } + +uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, + uint32_t inst, uint32_t reg_addr, char reg_name[], + uint32_t expected_value, uint32_t mask) +{ + uint32_t ret = 0; + uint32_t old_ = 0; + uint32_t tmp_ = RREG32(reg_addr); + uint32_t loop = adev->usec_timeout; + + while ((tmp_ & (mask)) != (expected_value)) { + if (old_ != tmp_) { + loop = adev->usec_timeout; + old_ = tmp_; + } else + udelay(1); + tmp_ = RREG32(reg_addr); + loop--; + if (!loop) { + DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn", + inst, reg_name, (uint32_t)expected_value, + (uint32_t)(tmp_ & (mask))); + ret = -ETIMEDOUT; + break; + } + } + return ret; +} |
