aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
diff options
context:
space:
mode:
authorLinus Torvalds <[email protected]>2025-07-31 02:26:49 +0000
committerLinus Torvalds <[email protected]>2025-07-31 02:26:49 +0000
commit260f6f4fda93c8485c8037865c941b42b9cba5d2 (patch)
tree587a0ea46d3351f63250d19860b01da8217ac774 /drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
parentMerge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm (diff)
parentMerge tag 'drm-misc-next-fixes-2025-07-24' of https://gitlab.freedesktop.org/... (diff)
downloadkernel-260f6f4fda93c8485c8037865c941b42b9cba5d2.tar.gz
kernel-260f6f4fda93c8485c8037865c941b42b9cba5d2.zip
Merge tag 'drm-next-2025-07-30' of https://gitlab.freedesktop.org/drm/kernel
Pull drm updates from Dave Airlie: "Highlights: - Intel xe enable Panthor Lake, started adding WildCat Lake - amdgpu has a bunch of reset improvments along with the usual IP updates - msm got VM_BIND support which is important for vulkan sparse memory - more drm_panic users - gpusvm common code to handle a bunch of core SVM work outside drivers. Detail summary: Changes outside drm subdirectory: - 'shrink_shmem_memory()' for better shmem/hibernate interaction - Rust support infrastructure: - make ETIMEDOUT available - add size constants up to SZ_2G - add DMA coherent allocation bindings - mtd driver for Intel GPU non-volatile storage - i2c designware quirk for Intel xe core: - atomic helpers: tune enable/disable sequences - add task info to wedge API - refactor EDID quirks - connector: move HDR sink to drm_display_info - fourcc: half-float and 32-bit float formats - mode_config: pass format info to simplify dma-buf: - heaps: Give CMA heap a stable name ci: - add device tree validation and kunit displayport: - change AUX DPCD access probe address - add quirk for DPCD probe - add panel replay definitions - backlight control helpers fbdev: - make CONFIG_FIRMWARE_EDID available on all arches fence: - fix UAF issues format-helper: - improve tests gpusvm: - introduce devmem only flag for allocation - add timeslicing support to GPU SVM ttm: - improve eviction sched: - tracing improvements - kunit improvements - memory leak fixes - reset handling improvements color mgmt: - add hardware gamma LUT handling helpers bridge: - add destroy hook - switch to reference counted drm_bridge allocations - tc358767: convert to devm_drm_bridge_alloc - improve CEC handling panel: - switch to reference counter drm_panel allocations - fwnode panel lookup - Huiling hl055fhv028c support - Raspberry Pi 7" 720x1280 support - edp: KDC KD116N3730A05, N160JCE-ELL CMN, N116BCJ-EAK - simple: AUO P238HAN01 - st7701: Winstar wf40eswaa6mnn0 - visionox: rm69299-shift - Renesas R61307, Renesas R69328 support - DJN HX83112B hdmi: - add CEC handling - YUV420 output support xe: - WildCat Lake support - Enable PanthorLake by default - mark BMG as SRIOV capable - update firmware recommendations - Expose media OA units - aux-bux support for non-volatile memory - MTD intel-dg driver for non-volatile memory - Expose fan control and voltage regulator in sysfs - restructure migration for multi-device - Restore GuC submit UAF fix - make GEM shrinker drm managed - SRIOV VF Post-migration recovery of GGTT nodes - W/A additions/reworks - Prefetch support for svm ranges - Don't allocate managed BO for each policy change - HWMON fixes for BMG - Create LRC BO without VM - PCI ID updates - make SLPC debugfs files optional - rework eviction rejection of bound external BOs - consolidate PAT programming logic for pre/post Xe2 - init changes for flicker-free boot - Enable GuC Dynamic Inhibit Context switch i915: - drm_panic support for i915/xe - initial flip queue off by default for LNL/PNL - Wildcat Lake Display support - Support for DSC fractional link bpp - Support for simultaneous Panel Replay and Adaptive sync - Support for PTL+ double buffer LUT - initial PIPEDMC event handling - drm_panel_follower support - DPLL interface renames - allocate struct intel_display dynamically - flip queue preperation - abstract DRAM detection better - avoid GuC scheduling stalls - remove DG1 force probe requirement - fix MEI interrupt handler on RT kernels - use backlight control helpers for eDP - more shared display code refactoring amdgpu: - add userq slot to INFO ioctl - SR-IOV hibernation support - Suspend improvements - Backlight improvements - Use scaling for non-native eDP modes - cleaner shader updates for GC 9.x - Remove fence slab - SDMA fw checks for userq support - RAS updates - DMCUB updates - DP tunneling fixes - Display idle D3 support - Per queue reset improvements - initial smartmux support amdkfd: - enable KFD on loongarch - mtype fix for ext coherent system memory radeon: - CS validation additional GL extensions - drop console lock during suspend/resume - bump driver version msm: - VM BIND support - CI: infrastructure updates - UBWC single source of truth - decouple GPU and KMS support - DP: rework I/O accessors - DPU: SM8750 support - DSI: SM8750 support - GPU: X1-45 support and speedbin support for X1-85 - MDSS: SM8750 support nova: - register! macro improvements - DMA object abstraction - VBIOS parser + fwsec lookup - sysmem flush page support - falcon: generic falcon boot code and HAL - FWSEC-FRTS: fb setup and load/execute ivpu: - Add Wildcat Lake support - Add turbo flag ast: - improve hardware generations implementation imx: - IMX8qxq Display Controller support lima: - Rockchip RK3528 GPU support nouveau: - fence handling cleanup panfrost: - MT8370 support - bo labeling - 64-bit register access qaic: - add RAS support rockchip: - convert inno_hdmi to a bridge rz-du: - add RZ/V2H(P) support - MIPI-DSI DCS support sitronix: - ST7567 support sun4i: - add H616 support tidss: - add TI AM62L support - AM65x OLDI bridge support bochs: - drm panic support vkms: - YUV and R* format support - use faux device vmwgfx: - fence improvements hyperv: - move out of simple - add drm_panic support" * tag 'drm-next-2025-07-30' of https://gitlab.freedesktop.org/drm/kernel: (1479 commits) drm/tidss: oldi: convert to devm_drm_bridge_alloc() API drm/tidss: encoder: convert to devm_drm_bridge_alloc() drm/amdgpu: move reset support type checks into the caller drm/amdgpu/sdma7: re-emit unprocessed state on ring reset drm/amdgpu/sdma6: re-emit unprocessed state on ring reset drm/amdgpu/sdma5.2: re-emit unprocessed state on ring reset drm/amdgpu/sdma5: re-emit unprocessed state on ring reset drm/amdgpu/gfx12: re-emit unprocessed state on ring reset drm/amdgpu/gfx11: re-emit unprocessed state on ring reset drm/amdgpu/gfx10: re-emit unprocessed state on ring reset drm/amdgpu/gfx9.4.3: re-emit unprocessed state on kcq reset drm/amdgpu/gfx9: re-emit unprocessed state on kcq reset drm/amdgpu: Add WARN_ON to the resource clear function drm/amd/pm: Use cached metrics data on SMUv13.0.6 drm/amd/pm: Use cached data for min/max clocks gpu: nova-core: fix bounds check in PmuLookupTableEntry::new drm/amdgpu: Replace HQD terminology with slots naming drm/amdgpu: Add user queue instance count in HW IP info drm/amd/amdgpu: Add helper functions for isp buffers drm/amd/amdgpu: Initialize swnode for ISP MFD device ...
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_job.c71
1 files changed, 26 insertions, 45 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index ddb9d3269357..e6061d45f142 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -89,10 +89,10 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
{
struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
struct amdgpu_job *job = to_amdgpu_job(s_job);
- struct amdgpu_task_info *ti;
+ struct drm_wedge_task_info *info = NULL;
+ struct amdgpu_task_info *ti = NULL;
struct amdgpu_device *adev = ring->adev;
- int idx;
- int r;
+ int idx, r;
if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
dev_info(adev->dev, "%s - device unplugged skipping recovery on scheduler:%s",
@@ -112,6 +112,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
amdgpu_job_core_dump(adev, job);
if (amdgpu_gpu_recovery &&
+ amdgpu_ring_is_reset_type_supported(ring, AMDGPU_RESET_TYPE_SOFT_RESET) &&
amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
dev_err(adev->dev, "ring %s timeout, but soft recovered\n",
s_job->sched->name);
@@ -124,53 +125,30 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid);
if (ti) {
- dev_err(adev->dev,
- "Process information: process %s pid %d thread %s pid %d\n",
- ti->process_name, ti->tgid, ti->task_name, ti->pid);
- amdgpu_vm_put_task_info(ti);
+ amdgpu_vm_print_task_info(adev, ti);
+ info = &ti->task;
}
/* attempt a per ring reset */
if (unlikely(adev->debug_disable_gpu_ring_reset)) {
dev_err(adev->dev, "Ring reset disabled by debug mask\n");
- } else if (amdgpu_gpu_recovery && ring->funcs->reset) {
- bool is_guilty;
-
- dev_err(adev->dev, "Starting %s ring reset\n", s_job->sched->name);
- /* stop the scheduler, but don't mess with the
- * bad job yet because if ring reset fails
- * we'll fall back to full GPU reset.
- */
- drm_sched_wqueue_stop(&ring->sched);
-
- /* for engine resets, we need to reset the engine,
- * but individual queues may be unaffected.
- * check here to make sure the accounting is correct.
- */
- if (ring->funcs->is_guilty)
- is_guilty = ring->funcs->is_guilty(ring);
- else
- is_guilty = true;
-
- if (is_guilty)
- dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
-
- r = amdgpu_ring_reset(ring, job->vmid);
+ } else if (amdgpu_gpu_recovery &&
+ amdgpu_ring_is_reset_type_supported(ring, AMDGPU_RESET_TYPE_PER_QUEUE) &&
+ ring->funcs->reset) {
+ dev_err(adev->dev, "Starting %s ring reset\n",
+ s_job->sched->name);
+ r = amdgpu_ring_reset(ring, job->vmid, &job->hw_fence);
if (!r) {
- if (amdgpu_ring_sched_ready(ring))
- drm_sched_stop(&ring->sched, s_job);
- if (is_guilty) {
- atomic_inc(&ring->adev->gpu_reset_counter);
- amdgpu_fence_driver_force_completion(ring);
- }
- if (amdgpu_ring_sched_ready(ring))
- drm_sched_start(&ring->sched, 0);
- dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name);
- drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
+ atomic_inc(&ring->adev->gpu_reset_counter);
+ dev_err(adev->dev, "Ring %s reset succeeded\n",
+ ring->sched.name);
+ drm_dev_wedged_event(adev_to_drm(adev),
+ DRM_WEDGE_RECOVERY_NONE, info);
goto exit;
}
- dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name);
+ dev_err(adev->dev, "Ring %s reset failed\n", ring->sched.name);
}
+
dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
if (amdgpu_device_should_recover_gpu(ring->adev)) {
@@ -198,13 +176,15 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
}
exit:
+ amdgpu_vm_put_task_info(ti);
drm_dev_exit(idx);
- return DRM_GPU_SCHED_STAT_NOMINAL;
+ return DRM_GPU_SCHED_STAT_RESET;
}
int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
struct drm_sched_entity *entity, void *owner,
- unsigned int num_ibs, struct amdgpu_job **job)
+ unsigned int num_ibs, struct amdgpu_job **job,
+ u64 drm_client_id)
{
if (num_ibs == 0)
return -EINVAL;
@@ -222,7 +202,8 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
if (!entity)
return 0;
- return drm_sched_job_init(&(*job)->base, entity, 1, owner);
+ return drm_sched_job_init(&(*job)->base, entity, 1, owner,
+ drm_client_id);
}
int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev,
@@ -232,7 +213,7 @@ int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev,
{
int r;
- r = amdgpu_job_alloc(adev, NULL, entity, owner, 1, job);
+ r = amdgpu_job_alloc(adev, NULL, entity, owner, 1, job, 0);
if (r)
return r;