diff options
Diffstat (limited to 'drivers/gpu/drm/scheduler')
| -rw-r--r-- | drivers/gpu/drm/scheduler/sched_main.c | 85 | ||||
| -rw-r--r-- | drivers/gpu/drm/scheduler/tests/mock_scheduler.c | 75 | ||||
| -rw-r--r-- | drivers/gpu/drm/scheduler/tests/sched_tests.h | 2 | ||||
| -rw-r--r-- | drivers/gpu/drm/scheduler/tests/tests_basic.c | 93 |
4 files changed, 191 insertions, 64 deletions
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 81ad40d9582b..e2cda28a1af4 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -366,11 +366,16 @@ static void drm_sched_run_free_queue(struct drm_gpu_scheduler *sched) { struct drm_sched_job *job; - spin_lock(&sched->job_list_lock); job = list_first_entry_or_null(&sched->pending_list, struct drm_sched_job, list); if (job && dma_fence_is_signaled(&job->s_fence->finished)) __drm_sched_run_free_queue(sched); +} + +static void drm_sched_run_free_queue_unlocked(struct drm_gpu_scheduler *sched) +{ + spin_lock(&sched->job_list_lock); + drm_sched_run_free_queue(sched); spin_unlock(&sched->job_list_lock); } @@ -523,11 +528,37 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job) spin_unlock(&sched->job_list_lock); } +/** + * drm_sched_job_reinsert_on_false_timeout - reinsert the job on a false timeout + * @sched: scheduler instance + * @job: job to be reinserted on the pending list + * + * In the case of a "false timeout" - when a timeout occurs but the GPU isn't + * hung and is making progress, the scheduler must reinsert the job back into + * @sched->pending_list. Otherwise, the job and its resources won't be freed + * through the &struct drm_sched_backend_ops.free_job callback. + * + * This function must be used in "false timeout" cases only. + */ +static void drm_sched_job_reinsert_on_false_timeout(struct drm_gpu_scheduler *sched, + struct drm_sched_job *job) +{ + spin_lock(&sched->job_list_lock); + list_add(&job->list, &sched->pending_list); + + /* After reinserting the job, the scheduler enqueues the free-job work + * again if ready. Otherwise, a signaled job could be added to the + * pending list, but never freed. + */ + drm_sched_run_free_queue(sched); + spin_unlock(&sched->job_list_lock); +} + static void drm_sched_job_timedout(struct work_struct *work) { struct drm_gpu_scheduler *sched; struct drm_sched_job *job; - enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_NOMINAL; + enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_RESET; sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work); @@ -556,6 +587,9 @@ static void drm_sched_job_timedout(struct work_struct *work) job->sched->ops->free_job(job); sched->free_guilty = false; } + + if (status == DRM_GPU_SCHED_STAT_NO_HANG) + drm_sched_job_reinsert_on_false_timeout(sched, job); } else { spin_unlock(&sched->job_list_lock); } @@ -578,6 +612,10 @@ static void drm_sched_job_timedout(struct work_struct *work) * This function is typically used for reset recovery (see the docu of * drm_sched_backend_ops.timedout_job() for details). Do not call it for * scheduler teardown, i.e., before calling drm_sched_fini(). + * + * As it's only used for reset recovery, drivers must not call this function + * in their &struct drm_sched_backend_ops.timedout_job callback when they + * skip a reset using &enum drm_gpu_sched_stat.DRM_GPU_SCHED_STAT_NO_HANG. */ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) { @@ -663,6 +701,10 @@ EXPORT_SYMBOL(drm_sched_stop); * drm_sched_backend_ops.timedout_job() for details). Do not call it for * scheduler startup. The scheduler itself is fully operational after * drm_sched_init() succeeded. + * + * As it's only used for reset recovery, drivers must not call this function + * in their &struct drm_sched_backend_ops.timedout_job callback when they + * skip a reset using &enum drm_gpu_sched_stat.DRM_GPU_SCHED_STAT_NO_HANG. */ void drm_sched_start(struct drm_gpu_scheduler *sched, int errno) { @@ -1184,7 +1226,7 @@ static void drm_sched_free_job_work(struct work_struct *w) if (job) sched->ops->free_job(job); - drm_sched_run_free_queue(sched); + drm_sched_run_free_queue_unlocked(sched); drm_sched_run_job_queue(sched); } @@ -1352,6 +1394,18 @@ Out_check_own: } EXPORT_SYMBOL(drm_sched_init); +static void drm_sched_cancel_remaining_jobs(struct drm_gpu_scheduler *sched) +{ + struct drm_sched_job *job, *tmp; + + /* All other accessors are stopped. No locking necessary. */ + list_for_each_entry_safe_reverse(job, tmp, &sched->pending_list, list) { + sched->ops->cancel_job(job); + list_del(&job->list); + sched->ops->free_job(job); + } +} + /** * drm_sched_fini - Destroy a gpu scheduler * @@ -1359,19 +1413,11 @@ EXPORT_SYMBOL(drm_sched_init); * * Tears down and cleans up the scheduler. * - * This stops submission of new jobs to the hardware through - * drm_sched_backend_ops.run_job(). Consequently, drm_sched_backend_ops.free_job() - * will not be called for all jobs still in drm_gpu_scheduler.pending_list. - * There is no solution for this currently. Thus, it is up to the driver to make - * sure that: - * - * a) drm_sched_fini() is only called after for all submitted jobs - * drm_sched_backend_ops.free_job() has been called or that - * b) the jobs for which drm_sched_backend_ops.free_job() has not been called - * after drm_sched_fini() ran are freed manually. - * - * FIXME: Take care of the above problem and prevent this function from leaking - * the jobs in drm_gpu_scheduler.pending_list under any circumstances. + * This stops submission of new jobs to the hardware through &struct + * drm_sched_backend_ops.run_job. If &struct drm_sched_backend_ops.cancel_job + * is implemented, all jobs will be canceled through it and afterwards cleaned + * up through &struct drm_sched_backend_ops.free_job. If cancel_job is not + * implemented, memory could leak. */ void drm_sched_fini(struct drm_gpu_scheduler *sched) { @@ -1401,11 +1447,18 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched) /* Confirm no work left behind accessing device structures */ cancel_delayed_work_sync(&sched->work_tdr); + /* Avoid memory leaks if supported by the driver. */ + if (sched->ops->cancel_job) + drm_sched_cancel_remaining_jobs(sched); + if (sched->own_submit_wq) destroy_workqueue(sched->submit_wq); sched->ready = false; kfree(sched->sched_rq); sched->sched_rq = NULL; + + if (!list_empty(&sched->pending_list)) + dev_warn(sched->dev, "Tearing down scheduler while jobs are pending!\n"); } EXPORT_SYMBOL(drm_sched_fini); diff --git a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c index 49d067fecd67..65acffc3fea8 100644 --- a/drivers/gpu/drm/scheduler/tests/mock_scheduler.c +++ b/drivers/gpu/drm/scheduler/tests/mock_scheduler.c @@ -63,7 +63,7 @@ static void drm_mock_sched_job_complete(struct drm_mock_sched_job *job) lockdep_assert_held(&sched->lock); job->flags |= DRM_MOCK_SCHED_JOB_DONE; - list_move_tail(&job->link, &sched->done_list); + list_del(&job->link); dma_fence_signal_locked(&job->hw_fence); complete(&job->done); } @@ -218,6 +218,11 @@ mock_sched_timedout_job(struct drm_sched_job *sched_job) struct drm_mock_sched_job *job = drm_sched_job_to_mock_job(sched_job); unsigned long flags; + if (job->flags & DRM_MOCK_SCHED_JOB_DONT_RESET) { + job->flags &= ~DRM_MOCK_SCHED_JOB_DONT_RESET; + return DRM_GPU_SCHED_STAT_NO_HANG; + } + spin_lock_irqsave(&sched->lock, flags); if (!dma_fence_is_signaled_locked(&job->hw_fence)) { list_del(&job->link); @@ -231,31 +236,46 @@ mock_sched_timedout_job(struct drm_sched_job *sched_job) drm_sched_job_cleanup(sched_job); /* Mock job itself is freed by the kunit framework. */ - return DRM_GPU_SCHED_STAT_NOMINAL; + return DRM_GPU_SCHED_STAT_RESET; } static void mock_sched_free_job(struct drm_sched_job *sched_job) { - struct drm_mock_scheduler *sched = - drm_sched_to_mock_sched(sched_job->sched); struct drm_mock_sched_job *job = drm_sched_job_to_mock_job(sched_job); - unsigned long flags; - /* Remove from the scheduler done list. */ - spin_lock_irqsave(&sched->lock, flags); - list_del(&job->link); - spin_unlock_irqrestore(&sched->lock, flags); dma_fence_put(&job->hw_fence); - drm_sched_job_cleanup(sched_job); /* Mock job itself is freed by the kunit framework. */ } +static void mock_sched_cancel_job(struct drm_sched_job *sched_job) +{ + struct drm_mock_scheduler *sched = drm_sched_to_mock_sched(sched_job->sched); + struct drm_mock_sched_job *job = drm_sched_job_to_mock_job(sched_job); + unsigned long flags; + + hrtimer_cancel(&job->timer); + + spin_lock_irqsave(&sched->lock, flags); + if (!dma_fence_is_signaled_locked(&job->hw_fence)) { + list_del(&job->link); + dma_fence_set_error(&job->hw_fence, -ECANCELED); + dma_fence_signal_locked(&job->hw_fence); + } + spin_unlock_irqrestore(&sched->lock, flags); + + /* + * The GPU Scheduler will call drm_sched_backend_ops.free_job(), still. + * Mock job itself is freed by the kunit framework. + */ +} + static const struct drm_sched_backend_ops drm_mock_scheduler_ops = { .run_job = mock_sched_run_job, .timedout_job = mock_sched_timedout_job, - .free_job = mock_sched_free_job + .free_job = mock_sched_free_job, + .cancel_job = mock_sched_cancel_job, }; /** @@ -289,7 +309,6 @@ struct drm_mock_scheduler *drm_mock_sched_new(struct kunit *test, long timeout) sched->hw_timeline.context = dma_fence_context_alloc(1); atomic_set(&sched->hw_timeline.next_seqno, 0); INIT_LIST_HEAD(&sched->job_list); - INIT_LIST_HEAD(&sched->done_list); spin_lock_init(&sched->lock); return sched; @@ -304,38 +323,6 @@ struct drm_mock_scheduler *drm_mock_sched_new(struct kunit *test, long timeout) */ void drm_mock_sched_fini(struct drm_mock_scheduler *sched) { - struct drm_mock_sched_job *job, *next; - unsigned long flags; - LIST_HEAD(list); - - drm_sched_wqueue_stop(&sched->base); - - /* Force complete all unfinished jobs. */ - spin_lock_irqsave(&sched->lock, flags); - list_for_each_entry_safe(job, next, &sched->job_list, link) - list_move_tail(&job->link, &list); - spin_unlock_irqrestore(&sched->lock, flags); - - list_for_each_entry(job, &list, link) - hrtimer_cancel(&job->timer); - - spin_lock_irqsave(&sched->lock, flags); - list_for_each_entry_safe(job, next, &list, link) - drm_mock_sched_job_complete(job); - spin_unlock_irqrestore(&sched->lock, flags); - - /* - * Free completed jobs and jobs not yet processed by the DRM scheduler - * free worker. - */ - spin_lock_irqsave(&sched->lock, flags); - list_for_each_entry_safe(job, next, &sched->done_list, link) - list_move_tail(&job->link, &list); - spin_unlock_irqrestore(&sched->lock, flags); - - list_for_each_entry_safe(job, next, &list, link) - mock_sched_free_job(&job->base); - drm_sched_fini(&sched->base); } diff --git a/drivers/gpu/drm/scheduler/tests/sched_tests.h b/drivers/gpu/drm/scheduler/tests/sched_tests.h index fbba38137f0c..63d4f2ac7074 100644 --- a/drivers/gpu/drm/scheduler/tests/sched_tests.h +++ b/drivers/gpu/drm/scheduler/tests/sched_tests.h @@ -49,7 +49,6 @@ struct drm_mock_scheduler { spinlock_t lock; struct list_head job_list; - struct list_head done_list; struct { u64 context; @@ -98,6 +97,7 @@ struct drm_mock_sched_job { #define DRM_MOCK_SCHED_JOB_DONE 0x1 #define DRM_MOCK_SCHED_JOB_TIMEDOUT 0x2 +#define DRM_MOCK_SCHED_JOB_DONT_RESET 0x4 unsigned long flags; struct list_head link; diff --git a/drivers/gpu/drm/scheduler/tests/tests_basic.c b/drivers/gpu/drm/scheduler/tests/tests_basic.c index 7230057e0594..55eb142bd7c5 100644 --- a/drivers/gpu/drm/scheduler/tests/tests_basic.c +++ b/drivers/gpu/drm/scheduler/tests/tests_basic.c @@ -5,6 +5,8 @@ #include "sched_tests.h" +#define MOCK_TIMEOUT (HZ / 5) + /* * DRM scheduler basic tests should check the basic functional correctness of * the scheduler, including some very light smoke testing. More targeted tests, @@ -28,7 +30,7 @@ static void drm_sched_basic_exit(struct kunit *test) static int drm_sched_timeout_init(struct kunit *test) { - test->priv = drm_mock_sched_new(test, HZ); + test->priv = drm_mock_sched_new(test, MOCK_TIMEOUT); return 0; } @@ -204,6 +206,47 @@ static struct kunit_suite drm_sched_basic = { .test_cases = drm_sched_basic_tests, }; +static void drm_sched_basic_cancel(struct kunit *test) +{ + struct drm_mock_sched_entity *entity; + struct drm_mock_scheduler *sched; + struct drm_mock_sched_job *job; + bool done; + + /* + * Check that drm_sched_fini() uses the cancel_job() callback to cancel + * jobs that are still pending. + */ + + sched = drm_mock_sched_new(test, MAX_SCHEDULE_TIMEOUT); + entity = drm_mock_sched_entity_new(test, DRM_SCHED_PRIORITY_NORMAL, + sched); + + job = drm_mock_sched_job_new(test, entity); + + drm_mock_sched_job_submit(job); + + done = drm_mock_sched_job_wait_scheduled(job, HZ); + KUNIT_ASSERT_TRUE(test, done); + + drm_mock_sched_entity_free(entity); + drm_mock_sched_fini(sched); + + KUNIT_ASSERT_EQ(test, job->hw_fence.error, -ECANCELED); +} + +static struct kunit_case drm_sched_cancel_tests[] = { + KUNIT_CASE(drm_sched_basic_cancel), + {} +}; + +static struct kunit_suite drm_sched_cancel = { + .name = "drm_sched_basic_cancel_tests", + .init = drm_sched_basic_init, + .exit = drm_sched_basic_exit, + .test_cases = drm_sched_cancel_tests, +}; + static void drm_sched_basic_timeout(struct kunit *test) { struct drm_mock_scheduler *sched = test->priv; @@ -227,14 +270,14 @@ static void drm_sched_basic_timeout(struct kunit *test) done = drm_mock_sched_job_wait_scheduled(job, HZ); KUNIT_ASSERT_TRUE(test, done); - done = drm_mock_sched_job_wait_finished(job, HZ / 2); + done = drm_mock_sched_job_wait_finished(job, MOCK_TIMEOUT / 2); KUNIT_ASSERT_FALSE(test, done); KUNIT_ASSERT_EQ(test, job->flags & DRM_MOCK_SCHED_JOB_TIMEDOUT, 0); - done = drm_mock_sched_job_wait_finished(job, HZ); + done = drm_mock_sched_job_wait_finished(job, MOCK_TIMEOUT); KUNIT_ASSERT_FALSE(test, done); KUNIT_ASSERT_EQ(test, @@ -244,8 +287,51 @@ static void drm_sched_basic_timeout(struct kunit *test) drm_mock_sched_entity_free(entity); } +static void drm_sched_skip_reset(struct kunit *test) +{ + struct drm_mock_scheduler *sched = test->priv; + struct drm_mock_sched_entity *entity; + struct drm_mock_sched_job *job; + unsigned int i; + bool done; + + /* + * Submit a single job against a scheduler with the timeout configured + * and verify that if the job is still running, the timeout handler + * will skip the reset and allow the job to complete. + */ + + entity = drm_mock_sched_entity_new(test, + DRM_SCHED_PRIORITY_NORMAL, + sched); + job = drm_mock_sched_job_new(test, entity); + + job->flags = DRM_MOCK_SCHED_JOB_DONT_RESET; + + drm_mock_sched_job_submit(job); + + done = drm_mock_sched_job_wait_scheduled(job, HZ); + KUNIT_ASSERT_TRUE(test, done); + + done = drm_mock_sched_job_wait_finished(job, 2 * MOCK_TIMEOUT); + KUNIT_ASSERT_FALSE(test, done); + + KUNIT_ASSERT_EQ(test, + job->flags & DRM_MOCK_SCHED_JOB_DONT_RESET, + 0); + + i = drm_mock_sched_advance(sched, 1); + KUNIT_ASSERT_EQ(test, i, 1); + + done = drm_mock_sched_job_wait_finished(job, HZ); + KUNIT_ASSERT_TRUE(test, done); + + drm_mock_sched_entity_free(entity); +} + static struct kunit_case drm_sched_timeout_tests[] = { KUNIT_CASE(drm_sched_basic_timeout), + KUNIT_CASE(drm_sched_skip_reset), {} }; @@ -471,6 +557,7 @@ static struct kunit_suite drm_sched_credits = { kunit_test_suites(&drm_sched_basic, &drm_sched_timeout, + &drm_sched_cancel, &drm_sched_priority, &drm_sched_modify_sched, &drm_sched_credits); |
