2015-07-21 05:45:14 +00:00
|
|
|
/*
|
|
|
|
* Copyright 2015 Advanced Micro Devices, Inc.
|
|
|
|
*
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice shall be included in
|
|
|
|
* all copies or substantial portions of the Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
|
|
|
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
|
|
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
*
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
#include <linux/kthread.h>
|
|
|
|
#include <linux/wait.h>
|
|
|
|
#include <linux/sched.h>
|
2019-06-09 22:07:56 +00:00
|
|
|
|
2021-05-12 14:26:43 +00:00
|
|
|
#include <drm/drm_drv.h>
|
|
|
|
|
2015-07-21 05:45:14 +00:00
|
|
|
#include "amdgpu.h"
|
2015-11-11 06:56:00 +00:00
|
|
|
#include "amdgpu_trace.h"
|
2022-07-08 03:14:05 +00:00
|
|
|
#include "amdgpu_reset.h"
|
2024-08-19 08:04:52 +00:00
|
|
|
#include "amdgpu_dev_coredump.h"
|
|
|
|
#include "amdgpu_xgmi.h"
|
|
|
|
|
|
|
|
static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,
|
|
|
|
struct amdgpu_job *job)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
dev_info(adev->dev, "Dumping IP State\n");
|
2024-08-28 08:06:23 +00:00
|
|
|
for (i = 0; i < adev->num_ip_blocks; i++)
|
2024-08-19 08:04:52 +00:00
|
|
|
if (adev->ip_blocks[i].version->funcs->dump_ip_state)
|
|
|
|
adev->ip_blocks[i].version->funcs
|
|
|
|
->dump_ip_state((void *)adev);
|
2024-08-28 08:06:23 +00:00
|
|
|
dev_info(adev->dev, "Dumping IP State Completed\n");
|
2024-08-19 08:04:52 +00:00
|
|
|
|
|
|
|
amdgpu_coredump(adev, true, false, job);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void amdgpu_job_core_dump(struct amdgpu_device *adev,
|
|
|
|
struct amdgpu_job *job)
|
|
|
|
{
|
|
|
|
struct list_head device_list, *device_list_handle = NULL;
|
|
|
|
struct amdgpu_device *tmp_adev = NULL;
|
|
|
|
struct amdgpu_hive_info *hive = NULL;
|
|
|
|
|
|
|
|
if (!amdgpu_sriov_vf(adev))
|
|
|
|
hive = amdgpu_get_xgmi_hive(adev);
|
|
|
|
if (hive)
|
|
|
|
mutex_lock(&hive->hive_lock);
|
|
|
|
/*
|
|
|
|
* Reuse the logic in amdgpu_device_gpu_recover() to build list of
|
|
|
|
* devices for code dump
|
|
|
|
*/
|
|
|
|
INIT_LIST_HEAD(&device_list);
|
|
|
|
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
|
|
|
|
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
|
|
|
|
list_add_tail(&tmp_adev->reset_list, &device_list);
|
|
|
|
if (!list_is_first(&adev->reset_list, &device_list))
|
|
|
|
list_rotate_to_front(&adev->reset_list, &device_list);
|
|
|
|
device_list_handle = &device_list;
|
|
|
|
} else {
|
|
|
|
list_add_tail(&adev->reset_list, &device_list);
|
|
|
|
device_list_handle = &device_list;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Do the coredump for each device */
|
|
|
|
list_for_each_entry(tmp_adev, device_list_handle, reset_list)
|
|
|
|
amdgpu_job_do_core_dump(tmp_adev, job);
|
|
|
|
|
|
|
|
if (hive) {
|
|
|
|
mutex_unlock(&hive->hive_lock);
|
|
|
|
amdgpu_put_xgmi_hive(hive);
|
|
|
|
}
|
|
|
|
}
|
2015-07-21 05:45:14 +00:00
|
|
|
|
2021-01-20 20:09:59 +00:00
|
|
|
static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
|
2016-03-04 10:51:02 +00:00
|
|
|
{
|
2018-07-13 13:08:44 +00:00
|
|
|
struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
|
|
|
|
struct amdgpu_job *job = to_amdgpu_job(s_job);
|
2024-01-18 19:15:42 +00:00
|
|
|
struct amdgpu_task_info *ti;
|
2020-04-07 10:08:39 +00:00
|
|
|
struct amdgpu_device *adev = ring->adev;
|
2021-05-12 14:26:43 +00:00
|
|
|
int idx;
|
2022-01-26 17:04:39 +00:00
|
|
|
int r;
|
2021-05-12 14:26:43 +00:00
|
|
|
|
2021-10-08 17:21:45 +00:00
|
|
|
if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
|
2024-07-08 19:02:40 +00:00
|
|
|
dev_info(adev->dev, "%s - device unplugged skipping recovery on scheduler:%s",
|
2021-05-12 14:26:43 +00:00
|
|
|
__func__, s_job->sched->name);
|
|
|
|
|
|
|
|
/* Effectively the job is aborted as the device is gone */
|
|
|
|
return DRM_GPU_SCHED_STAT_ENODEV;
|
|
|
|
}
|
2018-12-18 01:14:47 +00:00
|
|
|
|
2022-06-24 04:00:06 +00:00
|
|
|
adev->job_hang = true;
|
2016-05-18 12:19:32 +00:00
|
|
|
|
2024-08-19 08:04:52 +00:00
|
|
|
/*
|
|
|
|
* Do the coredump immediately after a job timeout to get a very
|
|
|
|
* close dump/snapshot/representation of GPU's current error status
|
2024-09-19 03:38:04 +00:00
|
|
|
* Skip it for SRIOV, since VF FLR will be triggered by host driver
|
|
|
|
* before job timeout
|
2024-08-19 08:04:52 +00:00
|
|
|
*/
|
2024-09-19 03:38:04 +00:00
|
|
|
if (!amdgpu_sriov_vf(adev))
|
|
|
|
amdgpu_job_core_dump(adev, job);
|
2024-08-19 08:04:52 +00:00
|
|
|
|
2020-07-06 22:23:17 +00:00
|
|
|
if (amdgpu_gpu_recovery &&
|
|
|
|
amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
|
2024-07-08 19:02:40 +00:00
|
|
|
dev_err(adev->dev, "ring %s timeout, but soft recovered\n",
|
|
|
|
s_job->sched->name);
|
2021-05-12 14:26:43 +00:00
|
|
|
goto exit;
|
2018-08-21 09:11:36 +00:00
|
|
|
}
|
|
|
|
|
2024-07-08 19:02:40 +00:00
|
|
|
dev_err(adev->dev, "ring %s timeout, signaled seq=%u, emitted seq=%u\n",
|
|
|
|
job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
|
|
|
|
ring->fence_drv.sync_seq);
|
2024-01-18 19:15:42 +00:00
|
|
|
|
|
|
|
ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid);
|
|
|
|
if (ti) {
|
2024-07-08 19:02:40 +00:00
|
|
|
dev_err(adev->dev,
|
|
|
|
"Process information: process %s pid %d thread %s pid %d\n",
|
|
|
|
ti->process_name, ti->tgid, ti->task_name, ti->pid);
|
2024-01-18 19:15:42 +00:00
|
|
|
amdgpu_vm_put_task_info(ti);
|
|
|
|
}
|
2017-05-05 07:09:42 +00:00
|
|
|
|
2023-05-09 09:42:11 +00:00
|
|
|
dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
|
|
|
|
|
2024-06-03 18:38:20 +00:00
|
|
|
/* attempt a per ring reset */
|
|
|
|
if (amdgpu_gpu_recovery &&
|
|
|
|
ring->funcs->reset) {
|
|
|
|
/* stop the scheduler, but don't mess with the
|
|
|
|
* bad job yet because if ring reset fails
|
|
|
|
* we'll fall back to full GPU reset.
|
|
|
|
*/
|
|
|
|
drm_sched_wqueue_stop(&ring->sched);
|
|
|
|
r = amdgpu_ring_reset(ring, job->vmid);
|
|
|
|
if (!r) {
|
|
|
|
if (amdgpu_ring_sched_ready(ring))
|
|
|
|
drm_sched_stop(&ring->sched, s_job);
|
2024-06-12 07:49:38 +00:00
|
|
|
atomic_inc(&ring->adev->gpu_reset_counter);
|
2024-06-03 18:38:20 +00:00
|
|
|
amdgpu_fence_driver_force_completion(ring);
|
|
|
|
if (amdgpu_ring_sched_ready(ring))
|
2024-08-27 12:33:12 +00:00
|
|
|
drm_sched_start(&ring->sched);
|
2024-06-03 18:38:20 +00:00
|
|
|
goto exit;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-07 10:08:39 +00:00
|
|
|
if (amdgpu_device_should_recover_gpu(ring->adev)) {
|
2022-07-08 03:14:05 +00:00
|
|
|
struct amdgpu_reset_context reset_context;
|
|
|
|
memset(&reset_context, 0, sizeof(reset_context));
|
|
|
|
|
|
|
|
reset_context.method = AMD_RESET_METHOD_NONE;
|
|
|
|
reset_context.reset_req_dev = adev;
|
2024-06-04 15:46:56 +00:00
|
|
|
reset_context.src = AMDGPU_RESET_SRC_JOB;
|
2022-07-08 03:14:05 +00:00
|
|
|
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
|
|
|
|
2024-08-19 08:04:52 +00:00
|
|
|
/*
|
|
|
|
* To avoid an unnecessary extra coredump, as we have already
|
|
|
|
* got the very close representation of GPU's error status
|
|
|
|
*/
|
|
|
|
set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
|
|
|
|
|
2022-07-08 03:14:05 +00:00
|
|
|
r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
|
2022-01-26 17:04:39 +00:00
|
|
|
if (r)
|
2024-07-08 19:02:40 +00:00
|
|
|
dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);
|
2020-04-07 10:08:39 +00:00
|
|
|
} else {
|
2019-05-13 05:57:29 +00:00
|
|
|
drm_sched_suspend_timeout(&ring->sched);
|
2020-04-07 10:08:39 +00:00
|
|
|
if (amdgpu_sriov_vf(adev))
|
|
|
|
adev->virt.tdr_debug = true;
|
|
|
|
}
|
2021-05-12 14:26:43 +00:00
|
|
|
|
|
|
|
exit:
|
2022-06-24 04:00:06 +00:00
|
|
|
adev->job_hang = false;
|
2021-05-12 14:26:43 +00:00
|
|
|
drm_dev_exit(idx);
|
|
|
|
return DRM_GPU_SCHED_STAT_NOMINAL;
|
2016-03-04 10:51:02 +00:00
|
|
|
}
|
|
|
|
|
2022-09-28 18:31:38 +00:00
|
|
|
int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
|
|
|
|
struct drm_sched_entity *entity, void *owner,
|
|
|
|
unsigned int num_ibs, struct amdgpu_job **job)
|
2016-02-03 12:44:52 +00:00
|
|
|
{
|
|
|
|
if (num_ibs == 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2022-03-01 08:57:41 +00:00
|
|
|
*job = kzalloc(struct_size(*job, ibs, num_ibs), GFP_KERNEL);
|
2016-02-03 12:44:52 +00:00
|
|
|
if (!*job)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2018-07-13 15:15:54 +00:00
|
|
|
/*
|
|
|
|
* Initialize the scheduler to at least some ring so that we always
|
|
|
|
* have a pointer to adev.
|
|
|
|
*/
|
|
|
|
(*job)->base.sched = &adev->rings[0]->sched;
|
2016-04-19 12:11:32 +00:00
|
|
|
(*job)->vm = vm;
|
2016-02-03 12:44:52 +00:00
|
|
|
|
2022-09-29 11:05:56 +00:00
|
|
|
amdgpu_sync_create(&(*job)->explicit_sync);
|
2023-04-19 13:17:57 +00:00
|
|
|
(*job)->generation = amdgpu_vm_generation(adev, vm);
|
2018-09-10 22:43:58 +00:00
|
|
|
(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
|
2016-02-08 11:13:05 +00:00
|
|
|
|
2022-09-28 18:31:38 +00:00
|
|
|
if (!entity)
|
|
|
|
return 0;
|
|
|
|
|
drm/sched: implement dynamic job-flow control
Currently, job flow control is implemented simply by limiting the number
of jobs in flight. Therefore, a scheduler is initialized with a credit
limit that corresponds to the number of jobs which can be sent to the
hardware.
This implies that for each job, drivers need to account for the maximum
job size possible in order to not overflow the ring buffer.
However, there are drivers, such as Nouveau, where the job size has a
rather large range. For such drivers it can easily happen that job
submissions not even filling the ring by 1% can block subsequent
submissions, which, in the worst case, can lead to the ring run dry.
In order to overcome this issue, allow for tracking the actual job size
instead of the number of jobs. Therefore, add a field to track a job's
credit count, which represents the number of credits a job contributes
to the scheduler's credit limit.
Signed-off-by: Danilo Krummrich <dakr@redhat.com>
Reviewed-by: Luben Tuikov <ltuikov89@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231110001638.71750-1-dakr@redhat.com
2023-11-10 00:16:33 +00:00
|
|
|
return drm_sched_job_init(&(*job)->base, entity, 1, owner);
|
2016-02-03 12:44:52 +00:00
|
|
|
}
|
|
|
|
|
2022-09-28 18:31:38 +00:00
|
|
|
int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev,
|
|
|
|
struct drm_sched_entity *entity, void *owner,
|
|
|
|
size_t size, enum amdgpu_ib_pool_type pool_type,
|
|
|
|
struct amdgpu_job **job)
|
2016-02-01 11:20:25 +00:00
|
|
|
{
|
|
|
|
int r;
|
|
|
|
|
2022-09-28 18:31:38 +00:00
|
|
|
r = amdgpu_job_alloc(adev, NULL, entity, owner, 1, job);
|
2016-02-01 11:20:25 +00:00
|
|
|
if (r)
|
|
|
|
return r;
|
|
|
|
|
2022-03-02 15:39:34 +00:00
|
|
|
(*job)->num_ibs = 1;
|
2020-03-26 00:38:29 +00:00
|
|
|
r = amdgpu_ib_get(adev, NULL, size, pool_type, &(*job)->ibs[0]);
|
2022-09-28 18:31:38 +00:00
|
|
|
if (r) {
|
|
|
|
if (entity)
|
|
|
|
drm_sched_job_cleanup(&(*job)->base);
|
2016-02-01 11:20:25 +00:00
|
|
|
kfree(*job);
|
2022-09-28 18:31:38 +00:00
|
|
|
}
|
2016-02-01 11:20:25 +00:00
|
|
|
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2022-03-01 09:59:14 +00:00
|
|
|
void amdgpu_job_set_resources(struct amdgpu_job *job, struct amdgpu_bo *gds,
|
|
|
|
struct amdgpu_bo *gws, struct amdgpu_bo *oa)
|
|
|
|
{
|
|
|
|
if (gds) {
|
|
|
|
job->gds_base = amdgpu_bo_gpu_offset(gds) >> PAGE_SHIFT;
|
|
|
|
job->gds_size = amdgpu_bo_size(gds) >> PAGE_SHIFT;
|
|
|
|
}
|
|
|
|
if (gws) {
|
|
|
|
job->gws_base = amdgpu_bo_gpu_offset(gws) >> PAGE_SHIFT;
|
|
|
|
job->gws_size = amdgpu_bo_size(gws) >> PAGE_SHIFT;
|
|
|
|
}
|
|
|
|
if (oa) {
|
|
|
|
job->oa_base = amdgpu_bo_gpu_offset(oa) >> PAGE_SHIFT;
|
|
|
|
job->oa_size = amdgpu_bo_size(oa) >> PAGE_SHIFT;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-06-29 13:10:31 +00:00
|
|
|
void amdgpu_job_free_resources(struct amdgpu_job *job)
|
2016-02-03 12:44:52 +00:00
|
|
|
{
|
2016-10-25 12:00:45 +00:00
|
|
|
struct dma_fence *f;
|
2016-05-18 11:09:47 +00:00
|
|
|
unsigned i;
|
|
|
|
|
2023-01-12 13:46:00 +00:00
|
|
|
/* Check if any fences where initialized */
|
|
|
|
if (job->base.s_fence && job->base.s_fence->finished.ops)
|
|
|
|
f = &job->base.s_fence->finished;
|
2025-06-02 15:31:52 +00:00
|
|
|
else if (job->hw_fence.base.ops)
|
|
|
|
f = &job->hw_fence.base;
|
2023-01-12 13:46:00 +00:00
|
|
|
else
|
|
|
|
f = NULL;
|
|
|
|
|
2016-02-03 12:44:52 +00:00
|
|
|
for (i = 0; i < job->num_ibs; ++i)
|
2024-12-06 12:17:45 +00:00
|
|
|
amdgpu_ib_free(NULL, &job->ibs[i], f);
|
2016-02-01 11:20:25 +00:00
|
|
|
}
|
|
|
|
|
2017-12-06 16:49:39 +00:00
|
|
|
static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
|
2016-03-10 04:14:44 +00:00
|
|
|
{
|
2018-07-13 13:08:44 +00:00
|
|
|
struct amdgpu_job *job = to_amdgpu_job(s_job);
|
2016-05-19 07:54:15 +00:00
|
|
|
|
2018-10-29 09:32:28 +00:00
|
|
|
drm_sched_job_cleanup(s_job);
|
|
|
|
|
2022-09-29 11:05:56 +00:00
|
|
|
amdgpu_sync_free(&job->explicit_sync);
|
2021-05-12 07:06:35 +00:00
|
|
|
|
2022-11-16 09:08:22 +00:00
|
|
|
/* only put the hw fence if has embedded fence */
|
2025-06-02 15:31:52 +00:00
|
|
|
if (!job->hw_fence.base.ops)
|
2022-11-16 09:08:22 +00:00
|
|
|
kfree(job);
|
|
|
|
else
|
2025-06-02 15:31:52 +00:00
|
|
|
dma_fence_put(&job->hw_fence.base);
|
2016-03-10 04:14:44 +00:00
|
|
|
}
|
|
|
|
|
2022-03-02 15:26:53 +00:00
|
|
|
void amdgpu_job_set_gang_leader(struct amdgpu_job *job,
|
|
|
|
struct amdgpu_job *leader)
|
|
|
|
{
|
|
|
|
struct dma_fence *fence = &leader->base.s_fence->scheduled;
|
|
|
|
|
|
|
|
WARN_ON(job->gang_submit);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't add a reference when we are the gang leader to avoid circle
|
|
|
|
* dependency.
|
|
|
|
*/
|
|
|
|
if (job != leader)
|
|
|
|
dma_fence_get(fence);
|
|
|
|
job->gang_submit = fence;
|
|
|
|
}
|
|
|
|
|
2016-05-18 11:12:12 +00:00
|
|
|
void amdgpu_job_free(struct amdgpu_job *job)
|
|
|
|
{
|
2022-09-28 18:31:38 +00:00
|
|
|
if (job->base.entity)
|
|
|
|
drm_sched_job_cleanup(&job->base);
|
|
|
|
|
2016-05-18 11:12:12 +00:00
|
|
|
amdgpu_job_free_resources(job);
|
2022-09-29 11:05:56 +00:00
|
|
|
amdgpu_sync_free(&job->explicit_sync);
|
2022-03-02 15:26:53 +00:00
|
|
|
if (job->gang_submit != &job->base.s_fence->scheduled)
|
|
|
|
dma_fence_put(job->gang_submit);
|
2021-05-12 07:06:35 +00:00
|
|
|
|
2025-06-02 15:31:52 +00:00
|
|
|
if (!job->hw_fence.base.ops)
|
2022-08-24 07:56:04 +00:00
|
|
|
kfree(job);
|
|
|
|
else
|
2025-06-02 15:31:52 +00:00
|
|
|
dma_fence_put(&job->hw_fence.base);
|
2016-05-18 11:12:12 +00:00
|
|
|
}
|
|
|
|
|
2022-09-28 18:31:38 +00:00
|
|
|
struct dma_fence *amdgpu_job_submit(struct amdgpu_job *job)
|
2016-02-01 11:20:25 +00:00
|
|
|
{
|
2022-09-28 18:31:38 +00:00
|
|
|
struct dma_fence *f;
|
2016-02-01 11:20:25 +00:00
|
|
|
|
drm/sched: Split drm_sched_job_init
This is a very confusingly named function, because not just does it
init an object, it arms it and provides a point of no return for
pushing a job into the scheduler. It would be nice if that's a bit
clearer in the interface.
But the real reason is that I want to push the dependency tracking
helpers into the scheduler code, and that means drm_sched_job_init
must be called a lot earlier, without arming the job.
v2:
- don't change .gitignore (Steven)
- don't forget v3d (Emma)
v3: Emma noticed that I leak the memory allocated in
drm_sched_job_init if we bail out before the point of no return in
subsequent driver patches. To be able to fix this change
drm_sched_job_cleanup() so it can handle being called both before and
after drm_sched_job_arm().
Also improve the kerneldoc for this.
v4:
- Fix the drm_sched_job_cleanup logic, I inverted the booleans, as
usual (Melissa)
- Christian pointed out that drm_sched_entity_select_rq() also needs
to be moved into drm_sched_job_arm, which made me realize that the
job->id definitely needs to be moved too.
Shuffle things to fit between job_init and job_arm.
v5:
Reshuffle the split between init/arm once more, amdgpu abuses
drm_sched.ready to signal gpu reset failures. Also document this
somewhat. (Christian)
v6:
Rebase on top of the msm drm/sched support. Note that the
drm_sched_job_init() call is completely misplaced, and hence also the
split-out drm_sched_entity_push_job(). I've put in a FIXME which the next
patch will address.
v7: Drop the FIXME in msm, after discussions with Rob I agree it shouldn't
be a problem where it is now.
Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Melissa Wen <mwen@igalia.com>
Cc: Melissa Wen <melissa.srw@gmail.com>
Acked-by: Emma Anholt <emma@anholt.net>
Acked-by: Steven Price <steven.price@arm.com> (v2)
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> (v5)
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Cc: Lucas Stach <l.stach@pengutronix.de>
Cc: Russell King <linux+etnaviv@armlinux.org.uk>
Cc: Christian Gmeiner <christian.gmeiner@gmail.com>
Cc: Qiang Yu <yuq825@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Cc: Steven Price <steven.price@arm.com>
Cc: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: "Christian König" <christian.koenig@amd.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Adam Borowski <kilobyte@angband.pl>
Cc: Nick Terrell <terrelln@fb.com>
Cc: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Cc: Paul Menzel <pmenzel@molgen.mpg.de>
Cc: Sami Tolvanen <samitolvanen@google.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Nirmoy Das <nirmoy.das@amd.com>
Cc: Deepak R Varma <mh12gx2825@gmail.com>
Cc: Lee Jones <lee.jones@linaro.org>
Cc: Kevin Wang <kevin1.wang@amd.com>
Cc: Chen Li <chenli@uniontech.com>
Cc: Luben Tuikov <luben.tuikov@amd.com>
Cc: "Marek Olšák" <marek.olsak@amd.com>
Cc: Dennis Li <Dennis.Li@amd.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Cc: Sonny Jiang <sonny.jiang@amd.com>
Cc: Boris Brezillon <boris.brezillon@collabora.com>
Cc: Tian Tao <tiantao6@hisilicon.com>
Cc: etnaviv@lists.freedesktop.org
Cc: lima@lists.freedesktop.org
Cc: linux-media@vger.kernel.org
Cc: linaro-mm-sig@lists.linaro.org
Cc: Emma Anholt <emma@anholt.net>
Cc: Rob Clark <robdclark@gmail.com>
Cc: Sean Paul <sean@poorly.run>
Cc: linux-arm-msm@vger.kernel.org
Cc: freedreno@lists.freedesktop.org
Link: https://patchwork.freedesktop.org/patch/msgid/20210817084917.3555822-1-daniel.vetter@ffwll.ch
2021-08-17 08:49:16 +00:00
|
|
|
drm_sched_job_arm(&job->base);
|
2022-09-28 18:31:38 +00:00
|
|
|
f = dma_fence_get(&job->base.s_fence->finished);
|
2016-06-29 13:10:31 +00:00
|
|
|
amdgpu_job_free_resources(job);
|
2021-08-05 10:46:50 +00:00
|
|
|
drm_sched_entity_push_job(&job->base);
|
2016-02-01 11:20:25 +00:00
|
|
|
|
2022-09-28 18:31:38 +00:00
|
|
|
return f;
|
2016-02-03 12:44:52 +00:00
|
|
|
}
|
|
|
|
|
2018-07-13 14:29:10 +00:00
|
|
|
int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
|
|
|
|
struct dma_fence **fence)
|
|
|
|
{
|
|
|
|
int r;
|
|
|
|
|
|
|
|
job->base.sched = &ring->sched;
|
2022-07-13 16:57:50 +00:00
|
|
|
r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, fence);
|
|
|
|
|
2018-07-13 14:29:10 +00:00
|
|
|
if (r)
|
|
|
|
return r;
|
|
|
|
|
|
|
|
amdgpu_job_free(job);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-09-28 12:00:57 +00:00
|
|
|
static struct dma_fence *
|
2022-09-29 13:01:57 +00:00
|
|
|
amdgpu_job_prepare_job(struct drm_sched_job *sched_job,
|
2022-09-28 12:00:57 +00:00
|
|
|
struct drm_sched_entity *s_entity)
|
2015-08-25 09:05:36 +00:00
|
|
|
{
|
2018-07-20 12:21:06 +00:00
|
|
|
struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched);
|
2015-09-09 01:21:19 +00:00
|
|
|
struct amdgpu_job *job = to_amdgpu_job(sched_job);
|
2022-09-29 12:04:01 +00:00
|
|
|
struct dma_fence *fence = NULL;
|
2017-05-09 07:50:22 +00:00
|
|
|
int r;
|
2017-11-13 19:47:52 +00:00
|
|
|
|
2023-04-17 16:15:15 +00:00
|
|
|
r = drm_sched_entity_error(s_entity);
|
2024-03-07 19:04:31 +00:00
|
|
|
if (r)
|
2023-04-17 16:15:15 +00:00
|
|
|
goto error;
|
|
|
|
|
2022-11-18 12:32:30 +00:00
|
|
|
if (!fence && job->gang_submit)
|
|
|
|
fence = amdgpu_device_switch_gang(ring->adev, job->gang_submit);
|
|
|
|
|
2022-09-28 12:00:57 +00:00
|
|
|
while (!fence && job->vm && !job->vmid) {
|
|
|
|
r = amdgpu_vmid_grab(job->vm, ring, job, &fence);
|
2023-04-17 16:15:15 +00:00
|
|
|
if (r) {
|
2024-07-08 19:02:40 +00:00
|
|
|
dev_err(ring->adev->dev, "Error getting VM ID (%d)\n", r);
|
2023-04-17 16:15:15 +00:00
|
|
|
goto error;
|
|
|
|
}
|
2015-11-03 19:58:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return fence;
|
2023-04-17 16:15:15 +00:00
|
|
|
|
|
|
|
error:
|
|
|
|
dma_fence_set_error(&job->base.s_fence->finished, r);
|
|
|
|
return NULL;
|
2015-08-25 09:05:36 +00:00
|
|
|
}
|
|
|
|
|
2017-12-06 16:49:39 +00:00
|
|
|
static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
|
2015-07-21 05:45:14 +00:00
|
|
|
{
|
2018-07-13 13:08:44 +00:00
|
|
|
struct amdgpu_ring *ring = to_amdgpu_ring(sched_job->sched);
|
2022-03-02 15:26:53 +00:00
|
|
|
struct amdgpu_device *adev = ring->adev;
|
2017-10-25 08:21:08 +00:00
|
|
|
struct dma_fence *fence = NULL, *finished;
|
2015-09-09 01:05:55 +00:00
|
|
|
struct amdgpu_job *job;
|
2019-10-24 19:44:10 +00:00
|
|
|
int r = 0;
|
2015-07-21 05:45:14 +00:00
|
|
|
|
2015-09-09 01:21:19 +00:00
|
|
|
job = to_amdgpu_job(sched_job);
|
2017-10-25 08:21:08 +00:00
|
|
|
finished = &job->base.s_fence->finished;
|
2016-02-08 11:13:05 +00:00
|
|
|
|
2015-11-11 06:56:00 +00:00
|
|
|
trace_amdgpu_sched_run_job(job);
|
2017-10-25 08:21:08 +00:00
|
|
|
|
2022-03-02 15:26:53 +00:00
|
|
|
/* Skip job if VRAM is lost and never resubmit gangs */
|
2023-04-19 13:17:57 +00:00
|
|
|
if (job->generation != amdgpu_vm_generation(adev, job->vm) ||
|
2022-03-02 15:26:53 +00:00
|
|
|
(job->job_run_counter && job->gang_submit))
|
|
|
|
dma_fence_set_error(finished, -ECANCELED);
|
2020-08-12 15:48:26 +00:00
|
|
|
|
|
|
|
if (finished->error < 0) {
|
2024-03-21 12:16:36 +00:00
|
|
|
dev_dbg(adev->dev, "Skip scheduling IBs in ring(%s)",
|
|
|
|
ring->name);
|
2020-08-12 15:48:26 +00:00
|
|
|
} else {
|
2018-07-13 13:08:44 +00:00
|
|
|
r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
|
2020-08-12 15:48:26 +00:00
|
|
|
&fence);
|
2017-05-16 06:34:27 +00:00
|
|
|
if (r)
|
2024-03-21 12:16:36 +00:00
|
|
|
dev_err(adev->dev,
|
|
|
|
"Error scheduling IBs (%d) in ring(%s)", r,
|
|
|
|
ring->name);
|
2017-05-16 06:34:27 +00:00
|
|
|
}
|
2017-02-20 22:53:19 +00:00
|
|
|
|
2021-05-12 07:06:35 +00:00
|
|
|
job->job_run_counter++;
|
2016-07-05 12:48:17 +00:00
|
|
|
amdgpu_job_free_resources(job);
|
2019-10-24 19:44:10 +00:00
|
|
|
|
|
|
|
fence = r ? ERR_PTR(r) : fence;
|
2016-02-01 10:56:35 +00:00
|
|
|
return fence;
|
2015-07-21 05:45:14 +00:00
|
|
|
}
|
|
|
|
|
2019-09-13 22:40:32 +00:00
|
|
|
#define to_drm_sched_job(sched_job) \
|
|
|
|
container_of((sched_job), struct drm_sched_job, queue_node)
|
|
|
|
|
|
|
|
void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
|
|
|
|
{
|
|
|
|
struct drm_sched_job *s_job;
|
|
|
|
struct drm_sched_entity *s_entity = NULL;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* Signal all jobs not yet scheduled */
|
2023-11-23 03:08:48 +00:00
|
|
|
for (i = DRM_SCHED_PRIORITY_KERNEL; i < sched->num_rqs; i++) {
|
2023-10-15 01:15:35 +00:00
|
|
|
struct drm_sched_rq *rq = sched->sched_rq[i];
|
2019-09-13 22:40:32 +00:00
|
|
|
spin_lock(&rq->lock);
|
|
|
|
list_for_each_entry(s_entity, &rq->entities, list) {
|
|
|
|
while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) {
|
|
|
|
struct drm_sched_fence *s_fence = s_job->s_fence;
|
|
|
|
|
|
|
|
dma_fence_signal(&s_fence->scheduled);
|
|
|
|
dma_fence_set_error(&s_fence->finished, -EHWPOISON);
|
|
|
|
dma_fence_signal(&s_fence->finished);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
spin_unlock(&rq->lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Signal all jobs already scheduled to HW */
|
gpu/drm: ring_mirror_list --> pending_list
Rename "ring_mirror_list" to "pending_list",
to describe what something is, not what it does,
how it's used, or how the hardware implements it.
This also abstracts the actual hardware
implementation, i.e. how the low-level driver
communicates with the device it drives, ring, CAM,
etc., shouldn't be exposed to DRM.
The pending_list keeps jobs submitted, which are
out of our control. Usually this means they are
pending execution status in hardware, but the
latter definition is a more general (inclusive)
definition.
Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Link: https://patchwork.freedesktop.org/patch/405573/
Cc: Alexander Deucher <Alexander.Deucher@amd.com>
Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Christian König <christian.koenig@amd.com>
2020-12-04 03:17:19 +00:00
|
|
|
list_for_each_entry(s_job, &sched->pending_list, list) {
|
2019-09-13 22:40:32 +00:00
|
|
|
struct drm_sched_fence *s_fence = s_job->s_fence;
|
|
|
|
|
|
|
|
dma_fence_set_error(&s_fence->finished, -EHWPOISON);
|
|
|
|
dma_fence_signal(&s_fence->finished);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-12-06 16:49:39 +00:00
|
|
|
const struct drm_sched_backend_ops amdgpu_sched_ops = {
|
2022-09-29 13:01:57 +00:00
|
|
|
.prepare_job = amdgpu_job_prepare_job,
|
2016-02-01 11:31:01 +00:00
|
|
|
.run_job = amdgpu_job_run,
|
2016-05-18 12:19:32 +00:00
|
|
|
.timedout_job = amdgpu_job_timedout,
|
2016-05-19 07:54:15 +00:00
|
|
|
.free_job = amdgpu_job_free_cb
|
2015-07-21 05:45:14 +00:00
|
|
|
};
|