aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
diff options
context:
space:
mode:
authorKent Russell <[email protected]>2021-10-19 14:05:07 +0000
committerAlex Deucher <[email protected]>2021-10-28 18:26:12 +0000
commit68daadf3d673568bb7122b1683fd8b0e27c55d9b (patch)
tree2f561e2389f17379a788c4035df738417098e207 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
parentdrm/amdgpu: Warn when bad pages approaches 90% threshold (diff)
downloadkernel-68daadf3d673568bb7122b1683fd8b0e27c55d9b.tar.gz
kernel-68daadf3d673568bb7122b1683fd8b0e27c55d9b.zip
drm/amdgpu: Add kernel parameter support for ignoring bad page threshold
When a GPU hits the bad_page_threshold, it will not be initialized by the amdgpu driver. This means that the table cannot be cleared, nor can information gathering be performed (getting serial number, BDF, etc). If the bad_page_threshold kernel parameter is set to -2, continue to initialize the GPU, while printing a warning to dmesg that this action has been done v2: squash in Luben's fix to restore RAS info reporting Cc: Luben Tuikov <[email protected]> Cc: Mukul Joshi <[email protected]> Signed-off-by: Kent Russell <[email protected]> Acked-by: Felix Kuehling <[email protected]> Reviewed-by: Luben Tuikov <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c15
1 files changed, 11 insertions, 4 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 3978152e5ccc..05117eda105b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -1105,11 +1105,18 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
res = amdgpu_ras_eeprom_correct_header_tag(control,
RAS_TABLE_HDR_VAL);
} else {
- *exceed_err_limit = true;
- dev_err(adev->dev,
- "RAS records:%d exceed threshold:%d, "
- "GPU will not be initialized. Replace this GPU or increase the threshold",
+ dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
control->ras_num_recs, ras->bad_page_cnt_threshold);
+ if (amdgpu_bad_page_threshold == -2) {
+ dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -2.");
+ res = 0;
+ } else {
+ *exceed_err_limit = true;
+ dev_err(adev->dev,
+ "RAS records:%d exceed threshold:%d, "
+ "GPU will not be initialized. Replace this GPU or increase the threshold",
+ control->ras_num_recs, ras->bad_page_cnt_threshold);
+ }
}
} else {
DRM_INFO("Creating a new EEPROM table");