From a43996573ad531ad1def11f0ecf5fdad361348a6 Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Fri, 5 Feb 2021 18:48:01 -0500 Subject: drm/amdgpu: Rename misspelled function Instead of fixing the spelling in amdgpu_ras_eeprom_process_recods(), rename it to, amdgpu_ras_eeprom_xfer(), to look similar to other I2C and protocol transfer (read/write) functions. Also to keep the column span to within reason by using a shorter name. Change the "num" function parameter from "int" to "const u32" since it is the number of items (records) to xfer, i.e. their count, which cannot be a negative number. Also swap the order of parameters, keeping the pointer to records and their number next to each other, while the direction now becomes the last parameter. Cc: Jean Delvare Cc: Alexander Deucher Cc: Andrey Grodzovsky Cc: Lijo Lazar Cc: Stanley Yang Cc: Hawking Zhang Signed-off-by: Luben Tuikov Acked-by: Alexander Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c13b02caf8c3..ba7eea90b36f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1817,10 +1817,10 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) save_count = data->count - control->num_recs; /* only new entries are saved */ if (save_count > 0) { - if (amdgpu_ras_eeprom_process_recods(control, - &data->bps[control->num_recs], - true, - save_count)) { + if (amdgpu_ras_eeprom_xfer(control, + &data->bps[control->num_recs], + save_count, + true)) { dev_err(adev->dev, "Failed to save EEPROM table data!"); return -EIO; } @@ -1850,8 +1850,7 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) if (!bps) return -ENOMEM; - if (amdgpu_ras_eeprom_process_recods(control, bps, false, - control->num_recs)) { + if (amdgpu_ras_eeprom_xfer(control, bps, control->num_recs, false)) { dev_err(adev->dev, "Failed to load EEPROM table records!"); ret = -EIO; goto out; -- cgit From 1fab841ff63d2b94673a46682098d86d67b195e2 Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Thu, 11 Mar 2021 11:20:15 -0500 Subject: drm/amdgpu: RAS xfer to read/write Wrap amdgpu_ras_eeprom_xfer(..., bool write), into amdgpu_ras_eeprom_read() and amdgpu_ras_eeprom_write(), as that makes reading and understanding the code clearer. Cc: Jean Delvare Cc: Alexander Deucher Cc: Andrey Grodzovsky Cc: Lijo Lazar Cc: Stanley Yang Cc: Hawking Zhang Signed-off-by: Luben Tuikov Acked-by: Alexander Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 ++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 24 +++++++++++++++++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 8 +++++--- 3 files changed, 28 insertions(+), 13 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ba7eea90b36f..b0a0cae24a84 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1817,10 +1817,9 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) save_count = data->count - control->num_recs; /* only new entries are saved */ if (save_count > 0) { - if (amdgpu_ras_eeprom_xfer(control, - &data->bps[control->num_recs], - save_count, - true)) { + if (amdgpu_ras_eeprom_write(control, + &data->bps[control->num_recs], + save_count)) { dev_err(adev->dev, "Failed to save EEPROM table data!"); return -EIO; } @@ -1850,7 +1849,7 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) if (!bps) return -ENOMEM; - if (amdgpu_ras_eeprom_xfer(control, bps, control->num_recs, false)) { + if (amdgpu_ras_eeprom_read(control, bps, control->num_recs)) { dev_err(adev->dev, "Failed to load EEPROM table records!"); ret = -EIO; goto out; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 9e3fbc44b4bc..550a31953d2d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -432,9 +432,9 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev) return false; } -int amdgpu_ras_eeprom_xfer(struct amdgpu_ras_eeprom_control *control, - struct eeprom_table_record *records, - const u32 num, bool write) +static int amdgpu_ras_eeprom_xfer(struct amdgpu_ras_eeprom_control *control, + struct eeprom_table_record *records, + const u32 num, bool write) { int i, ret = 0; unsigned char *buffs, *buff; @@ -554,6 +554,20 @@ free_buff: return ret == num ? 0 : -EIO; } +int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, + struct eeprom_table_record *records, + const u32 num) +{ + return amdgpu_ras_eeprom_xfer(control, records, num, false); +} + +int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control, + struct eeprom_table_record *records, + const u32 num) +{ + return amdgpu_ras_eeprom_xfer(control, records, num, true); +} + inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void) { return RAS_MAX_RECORD_NUM; @@ -574,13 +588,13 @@ void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control) recs[i].retired_page = i; } - if (!amdgpu_ras_eeprom_xfer(control, recs, 1, true)) { + if (!amdgpu_ras_eeprom_write(control, recs, 1)) { memset(recs, 0, sizeof(*recs) * 1); control->next_addr = RAS_RECORD_START; - if (!amdgpu_ras_eeprom_xfer(control, recs, 1, false)) { + if (!amdgpu_ras_eeprom_read(control, recs)) { for (i = 0; i < 1; i++) DRM_INFO("rec.address :0x%llx, rec.retired_page :%llu", recs[i].address, recs[i].retired_page); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index 6a1bd527bce5..fa9c509a8e2f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -82,9 +82,11 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control); bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev); -int amdgpu_ras_eeprom_xfer(struct amdgpu_ras_eeprom_control *control, - struct eeprom_table_record *records, - const u32 num, bool write); +int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, + struct eeprom_table_record *records, const u32 num); + +int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control, + struct eeprom_table_record *records, const u32 num); inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void); -- cgit From cf696091d38b61ff9e4e4d592522a2daf5e3637e Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Thu, 11 Mar 2021 10:34:28 -0500 Subject: drm/amdgpu: Return result fix in RAS The low level EEPROM write method, doesn't return 1, but the number of bytes written. Thus do not compare to 1, instead, compare to greater than 0 for success. Other cleanup: if the lower layers returned -errno, then return that, as opposed to overwriting the error code with one-fits-all -EINVAL. For instance, some return -EAGAIN. Cc: Jean Delvare Cc: Alexander Deucher Cc: Andrey Grodzovsky Cc: Lijo Lazar Cc: Stanley Yang Cc: Hawking Zhang Signed-off-by: Luben Tuikov Reviewed-by: Alexander Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c | 3 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 +++++++++++++--------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 2 +- drivers/gpu/drm/amd/amdgpu/smu_v11_0_i2c.c | 3 +-- 4 files changed, 16 insertions(+), 14 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c index a5a87affedab..a4815af111ed 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c @@ -105,8 +105,7 @@ static int __amdgpu_eeprom_xfer(struct i2c_adapter *i2c_adap, u32 eeprom_addr, int r; u16 len; - r = 0; - for ( ; buf_size > 0; + for (r = 0; buf_size > 0; buf_size -= len, eeprom_addr += len, eeprom_buf += len) { /* Set the EEPROM address we want to write to/read from. */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b0a0cae24a84..6ef3cfea6e6e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -355,8 +355,9 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, * to see which blocks support RAS on a particular asic. * */ -static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf, - size_t size, loff_t *pos) +static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, + const char __user *buf, + size_t size, loff_t *pos) { struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; struct ras_debug_if data; @@ -370,7 +371,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data); if (ret) - return -EINVAL; + return ret; if (data.op == 3) { ret = amdgpu_reserve_page_direct(adev, data.inject.address); @@ -439,21 +440,24 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * * will reset EEPROM table to 0 entries. * */ -static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf, - size_t size, loff_t *pos) +static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, + const char __user *buf, + size_t size, loff_t *pos) { struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; int ret; ret = amdgpu_ras_eeprom_reset_table( - &(amdgpu_ras_get_context(adev)->eeprom_control)); + &(amdgpu_ras_get_context(adev)->eeprom_control)); - if (ret == 1) { + if (ret > 0) { + /* Something was written to EEPROM. + */ amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS; return size; } else { - return -EIO; + return ret; } } @@ -1994,7 +1998,7 @@ free: kfree(*data); con->eh_data = NULL; out: - dev_warn(adev->dev, "Failed to initialize ras recovery!\n"); + dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret); /* * Except error threshold exceeding case, other failure cases in this diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 17cea35275e4..dc48c5563980 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -335,7 +335,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, ret = amdgpu_ras_eeprom_reset_table(control); } - return ret == 1 ? 0 : -EIO; + return ret > 0 ? 0 : -EIO; } static void __encode_table_record_to_buff(struct amdgpu_ras_eeprom_control *control, diff --git a/drivers/gpu/drm/amd/amdgpu/smu_v11_0_i2c.c b/drivers/gpu/drm/amd/amdgpu/smu_v11_0_i2c.c index 650352567566..7f48ee020bc0 100644 --- a/drivers/gpu/drm/amd/amdgpu/smu_v11_0_i2c.c +++ b/drivers/gpu/drm/amd/amdgpu/smu_v11_0_i2c.c @@ -222,7 +222,7 @@ static uint32_t smu_v11_0_i2c_transmit(struct i2c_adapter *control, u32 numbytes, u32 i2c_flag) { struct amdgpu_device *adev = to_amdgpu_device(control); - uint32_t bytes_sent, reg, ret = 0; + u32 bytes_sent, reg, ret = I2C_OK; unsigned long timeout_counter; bytes_sent = 0; @@ -290,7 +290,6 @@ static uint32_t smu_v11_0_i2c_transmit(struct i2c_adapter *control, } ret = smu_v11_0_i2c_poll_tx_status(control); - Err: /* Any error, no point in proceeding */ if (ret != I2C_OK) { -- cgit From e4e6a58935eed66c4ea39c95bcb954c7db68d272 Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Sat, 27 Mar 2021 03:57:49 -0400 Subject: drm/amdgpu: Use explicit cardinality for clarity RAS_MAX_RECORD_NUM may mean the maximum record number, as in the maximum house number on your street, or it may mean the maximum number of records, as in the count of records, which is also a number. To make this distinction whether the number is ordinal (index) or cardinal (count), rename this macro to RAS_MAX_RECORD_COUNT. This makes it easy to understand what it refers to, especially when we compute quantities such as, how many records do we have left in the table, especially when there are so many other numbers, quantities and numerical macros around. Also rename the long, amdgpu_ras_eeprom_get_record_max_length() to the more succinct and clear, amdgpu_ras_eeprom_max_record_count(). When computing the threshold, which also deals with counts, i.e. "how many", use cardinal "max_eeprom_records_count", than the quantitative "max_eeprom_records_len". Simplify the logic here and there, as well. Cc: Guchun Chen Cc: John Clements Cc: Hawking Zhang Cc: Alexander Deucher Signed-off-by: Luben Tuikov Acked-by: Alexander Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 9 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 50 +++++++++++--------------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 8 ++--- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 2 +- 4 files changed, 30 insertions(+), 39 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 71beb0db0125..b514b7bbc0f5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -870,11 +870,10 @@ MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legac module_param_named(reset_method, amdgpu_reset_method, int, 0444); /** - * DOC: bad_page_threshold (int) - * Bad page threshold is to specify the threshold value of faulty pages - * detected by RAS ECC, that may result in GPU entering bad status if total - * faulty pages by ECC exceed threshold value and leave it for user's further - * check. + * DOC: bad_page_threshold (int) Bad page threshold is specifies the + * threshold value of faulty pages detected by RAS ECC, which may + * result in the GPU entering bad status when the number of total + * faulty pages by ECC exceeds the threshold value. */ MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)"); module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 6ef3cfea6e6e..79afff19ff24 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -71,8 +71,8 @@ const char *ras_block_string[] = { /* inject address is 52 bits */ #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) -/* typical ECC bad page rate(1 bad page per 100MB VRAM) */ -#define RAS_BAD_PAGE_RATE (100 * 1024 * 1024ULL) +/* typical ECC bad page rate is 1 bad page per 100MB VRAM */ +#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) enum amdgpu_ras_retire_page_reservation { AMDGPU_RAS_RETIRE_PAGE_RESERVED, @@ -1841,27 +1841,24 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) { struct amdgpu_ras_eeprom_control *control = - &adev->psp.ras.ras->eeprom_control; - struct eeprom_table_record *bps = NULL; - int ret = 0; + &adev->psp.ras.ras->eeprom_control; + struct eeprom_table_record *bps; + int ret; /* no bad page record, skip eeprom access */ - if (!control->num_recs || (amdgpu_bad_page_threshold == 0)) - return ret; + if (control->num_recs == 0 || amdgpu_bad_page_threshold == 0) + return 0; bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL); if (!bps) return -ENOMEM; - if (amdgpu_ras_eeprom_read(control, bps, control->num_recs)) { + ret = amdgpu_ras_eeprom_read(control, bps, control->num_recs); + if (ret) dev_err(adev->dev, "Failed to load EEPROM table records!"); - ret = -EIO; - goto out; - } - - ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs); + else + ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs); -out: kfree(bps); return ret; } @@ -1901,11 +1898,9 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, } static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, - uint32_t max_length) + uint32_t max_count) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - int tmp_threshold = amdgpu_bad_page_threshold; - u64 val; /* * Justification of value bad_page_cnt_threshold in ras structure @@ -1926,18 +1921,15 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, * take no effect. */ - if (tmp_threshold < -1) - tmp_threshold = -1; - else if (tmp_threshold > max_length) - tmp_threshold = max_length; + if (amdgpu_bad_page_threshold < 0) { + u64 val = adev->gmc.mc_vram_size; - if (tmp_threshold == -1) { - val = adev->gmc.mc_vram_size; - do_div(val, RAS_BAD_PAGE_RATE); + do_div(val, RAS_BAD_PAGE_COVER); con->bad_page_cnt_threshold = min(lower_32_bits(val), - max_length); + max_count); } else { - con->bad_page_cnt_threshold = tmp_threshold; + con->bad_page_cnt_threshold = min_t(int, max_count, + amdgpu_bad_page_threshold); } } @@ -1945,7 +1937,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; - uint32_t max_eeprom_records_len = 0; + u32 max_eeprom_records_count = 0; bool exc_err_limit = false; int ret; @@ -1965,8 +1957,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) atomic_set(&con->in_recovery, 0); con->adev = adev; - max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); - amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); + max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(); + amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); /* Todo: During test the SMU might fail to read the eeprom through I2C * when the GPU is pending on XGMI reset during probe time diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 54ef31594acc..21e1e59e4857 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -54,7 +54,7 @@ #define RAS_TBL_SIZE_BYTES (256 * 1024) #define RAS_HDR_START 0 #define RAS_RECORD_START (RAS_HDR_START + RAS_TABLE_HEADER_SIZE) -#define RAS_MAX_RECORD_NUM ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \ +#define RAS_MAX_RECORD_COUNT ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \ / RAS_TABLE_RECORD_SIZE) #define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, eeprom_control))->adev @@ -532,7 +532,7 @@ static int amdgpu_ras_eeprom_xfer(struct amdgpu_ras_eeprom_control *control, * TODO - Check the assumption is correct */ control->num_recs += num; - control->num_recs %= RAS_MAX_RECORD_NUM; + control->num_recs %= RAS_MAX_RECORD_COUNT; control->tbl_hdr.tbl_size += RAS_TABLE_RECORD_SIZE * num; if (control->tbl_hdr.tbl_size > RAS_TBL_SIZE_BYTES) control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + @@ -568,9 +568,9 @@ int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control, return amdgpu_ras_eeprom_xfer(control, records, num, true); } -inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void) +inline uint32_t amdgpu_ras_eeprom_max_record_count(void) { - return RAS_MAX_RECORD_NUM; + return RAS_MAX_RECORD_COUNT; } /* Used for testing if bugs encountered */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index 4906ed9fb8cd..504729b80537 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -88,7 +88,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control, struct eeprom_table_record *records, const u32 num); -inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void); +inline uint32_t amdgpu_ras_eeprom_max_record_count(void); void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control); -- cgit From 0686627b3fb2718bb0a6a0e1cd2d0e3dcbe97623 Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Wed, 16 Jun 2021 13:09:37 -0400 Subject: drm/amdgpu: Some renames Qualify with "ras_". Use kernel's own--don't redefine your own. Cc: Alexander Deucher Cc: Andrey Grodzovsky Signed-off-by: Luben Tuikov Reviewed-by: Alexander Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 +++++++------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 30 +++++++++++++------------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 25 +++++++++++++++------ drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 2 +- 4 files changed, 43 insertions(+), 30 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 79afff19ff24..9371be383525 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1818,11 +1818,11 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) control = &con->eeprom_control; data = con->eh_data; - save_count = data->count - control->num_recs; + save_count = data->count - control->ras_num_recs; /* only new entries are saved */ if (save_count > 0) { if (amdgpu_ras_eeprom_write(control, - &data->bps[control->num_recs], + &data->bps[control->ras_num_recs], save_count)) { dev_err(adev->dev, "Failed to save EEPROM table data!"); return -EIO; @@ -1846,18 +1846,18 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) int ret; /* no bad page record, skip eeprom access */ - if (control->num_recs == 0 || amdgpu_bad_page_threshold == 0) + if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0) return 0; - bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL); + bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL); if (!bps) return -ENOMEM; - ret = amdgpu_ras_eeprom_read(control, bps, control->num_recs); + ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs); if (ret) dev_err(adev->dev, "Failed to load EEPROM table records!"); else - ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs); + ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs); kfree(bps); return ret; @@ -1974,13 +1974,13 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) if (exc_err_limit || ret) goto free; - if (con->eeprom_control.num_recs) { + if (con->eeprom_control.ras_num_recs) { ret = amdgpu_ras_load_bad_pages(adev); if (ret) goto free; if (adev->smu.ppt_funcs && adev->smu.ppt_funcs->send_hbm_bad_pages_num) - adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.num_recs); + adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs); } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 8b60fba9f835..34da00ef8369 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -246,10 +246,10 @@ static int amdgpu_ras_eeprom_correct_header_tag( memset(buf, 0, RAS_TABLE_HEADER_SIZE); - mutex_lock(&control->tbl_mutex); + mutex_lock(&control->ras_tbl_mutex); hdr->header = header; ret = __write_table_header(control, buf); - mutex_unlock(&control->tbl_mutex); + mutex_unlock(&control->ras_tbl_mutex); return ret; } @@ -260,7 +260,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; int ret = 0; - mutex_lock(&control->tbl_mutex); + mutex_lock(&control->ras_tbl_mutex); hdr->header = RAS_TABLE_HDR_VAL; hdr->version = RAS_TABLE_VER; @@ -271,7 +271,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) control->next_addr = RAS_RECORD_START; ret = __write_table_header(control, buf); - mutex_unlock(&control->tbl_mutex); + mutex_unlock(&control->ras_tbl_mutex); return ret; @@ -298,7 +298,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, if (!__get_eeprom_i2c_addr(adev, control)) return -EINVAL; - mutex_init(&control->tbl_mutex); + mutex_init(&control->ras_tbl_mutex); /* Read/Create table header from EEPROM address 0 */ ret = amdgpu_eeprom_read(&adev->pm.smu_i2c, @@ -312,17 +312,17 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, __decode_table_header_from_buf(hdr, buf); if (hdr->header == RAS_TABLE_HDR_VAL) { - control->num_recs = (hdr->tbl_size - RAS_TABLE_HEADER_SIZE) / + control->ras_num_recs = (hdr->tbl_size - RAS_TABLE_HEADER_SIZE) / RAS_TABLE_RECORD_SIZE; control->tbl_byte_sum = __calc_hdr_byte_sum(control); control->next_addr = RAS_RECORD_START; DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records", - control->num_recs); + control->ras_num_recs); } else if ((hdr->header == RAS_TABLE_HDR_BAD) && (amdgpu_bad_page_threshold != 0)) { - if (ras->bad_page_cnt_threshold > control->num_recs) { + if (ras->bad_page_cnt_threshold > control->ras_num_recs) { dev_info(adev->dev, "Using one valid bigger bad page " "threshold and correcting eeprom header tag.\n"); ret = amdgpu_ras_eeprom_correct_header_tag(control, @@ -452,7 +452,7 @@ static int amdgpu_ras_eeprom_xfer(struct amdgpu_ras_eeprom_control *control, if (!bufs) return -ENOMEM; - mutex_lock(&control->tbl_mutex); + mutex_lock(&control->ras_tbl_mutex); /* * If saved bad pages number exceeds the bad page threshold for @@ -466,10 +466,10 @@ static int amdgpu_ras_eeprom_xfer(struct amdgpu_ras_eeprom_control *control, * further check. */ if (write && (amdgpu_bad_page_threshold != 0) && - ((control->num_recs + num) >= ras->bad_page_cnt_threshold)) { + ((control->ras_num_recs + num) >= ras->bad_page_cnt_threshold)) { dev_warn(adev->dev, "Saved bad pages(%d) reaches threshold value(%d).\n", - control->num_recs + num, ras->bad_page_cnt_threshold); + control->ras_num_recs + num, ras->bad_page_cnt_threshold); control->tbl_hdr.header = RAS_TABLE_HDR_BAD; } @@ -531,12 +531,12 @@ static int amdgpu_ras_eeprom_xfer(struct amdgpu_ras_eeprom_control *control, * * TODO - Check the assumption is correct */ - control->num_recs += num; - control->num_recs %= RAS_MAX_RECORD_COUNT; + control->ras_num_recs += num; + control->ras_num_recs %= RAS_MAX_RECORD_COUNT; control->tbl_hdr.tbl_size += RAS_TABLE_RECORD_SIZE * num; if (control->tbl_hdr.tbl_size > RAS_TBL_SIZE_BYTES) control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + - control->num_recs * RAS_TABLE_RECORD_SIZE; + control->ras_num_recs * RAS_TABLE_RECORD_SIZE; __update_tbl_checksum(control, records, num); __write_table_header(control, bufs); @@ -549,7 +549,7 @@ static int amdgpu_ras_eeprom_xfer(struct amdgpu_ras_eeprom_control *control, free_buf: kfree(bufs); - mutex_unlock(&control->tbl_mutex); + mutex_unlock(&control->ras_tbl_mutex); return ret == num ? 0 : -EIO; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index 504729b80537..9e1e7656b7bc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -40,14 +40,27 @@ struct amdgpu_ras_eeprom_table_header { uint32_t first_rec_offset; uint32_t tbl_size; uint32_t checksum; -}__attribute__((__packed__)); +} __packed; struct amdgpu_ras_eeprom_control { struct amdgpu_ras_eeprom_table_header tbl_hdr; - u32 i2c_address; /* Base I2C 19-bit memory address */ + + /* Base I2C EEPPROM 19-bit memory address, + * where the table is located. For more information, + * see top of amdgpu_eeprom.c. + */ + u32 i2c_address; + uint32_t next_addr; - unsigned int num_recs; - struct mutex tbl_mutex; + + /* Number of records in the table. + */ + unsigned int ras_num_recs; + + /* Protect table access via this mutex. + */ + struct mutex ras_tbl_mutex; + u8 tbl_byte_sum; }; @@ -74,10 +87,10 @@ struct eeprom_table_record { unsigned char mem_channel; unsigned char mcumc_id; -}__attribute__((__packed__)); +} __packed; int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, - bool *exceed_err_limit); + bool *exceed_err_limit); int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control); bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index f4489773715e..0c7c56a91b25 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -134,7 +134,7 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, amdgpu_ras_save_bad_pages(adev); if (adev->smu.ppt_funcs && adev->smu.ppt_funcs->send_hbm_bad_pages_num) - adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.num_recs); + adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs); } amdgpu_ras_reset_gpu(adev); -- cgit From 63d4c081a556a1e1f200411ad1e34a51965f1048 Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Tue, 6 Apr 2021 07:07:54 -0400 Subject: drm/amdgpu: Optimize EEPROM RAS table I/O Split functionality between read and write, which simplifies the code and exposes areas of optimization and more or less complexity, and take advantage of that. Read and write the table in one go; use a separate stage to decode or encode the data, as opposed to on the fly, which keeps the I2C bus busy. Use a single read/write to read/write the table or at most two if the number of records we're reading/writing wraps around. Check the check-sum of a table in EEPROM on init. Update the checksum at the same time as when updating the table header signature, when the threshold was increased on boot. Take advantage of arithmetic modulo 256, that is, use a byte!, to greatly simplify checksum arithmetic. Cc: Alexander Deucher Cc: Andrey Grodzovsky Signed-off-by: Luben Tuikov Acked-by: Alexander Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c | 20 +- drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.h | 23 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 8 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 844 +++++++++++++++++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 32 +- 5 files changed, 620 insertions(+), 307 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c index a4815af111ed..4c3c65a5acae 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.c @@ -176,8 +176,8 @@ static int __amdgpu_eeprom_xfer(struct i2c_adapter *i2c_adap, u32 eeprom_addr, * * Returns the number of bytes read/written; -errno on error. */ -int amdgpu_eeprom_xfer(struct i2c_adapter *i2c_adap, u32 eeprom_addr, - u8 *eeprom_buf, u16 buf_size, bool read) +static int amdgpu_eeprom_xfer(struct i2c_adapter *i2c_adap, u32 eeprom_addr, + u8 *eeprom_buf, u16 buf_size, bool read) { const struct i2c_adapter_quirks *quirks = i2c_adap->quirks; u16 limit; @@ -221,3 +221,19 @@ int amdgpu_eeprom_xfer(struct i2c_adapter *i2c_adap, u32 eeprom_addr, return res; } } + +int amdgpu_eeprom_read(struct i2c_adapter *i2c_adap, + u32 eeprom_addr, u8 *eeprom_buf, + u16 bytes) +{ + return amdgpu_eeprom_xfer(i2c_adap, eeprom_addr, eeprom_buf, bytes, + true); +} + +int amdgpu_eeprom_write(struct i2c_adapter *i2c_adap, + u32 eeprom_addr, u8 *eeprom_buf, + u16 bytes) +{ + return amdgpu_eeprom_xfer(i2c_adap, eeprom_addr, eeprom_buf, bytes, + false); +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.h index 966b434f0de2..6935adb2be1f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_eeprom.h @@ -26,23 +26,12 @@ #include -int amdgpu_eeprom_xfer(struct i2c_adapter *i2c_adap, u32 eeprom_addr, - u8 *eeprom_buf, u16 bytes, bool read); +int amdgpu_eeprom_read(struct i2c_adapter *i2c_adap, + u32 eeprom_addr, u8 *eeprom_buf, + u16 bytes); -static inline int amdgpu_eeprom_read(struct i2c_adapter *i2c_adap, - u32 eeprom_addr, u8 *eeprom_buf, - u16 bytes) -{ - return amdgpu_eeprom_xfer(i2c_adap, eeprom_addr, eeprom_buf, bytes, - true); -} - -static inline int amdgpu_eeprom_write(struct i2c_adapter *i2c_adap, - u32 eeprom_addr, u8 *eeprom_buf, - u16 bytes) -{ - return amdgpu_eeprom_xfer(i2c_adap, eeprom_addr, eeprom_buf, bytes, - false); -} +int amdgpu_eeprom_write(struct i2c_adapter *i2c_adap, + u32 eeprom_addr, u8 *eeprom_buf, + u16 bytes); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 9371be383525..fd63e5f6d204 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -451,7 +451,7 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, ret = amdgpu_ras_eeprom_reset_table( &(amdgpu_ras_get_context(adev)->eeprom_control)); - if (ret > 0) { + if (!ret) { /* Something was written to EEPROM. */ amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS; @@ -1821,9 +1821,9 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) save_count = data->count - control->ras_num_recs; /* only new entries are saved */ if (save_count > 0) { - if (amdgpu_ras_eeprom_write(control, - &data->bps[control->ras_num_recs], - save_count)) { + if (amdgpu_ras_eeprom_append(control, + &data->bps[control->ras_num_recs], + save_count)) { dev_err(adev->dev, "Failed to save EEPROM table data!"); return -EIO; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 7522d2ca5b03..dc4a845a3240 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -52,11 +52,27 @@ /* Assume 2-Mbit size EEPROM and take up the whole space. */ #define RAS_TBL_SIZE_BYTES (256 * 1024) -#define RAS_HDR_START 0 +#define RAS_TABLE_START 0 +#define RAS_HDR_START RAS_TABLE_START #define RAS_RECORD_START (RAS_HDR_START + RAS_TABLE_HEADER_SIZE) #define RAS_MAX_RECORD_COUNT ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \ / RAS_TABLE_RECORD_SIZE) +/* Given a zero-based index of an EEPROM RAS record, yields the EEPROM + * offset off of RAS_TABLE_START. That is, this is something you can + * add to control->i2c_address, and then tell I2C layer to read + * from/write to there. _N is the so called absolute index, + * because it starts right after the table header. + */ +#define RAS_INDEX_TO_OFFSET(_C, _N) ((_C)->ras_record_offset + \ + (_N) * RAS_TABLE_RECORD_SIZE) + +#define RAS_OFFSET_TO_INDEX(_C, _O) (((_O) - \ + (_C)->ras_record_offset) / RAS_TABLE_RECORD_SIZE) + +#define RAS_NUM_RECS(_tbl_hdr) (((_tbl_hdr)->tbl_size - \ + RAS_TABLE_HEADER_SIZE) / RAS_TABLE_RECORD_SIZE) + #define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, eeprom_control))->adev static bool __is_ras_eeprom_supported(struct amdgpu_device *adev) @@ -117,10 +133,11 @@ static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev, return true; } -static void __encode_table_header_to_buf(struct amdgpu_ras_eeprom_table_header *hdr, - unsigned char *buf) +static void +__encode_table_header_to_buf(struct amdgpu_ras_eeprom_table_header *hdr, + unsigned char *buf) { - uint32_t *pp = (uint32_t *) buf; + u32 *pp = (uint32_t *)buf; pp[0] = cpu_to_le32(hdr->header); pp[1] = cpu_to_le32(hdr->version); @@ -129,10 +146,11 @@ static void __encode_table_header_to_buf(struct amdgpu_ras_eeprom_table_header * pp[4] = cpu_to_le32(hdr->checksum); } -static void __decode_table_header_from_buf(struct amdgpu_ras_eeprom_table_header *hdr, - unsigned char *buf) +static void +__decode_table_header_from_buf(struct amdgpu_ras_eeprom_table_header *hdr, + unsigned char *buf) { - u32 *pp = (uint32_t *) buf; + u32 *pp = (uint32_t *)buf; hdr->header = le32_to_cpu(pp[0]); hdr->version = le32_to_cpu(pp[1]); @@ -141,124 +159,89 @@ static void __decode_table_header_from_buf(struct amdgpu_ras_eeprom_table_header hdr->checksum = le32_to_cpu(pp[4]); } -static int __write_table_header(struct amdgpu_ras_eeprom_control *control, - unsigned char *buf) +static int __write_table_header(struct amdgpu_ras_eeprom_control *control) { - int ret = 0; + u8 buf[RAS_TABLE_HEADER_SIZE]; struct amdgpu_device *adev = to_amdgpu_device(control); + int res; + memset(buf, 0, sizeof(buf)); __encode_table_header_to_buf(&control->tbl_hdr, buf); /* i2c may be unstable in gpu reset */ down_read(&adev->reset_sem); - ret = amdgpu_eeprom_write(&adev->pm.smu_i2c, - control->i2c_address + RAS_HDR_START, + res = amdgpu_eeprom_write(&adev->pm.smu_i2c, + control->i2c_address + + control->ras_header_offset, buf, RAS_TABLE_HEADER_SIZE); up_read(&adev->reset_sem); - if (ret < 1) - DRM_ERROR("Failed to write EEPROM table header, ret:%d", ret); + if (res < 0) { + DRM_ERROR("Failed to write EEPROM table header:%d", res); + } else if (res < RAS_TABLE_HEADER_SIZE) { + DRM_ERROR("Short write:%d out of %d\n", + res, RAS_TABLE_HEADER_SIZE); + res = -EIO; + } else { + res = 0; + } - return ret; + return res; } static u8 __calc_hdr_byte_sum(const struct amdgpu_ras_eeprom_control *control) { - int i; - u8 hdr_sum = 0; - u8 *p; + int ii; + u8 *pp, csum; size_t sz; /* Header checksum, skip checksum field in the calculation */ sz = sizeof(control->tbl_hdr) - sizeof(control->tbl_hdr.checksum); - p = (u8 *) &control->tbl_hdr; - for (i = 0; i < sz; i++, p++) - hdr_sum += *p; - - return hdr_sum; -} - -static u8 __calc_recs_byte_sum(const struct eeprom_table_record *record, - const int num) -{ - int i, j; - u8 tbl_sum = 0; - - if (!record) - return 0; - - /* Records checksum */ - for (i = 0; i < num; i++) { - u8 *p = (u8 *) &record[i]; - - for (j = 0; j < sizeof(*record); j++, p++) - tbl_sum += *p; - } + pp = (u8 *) &control->tbl_hdr; + csum = 0; + for (ii = 0; ii < sz; ii++, pp++) + csum += *pp; - return tbl_sum; -} - -static inline u8 -__calc_tbl_byte_sum(struct amdgpu_ras_eeprom_control *control, - struct eeprom_table_record *records, int num) -{ - return __calc_hdr_byte_sum(control) + - __calc_recs_byte_sum(records, num); -} - -static void __update_tbl_checksum(struct amdgpu_ras_eeprom_control *control, - struct eeprom_table_record *records, int num) -{ - u8 v; - - control->tbl_byte_sum = __calc_tbl_byte_sum(control, records, num); - /* Avoid 32-bit sign extension. */ - v = -control->tbl_byte_sum; - control->tbl_hdr.checksum = v; -} - -static bool __verify_tbl_checksum(struct amdgpu_ras_eeprom_control *control, - struct eeprom_table_record *records, - int num) -{ - u8 result; - - control->tbl_byte_sum = __calc_tbl_byte_sum(control, records, num); - - result = (u8)control->tbl_hdr.checksum + control->tbl_byte_sum; - if (result) { - DRM_WARN("RAS table checksum mismatch: stored:0x%02X wants:0x%02hhX", - control->tbl_hdr.checksum, - -control->tbl_byte_sum); - return false; - } - - return true; + return csum; } static int amdgpu_ras_eeprom_correct_header_tag( struct amdgpu_ras_eeprom_control *control, uint32_t header) { - unsigned char buf[RAS_TABLE_HEADER_SIZE]; struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; - int ret = 0; + u8 *hh; + int res; + u8 csum; - memset(buf, 0, RAS_TABLE_HEADER_SIZE); + csum = -hdr->checksum; + hh = (void *) &hdr->header; + csum -= (hh[0] + hh[1] + hh[2] + hh[3]); + hh = (void *) &header; + csum += hh[0] + hh[1] + hh[2] + hh[3]; + csum = -csum; mutex_lock(&control->ras_tbl_mutex); hdr->header = header; - ret = __write_table_header(control, buf); + hdr->checksum = csum; + res = __write_table_header(control); mutex_unlock(&control->ras_tbl_mutex); - return ret; + return res; } +/** + * amdgpu_ras_eeprom_reset_table -- Reset the RAS EEPROM table + * @control: pointer to control structure + * + * Reset the contents of the header of the RAS EEPROM table. + * Return 0 on success, -errno on error. + */ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) { - unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; - int ret = 0; + u8 csum; + int res; mutex_lock(&control->ras_tbl_mutex); @@ -267,83 +250,23 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) hdr->first_rec_offset = RAS_RECORD_START; hdr->tbl_size = RAS_TABLE_HEADER_SIZE; - __update_tbl_checksum(control, NULL, 0); - control->next_addr = RAS_RECORD_START; - ret = __write_table_header(control, buf); - - mutex_unlock(&control->ras_tbl_mutex); - - return ret; - -} - -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, - bool *exceed_err_limit) -{ - int ret = 0; - struct amdgpu_device *adev = to_amdgpu_device(control); - unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; - struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - - *exceed_err_limit = false; - - if (!__is_ras_eeprom_supported(adev)) - return 0; + csum = __calc_hdr_byte_sum(control); + csum = -csum; + hdr->checksum = csum; + res = __write_table_header(control); - /* Verify i2c adapter is initialized */ - if (!adev->pm.smu_i2c.algo) - return -ENOENT; + control->ras_num_recs = 0; + control->ras_fri = 0; - if (!__get_eeprom_i2c_addr(adev, control)) - return -EINVAL; - - mutex_init(&control->ras_tbl_mutex); - - /* Read/Create table header from EEPROM address 0 */ - ret = amdgpu_eeprom_read(&adev->pm.smu_i2c, - control->i2c_address + RAS_HDR_START, - buf, RAS_TABLE_HEADER_SIZE); - if (ret < 1) { - DRM_ERROR("Failed to read EEPROM table header, ret:%d", ret); - return ret; - } - - __decode_table_header_from_buf(hdr, buf); - - if (hdr->header == RAS_TABLE_HDR_VAL) { - control->ras_num_recs = (hdr->tbl_size - RAS_TABLE_HEADER_SIZE) / - RAS_TABLE_RECORD_SIZE; - control->tbl_byte_sum = __calc_hdr_byte_sum(control); - control->next_addr = RAS_RECORD_START; - - DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records", - control->ras_num_recs); - - } else if ((hdr->header == RAS_TABLE_HDR_BAD) && - (amdgpu_bad_page_threshold != 0)) { - if (ras->bad_page_cnt_threshold > control->ras_num_recs) { - dev_info(adev->dev, "Using one valid bigger bad page " - "threshold and correcting eeprom header tag.\n"); - ret = amdgpu_ras_eeprom_correct_header_tag(control, - RAS_TABLE_HDR_VAL); - } else { - *exceed_err_limit = true; - dev_err(adev->dev, "Exceeding the bad_page_threshold parameter, " - "disabling the GPU.\n"); - } - } else { - DRM_INFO("Creating new EEPROM table"); - - ret = amdgpu_ras_eeprom_reset_table(control); - } + mutex_unlock(&control->ras_tbl_mutex); - return ret > 0 ? 0 : -EIO; + return res; } -static void __encode_table_record_to_buf(struct amdgpu_ras_eeprom_control *control, - struct eeprom_table_record *record, - unsigned char *buf) +static void +__encode_table_record_to_buf(struct amdgpu_ras_eeprom_control *control, + struct eeprom_table_record *record, + unsigned char *buf) { __le64 tmp = 0; int i = 0; @@ -368,9 +291,10 @@ static void __encode_table_record_to_buf(struct amdgpu_ras_eeprom_control *contr memcpy(buf + i, &tmp, 6); } -static void __decode_table_record_from_buf(struct amdgpu_ras_eeprom_control *control, - struct eeprom_table_record *record, - unsigned char *buf) +static void +__decode_table_record_from_buf(struct amdgpu_ras_eeprom_control *control, + struct eeprom_table_record *record, + unsigned char *buf) { __le64 tmp = 0; int i = 0; @@ -395,22 +319,6 @@ static void __decode_table_record_from_buf(struct amdgpu_ras_eeprom_control *con record->retired_page = (le64_to_cpu(tmp) & 0xffffffffffff); } -/* - * When reaching end of EEPROM memory jump back to 0 record address - */ -static uint32_t __correct_eeprom_dest_address(uint32_t curr_address) -{ - u32 next_address = curr_address + RAS_TABLE_RECORD_SIZE; - - /* When all EEPROM memory used jump back to 0 address */ - if (next_address >= RAS_TBL_SIZE_BYTES) { - DRM_INFO("Reached end of EEPROM memory, wrap around to 0."); - return RAS_RECORD_START; - } - - return curr_address; -} - bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -427,148 +335,530 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev) if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) { dev_warn(adev->dev, "This GPU is in BAD status."); - dev_warn(adev->dev, "Please retire it or setting one bigger " - "threshold value when reloading driver.\n"); + dev_warn(adev->dev, "Please retire it or set a larger " + "threshold value when reloading driver.\n"); return true; } return false; } -static int amdgpu_ras_eeprom_xfer(struct amdgpu_ras_eeprom_control *control, - struct eeprom_table_record *records, - const u32 num, bool write) +/** + * __amdgpu_ras_eeprom_write -- write indexed from buffer to EEPROM + * @control: pointer to control structure + * @buf: pointer to buffer containing data to write + * @fri: start writing at this index + * @num: number of records to write + * + * The caller must hold the table mutex in @control. + * Return 0 on success, -errno otherwise. + */ +static int __amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control, + u8 *buf, const u32 fri, const u32 num) { - int i, ret = 0; - unsigned char *bufs, *buf; - struct eeprom_table_record *record; struct amdgpu_device *adev = to_amdgpu_device(control); - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + u32 buf_size; + int res; - if (!__is_ras_eeprom_supported(adev)) - return 0; + /* i2c may be unstable in gpu reset */ + down_read(&adev->reset_sem); + buf_size = num * RAS_TABLE_RECORD_SIZE; + res = amdgpu_eeprom_write(&adev->pm.smu_i2c, + control->i2c_address + + RAS_INDEX_TO_OFFSET(control, fri), + buf, buf_size); + up_read(&adev->reset_sem); + if (res < 0) { + DRM_ERROR("Writing %d EEPROM table records error:%d", + num, res); + } else if (res < buf_size) { + /* Short write, return error. + */ + DRM_ERROR("Wrote %d records out of %d", + res / RAS_TABLE_RECORD_SIZE, num); + res = -EIO; + } else { + res = 0; + } + + return res; +} - bufs = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL); - if (!bufs) +static int +amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, + struct eeprom_table_record *record, + const u32 num) +{ + u32 a, b, i; + u8 *buf, *pp; + int res; + + buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL); + if (!buf) return -ENOMEM; - mutex_lock(&control->ras_tbl_mutex); + /* Encode all of them in one go. + */ + pp = buf; + for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) + __encode_table_record_to_buf(control, &record[i], pp); + + /* a, first record index to write into. + * b, last record index to write into. + * a = first index to read (fri) + number of records in the table, + * b = a + @num - 1. + * Let N = control->ras_max_num_record_count, then we have, + * case 0: 0 <= a <= b < N, + * just append @num records starting at a; + * case 1: 0 <= a < N <= b, + * append (N - a) records starting at a, and + * append the remainder, b % N + 1, starting at 0. + * case 2: 0 <= fri < N <= a <= b, then modulo N we get two subcases, + * case 2a: 0 <= a <= b < N + * append num records starting at a; and fix fri if b overwrote it, + * and since a <= b, if b overwrote it then a must've also, + * and if b didn't overwrite it, then a didn't also. + * case 2b: 0 <= b < a < N + * write num records starting at a, which wraps around 0=N + * and overwrite fri unconditionally. Now from case 2a, + * this means that b eclipsed fri to overwrite it and wrap + * around 0 again, i.e. b = 2N+r pre modulo N, so we unconditionally + * set fri = b + 1 (mod N). + * Now, since fri is updated in every case, except the trivial case 0, + * the number of records present in the table after writing, is, + * num_recs - 1 = b - fri (mod N), and we take the positive value, + * by adding an arbitrary multiple of N before taking the modulo N + * as shown below. + */ + a = control->ras_fri + control->ras_num_recs; + b = a + num - 1; + if (b < control->ras_max_record_count) { + res = __amdgpu_ras_eeprom_write(control, buf, a, num); + } else if (a < control->ras_max_record_count) { + u32 g0, g1; + + g0 = control->ras_max_record_count - a; + g1 = b % control->ras_max_record_count + 1; + res = __amdgpu_ras_eeprom_write(control, buf, a, g0); + if (res) + goto Out; + res = __amdgpu_ras_eeprom_write(control, + buf + g0 * RAS_TABLE_RECORD_SIZE, + 0, g1); + if (res) + goto Out; + if (g1 > control->ras_fri) + control->ras_fri = g1 % control->ras_max_record_count; + } else { + a %= control->ras_max_record_count; + b %= control->ras_max_record_count; + + if (a <= b) { + /* Note that, b - a + 1 = num. */ + res = __amdgpu_ras_eeprom_write(control, buf, a, num); + if (res) + goto Out; + if (b >= control->ras_fri) + control->ras_fri = (b + 1) % control->ras_max_record_count; + } else { + u32 g0, g1; + + /* b < a, which means, we write from + * a to the end of the table, and from + * the start of the table to b. + */ + g0 = control->ras_max_record_count - a; + g1 = b + 1; + res = __amdgpu_ras_eeprom_write(control, buf, a, g0); + if (res) + goto Out; + res = __amdgpu_ras_eeprom_write(control, + buf + g0 * RAS_TABLE_RECORD_SIZE, + 0, g1); + if (res) + goto Out; + control->ras_fri = g1 % control->ras_max_record_count; + } + } + control->ras_num_recs = 1 + (control->ras_max_record_count + b + - control->ras_fri) + % control->ras_max_record_count; +Out: + kfree(buf); + return res; +} - /* - * If saved bad pages number exceeds the bad page threshold for - * the whole VRAM, update table header to mark the BAD GPU tag - * and schedule one ras recovery after eeprom write is done, - * this can avoid the missing for latest records. - * - * This new header will be picked up and checked in the bootup - * by ras recovery, which may break bootup process to notify - * user this GPU is in bad state and to retire such GPU for - * further check. +static int +amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) +{ + struct amdgpu_device *adev = to_amdgpu_device(control); + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + u8 *buf, *pp, csum; + u32 buf_size; + int res; + + /* Modify the header if it exceeds. */ - if (write && (amdgpu_bad_page_threshold != 0) && - ((control->ras_num_recs + num) >= ras->bad_page_cnt_threshold)) { + if (amdgpu_bad_page_threshold != 0 && + control->ras_num_recs >= ras->bad_page_cnt_threshold) { dev_warn(adev->dev, - "Saved bad pages(%d) reaches threshold value(%d).\n", - control->ras_num_recs + num, ras->bad_page_cnt_threshold); + "Saved bad pages %d reaches threshold value %d\n", + control->ras_num_recs, ras->bad_page_cnt_threshold); control->tbl_hdr.header = RAS_TABLE_HDR_BAD; } - /* In case of overflow just start from beginning to not lose newest records */ - if (write && - (control->next_addr + - RAS_TABLE_RECORD_SIZE * num >= RAS_TBL_SIZE_BYTES)) - control->next_addr = RAS_RECORD_START; + control->tbl_hdr.version = RAS_TABLE_VER; + control->tbl_hdr.first_rec_offset = RAS_INDEX_TO_OFFSET(control, control->ras_fri); + control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + control->ras_num_recs * RAS_TABLE_RECORD_SIZE; + control->tbl_hdr.checksum = 0; + + buf_size = control->ras_num_recs * RAS_TABLE_RECORD_SIZE; + buf = kcalloc(control->ras_num_recs, RAS_TABLE_RECORD_SIZE, GFP_KERNEL); + if (!buf) { + DRM_ERROR("allocating memory for table of size %d bytes failed\n", + control->tbl_hdr.tbl_size); + res = -ENOMEM; + goto Out; + } - /* - * TODO Currently makes EEPROM writes for each record, this creates - * internal fragmentation. Optimized the code to do full page write of - * 256b + down_read(&adev->reset_sem); + res = amdgpu_eeprom_read(&adev->pm.smu_i2c, + control->i2c_address + + control->ras_record_offset, + buf, buf_size); + up_read(&adev->reset_sem); + if (res < 0) { + DRM_ERROR("EEPROM failed reading records:%d\n", + res); + goto Out; + } else if (res < buf_size) { + DRM_ERROR("EEPROM read %d out of %d bytes\n", + res, buf_size); + res = -EIO; + goto Out; + } + + /* Recalc the checksum. */ - for (i = 0; i < num; i++) { - buf = &bufs[i * RAS_TABLE_RECORD_SIZE]; - record = &records[i]; + csum = 0; + for (pp = buf; pp < buf + buf_size; pp++) + csum += *pp; + + csum += __calc_hdr_byte_sum(control); + /* avoid sign extension when assigning to "checksum" */ + csum = -csum; + control->tbl_hdr.checksum = csum; + res = __write_table_header(control); +Out: + kfree(buf); + return res; +} + +/** + * amdgpu_ras_eeprom_append -- append records to the EEPROM RAS table + * @control: pointer to control structure + * @record: array of records to append + * @num: number of records in @record array + * + * Append @num records to the table, calculate the checksum and write + * the table back to EEPROM. The maximum number of records that + * can be appended is between 1 and control->ras_max_record_count, + * regardless of how many records are already stored in the table. + * + * Return 0 on success or if EEPROM is not supported, -errno on error. + */ +int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, + struct eeprom_table_record *record, + const u32 num) +{ + struct amdgpu_device *adev = to_amdgpu_device(control); + int res; + + if (!__is_ras_eeprom_supported(adev)) + return 0; - control->next_addr = __correct_eeprom_dest_address(control->next_addr); + if (num == 0) { + DRM_ERROR("will not append 0 records\n"); + return -EINVAL; + } else if (num > control->ras_max_record_count) { + DRM_ERROR("cannot append %d records than the size of table %d\n", + num, control->ras_max_record_count); + return -EINVAL; + } + + mutex_lock(&control->ras_tbl_mutex); - /* EEPROM table content is stored in LE format */ - if (write) - __encode_table_record_to_buf(control, record, buf); + res = amdgpu_ras_eeprom_append_table(control, record, num); + if (!res) + res = amdgpu_ras_eeprom_update_header(control); - /* i2c may be unstable in gpu reset */ - down_read(&adev->reset_sem); - ret = amdgpu_eeprom_xfer(&adev->pm.smu_i2c, - control->i2c_address + control->next_addr, - buf, RAS_TABLE_RECORD_SIZE, !write); - up_read(&adev->reset_sem); + mutex_unlock(&control->ras_tbl_mutex); + return res; +} - if (ret < 1) { - DRM_ERROR("Failed to process EEPROM table records, ret:%d", ret); +/** + * __amdgpu_ras_eeprom_read -- read indexed from EEPROM into buffer + * @control: pointer to control structure + * @buf: pointer to buffer to read into + * @fri: first record index, start reading at this index, absolute index + * @num: number of records to read + * + * The caller must hold the table mutex in @control. + * Return 0 on success, -errno otherwise. + */ +static int __amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, + u8 *buf, const u32 fri, const u32 num) +{ + struct amdgpu_device *adev = to_amdgpu_device(control); + u32 buf_size; + int res; - /* TODO Restore prev next EEPROM address ? */ - goto free_buf; - } - /* - * The destination EEPROM address might need to be corrected to account - * for page or entire memory wrapping + /* i2c may be unstable in gpu reset */ + down_read(&adev->reset_sem); + buf_size = num * RAS_TABLE_RECORD_SIZE; + res = amdgpu_eeprom_read(&adev->pm.smu_i2c, + control->i2c_address + + RAS_INDEX_TO_OFFSET(control, fri), + buf, buf_size); + up_read(&adev->reset_sem); + if (res < 0) { + DRM_ERROR("Reading %d EEPROM table records error:%d", + num, res); + } else if (res < buf_size) { + /* Short read, return error. */ - control->next_addr += RAS_TABLE_RECORD_SIZE; + DRM_ERROR("Read %d records out of %d", + res / RAS_TABLE_RECORD_SIZE, num); + res = -EIO; + } else { + res = 0; } - if (!write) { - for (i = 0; i < num; i++) { - buf = &bufs[i * RAS_TABLE_RECORD_SIZE]; - record = &records[i]; + return res; +} - __decode_table_record_from_buf(control, record, buf); - } +/** + * amdgpu_ras_eeprom_read -- read EEPROM + * @control: pointer to control structure + * @record: array of records to read into + * @num: number of records in @record + * + * Reads num records from the RAS table in EEPROM and + * writes the data into @record array. + * + * Returns 0 on success, -errno on error. + */ +int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, + struct eeprom_table_record *record, + const u32 num) +{ + struct amdgpu_device *adev = to_amdgpu_device(control); + int i, res; + u8 *buf, *pp; + u32 g0, g1; + + if (!__is_ras_eeprom_supported(adev)) + return 0; + + if (num == 0) { + DRM_ERROR("will not read 0 records\n"); + return -EINVAL; + } else if (num > control->ras_num_recs) { + DRM_ERROR("too many records to read:%d available:%d\n", + num, control->ras_num_recs); + return -EINVAL; } - if (write) { - /* - * Update table header with size and CRC and account for table - * wrap around where the assumption is that we treat it as empty - * table - * - * TODO - Check the assumption is correct - */ - control->ras_num_recs += num; - control->ras_num_recs %= RAS_MAX_RECORD_COUNT; - control->tbl_hdr.tbl_size += RAS_TABLE_RECORD_SIZE * num; - if (control->tbl_hdr.tbl_size > RAS_TBL_SIZE_BYTES) - control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + - control->ras_num_recs * RAS_TABLE_RECORD_SIZE; - - __update_tbl_checksum(control, records, num); - __write_table_header(control, bufs); - } else if (!__verify_tbl_checksum(control, records, num)) { - DRM_WARN("EEPROM Table checksum mismatch!"); - /* TODO Uncomment when EEPROM read/write is relliable */ - /* ret = -EIO; */ + buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* Determine how many records to read, from the first record + * index, fri, to the end of the table, and from the beginning + * of the table, such that the total number of records is + * @num, and we handle wrap around when fri > 0 and + * fri + num > RAS_MAX_RECORD_COUNT. + * + * First we compute the index of the last element + * which would be fetched from each region, + * g0 is in [fri, fri + num - 1], and + * g1 is in [0, RAS_MAX_RECORD_COUNT - 1]. + * Then, if g0 < RAS_MAX_RECORD_COUNT, the index of + * the last element to fetch, we set g0 to _the number_ + * of elements to fetch, @num, since we know that the last + * indexed to be fetched does not exceed the table. + * + * If, however, g0 >= RAS_MAX_RECORD_COUNT, then + * we set g0 to the number of elements to read + * until the end of the table, and g1 to the number of + * elements to read from the beginning of the table. + */ + g0 = control->ras_fri + num - 1; + g1 = g0 % control->ras_max_record_count; + if (g0 < control->ras_max_record_count) { + g0 = num; + g1 = 0; + } else { + g0 = control->ras_max_record_count - control->ras_fri; + g1 += 1; + } + + mutex_lock(&control->ras_tbl_mutex); + res = __amdgpu_ras_eeprom_read(control, buf, control->ras_fri, g0); + if (res) + goto Out; + if (g1) { + res = __amdgpu_ras_eeprom_read(control, + buf + g0 * RAS_TABLE_RECORD_SIZE, + 0, g1); + if (res) + goto Out; } -free_buf: - kfree(bufs); + res = 0; + /* Read up everything? Then transform. + */ + pp = buf; + for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) + __decode_table_record_from_buf(control, &record[i], pp); +Out: + kfree(buf); mutex_unlock(&control->ras_tbl_mutex); - return ret == num ? 0 : -EIO; + return res; } -int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, - struct eeprom_table_record *records, - const u32 num) +inline uint32_t amdgpu_ras_eeprom_max_record_count(void) { - return amdgpu_ras_eeprom_xfer(control, records, num, false); + return RAS_MAX_RECORD_COUNT; } -int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control, - struct eeprom_table_record *records, - const u32 num) +/** + * __verify_ras_table_checksum -- verify the RAS EEPROM table checksum + * @control: pointer to control structure + * + * Check the checksum of the stored in EEPROM RAS table. + * + * Return 0 if the checksum is correct, + * positive if it is not correct, and + * -errno on I/O error. + */ +static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control) { - return amdgpu_ras_eeprom_xfer(control, records, num, true); + struct amdgpu_device *adev = to_amdgpu_device(control); + int res; + u8 csum, *buf, *pp; + u32 buf_size; + + buf_size = RAS_TABLE_HEADER_SIZE + + control->ras_num_recs * RAS_TABLE_RECORD_SIZE; + buf = kzalloc(buf_size, GFP_KERNEL); + if (!buf) { + DRM_ERROR("Out of memory checking RAS table checksum.\n"); + return -ENOMEM; + } + + res = amdgpu_eeprom_read(&adev->pm.smu_i2c, + control->i2c_address + + control->ras_header_offset, + buf, buf_size); + if (res < buf_size) { + DRM_ERROR("Partial read for checksum, res:%d\n", res); + /* On partial reads, return -EIO. + */ + if (res >= 0) + res = -EIO; + goto Out; + } + + csum = 0; + for (pp = buf; pp < buf + buf_size; pp++) + csum += *pp; +Out: + kfree(buf); + return res < 0 ? res : csum; } -inline uint32_t amdgpu_ras_eeprom_max_record_count(void) +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, + bool *exceed_err_limit) { - return RAS_MAX_RECORD_COUNT; + struct amdgpu_device *adev = to_amdgpu_device(control); + unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; + struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + int res; + + *exceed_err_limit = false; + + if (!__is_ras_eeprom_supported(adev)) + return 0; + + /* Verify i2c adapter is initialized */ + if (!adev->pm.smu_i2c.algo) + return -ENOENT; + + if (!__get_eeprom_i2c_addr(adev, control)) + return -EINVAL; + + control->ras_header_offset = RAS_HDR_START; + control->ras_record_offset = RAS_RECORD_START; + control->ras_max_record_count = RAS_MAX_RECORD_COUNT; + mutex_init(&control->ras_tbl_mutex); + + /* Read the table header from EEPROM address */ + res = amdgpu_eeprom_read(&adev->pm.smu_i2c, + control->i2c_address + control->ras_header_offset, + buf, RAS_TABLE_HEADER_SIZE); + if (res < RAS_TABLE_HEADER_SIZE) { + DRM_ERROR("Failed to read EEPROM table header, res:%d", res); + return res >= 0 ? -EIO : res; + } + + __decode_table_header_from_buf(hdr, buf); + + control->ras_num_recs = RAS_NUM_RECS(hdr); + control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset); + + if (hdr->header == RAS_TABLE_HDR_VAL) { + DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records", + control->ras_num_recs); + res = __verify_ras_table_checksum(control); + if (res) + DRM_ERROR("RAS table incorrect checksum or error:%d\n", + res); + } else if (hdr->header == RAS_TABLE_HDR_BAD && + amdgpu_bad_page_threshold != 0) { + res = __verify_ras_table_checksum(control); + if (res) + DRM_ERROR("RAS Table incorrect checksum or error:%d\n", + res); + if (ras->bad_page_cnt_threshold > control->ras_num_recs) { + /* This means that, the threshold was increased since + * the last time the system was booted, and now, + * ras->bad_page_cnt_threshold - control->num_recs > 0, + * so that at least one more record can be saved, + * before the page count threshold is reached. + */ + dev_info(adev->dev, + "records:%d threshold:%d, resetting " + "RAS table header signature", + control->ras_num_recs, + ras->bad_page_cnt_threshold); + res = amdgpu_ras_eeprom_correct_header_tag(control, + RAS_TABLE_HDR_VAL); + } else { + *exceed_err_limit = true; + dev_err(adev->dev, + "RAS records:%d exceed threshold:%d, " + "maybe retire this GPU?", + control->ras_num_recs, ras->bad_page_cnt_threshold); + } + } else { + DRM_INFO("Creating a new EEPROM table"); + + res = amdgpu_ras_eeprom_reset_table(control); + } + + return res < 0 ? res : 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index 67a7ec3e6c22..e26af82ea63a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -28,7 +28,7 @@ struct amdgpu_device; -enum amdgpu_ras_eeprom_err_type{ +enum amdgpu_ras_eeprom_err_type { AMDGPU_RAS_EEPROM_ERR_PLACE_HOLDER, AMDGPU_RAS_EEPROM_ERR_RECOVERABLE, AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE @@ -51,17 +51,34 @@ struct amdgpu_ras_eeprom_control { */ u32 i2c_address; - uint32_t next_addr; + /* The byte offset off of @i2c_address + * where the table header is found, + * and where the records start--always + * right after the header. + */ + u32 ras_header_offset; + u32 ras_record_offset; /* Number of records in the table. */ - unsigned int ras_num_recs; + u32 ras_num_recs; + + /* First record index to read, 0-based. + * Range is [0, num_recs-1]. This is + * an absolute index, starting right after + * the table header. + */ + u32 ras_fri; + + /* Maximum possible number of records + * we could store, i.e. the maximum capacity + * of the table. + */ + u32 ras_max_record_count; /* Protect table access via this mutex. */ struct mutex ras_tbl_mutex; - - u8 tbl_byte_sum; }; /* @@ -91,6 +108,7 @@ struct eeprom_table_record { int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, bool *exceed_err_limit); + int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control); bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev); @@ -98,8 +116,8 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev); int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, struct eeprom_table_record *records, const u32 num); -int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control, - struct eeprom_table_record *records, const u32 num); +int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, + struct eeprom_table_record *records, const u32 num); inline uint32_t amdgpu_ras_eeprom_max_record_count(void); -- cgit From c65b0805e779196ba07c2cb29e7f71777e81009d Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Thu, 8 Apr 2021 11:34:26 -0400 Subject: drm/amdgpu: RAS EEPROM table is now in debugfs Add "ras_eeprom_size" file in debugfs, which reports the maximum size allocated to the RAS table in EEROM, as the number of bytes and the number of records it could store. For instance, $cat /sys/kernel/debug/dri/0/ras/ras_eeprom_size 262144 bytes or 10921 records $_ Add "ras_eeprom_table" file in debugfs, which dumps the RAS table stored EEPROM, in a formatted way. For instance, $cat ras_eeprom_table Signature Version FirstOffs Size Checksum 0x414D4452 0x00010000 0x00000014 0x000000EC 0x000000DA Index Offset ErrType Bank/CU TimeStamp Offs/Addr MemChl MCUMCID RetiredPage 0 0x00014 ue 0x00 0x00000000607608DC 0x000000000000 0x00 0x00 0x000000000000 1 0x0002C ue 0x00 0x00000000607608DC 0x000000001000 0x00 0x00 0x000000000001 2 0x00044 ue 0x00 0x00000000607608DC 0x000000002000 0x00 0x00 0x000000000002 3 0x0005C ue 0x00 0x00000000607608DC 0x000000003000 0x00 0x00 0x000000000003 4 0x00074 ue 0x00 0x00000000607608DC 0x000000004000 0x00 0x00 0x000000000004 5 0x0008C ue 0x00 0x00000000607608DC 0x000000005000 0x00 0x00 0x000000000005 6 0x000A4 ue 0x00 0x00000000607608DC 0x000000006000 0x00 0x00 0x000000000006 7 0x000BC ue 0x00 0x00000000607608DC 0x000000007000 0x00 0x00 0x000000000007 8 0x000D4 ue 0x00 0x00000000607608DD 0x000000008000 0x00 0x00 0x000000000008 $_ Cc: Alexander Deucher Cc: Andrey Grodzovsky Cc: John Clements Cc: Hawking Zhang Cc: Xinhui Pan Signed-off-by: Luben Tuikov Acked-by: Alexander Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 12 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 241 ++++++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 10 +- 4 files changed, 252 insertions(+), 12 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index fd63e5f6d204..53a54ff8edc3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -404,9 +404,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, /* umc ce/ue error injection for a bad page is not allowed */ if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) && amdgpu_ras_check_bad_page(adev, data.inject.address)) { - dev_warn(adev->dev, "RAS WARN: 0x%llx has been marked " - "as bad before error injection!\n", - data.inject.address); + dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has " + "already been marked as bad!\n", + data.inject.address); break; } @@ -1301,6 +1301,12 @@ static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device * &con->bad_page_cnt_threshold); debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled); debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled); + debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev, + &amdgpu_ras_debugfs_eeprom_size_ops); + con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table", + S_IRUGO, dir, adev, + &amdgpu_ras_debugfs_eeprom_table_ops); + amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control); /* * After one uncorrectable error happens, usually GPU recovery will diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 256cea5d34f2..283afd791db1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -318,6 +318,7 @@ struct amdgpu_ras { /* sysfs */ struct device_attribute features_attr; struct bin_attribute badpages_attr; + struct dentry *de_ras_eeprom_table; /* block array */ struct ras_manager *objs; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index dc4a845a3240..677e379f5fb5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -27,6 +27,8 @@ #include #include "atom.h" #include "amdgpu_eeprom.h" +#include +#include #define EEPROM_I2C_MADDR_VEGA20 0x0 #define EEPROM_I2C_MADDR_ARCTURUS 0x40000 @@ -70,6 +72,13 @@ #define RAS_OFFSET_TO_INDEX(_C, _O) (((_O) - \ (_C)->ras_record_offset) / RAS_TABLE_RECORD_SIZE) +/* Given a 0-based relative record index, 0, 1, 2, ..., etc., off + * of "fri", return the absolute record index off of the end of + * the table header. + */ +#define RAS_RI_TO_AI(_C, _I) (((_I) + (_C)->ras_fri) % \ + (_C)->ras_max_record_count) + #define RAS_NUM_RECS(_tbl_hdr) (((_tbl_hdr)->tbl_size - \ RAS_TABLE_HEADER_SIZE) / RAS_TABLE_RECORD_SIZE) @@ -77,13 +86,10 @@ static bool __is_ras_eeprom_supported(struct amdgpu_device *adev) { - if ((adev->asic_type == CHIP_VEGA20) || - (adev->asic_type == CHIP_ARCTURUS) || - (adev->asic_type == CHIP_SIENNA_CICHLID) || - (adev->asic_type == CHIP_ALDEBARAN)) - return true; - - return false; + return adev->asic_type == CHIP_VEGA20 || + adev->asic_type == CHIP_ARCTURUS || + adev->asic_type == CHIP_SIENNA_CICHLID || + adev->asic_type == CHIP_ALDEBARAN; } static bool __get_eeprom_i2c_addr_arct(struct amdgpu_device *adev, @@ -258,6 +264,8 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) control->ras_num_recs = 0; control->ras_fri = 0; + amdgpu_ras_debugfs_set_ret_size(control); + mutex_unlock(&control->ras_tbl_mutex); return res; @@ -591,6 +599,8 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, res = amdgpu_ras_eeprom_append_table(control, record, num); if (!res) res = amdgpu_ras_eeprom_update_header(control); + if (!res) + amdgpu_ras_debugfs_set_ret_size(control); mutex_unlock(&control->ras_tbl_mutex); return res; @@ -734,6 +744,223 @@ inline uint32_t amdgpu_ras_eeprom_max_record_count(void) return RAS_MAX_RECORD_COUNT; } +static ssize_t +amdgpu_ras_debugfs_eeprom_size_read(struct file *f, char __user *buf, + size_t size, loff_t *pos) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL; + u8 data[50]; + int res; + + if (!size) + return size; + + if (!ras || !control) { + res = snprintf(data, sizeof(data), "Not supported\n"); + } else { + res = snprintf(data, sizeof(data), "%d bytes or %d records\n", + RAS_TBL_SIZE_BYTES, control->ras_max_record_count); + } + + if (*pos >= res) + return 0; + + res -= *pos; + res = min_t(size_t, res, size); + + if (copy_to_user(buf, &data[*pos], res)) + return -EINVAL; + + *pos += res; + + return res; +} + +const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops = { + .owner = THIS_MODULE, + .read = amdgpu_ras_debugfs_eeprom_size_read, + .write = NULL, + .llseek = default_llseek, +}; + +static const char *tbl_hdr_str = " Signature Version FirstOffs Size Checksum\n"; +static const char *tbl_hdr_fmt = "0x%08X 0x%08X 0x%08X 0x%08X 0x%08X\n"; +#define tbl_hdr_fmt_size (5 * (2+8) + 4 + 1) +static const char *rec_hdr_str = "Index Offset ErrType Bank/CU TimeStamp Offs/Addr MemChl MCUMCID RetiredPage\n"; +static const char *rec_hdr_fmt = "%5d 0x%05X %7s 0x%02X 0x%016llX 0x%012llX 0x%02X 0x%02X 0x%012llX\n"; +#define rec_hdr_fmt_size (5 + 1 + 7 + 1 + 7 + 1 + 7 + 1 + 18 + 1 + 14 + 1 + 6 + 1 + 7 + 1 + 14 + 1) + +static const char *record_err_type_str[AMDGPU_RAS_EEPROM_ERR_COUNT] = { + "ignore", + "re", + "ue", +}; + +static loff_t amdgpu_ras_debugfs_table_size(struct amdgpu_ras_eeprom_control *control) +{ + return strlen(tbl_hdr_str) + tbl_hdr_fmt_size + + strlen(rec_hdr_str) + rec_hdr_fmt_size * control->ras_num_recs; +} + +void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control) +{ + struct amdgpu_ras *ras = container_of(control, struct amdgpu_ras, + eeprom_control); + struct dentry *de = ras->de_ras_eeprom_table; + + if (de) + d_inode(de)->i_size = amdgpu_ras_debugfs_table_size(control); +} + +static ssize_t amdgpu_ras_debugfs_table_read(struct file *f, char __user *buf, + size_t size, loff_t *pos) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + struct amdgpu_ras_eeprom_control *control = &ras->eeprom_control; + const size_t orig_size = size; + int res = -EINVAL; + size_t data_len; + + mutex_lock(&control->ras_tbl_mutex); + + /* We want *pos - data_len > 0, which means there's + * bytes to be printed from data. + */ + data_len = strlen(tbl_hdr_str); + if (*pos < data_len) { + data_len -= *pos; + data_len = min_t(size_t, data_len, size); + if (copy_to_user(buf, &tbl_hdr_str[*pos], data_len)) + goto Out; + buf += data_len; + size -= data_len; + *pos += data_len; + } + + data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size; + if (*pos < data_len && size > 0) { + u8 data[tbl_hdr_fmt_size + 1]; + loff_t lpos; + + snprintf(data, sizeof(data), tbl_hdr_fmt, + control->tbl_hdr.header, + control->tbl_hdr.version, + control->tbl_hdr.first_rec_offset, + control->tbl_hdr.tbl_size, + control->tbl_hdr.checksum); + + data_len -= *pos; + data_len = min_t(size_t, data_len, size); + lpos = *pos - strlen(tbl_hdr_str); + if (copy_to_user(buf, &data[lpos], data_len)) + goto Out; + buf += data_len; + size -= data_len; + *pos += data_len; + } + + data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size + strlen(rec_hdr_str); + if (*pos < data_len && size > 0) { + loff_t lpos; + + data_len -= *pos; + data_len = min_t(size_t, data_len, size); + lpos = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size; + if (copy_to_user(buf, &rec_hdr_str[lpos], data_len)) + goto Out; + buf += data_len; + size -= data_len; + *pos += data_len; + } + + data_len = amdgpu_ras_debugfs_table_size(control); + if (*pos < data_len && size > 0) { + u8 dare[RAS_TABLE_RECORD_SIZE]; + u8 data[rec_hdr_fmt_size + 1]; + /* Find the starting record index + */ + int s = (*pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size - + strlen(rec_hdr_str)) / rec_hdr_fmt_size; + int r = (*pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size - + strlen(rec_hdr_str)) % rec_hdr_fmt_size; + struct eeprom_table_record record; + + for ( ; size > 0 && s < control->ras_num_recs; s++) { + u32 ai = RAS_RI_TO_AI(control, s); + /* Read a single record + */ + res = __amdgpu_ras_eeprom_read(control, dare, ai, 1); + if (res) + goto Out; + __decode_table_record_from_buf(control, &record, dare); + snprintf(data, sizeof(data), rec_hdr_fmt, + s, + RAS_INDEX_TO_OFFSET(control, ai), + record_err_type_str[record.err_type], + record.bank, + record.ts, + record.offset, + record.mem_channel, + record.mcumc_id, + record.retired_page); + + data_len = min_t(size_t, rec_hdr_fmt_size - r, size); + if (copy_to_user(buf, &data[r], data_len)) + return -EINVAL; + buf += data_len; + size -= data_len; + *pos += data_len; + r = 0; + } + } + res = 0; +Out: + mutex_unlock(&control->ras_tbl_mutex); + return res < 0 ? res : orig_size - size; +} + +static ssize_t +amdgpu_ras_debugfs_eeprom_table_read(struct file *f, char __user *buf, + size_t size, loff_t *pos) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL; + u8 data[81]; + int res; + + if (!size) + return size; + + if (!ras || !control) { + res = snprintf(data, sizeof(data), "Not supported\n"); + if (*pos >= res) + return 0; + + res -= *pos; + res = min_t(size_t, res, size); + + if (copy_to_user(buf, &data[*pos], res)) + return -EINVAL; + + *pos += res; + + return res; + } else { + return amdgpu_ras_debugfs_table_read(f, buf, size, pos); + } +} + +const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops = { + .owner = THIS_MODULE, + .read = amdgpu_ras_debugfs_eeprom_table_read, + .write = NULL, + .llseek = default_llseek, +}; + /** * __verify_ras_table_checksum -- verify the RAS EEPROM table checksum * @control: pointer to control structure diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index e26af82ea63a..f95fc61b3021 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -29,9 +29,10 @@ struct amdgpu_device; enum amdgpu_ras_eeprom_err_type { - AMDGPU_RAS_EEPROM_ERR_PLACE_HOLDER, + AMDGPU_RAS_EEPROM_ERR_NA, AMDGPU_RAS_EEPROM_ERR_RECOVERABLE, - AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE + AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE, + AMDGPU_RAS_EEPROM_ERR_COUNT, }; struct amdgpu_ras_eeprom_table_header { @@ -121,4 +122,9 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, inline uint32_t amdgpu_ras_eeprom_max_record_count(void); +void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control); + +extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops; +extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops; + #endif // _AMDGPU_RAS_EEPROM_H -- cgit From 1d9d2ca85b32605ac9c74c8fa42d0c1cfbe019d4 Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Thu, 29 Apr 2021 20:15:42 -0400 Subject: drm/amdgpu: Fix koops when accessing RAS EEPROM Debugfs RAS EEPROM files are available when the ASIC supports RAS, and when the debugfs is enabled, an also when "ras_enable" module parameter is set to 0. However in this case, we get a kernel oops when accessing some of the "ras_..." controls in debugfs. The reason for this is that struct amdgpu_ras::adev is unset. This commit sets it, thus enabling access to those facilities. Note that this facilitates EEPROM access and not necessarily RAS features or functionality. Cc: Alexander Deucher Cc: John Clements Cc: Hawking Zhang Signed-off-by: Luben Tuikov Acked-by: Alexander Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 53a54ff8edc3..875874ea745e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1947,11 +1947,20 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) bool exc_err_limit = false; int ret; - if (adev->ras_enabled && con) - data = &con->eh_data; - else + if (!con) + return 0; + + /* Allow access to RAS EEPROM via debugfs, when the ASIC + * supports RAS and debugfs is enabled, but when + * adev->ras_enabled is unset, i.e. when "ras_enable" + * module parameter is set to 0. + */ + con->adev = adev; + + if (!adev->ras_enabled) return 0; + data = &con->eh_data; *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO); if (!*data) { ret = -ENOMEM; @@ -1961,7 +1970,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) mutex_init(&con->recovery_lock); INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); atomic_set(&con->in_recovery, 0); - con->adev = adev; max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(); amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); -- cgit From 4d9f771e111ee0144338c1012a90f1762220141a Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Fri, 2 Jul 2021 18:35:14 -0400 Subject: drm/amdgpu: Return error if no RAS In amdgpu_ras_query_error_count() return an error if the device doesn't support RAS. This prevents that function from having to always set the values of the integer pointers (if set), and thus prevents function side effects--always to have to set values of integers if integer pointers set, regardless of whether RAS is supported or not--with this change this side effect is mitigated. Also, if no pointers are set, don't count, since we've no way of reporting the counts. Also, give this function a kernel-doc. Cc: Alexander Deucher Cc: John Clements Cc: Hawking Zhang Reported-by: Tom Rix Fixes: a46751fbcde505 ("drm/amdgpu: Fix RAS function interface") Signed-off-by: Luben Tuikov Reviewed-by: Alexander Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 49 +++++++++++++++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 6 ++-- 2 files changed, 38 insertions(+), 17 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 875874ea745e..194f7ccfbf94 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -813,7 +813,7 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, /* query/inject/cure begin */ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, - struct ras_query_if *info) + struct ras_query_if *info) { struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); struct ras_err_data err_data = {0, 0, 0, NULL}; @@ -1047,17 +1047,32 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, return ret; } -/* get the total error counts on all IPs */ -void amdgpu_ras_query_error_count(struct amdgpu_device *adev, - unsigned long *ce_count, - unsigned long *ue_count) +/** + * amdgpu_ras_query_error_count -- Get error counts of all IPs + * adev: pointer to AMD GPU device + * ce_count: pointer to an integer to be set to the count of correctible errors. + * ue_count: pointer to an integer to be set to the count of uncorrectible + * errors. + * + * If set, @ce_count or @ue_count, count and return the corresponding + * error counts in those integer pointers. Return 0 if the device + * supports RAS. Return -EOPNOTSUPP if the device doesn't support RAS. + */ +int amdgpu_ras_query_error_count(struct amdgpu_device *adev, + unsigned long *ce_count, + unsigned long *ue_count) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj; unsigned long ce, ue; if (!adev->ras_enabled || !con) - return; + return -EOPNOTSUPP; + + /* Don't count since no reporting. + */ + if (!ce_count && !ue_count) + return 0; ce = 0; ue = 0; @@ -1065,9 +1080,11 @@ void amdgpu_ras_query_error_count(struct amdgpu_device *adev, struct ras_query_if info = { .head = obj->head, }; + int res; - if (amdgpu_ras_query_error_status(adev, &info)) - return; + res = amdgpu_ras_query_error_status(adev, &info); + if (res) + return res; ce += info.ce_count; ue += info.ue_count; @@ -1078,6 +1095,8 @@ void amdgpu_ras_query_error_count(struct amdgpu_device *adev, if (ue_count) *ue_count = ue; + + return 0; } /* query/inject/cure end */ @@ -2145,9 +2164,10 @@ static void amdgpu_ras_counte_dw(struct work_struct *work) /* Cache new values. */ - amdgpu_ras_query_error_count(adev, &ce_count, &ue_count); - atomic_set(&con->ras_ce_count, ce_count); - atomic_set(&con->ras_ue_count, ue_count); + if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) { + atomic_set(&con->ras_ce_count, ce_count); + atomic_set(&con->ras_ue_count, ue_count); + } pm_runtime_mark_last_busy(dev->dev); Out: @@ -2320,9 +2340,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, /* Those are the cached values at init. */ - amdgpu_ras_query_error_count(adev, &ce_count, &ue_count); - atomic_set(&con->ras_ce_count, ce_count); - atomic_set(&con->ras_ue_count, ue_count); + if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) { + atomic_set(&con->ras_ce_count, ce_count); + atomic_set(&con->ras_ue_count, ue_count); + } return 0; cleanup: diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 283afd791db1..4d9c63f2f377 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -491,9 +491,9 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, void amdgpu_ras_resume(struct amdgpu_device *adev); void amdgpu_ras_suspend(struct amdgpu_device *adev); -void amdgpu_ras_query_error_count(struct amdgpu_device *adev, - unsigned long *ce_count, - unsigned long *ue_count); +int amdgpu_ras_query_error_count(struct amdgpu_device *adev, + unsigned long *ce_count, + unsigned long *ue_count); /* error handling functions */ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, -- cgit