RE: [PATCH 1/4] drm/amd: Add detailed GFXOFF stats to debugfs
From: Quan, Evan
Date: Mon Jul 25 2022 - 06:27:21 EST
[AMD Official Use Only - General]
Using "uint64_t" instead of "uint32_t" for entry counter may be better.
BR
Evan
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of
> André Almeida
> Sent: Saturday, July 23, 2022 4:34 AM
> To: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Koenig, Christian
> <Christian.Koenig@xxxxxxx>; Pan, Xinhui <Xinhui.Pan@xxxxxxx>; David
> Airlie <airlied@xxxxxxxx>; Daniel Vetter <daniel@xxxxxxxx>; Zhang, Hawking
> <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Kuehling,
> Felix <Felix.Kuehling@xxxxxxx>; Xiao, Jack <Jack.Xiao@xxxxxxx>; amd-
> gfx@xxxxxxxxxxxxxxxxxxxxx; dri-devel@xxxxxxxxxxxxxxxxxxxxx; linux-
> kernel@xxxxxxxxxxxxxxx; StDenis, Tom <Tom.StDenis@xxxxxxx>; Siqueira,
> Rodrigo <Rodrigo.Siqueira@xxxxxxx>
> Cc: André Almeida <andrealmeid@xxxxxxxxxx>; kernel-dev@xxxxxxxxxx
> Subject: [PATCH 1/4] drm/amd: Add detailed GFXOFF stats to debugfs
>
> Add debugfs interface to log GFXOFF statistics:
>
> - Read amdgpu_gfxoff_count to get the total GFXOFF entry count at the
> time of query since system power-up
>
> - Write 1 to amdgpu_gfxoff_residency to start logging, and 0 to stop.
> Read it to get average GFXOFF residency % multiplied by 100
> during the last logging interval.
>
> Both features are designed to be keep the values persistent between
> suspends.
>
> Signed-off-by: André Almeida <andrealmeid@xxxxxxxxxx>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 168
> ++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 39 ++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 6 +
> drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 45 +++++
> drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h | 3 +
> drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 34 +++-
> drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 22 +++
> drivers/gpu/drm/amd/pm/swsmu/smu_internal.h | 3 +
> 9 files changed, 321 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index e2eec985adb3..edf90a9ba980 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -1042,6 +1042,157 @@ static ssize_t amdgpu_debugfs_gpr_read(struct
> file *f, char __user *buf,
> return r;
> }
>
> +/**
> + * amdgpu_debugfs_gfxoff_residency_read - Read GFXOFF residency
> + *
> + * @f: open file handle
> + * @buf: User buffer to store read data in
> + * @size: Number of bytes to read
> + * @pos: Offset to seek to
> + *
> + * Read the last residency value logged. It doesn't auto update, one needs
> to
> + * stop logging before getting the current value.
> + */
> +static ssize_t amdgpu_debugfs_gfxoff_residency_read(struct file *f, char
> __user *buf,
> + size_t size, loff_t *pos)
> +{
> + struct amdgpu_device *adev = file_inode(f)->i_private;
> + ssize_t result = 0;
> + int r;
> +
> + if (size & 0x3 || *pos & 0x3)
> + return -EINVAL;
> +
> + r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> + if (r < 0) {
> + pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
> + return r;
> + }
> +
> + while (size) {
> + uint32_t value;
> +
> + r = amdgpu_get_gfx_off_residency(adev, &value);
> + if (r)
> + goto out;
> +
> + r = put_user(value, (uint32_t *)buf);
> + if (r)
> + goto out;
> +
> + result += 4;
> + buf += 4;
> + *pos += 4;
> + size -= 4;
> + }
> +
> + r = result;
> +out:
> + pm_runtime_mark_last_busy(adev_to_drm(adev)->dev);
> + pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
> +
> + return r;
> +}
> +
> +/**
> + * amdgpu_debugfs_gfxoff_residency_write - Log GFXOFF Residency
> + *
> + * @f: open file handle
> + * @buf: User buffer to write data from
> + * @size: Number of bytes to write
> + * @pos: Offset to seek to
> + *
> + * Write a 32-bit non-zero to start logging; write a 32-bit zero to stop
> + */
> +static ssize_t amdgpu_debugfs_gfxoff_residency_write(struct file *f, const
> char __user *buf,
> + size_t size, loff_t *pos)
> +{
> + struct amdgpu_device *adev = file_inode(f)->i_private;
> + ssize_t result = 0;
> + int r;
> +
> + if (size & 0x3 || *pos & 0x3)
> + return -EINVAL;
> +
> + r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> + if (r < 0) {
> + pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
> + return r;
> + }
> +
> + while (size) {
> + u32 value;
> +
> + r = get_user(value, (uint32_t *)buf);
> + if (r)
> + goto out;
> +
> + amdgpu_set_gfx_off_residency(adev, value ? true : false);
> +
> + result += 4;
> + buf += 4;
> + *pos += 4;
> + size -= 4;
> + }
> +
> + r = result;
> +out:
> + pm_runtime_mark_last_busy(adev_to_drm(adev)->dev);
> + pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
> +
> + return r;
> +}
> +
> +
> +/**
> + * amdgpu_debugfs_gfxoff_count_read - Read GFXOFF entry count
> + *
> + * @f: open file handle
> + * @buf: User buffer to store read data in
> + * @size: Number of bytes to read
> + * @pos: Offset to seek to
> + */
> +static ssize_t amdgpu_debugfs_gfxoff_count_read(struct file *f, char
> __user *buf,
> + size_t size, loff_t *pos)
> +{
> + struct amdgpu_device *adev = file_inode(f)->i_private;
> + ssize_t result = 0;
> + int r;
> +
> + if (size & 0x3 || *pos & 0x3)
> + return -EINVAL;
> +
> + r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> + if (r < 0) {
> + pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
> + return r;
> + }
> +
> + while (size) {
> + u32 value;
> +
> + r = amdgpu_get_gfx_off_entrycount(adev, &value);
> + if (r)
> + goto out;
> +
> + r = put_user(value, (uint32_t *)buf);
> + if (r)
> + goto out;
> +
> + result += 4;
> + buf += 4;
> + *pos += 4;
> + size -= 4;
> + }
> +
> + r = result;
> +out:
> + pm_runtime_mark_last_busy(adev_to_drm(adev)->dev);
> + pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
> +
> + return r;
> +}
> +
> /**
> * amdgpu_debugfs_gfxoff_write - Enable/disable GFXOFF
> *
> @@ -1249,6 +1400,19 @@ static const struct file_operations
> amdgpu_debugfs_gfxoff_status_fops = {
> .llseek = default_llseek
> };
>
> +static const struct file_operations amdgpu_debugfs_gfxoff_count_fops = {
> + .owner = THIS_MODULE,
> + .read = amdgpu_debugfs_gfxoff_count_read,
> + .llseek = default_llseek
> +};
> +
> +static const struct file_operations amdgpu_debugfs_gfxoff_residency_fops
> = {
> + .owner = THIS_MODULE,
> + .read = amdgpu_debugfs_gfxoff_residency_read,
> + .write = amdgpu_debugfs_gfxoff_residency_write,
> + .llseek = default_llseek
> +};
> +
> static const struct file_operations *debugfs_regs[] = {
> &amdgpu_debugfs_regs_fops,
> &amdgpu_debugfs_regs2_fops,
> @@ -1261,6 +1425,8 @@ static const struct file_operations *debugfs_regs[]
> = {
> &amdgpu_debugfs_gpr_fops,
> &amdgpu_debugfs_gfxoff_fops,
> &amdgpu_debugfs_gfxoff_status_fops,
> + &amdgpu_debugfs_gfxoff_count_fops,
> + &amdgpu_debugfs_gfxoff_residency_fops,
> };
>
> static const char *debugfs_regs_names[] = {
> @@ -1275,6 +1441,8 @@ static const char *debugfs_regs_names[] = {
> "amdgpu_gpr",
> "amdgpu_gfxoff",
> "amdgpu_gfxoff_status",
> + "amdgpu_gfxoff_count",
> + "amdgpu_gfxoff_residency",
> };
>
> /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index b79ee4ffb879..15a95bc2c211 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3576,6 +3576,8 @@ int amdgpu_device_init(struct amdgpu_device
> *adev,
> INIT_WORK(&adev->xgmi_reset_work,
> amdgpu_device_xgmi_reset_func);
>
> adev->gfx.gfx_off_req_count = 1;
> + adev->gfx.gfx_off_residency = 0;
> + adev->gfx.gfx_off_entrycount = 0;
> adev->pm.ac_power = power_supply_is_system_supplied() > 0;
>
> atomic_set(&adev->throttling_logging_enabled, 1);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 222d3d7ea076..3675c1b899db 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -610,6 +610,45 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device
> *adev, bool enable)
> mutex_unlock(&adev->gfx.gfx_off_mutex);
> }
>
> +int amdgpu_set_gfx_off_residency(struct amdgpu_device *adev, bool
> value)
> +{
> + int r = 0;
> +
> + mutex_lock(&adev->gfx.gfx_off_mutex);
> +
> + r = amdgpu_dpm_set_residency_gfxoff(adev, value);
> +
> + mutex_unlock(&adev->gfx.gfx_off_mutex);
> +
> + return r;
> +}
> +
> +int amdgpu_get_gfx_off_residency(struct amdgpu_device *adev, u32
> *value)
> +{
> + int r = 0;
> +
> + mutex_lock(&adev->gfx.gfx_off_mutex);
> +
> + r = amdgpu_dpm_get_residency_gfxoff(adev, value);
> +
> + mutex_unlock(&adev->gfx.gfx_off_mutex);
> +
> + return r;
> +}
> +
> +int amdgpu_get_gfx_off_entrycount(struct amdgpu_device *adev, u32
> *value)
> +{
> + int r = 0;
> +
> + mutex_lock(&adev->gfx.gfx_off_mutex);
> +
> + r = amdgpu_dpm_get_entrycount_gfxoff(adev, value);
> +
> + mutex_unlock(&adev->gfx.gfx_off_mutex);
> +
> + return r;
> +}
> +
> int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t
> *value)
> {
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 23a696d38390..f06e979e2565 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -336,6 +336,8 @@ struct amdgpu_gfx {
> struct mutex gfx_off_mutex;
> uint32_t gfx_off_req_count; /* default 1, enable gfx off:
> dec 1, disable gfx off: add 1 */
> struct delayed_work gfx_off_delay_work;
> + uint32_t gfx_off_residency;
> + uint32_t gfx_off_entrycount;
>
> /* pipe reservation */
> struct mutex pipe_reserve_mutex;
> @@ -407,6 +409,10 @@ bool amdgpu_gfx_is_me_queue_enabled(struct
> amdgpu_device *adev, int me,
> void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable);
> int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, uint32_t
> *value);
> int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct
> ras_common_if *ras_block);
> +void amdgpu_gfx_ras_fini(struct amdgpu_device *adev);
> +int amdgpu_get_gfx_off_entrycount(struct amdgpu_device *adev, u32
> *value);
> +int amdgpu_get_gfx_off_residency(struct amdgpu_device *adev, u32
> *residency);
> +int amdgpu_set_gfx_off_residency(struct amdgpu_device *adev, bool
> value);
> int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
> void *err_data,
> struct amdgpu_iv_entry *entry);
> diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> index 956b6ce81c84..df87d0768fd7 100644
> --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> @@ -668,6 +668,51 @@ int amdgpu_dpm_wait_for_event(struct
> amdgpu_device *adev,
> return ret;
> }
>
> +int amdgpu_dpm_set_residency_gfxoff(struct amdgpu_device *adev, bool
> value)
> +{
> + struct smu_context *smu = adev->powerplay.pp_handle;
> + int ret = 0;
> +
> + if (!is_support_sw_smu(adev))
> + return -EOPNOTSUPP;
> +
> + mutex_lock(&adev->pm.mutex);
> + ret = smu_set_residency_gfxoff(smu, value);
> + mutex_unlock(&adev->pm.mutex);
> +
> + return ret;
> +}
> +
> +int amdgpu_dpm_get_residency_gfxoff(struct amdgpu_device *adev, u32
> *value)
> +{
> + struct smu_context *smu = adev->powerplay.pp_handle;
> + int ret = 0;
> +
> + if (!is_support_sw_smu(adev))
> + return -EOPNOTSUPP;
> +
> + mutex_lock(&adev->pm.mutex);
> + ret = smu_get_residency_gfxoff(smu, value);
> + mutex_unlock(&adev->pm.mutex);
> +
> + return ret;
> +}
> +
> +int amdgpu_dpm_get_entrycount_gfxoff(struct amdgpu_device *adev,
> u32 *value)
> +{
> + struct smu_context *smu = adev->powerplay.pp_handle;
> + int ret = 0;
> +
> + if (!is_support_sw_smu(adev))
> + return -EOPNOTSUPP;
> +
> + mutex_lock(&adev->pm.mutex);
> + ret = smu_get_entrycount_gfxoff(smu, value);
> + mutex_unlock(&adev->pm.mutex);
> +
> + return ret;
> +}
> +
> int amdgpu_dpm_get_status_gfxoff(struct amdgpu_device *adev, uint32_t
> *value)
> {
> struct smu_context *smu = adev->powerplay.pp_handle;
> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> index 65624d091ed2..83a83e93037c 100644
> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> @@ -435,6 +435,9 @@ int amdgpu_dpm_set_soft_freq_range(struct
> amdgpu_device *adev,
> int amdgpu_dpm_write_watermarks_table(struct amdgpu_device *adev);
> int amdgpu_dpm_wait_for_event(struct amdgpu_device *adev, enum
> smu_event_type event,
> uint64_t event_arg);
> +int amdgpu_dpm_get_residency_gfxoff(struct amdgpu_device *adev, u32
> *value);
> +int amdgpu_dpm_set_residency_gfxoff(struct amdgpu_device *adev, bool
> value);
> +int amdgpu_dpm_get_entrycount_gfxoff(struct amdgpu_device *adev,
> u32 *value);
> int amdgpu_dpm_get_status_gfxoff(struct amdgpu_device *adev, uint32_t
> *value);
> uint64_t amdgpu_dpm_get_thermal_throttling_counter(struct
> amdgpu_device *adev);
> void amdgpu_dpm_gfx_state_change(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index fd79b213fab4..cfc3b9d749bf 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -90,6 +90,30 @@ static int smu_sys_set_pp_feature_mask(void *handle,
> return smu_set_pp_feature_mask(smu, new_mask);
> }
>
> +int smu_set_residency_gfxoff(struct smu_context *smu, bool value)
> +{
> + if (!smu->ppt_funcs->set_gfx_off_residency)
> + return -EINVAL;
> +
> + return smu_set_gfx_off_residency(smu, value);
> +}
> +
> +int smu_get_residency_gfxoff(struct smu_context *smu, u32 *value)
> +{
> + if (!smu->ppt_funcs->get_gfx_off_residency)
> + return -EINVAL;
> +
> + return smu_get_gfx_off_residency(smu, value);
> +}
> +
> +int smu_get_entrycount_gfxoff(struct smu_context *smu, u32 *value)
> +{
> + if (!smu->ppt_funcs->get_gfx_off_entrycount)
> + return -EINVAL;
> +
> + return smu_get_gfx_off_entrycount(smu, value);
> +}
> +
> int smu_get_status_gfxoff(struct smu_context *smu, uint32_t *value)
> {
> if (!smu->ppt_funcs->get_gfx_off_status)
> @@ -1573,7 +1597,7 @@ static int smu_suspend(void *handle)
> {
> struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> struct smu_context *smu = adev->powerplay.pp_handle;
> - int ret;
> + int ret, count;
>
> if (amdgpu_sriov_vf(adev)&& !amdgpu_sriov_is_pp_one_vf(adev))
> return 0;
> @@ -1591,6 +1615,14 @@ static int smu_suspend(void *handle)
>
> smu_set_gfx_cgpg(smu, false);
>
> + /*
> + * pwfw resets entrycount when device is suspended, so we save
> the
> + * last value to be used when we resume to keep it consistent
> + */
> + ret = smu_get_entrycount_gfxoff(smu, &count);
> + if (!ret)
> + adev->gfx.gfx_off_entrycount = count;
> +
> return 0;
> }
>
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> index b81c657c7386..9827075b768e 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> @@ -1111,6 +1111,22 @@ struct pptable_funcs {
> */
> uint32_t (*get_gfx_off_status)(struct smu_context *smu);
>
> + /**
> + * @gfx_off_entrycount: total GFXOFF entry count at the time of
> + * query since system power-up
> + */
> + u32 (*get_gfx_off_entrycount)(struct smu_context *smu, uint32_t
> *entrycount);
> +
> + /**
> + * @set_gfx_off_residency: set 1 to start logging, 0 to stop logging
> + */
> + u32 (*set_gfx_off_residency)(struct smu_context *smu, bool start);
> +
> + /**
> + * @get_gfx_off_residency: Average GFXOFF residency % during the
> logging interval
> + */
> + u32 (*get_gfx_off_residency)(struct smu_context *smu, uint32_t
> *residency);
> +
> /**
> * @register_irq_handler: Register interupt request handlers.
> */
> @@ -1454,6 +1470,12 @@ int smu_set_ac_dc(struct smu_context *smu);
>
> int smu_allow_xgmi_power_down(struct smu_context *smu, bool en);
>
> +int smu_get_entrycount_gfxoff(struct smu_context *smu, u32 *value);
> +
> +int smu_get_residency_gfxoff(struct smu_context *smu, u32 *value);
> +
> +int smu_set_residency_gfxoff(struct smu_context *smu, bool value);
> +
> int smu_get_status_gfxoff(struct smu_context *smu, uint32_t *value);
>
> int smu_handle_passthrough_sbr(struct smu_context *smu, bool enable);
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_internal.h
> b/drivers/gpu/drm/amd/pm/swsmu/smu_internal.h
> index 7469bbfce1fb..ceb13c838067 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_internal.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_internal.h
> @@ -47,6 +47,9 @@
> #define smu_notify_memory_pool_location(smu)
> smu_ppt_funcs(notify_memory_pool_location, 0, smu)
> #define smu_gfx_off_control(smu, enable)
> smu_ppt_funcs(gfx_off_control, 0, smu, enable)
> #define smu_get_gfx_off_status(smu)
> smu_ppt_funcs(get_gfx_off_status, 0, smu)
> +#define smu_get_gfx_off_entrycount(smu, value)
> smu_ppt_funcs(get_gfx_off_entrycount, 0, smu,
> value)
> +#define smu_get_gfx_off_residency(smu, value)
> smu_ppt_funcs(get_gfx_off_residency, 0, smu,
> value)
> +#define smu_set_gfx_off_residency(smu, value)
> smu_ppt_funcs(set_gfx_off_residency, 0, smu,
> value)
> #define smu_set_last_dcef_min_deep_sleep_clk(smu)
> smu_ppt_funcs(set_last_dcef_min_deep_sleep_clk, 0, smu)
> #define smu_system_features_control(smu, en)
> smu_ppt_funcs(system_features_control, 0, smu, en)
> #define smu_init_max_sustainable_clocks(smu)
> smu_ppt_funcs(init_max_sustainable_clocks, 0, smu)
> --
> 2.37.1