Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

From: Michal Hocko
Date: Wed Apr 17 2019 - 04:45:07 EST


Hi,
I do not see http://lkml.kernel.org/r/1554348617-12897-1-git-send-email-huangzhaoyang@xxxxxxxxx
discussion reaching a conlusion to change the current workingset
implementation. Therefore is there any reason to post a new version of
the patch? If yes it would be really great to see a short summary about
how this version is different from the previous one and how all the
review feedback has been addressed.

On Wed 17-04-19 15:47:26, Zhaoyang Huang wrote:
> From: Zhaoyang Huang <zhaoyang.huang@xxxxxxxxxx>
>
> This patch introduce timestamp into workingset's entry and judge if the page
> is active or inactive via active_file/refault_ratio instead of refault distance.
>
> The original thought is coming from the logs we got from trace_printk in this
> patch, we can find about 1/5 of the file pages' refault are under the
> scenario[1],which will be counted as inactive as they have a long refault distance
> in between access. However, we can also know from the time information that the
> page refault quickly as comparing to the average refault time which is calculated
> by the number of active file and refault ratio. We want to save these kinds of
> pages from evicted earlier as it used to be. The refault ratio is the value
> which can reflect lru's average file access frequency and also can be deemed as a
> prediction of future.
>
> The patch is tested on an android system and reduce 30% of page faults, while
> 60% of the pages remain the original status as (refault_distance < active_file)
> indicates. Pages status got from ftrace during the test can refer to [2].
>
> [1]
> system_server workingset_refault: WKST_ACT[0]:rft_dis 265976, act_file 34268 rft_ratio 3047 rft_time 0 avg_rft_time 11 refault 295592 eviction 29616 secs 97 pre_secs 97
> HwBinder:922 workingset_refault: WKST_ACT[0]:rft_dis 264478, act_file 35037 rft_ratio 3070 rft_time 2 avg_rft_time 11 refault 310078 eviction 45600 secs 101 pre_secs 99
>
> [2]
> WKST_ACT[0]: original--INACTIVE commit--ACTIVE
> WKST_ACT[1]: original--ACTIVE commit--ACTIVE
> WKST_INACT[0]: original--INACTIVE commit--INACTIVE
> WKST_INACT[1]: original--ACTIVE commit--INACTIVE
>
> Signed-off-by: Zhaoyang Huang <huangzhaoyang@xxxxxxxxx>
> ---
> include/linux/mmzone.h | 1 +
> mm/workingset.c | 120 +++++++++++++++++++++++++++++++++++++++++++++----
> 2 files changed, 112 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 32699b2..6f30673 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -240,6 +240,7 @@ struct lruvec {
> atomic_long_t inactive_age;
> /* Refaults at the time of last reclaim cycle */
> unsigned long refaults;
> + atomic_long_t refaults_ratio;
> #ifdef CONFIG_MEMCG
> struct pglist_data *pgdat;
> #endif
> diff --git a/mm/workingset.c b/mm/workingset.c
> index 40ee02c..66c177b 100644
> --- a/mm/workingset.c
> +++ b/mm/workingset.c
> @@ -160,6 +160,21 @@
> MEM_CGROUP_ID_SHIFT)
> #define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
>
> +#ifdef CONFIG_64BIT
> +#define EVICTION_SECS_POS_SHIFT 20
> +#define EVICTION_SECS_SHRINK_SHIFT 4
> +#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
> +#else
> +#ifndef CONFIG_MEMCG
> +#define EVICTION_SECS_POS_SHIFT 12
> +#define EVICTION_SECS_SHRINK_SHIFT 4
> +#define EVICTION_SECS_POS_MASK ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
> +#else
> +#define EVICTION_SECS_POS_SHIFT 0
> +#define EVICTION_SECS_SHRINK_SHIFT 0
> +#define NO_SECS_IN_WORKINGSET
> +#endif
> +#endif
> /*
> * Eviction timestamps need to be able to cover the full range of
> * actionable refaults. However, bits are tight in the radix tree
> @@ -169,10 +184,54 @@
> * evictions into coarser buckets by shaving off lower timestamp bits.
> */
> static unsigned int bucket_order __read_mostly;
> -
> +#ifdef NO_SECS_IN_WORKINGSET
> +static void pack_secs(unsigned long *peviction) { }
> +static unsigned int unpack_secs(unsigned long entry) {return 0; }
> +#else
> +/*
> + * Shrink the timestamp according to its value and store it together
> + * with the shrink size in the entry.
> + */
> +static void pack_secs(unsigned long *peviction)
> +{
> + unsigned int secs;
> + unsigned long eviction;
> + int order;
> + int secs_shrink_size;
> + struct timespec ts;
> +
> + get_monotonic_boottime(&ts);
> + secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;
> + order = get_count_order(secs);
> + secs_shrink_size = (order <= EVICTION_SECS_POS_SHIFT)
> + ? 0 : (order - EVICTION_SECS_POS_SHIFT);
> +
> + eviction = *peviction;
> + eviction = (eviction << EVICTION_SECS_POS_SHIFT)
> + | ((secs >> secs_shrink_size) & EVICTION_SECS_POS_MASK);
> + eviction = (eviction << EVICTION_SECS_SHRINK_SHIFT) | (secs_shrink_size & 0xf);
> + *peviction = eviction;
> +}
> +/*
> + * Unpack the second from the entry and restore the value according to the
> + * shrink size.
> + */
> +static unsigned int unpack_secs(unsigned long entry)
> +{
> + unsigned int secs;
> + int secs_shrink_size;
> +
> + secs_shrink_size = entry & ((1 << EVICTION_SECS_SHRINK_SHIFT) - 1);
> + entry >>= EVICTION_SECS_SHRINK_SHIFT;
> + secs = entry & EVICTION_SECS_POS_MASK;
> + secs = secs << secs_shrink_size;
> + return secs;
> +}
> +#endif
> static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
> {
> eviction >>= bucket_order;
> + pack_secs(&eviction);
> eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
> eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
> eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
> @@ -181,20 +240,24 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
> }
>
> static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
> - unsigned long *evictionp)
> + unsigned long *evictionp, unsigned int *prev_secs)
> {
> unsigned long entry = (unsigned long)shadow;
> int memcgid, nid;
> + unsigned int secs;
>
> entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
> nid = entry & ((1UL << NODES_SHIFT) - 1);
> entry >>= NODES_SHIFT;
> memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
> entry >>= MEM_CGROUP_ID_SHIFT;
> + secs = unpack_secs(entry);
> + entry >>= (EVICTION_SECS_POS_SHIFT + EVICTION_SECS_SHRINK_SHIFT);
>
> *memcgidp = memcgid;
> *pgdat = NODE_DATA(nid);
> *evictionp = entry << bucket_order;
> + *prev_secs = secs;
> }
>
> /**
> @@ -242,9 +305,22 @@ bool workingset_refault(void *shadow)
> unsigned long refault;
> struct pglist_data *pgdat;
> int memcgid;
> +#ifndef NO_SECS_IN_WORKINGSET
> + unsigned long avg_refault_time;
> + unsigned long refault_time;
> + int tradition;
> + unsigned int prev_secs;
> + unsigned int secs;
> + unsigned long refaults_ratio;
> +#endif
> + struct timespec ts;
> + /*
> + convert jiffies to second
> + */
> + get_monotonic_boottime(&ts);
> + secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;
>
> - unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
> -
> + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &prev_secs);
> rcu_read_lock();
> /*
> * Look up the memcg associated with the stored ID. It might
> @@ -288,14 +364,37 @@ bool workingset_refault(void *shadow)
> * list is not a problem.
> */
> refault_distance = (refault - eviction) & EVICTION_MASK;
> -
> inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
> -
> - if (refault_distance <= active_file) {
> +#ifndef NO_SECS_IN_WORKINGSET
> + refaults_ratio = (atomic_long_read(&lruvec->inactive_age) + 1) / secs;
> + atomic_long_set(&lruvec->refaults_ratio, refaults_ratio);
> + refault_time = secs - prev_secs;
> + avg_refault_time = active_file / refaults_ratio;
> + tradition = !!(refault_distance < active_file);
> + if (refault_time <= avg_refault_time) {
> +#else
> + if (refault_distance < active_file) {
> +#endif
> inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
> +#ifndef NO_SECS_IN_WORKINGSET
> + trace_printk("WKST_ACT[%d]:rft_dis %ld, act_file %ld \
> + rft_ratio %ld rft_time %ld avg_rft_time %ld \
> + refault %ld eviction %ld secs %d pre_secs %d\n",
> + tradition, refault_distance, active_file,
> + refaults_ratio, refault_time, avg_refault_time,
> + refault, eviction, secs, prev_secs);
> +#endif
> rcu_read_unlock();
> return true;
> }
> +#ifndef NO_SECS_IN_WORKINGSET
> + trace_printk("WKST_INACT[%d]:rft_dis %ld, act_file %ld \
> + rft_ratio %ld rft_time %ld avg_rft_time %ld \
> + refault %ld eviction %ld secs %d pre_secs %d\n",
> + tradition, refault_distance, active_file,
> + refaults_ratio, refault_time, avg_refault_time,
> + refault, eviction, secs, prev_secs);
> +#endif
> rcu_read_unlock();
> return false;
> }
> @@ -513,7 +612,9 @@ static int __init workingset_init(void)
> unsigned int max_order;
> int ret;
>
> - BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
> + BUILD_BUG_ON(BITS_PER_LONG < (EVICTION_SHIFT
> + + EVICTION_SECS_POS_SHIFT
> + + EVICTION_SECS_SHRINK_SHIFT));
> /*
> * Calculate the eviction bucket size to cover the longest
> * actionable refault distance, which is currently half of
> @@ -521,7 +622,8 @@ static int __init workingset_init(void)
> * some more pages at runtime, so keep working with up to
> * double the initial memory by using totalram_pages as-is.
> */
> - timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
> + timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT
> + - EVICTION_SECS_POS_SHIFT - EVICTION_SECS_SHRINK_SHIFT;
> max_order = fls_long(totalram_pages - 1);
> if (max_order > timestamp_bits)
> bucket_order = max_order - timestamp_bits;
> --
> 1.9.1

--
Michal Hocko
SUSE Labs