On Sun, Apr 18, 2021 at 08:00:31PM -0400, Waiman Long wrote:
Currently, the object stock structure caches either reclaimable vmstatHow about
bytes or unreclaimable vmstat bytes in its object stock structure. The
hit rate can be improved if both types of vmstat data can be cached
especially for single-node system.
This patch supports the cacheing of both type of vmstat data, though
at the expense of a slightly increased complexity in the caching code.
For large object (>= PAGE_SIZE), vmstat array is done directly without
going through the stock caching step.
On a 2-socket Cascade Lake server with instrumentation enabled, the
miss rates are shown in the table below.
Initial bootup:
Kernel __mod_objcg_state mod_objcg_state %age
------ ----------------- --------------- ----
Before patch 634400 3243830 19.6%
After patch 419810 3182424 13.2%
Parallel kernel build:
Kernel __mod_objcg_state mod_objcg_state %age
------ ----------------- --------------- ----
Before patch 24329265 142512465 17.1%
After patch 24051721 142445825 16.9%
There was a decrease of miss rate after initial system bootup. However,
the miss rate for parallel kernel build remained about the same probably
because most of the touched kmemcache objects were reclaimable inodes
and dentries.
Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
---
mm/memcontrol.c | 79 +++++++++++++++++++++++++++++++------------------
1 file changed, 51 insertions(+), 28 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c13502eab282..a6dd18f6d8a8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2212,8 +2212,8 @@ struct obj_stock {
struct obj_cgroup *cached_objcg;
struct pglist_data *cached_pgdat;
unsigned int nr_bytes;
- int vmstat_idx;
- int vmstat_bytes;
+ int reclaimable_bytes; /* NR_SLAB_RECLAIMABLE_B */
+ int unreclaimable_bytes; /* NR_SLAB_UNRECLAIMABLE_B */
int nr_slab_reclaimable_b;
int nr_slab_unreclaimable_b;
so you don't need the comments?
It may not be that helpful. I am going to take it out in the next version.#elseThis looks like an optimization independent of the vmstat item split?
int dummy[0];
#endif
@@ -3217,40 +3217,56 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
unsigned long flags;
- struct obj_stock *stock = get_obj_stock(&flags);
+ struct obj_stock *stock;
+ int *bytes, *alt_bytes, alt_idx;
+
+ /*
+ * Directly update vmstat array for big object.
+ */
+ if (unlikely(abs(nr) >= PAGE_SIZE))
+ goto update_vmstat;
+ stock = get_obj_stock(&flags);As per the other email, I really don't think optimizing the pgdat
+ if (idx == NR_SLAB_RECLAIMABLE_B) {
+ bytes = &stock->reclaimable_bytes;
+ alt_bytes = &stock->unreclaimable_bytes;
+ alt_idx = NR_SLAB_UNRECLAIMABLE_B;
+ } else {
+ bytes = &stock->unreclaimable_bytes;
+ alt_bytes = &stock->reclaimable_bytes;
+ alt_idx = NR_SLAB_RECLAIMABLE_B;
+ }
/*
- * Save vmstat data in stock and skip vmstat array update unless
- * accumulating over a page of vmstat data or when pgdat or idx
+ * Try to save vmstat data in stock and skip vmstat array update
+ * unless accumulating over a page of vmstat data or when pgdat
* changes.
*/
if (stock->cached_objcg != objcg) {
/* Output the current data as is */
- } else if (!stock->vmstat_bytes) {
- /* Save the current data */
- stock->vmstat_bytes = nr;
- stock->vmstat_idx = idx;
- stock->cached_pgdat = pgdat;
- nr = 0;
- } else if ((stock->cached_pgdat != pgdat) ||
- (stock->vmstat_idx != idx)) {
- /* Output the cached data & save the current data */
- swap(nr, stock->vmstat_bytes);
- swap(idx, stock->vmstat_idx);
+ } else if (stock->cached_pgdat != pgdat) {
+ /* Save the current data and output cached data, if any */
+ swap(nr, *bytes);
swap(pgdat, stock->cached_pgdat);
+ if (*alt_bytes) {
+ __mod_objcg_state(objcg, pgdat, alt_idx,
+ *alt_bytes);
+ *alt_bytes = 0;
+ }
switch (in a percpu cache) is worth this level of complexity.
OK, I will eliminate the bytes variable.
} else {The int bytes indirection isn't necessary. It's easier to read even
- stock->vmstat_bytes += nr;
- if (abs(stock->vmstat_bytes) > PAGE_SIZE) {
- nr = stock->vmstat_bytes;
- stock->vmstat_bytes = 0;
+ *bytes += nr;
+ if (abs(*bytes) > PAGE_SIZE) {
+ nr = *bytes;
+ *bytes = 0;
} else {
nr = 0;
}
}
- if (nr)
- __mod_objcg_state(objcg, pgdat, idx, nr);
-
put_obj_stock(flags);
+ if (!nr)
+ return;
+update_vmstat:
+ __mod_objcg_state(objcg, pgdat, idx, nr);
}
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
@@ -3303,12 +3319,19 @@ static void drain_obj_stock(struct obj_stock *stock)
/*
* Flush the vmstat data in current stock
*/
- if (stock->vmstat_bytes) {
- __mod_objcg_state(old, stock->cached_pgdat, stock->vmstat_idx,
- stock->vmstat_bytes);
+ if (stock->reclaimable_bytes || stock->unreclaimable_bytes) {
+ int bytes;
+
+ if ((bytes = stock->reclaimable_bytes))
+ __mod_objcg_state(old, stock->cached_pgdat,
+ NR_SLAB_RECLAIMABLE_B, bytes);
+ if ((bytes = stock->unreclaimable_bytes))
+ __mod_objcg_state(old, stock->cached_pgdat,
+ NR_SLAB_UNRECLAIMABLE_B, bytes);
with the extra lines required to repeat the long stock member names,
because that is quite a common pattern (if (stuff) frob(stuff)).
I am also thinking that eliminate unnecessary rcu_read_lock/rcu_read_unlock may help performance a bit. However, that will be done in another patch after I have done more performance testing. I am not going to bother with that in this series.
__mod_objcg_state() also each time does rcu_read_lock() toggling and a
memcg lookup that could be batched, which I think is further proof
that it should just be inlined here.