[PATCH] zram: remove global tb_lock by using lock-free CAS

From: Weijie Yang
Date: Mon May 05 2014 - 00:02:42 EST


Currently, we use a rwlock tb_lock to protect concurrent access to
whole zram meta table. However, according to the actual access model,
there is only a small chance for upper user access the same table[index],
so the current lock granularity is too big.

This patch add a atomic state for every table[index] to record its access,
by using CAS operation, protect concurrent access to the same table[index],
meanwhile allow the maximum concurrency.

On 64-bit system, it will not increase the meta table memory overhead, and
on 32-bit system with 4K page_size, it will increase about 1MB memory overhead
for 1GB zram. So, it is cost-efficient.

Test result:
(x86-64 Intel Core2 Q8400, system memory 4GB, Ubuntu 12.04,
kernel v3.15.0-rc3, zram 1GB with 4 max_comp_streams LZO,
take the average of 5 tests)

iozone -t 4 -R -r 16K -s 200M -I +Z

Test base lock-free ratio
------------------------------------------------------
Initial write 1348017.60 1424141.62 +5.6%
Rewrite 1520189.16 1652504.81 +8.7%
Read 8294445.45 11404668.35 +37.5%
Re-read 8134448.83 11555483.75 +42.1%
Reverse Read 6748717.97 8394478.17 +24.4%
Stride read 7220276.66 9372229.95 +29.8%
Random read 7133010.06 9187221.90 +28.8%
Mixed workload 4056980.71 5843370.85 +44.0%
Random write 1470106.17 1608947.04 +9.4%
Pwrite 1259493.72 1311055.32 +4.1%
Pread 4247583.17 4652056.11 +9.5%

Signed-off-by: Weijie Yang <weijie.yang@xxxxxxxxxxx>
---

This patch is based on linux-next tree, commit b5c8d48bf8f42

drivers/block/zram/zram_drv.c | 41 ++++++++++++++++++++++++++---------------
drivers/block/zram/zram_drv.h | 5 ++++-
2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 48eccb3..8b70945
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -255,7 +255,6 @@ static struct zram_meta *zram_meta_alloc(u64 disksize)
goto free_table;
}

- rwlock_init(&meta->tb_lock);
return meta;

free_table:
@@ -339,12 +338,14 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
unsigned long handle;
u16 size;

- read_lock(&meta->tb_lock);
+ while(atomic_cmpxchg(&meta->table[index].state, IDLE, ACCESS) != IDLE)
+ cpu_relax();
+
handle = meta->table[index].handle;
size = meta->table[index].size;

if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
- read_unlock(&meta->tb_lock);
+ atomic_set(&meta->table[index].state, IDLE);
clear_page(mem);
return 0;
}
@@ -355,7 +356,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
else
ret = zcomp_decompress(zram->comp, cmem, size, mem);
zs_unmap_object(meta->mem_pool, handle);
- read_unlock(&meta->tb_lock);
+ atomic_set(&meta->table[index].state, IDLE);

/* Should NEVER happen. Return bio error if it does. */
if (unlikely(ret)) {
@@ -376,14 +377,16 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
struct zram_meta *meta = zram->meta;
page = bvec->bv_page;

- read_lock(&meta->tb_lock);
+ while(atomic_cmpxchg(&meta->table[index].state, IDLE, ACCESS) != IDLE)
+ cpu_relax();
+
if (unlikely(!meta->table[index].handle) ||
zram_test_flag(meta, index, ZRAM_ZERO)) {
- read_unlock(&meta->tb_lock);
+ atomic_set(&meta->table[index].state, IDLE);
handle_zero_page(bvec);
return 0;
}
- read_unlock(&meta->tb_lock);
+ atomic_set(&meta->table[index].state, IDLE);

if (is_partial_io(bvec))
/* Use a temporary buffer to decompress the page */
@@ -461,10 +464,13 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
if (page_zero_filled(uncmem)) {
kunmap_atomic(user_mem);
/* Free memory associated with this sector now. */
- write_lock(&zram->meta->tb_lock);
+ while(atomic_cmpxchg(&meta->table[index].state,
+ IDLE, ACCESS) != IDLE)
+ cpu_relax();
+
zram_free_page(zram, index);
zram_set_flag(meta, index, ZRAM_ZERO);
- write_unlock(&zram->meta->tb_lock);
+ atomic_set(&meta->table[index].state, IDLE);

atomic64_inc(&zram->stats.zero_pages);
ret = 0;
@@ -514,12 +520,13 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
* Free memory associated with this sector
* before overwriting unused sectors.
*/
- write_lock(&zram->meta->tb_lock);
+ while(atomic_cmpxchg(&meta->table[index].state, IDLE, ACCESS) != IDLE)
+ cpu_relax();
zram_free_page(zram, index);

meta->table[index].handle = handle;
meta->table[index].size = clen;
- write_unlock(&zram->meta->tb_lock);
+ atomic_set(&meta->table[index].state, IDLE);

/* Update stats */
atomic64_add(clen, &zram->stats.compr_data_size);
@@ -560,6 +567,7 @@ static void zram_bio_discard(struct zram *zram, u32 index,
int offset, struct bio *bio)
{
size_t n = bio->bi_iter.bi_size;
+ struct zram_meta *meta = zram->meta;

/*
* zram manages data in physical block size units. Because logical block
@@ -584,9 +592,11 @@ static void zram_bio_discard(struct zram *zram, u32 index,
* Discard request can be large so the lock hold times could be
* lengthy. So take the lock once per page.
*/
- write_lock(&zram->meta->tb_lock);
+ while(atomic_cmpxchg(&meta->table[index].state,
+ IDLE, ACCESS) != IDLE)
+ cpu_relax();
zram_free_page(zram, index);
- write_unlock(&zram->meta->tb_lock);
+ atomic_set(&meta->table[index].state, IDLE);
index++;
n -= PAGE_SIZE;
}
@@ -804,9 +814,10 @@ static void zram_slot_free_notify(struct block_device *bdev,
zram = bdev->bd_disk->private_data;
meta = zram->meta;

- write_lock(&meta->tb_lock);
+ while(atomic_cmpxchg(&meta->table[index].state, IDLE, ACCESS) != IDLE)
+ cpu_relax();
zram_free_page(zram, index);
- write_unlock(&meta->tb_lock);
+ atomic_set(&meta->table[index].state, IDLE);
atomic64_inc(&zram->stats.notify_free);
}

diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 7f21c14..76b2bb5
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -61,9 +61,13 @@ enum zram_pageflags {

/*-- Data structures */

+#define IDLE 0
+#define ACCESS 1
+
/* Allocated for each disk page */
struct table {
unsigned long handle;
+ atomic_t state;
u16 size; /* object size (excluding header) */
u8 flags;
} __aligned(4);
@@ -81,7 +85,6 @@ struct zram_stats {
};

struct zram_meta {
- rwlock_t tb_lock; /* protect table */
struct table *table;
struct zs_pool *mem_pool;
};
--
1.7.10.4


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/