[6/9] pohmelfs: distributed locking and cache coherency protocol.
From: Evgeniy Polyakov
Date: Fri Dec 26 2008 - 09:30:51 EST
POHMELFS utilizes writeback cache, which is built on top of MO(E)SI-like
coherency protocol. This patch includes its implementation and cache
object processing helpers (like allocation and completion callbacks).
POHMELFS uses scalable cached read/write locking. No additional requests
are performed if lock is granted to the filesystem. The same protocol
is used by the server to on-demand flushing of the client's cache (for
example when server wants to update local data).
Signed-off-by: Evgeniy Polyakov <zbr@xxxxxxxxxxx>
diff --git a/fs/pohmelfs/lock.c b/fs/pohmelfs/lock.c
new file mode 100644
index 0000000..47e7562
--- /dev/null
+++ b/fs/pohmelfs/lock.c
@@ -0,0 +1,186 @@
+/*
+ * 2007+ Copyright (c) Evgeniy Polyakov <zbr@xxxxxxxxxxx>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/slab.h>
+#include <linux/mempool.h>
+
+#include "netfs.h"
+
+static int pohmelfs_send_lock_trans(struct pohmelfs_inode *pi,
+ u64 id, u64 start, u32 size, int type)
+{
+ struct inode *inode = &pi->vfs_inode;
+ struct pohmelfs_sb *psb = POHMELFS_SB(inode->i_sb);
+ struct netfs_trans *t;
+ struct netfs_cmd *cmd;
+ int path_len, err;
+ void *data;
+ struct netfs_lock *l;
+ int isize = (type & POHMELFS_LOCK_GRAB) ? 0 : sizeof(struct netfs_inode_info);
+
+ mutex_lock(&psb->path_lock);
+ err = pohmelfs_path_length(pi);
+ mutex_unlock(&psb->path_lock);
+
+ if (err < 0)
+ goto err_out_exit;
+
+ path_len = err;
+
+ err = -ENOMEM;
+ t = netfs_trans_alloc(psb, path_len + sizeof(struct netfs_lock) + isize, 0, 0);
+ if (!t)
+ goto err_out_exit;
+
+ cmd = netfs_trans_current(t);
+ data = cmd + 1;
+ l = data + path_len;
+
+ cmd->cmd = NETFS_LOCK;
+ cmd->start = 0;
+ cmd->id = id;
+ cmd->size = sizeof(struct netfs_lock) + path_len + isize;
+ cmd->ext = path_len;
+ cmd->csize = 0;
+
+ netfs_convert_cmd(cmd);
+
+ mutex_lock(&psb->path_lock);
+ err = pohmelfs_construct_path_string(pi, cmd+1, path_len);
+ mutex_unlock(&psb->path_lock);
+ if (err < 0)
+ goto err_out_free;
+
+ l->start = start;
+ l->size = size;
+ l->type = type;
+ l->ino = pi->ino;
+
+ netfs_convert_lock(l);
+
+ if (isize) {
+ struct netfs_inode_info *info = (struct netfs_inode_info *)(l + 1);
+
+ info->mode = inode->i_mode;
+ info->nlink = inode->i_nlink;
+ info->uid = inode->i_uid;
+ info->gid = inode->i_gid;
+ info->blocks = inode->i_blocks;
+ info->rdev = inode->i_rdev;
+ info->size = inode->i_size;
+ info->version = inode->i_version;
+
+ netfs_convert_inode_info(info);
+ }
+
+ netfs_trans_update(cmd, t, path_len + sizeof(struct netfs_lock) + isize);
+
+ return netfs_trans_finish(t, psb);
+
+err_out_free:
+ netfs_trans_free(t);
+err_out_exit:
+ return err;
+}
+
+int pohmelfs_data_lock(struct pohmelfs_inode *pi, u64 start, u32 size, int type)
+{
+ struct pohmelfs_sb *psb = POHMELFS_SB(pi->vfs_inode.i_sb);
+ struct pohmelfs_mcache *m;
+ int err = -ENOMEM;
+ struct iattr iattr;
+ struct inode *inode = &pi->vfs_inode;
+
+ dprintk("%s: %p: ino: %llu, start: %llu, size: %u, "
+ "type: %d, locked as: %d, owned: %d.\n",
+ __func__, &pi->vfs_inode, pi->ino,
+ start, size, type, pi->lock_type,
+ !!test_bit(NETFS_INODE_OWNED, &pi->state));
+
+ if (test_bit(NETFS_INODE_OWNED, &pi->state) && (type == pi->lock_type))
+ return 0;
+
+ if ((type == POHMELFS_READ_LOCK) && (pi->lock_type == POHMELFS_WRITE_LOCK))
+ return 0;
+
+ clear_bit(NETFS_INODE_OWNED, &pi->state);
+ clear_bit(NETFS_INODE_REMOTE_SYNCED, &pi->state);
+
+ m = pohmelfs_mcache_alloc(psb, start, size, NULL);
+ if (IS_ERR(m))
+ return PTR_ERR(m);
+
+ err = pohmelfs_send_lock_trans(pi, m->gen, start, size,
+ type | POHMELFS_LOCK_GRAB);
+ if (err)
+ goto err_out_put;
+
+ err = wait_for_completion_timeout(&m->complete, psb->mcache_timeout);
+ if (err)
+ err = m->err;
+ else
+ err = -ETIMEDOUT;
+
+ dprintk("%s: %p: ino: %llu, mgen: %llu, start: %llu, size: %u, err: %d.\n",
+ __func__, &pi->vfs_inode, pi->ino, m->gen, start, size, err);
+
+ if (err && (err != -ENOENT))
+ goto err_out_put;
+
+ if (!err) {
+ netfs_convert_inode_info(&m->info);
+
+ iattr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_SIZE | ATTR_ATIME;
+ iattr.ia_mode = m->info.mode;
+ iattr.ia_uid = m->info.uid;
+ iattr.ia_gid = m->info.gid;
+ iattr.ia_size = m->info.size;
+ iattr.ia_atime = CURRENT_TIME;
+
+ err = pohmelfs_setattr_raw(inode, &iattr);
+ if (!err) {
+ struct dentry *dentry = d_find_alias(inode);
+ if (dentry) {
+ fsnotify_change(dentry, iattr.ia_valid);
+ dput(dentry);
+ }
+ }
+ }
+
+ pi->lock_type = type;
+ set_bit(NETFS_INODE_OWNED, &pi->state);
+
+ pohmelfs_mcache_put(psb, m);
+
+ return 0;
+
+err_out_put:
+ pohmelfs_mcache_put(psb, m);
+ return err;
+}
+
+int pohmelfs_data_unlock(struct pohmelfs_inode *pi, u64 start, u32 size, int type)
+{
+ dprintk("%s: %p: ino: %llu, start: %llu, size: %u, type: %d.\n",
+ __func__, &pi->vfs_inode, pi->ino, start, size, type);
+ pi->lock_type = 0;
+ clear_bit(NETFS_INODE_OWNED, &pi->state);
+ clear_bit(NETFS_INODE_REMOTE_SYNCED, &pi->state);
+ return pohmelfs_send_lock_trans(pi, pi->ino, start, size, type);
+}
diff --git a/fs/pohmelfs/mcache.c b/fs/pohmelfs/mcache.c
new file mode 100644
index 0000000..2e2d36e
--- /dev/null
+++ b/fs/pohmelfs/mcache.c
@@ -0,0 +1,171 @@
+/*
+ * 2007+ Copyright (c) Evgeniy Polyakov <zbr@xxxxxxxxxxx>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mempool.h>
+
+#include "netfs.h"
+
+static struct kmem_cache *pohmelfs_mcache_cache;
+static mempool_t *pohmelfs_mcache_pool;
+
+static inline int pohmelfs_mcache_cmp(u64 gen, u64 new)
+{
+ if (gen < new)
+ return 1;
+ if (gen > new)
+ return -1;
+ return 0;
+}
+
+struct pohmelfs_mcache *pohmelfs_mcache_search(struct pohmelfs_sb *psb, u64 gen)
+{
+ struct rb_root *root = &psb->mcache_root;
+ struct rb_node *n = root->rb_node;
+ struct pohmelfs_mcache *tmp, *ret = NULL;
+ int cmp;
+
+ while (n) {
+ tmp = rb_entry(n, struct pohmelfs_mcache, mcache_entry);
+
+ cmp = pohmelfs_mcache_cmp(tmp->gen, gen);
+ if (cmp < 0)
+ n = n->rb_left;
+ else if (cmp > 0)
+ n = n->rb_right;
+ else {
+ ret = tmp;
+ pohmelfs_mcache_get(ret);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int pohmelfs_mcache_insert(struct pohmelfs_sb *psb, struct pohmelfs_mcache *m)
+{
+ struct rb_root *root = &psb->mcache_root;
+ struct rb_node **n = &root->rb_node, *parent = NULL;
+ struct pohmelfs_mcache *ret = NULL, *tmp;
+ int cmp;
+
+ while (*n) {
+ parent = *n;
+
+ tmp = rb_entry(parent, struct pohmelfs_mcache, mcache_entry);
+
+ cmp = pohmelfs_mcache_cmp(tmp->gen, m->gen);
+ if (cmp < 0)
+ n = &parent->rb_left;
+ else if (cmp > 0)
+ n = &parent->rb_right;
+ else {
+ ret = tmp;
+ break;
+ }
+ }
+
+ if (ret)
+ return -EEXIST;
+
+ rb_link_node(&m->mcache_entry, parent, n);
+ rb_insert_color(&m->mcache_entry, root);
+
+ return 0;
+}
+
+static int pohmelfs_mcache_remove(struct pohmelfs_sb *psb, struct pohmelfs_mcache *m)
+{
+ if (m && m->mcache_entry.rb_parent_color) {
+ rb_erase(&m->mcache_entry, &psb->mcache_root);
+ m->mcache_entry.rb_parent_color = 0;
+ return 1;
+ }
+ return 0;
+}
+
+void pohmelfs_mcache_remove_locked(struct pohmelfs_sb *psb, struct pohmelfs_mcache *m)
+{
+ mutex_lock(&psb->mcache_lock);
+ pohmelfs_mcache_remove(psb, m);
+ mutex_unlock(&psb->mcache_lock);
+}
+
+struct pohmelfs_mcache *pohmelfs_mcache_alloc(struct pohmelfs_sb *psb, u64 start,
+ unsigned int size, void *data)
+{
+ struct pohmelfs_mcache *m;
+ int err = -ENOMEM;
+
+ m = mempool_alloc(pohmelfs_mcache_pool, GFP_KERNEL);
+ if (!m)
+ goto err_out_exit;
+
+ init_completion(&m->complete);
+ m->err = 0;
+ atomic_set(&m->refcnt, 1);
+ m->data = data;
+ m->start = start;
+ m->size = size;
+ m->gen = atomic_long_inc_return(&psb->mcache_gen);
+
+ mutex_lock(&psb->mcache_lock);
+ err = pohmelfs_mcache_insert(psb, m);
+ mutex_unlock(&psb->mcache_lock);
+ if (err)
+ goto err_out_free;
+
+ return m;
+
+err_out_free:
+ mempool_free(m, pohmelfs_mcache_pool);
+err_out_exit:
+ return ERR_PTR(err);
+}
+
+void pohmelfs_mcache_free(struct pohmelfs_sb *psb, struct pohmelfs_mcache *m)
+{
+ pohmelfs_mcache_remove_locked(psb, m);
+
+ mempool_free(m, pohmelfs_mcache_pool);
+}
+
+int __init pohmelfs_mcache_init(void)
+{
+ pohmelfs_mcache_cache = kmem_cache_create("pohmelfs_mcache_cache",
+ sizeof(struct pohmelfs_mcache),
+ 0, (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), NULL);
+ if (!pohmelfs_mcache_cache)
+ goto err_out_exit;
+
+ pohmelfs_mcache_pool = mempool_create_slab_pool(256, pohmelfs_mcache_cache);
+ if (!pohmelfs_mcache_pool)
+ goto err_out_free;
+
+ return 0;
+
+err_out_free:
+ kmem_cache_destroy(pohmelfs_mcache_cache);
+err_out_exit:
+ return -ENOMEM;
+}
+
+void pohmelfs_mcache_exit(void)
+{
+ mempool_destroy(pohmelfs_mcache_pool);
+ kmem_cache_destroy(pohmelfs_mcache_cache);
+}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/