[RFC 04/16] NOVA: Inode operations and structures

From: Steven Swanson
Date: Thu Aug 03 2017 - 03:49:00 EST


Nova maintains per-CPU inode tables, and inode numbers are striped across the
tables (i.e., inos 0, n, 2n,... on cpu 0; inos 1, n + 1, 2n + 1, ... on cpu 1).

The inodes themselves live in a set of linked lists (one per CPU) of 2MB
blocks. The last 8 bytes of each block points to the next block. Pointers to
heads of these list live in PMEM block INODE_TABLE0_START and are replicated in
PMEM block INODE_TABLE1_START. Additional space for inodes is allocated on
demand.

To allocate inodes, Nova maintains a per-cpu inuse_list in DRAM holds a RB
tree that holds ranges of unallocated inode numbers.

Signed-off-by: Steven Swanson <swanson@xxxxxxxxxxx>
---
fs/nova/inode.c | 1467 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/nova/inode.h | 389 +++++++++++++++
2 files changed, 1856 insertions(+)
create mode 100644 fs/nova/inode.c
create mode 100644 fs/nova/inode.h

diff --git a/fs/nova/inode.c b/fs/nova/inode.c
new file mode 100644
index 000000000000..db001b7b5d4f
--- /dev/null
+++ b/fs/nova/inode.c
@@ -0,0 +1,1467 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * Inode methods (allocate/free/read/write).
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix024@xxxxxxxxxxx>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli@xxxxxxxxx>
+ * Copyright 2003 Sony Corporation
+ * Copyright 2003 Matsushita Electric Industrial Co., Ltd.
+ * 2003-2004 (c) MontaVista Software, Inc. , Steve Longerbeam
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/fs.h>
+#include <linux/aio.h>
+#include <linux/highuid.h>
+#include <linux/module.h>
+#include <linux/mpage.h>
+#include <linux/backing-dev.h>
+#include <linux/types.h>
+#include <linux/ratelimit.h>
+#include "nova.h"
+#include "inode.h"
+
+unsigned int blk_type_to_shift[NOVA_BLOCK_TYPE_MAX] = {12, 21, 30};
+uint32_t blk_type_to_size[NOVA_BLOCK_TYPE_MAX] = {0x1000, 0x200000, 0x40000000};
+
+int nova_init_inode_inuse_list(struct super_block *sb)
+{
+ struct nova_sb_info *sbi = NOVA_SB(sb);
+ struct nova_range_node *range_node;
+ struct inode_map *inode_map;
+ unsigned long range_high;
+ int i;
+ int ret;
+
+ sbi->s_inodes_used_count = NOVA_NORMAL_INODE_START;
+
+ range_high = NOVA_NORMAL_INODE_START / sbi->cpus;
+ if (NOVA_NORMAL_INODE_START % sbi->cpus)
+ range_high++;
+
+ for (i = 0; i < sbi->cpus; i++) {
+ inode_map = &sbi->inode_maps[i];
+ range_node = nova_alloc_inode_node(sb);
+ if (range_node == NULL)
+ /* FIXME: free allocated memories */
+ return -ENOMEM;
+
+ range_node->range_low = 0;
+ range_node->range_high = range_high;
+ nova_update_range_node_checksum(range_node);
+ ret = nova_insert_inodetree(sbi, range_node, i);
+ if (ret) {
+ nova_err(sb, "%s failed\n", __func__);
+ nova_free_inode_node(sb, range_node);
+ return ret;
+ }
+ inode_map->num_range_node_inode = 1;
+ inode_map->first_inode_range = range_node;
+ }
+
+ return 0;
+}
+
+static int nova_alloc_inode_table(struct super_block *sb,
+ struct nova_inode_info_header *sih, int version)
+{
+ struct nova_sb_info *sbi = NOVA_SB(sb);
+ struct inode_table *inode_table;
+ unsigned long blocknr;
+ u64 block;
+ int allocated;
+ int i;
+
+ for (i = 0; i < sbi->cpus; i++) {
+ inode_table = nova_get_inode_table(sb, version, i);
+ if (!inode_table)
+ return -EINVAL;
+
+ /* Allocate replicate inodes from tail */
+ allocated = nova_new_log_blocks(sb, sih, &blocknr, 1,
+ ALLOC_INIT_ZERO, i,
+ version ? ALLOC_FROM_TAIL : ALLOC_FROM_HEAD);
+
+ nova_dbgv("%s: allocate log @ 0x%lx\n", __func__,
+ blocknr);
+ if (allocated != 1 || blocknr == 0)
+ return -ENOSPC;
+
+ block = nova_get_block_off(sb, blocknr, NOVA_BLOCK_TYPE_2M);
+ nova_memunlock_range(sb, inode_table, CACHELINE_SIZE);
+ inode_table->log_head = block;
+ nova_memlock_range(sb, inode_table, CACHELINE_SIZE);
+ nova_flush_buffer(inode_table, CACHELINE_SIZE, 0);
+ }
+
+ return 0;
+}
+
+int nova_init_inode_table(struct super_block *sb)
+{
+ struct nova_inode *pi = nova_get_inode_by_ino(sb, NOVA_INODETABLE_INO);
+ struct nova_inode_info_header sih;
+ int num_tables;
+ int ret = 0;
+ int i;
+
+ nova_memunlock_inode(sb, pi);
+ pi->i_mode = 0;
+ pi->i_uid = 0;
+ pi->i_gid = 0;
+ pi->i_links_count = cpu_to_le16(1);
+ pi->i_flags = 0;
+ pi->nova_ino = NOVA_INODETABLE_INO;
+
+ pi->i_blk_type = NOVA_BLOCK_TYPE_2M;
+ nova_memlock_inode(sb, pi);
+
+ sih.ino = NOVA_INODETABLE_INO;
+ sih.i_blk_type = NOVA_BLOCK_TYPE_2M;
+
+ num_tables = 1;
+ if (metadata_csum)
+ num_tables = 2;
+
+ for (i = 0; i < num_tables; i++) {
+ ret = nova_alloc_inode_table(sb, &sih, i);
+ if (ret)
+ return ret;
+ }
+
+ PERSISTENT_BARRIER();
+ return ret;
+}
+
+inline int nova_insert_inodetree(struct nova_sb_info *sbi,
+ struct nova_range_node *new_node, int cpu)
+{
+ struct rb_root *tree;
+ int ret;
+
+ tree = &sbi->inode_maps[cpu].inode_inuse_tree;
+ ret = nova_insert_range_node(tree, new_node);
+ if (ret)
+ nova_dbg("ERROR: %s failed %d\n", __func__, ret);
+
+ return ret;
+}
+
+inline int nova_search_inodetree(struct nova_sb_info *sbi,
+ unsigned long ino, struct nova_range_node **ret_node)
+{
+ struct rb_root *tree;
+ unsigned long internal_ino;
+ int cpu;
+
+ cpu = ino % sbi->cpus;
+ tree = &sbi->inode_maps[cpu].inode_inuse_tree;
+ internal_ino = ino / sbi->cpus;
+ return nova_find_range_node(sbi, tree, internal_ino, ret_node);
+}
+
+/* Get the address in PMEM of an inode by inode number. Allocate additional
+ * block to store additional inodes if necessary.
+ */
+int nova_get_inode_address(struct super_block *sb, u64 ino, int version,
+ u64 *pi_addr, int extendable, int extend_alternate)
+{
+ struct nova_sb_info *sbi = NOVA_SB(sb);
+ struct nova_inode_info_header sih;
+ struct inode_table *inode_table;
+ unsigned int data_bits;
+ unsigned int num_inodes_bits;
+ u64 curr;
+ unsigned int superpage_count;
+ u64 alternate_pi_addr = 0;
+ u64 internal_ino;
+ int cpuid;
+ int extended = 0;
+ unsigned int index;
+ unsigned int i = 0;
+ unsigned long blocknr;
+ unsigned long curr_addr;
+ int allocated;
+
+ if (ino < NOVA_NORMAL_INODE_START) {
+ *pi_addr = nova_get_reserved_inode_addr(sb, ino);
+ return 0;
+ }
+
+ sih.ino = NOVA_INODETABLE_INO;
+ sih.i_blk_type = NOVA_BLOCK_TYPE_2M;
+ data_bits = blk_type_to_shift[sih.i_blk_type];
+ num_inodes_bits = data_bits - NOVA_INODE_BITS;
+
+ cpuid = ino % sbi->cpus;
+ internal_ino = ino / sbi->cpus;
+
+ inode_table = nova_get_inode_table(sb, version, cpuid);
+ superpage_count = internal_ino >> num_inodes_bits;
+ index = internal_ino & ((1 << num_inodes_bits) - 1);
+
+ curr = inode_table->log_head;
+ if (curr == 0)
+ return -EINVAL;
+
+ for (i = 0; i < superpage_count; i++) {
+ if (curr == 0)
+ return -EINVAL;
+
+ curr_addr = (unsigned long)nova_get_block(sb, curr);
+ /* Next page pointer in the last 8 bytes of the superpage */
+ curr_addr += nova_inode_blk_size(&sih) - 8;
+ curr = *(u64 *)(curr_addr);
+
+ if (curr == 0) {
+ if (extendable == 0)
+ return -EINVAL;
+
+ extended = 1;
+
+ allocated = nova_new_log_blocks(sb, &sih, &blocknr,
+ 1, ALLOC_INIT_ZERO, cpuid,
+ version ? ALLOC_FROM_TAIL : ALLOC_FROM_HEAD);
+
+ if (allocated != 1)
+ return allocated;
+
+ curr = nova_get_block_off(sb, blocknr,
+ NOVA_BLOCK_TYPE_2M);
+ nova_memunlock_range(sb, (void *)curr_addr,
+ CACHELINE_SIZE);
+ *(u64 *)(curr_addr) = curr;
+ nova_memlock_range(sb, (void *)curr_addr,
+ CACHELINE_SIZE);
+ nova_flush_buffer((void *)curr_addr,
+ NOVA_INODE_SIZE, 1);
+ }
+ }
+
+ /* Extend alternate inode table */
+ if (extended && extend_alternate && metadata_csum)
+ nova_get_inode_address(sb, ino, version + 1,
+ &alternate_pi_addr, extendable, 0);
+
+ *pi_addr = curr + index * NOVA_INODE_SIZE;
+
+ return 0;
+}
+
+int nova_get_alter_inode_address(struct super_block *sb, u64 ino,
+ u64 *alter_pi_addr)
+{
+ int ret;
+
+ if (metadata_csum == 0) {
+ nova_err(sb, "Access alter inode when replica inode disabled\n");
+ return 0;
+ }
+
+ if (ino < NOVA_NORMAL_INODE_START) {
+ *alter_pi_addr = nova_get_alter_reserved_inode_addr(sb, ino);
+ } else {
+ ret = nova_get_inode_address(sb, ino, 1, alter_pi_addr, 0, 0);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int nova_delete_file_tree(struct super_block *sb,
+ struct nova_inode_info_header *sih, unsigned long start_blocknr,
+ unsigned long last_blocknr, bool delete_nvmm, bool delete_dead,
+ u64 epoch_id)
+{
+ struct nova_file_write_entry *entry;
+ struct nova_file_write_entry *entryc, entry_copy;
+ struct nova_file_write_entry *old_entry = NULL;
+ unsigned long pgoff = start_blocknr;
+ unsigned long old_pgoff = 0;
+ unsigned int num_free = 0;
+ int freed = 0;
+ void *ret;
+ timing_t delete_time;
+
+ NOVA_START_TIMING(delete_file_tree_t, delete_time);
+
+ entryc = (metadata_csum == 0) ? entry : &entry_copy;
+
+ /* Handle EOF blocks */
+ do {
+ entry = radix_tree_lookup(&sih->tree, pgoff);
+ if (entry) {
+ ret = radix_tree_delete(&sih->tree, pgoff);
+ BUG_ON(!ret || ret != entry);
+ if (entry != old_entry) {
+ if (old_entry && delete_nvmm) {
+ nova_free_old_entry(sb, sih,
+ old_entry, old_pgoff,
+ num_free, delete_dead,
+ epoch_id);
+ freed += num_free;
+ }
+
+ old_entry = entry;
+ old_pgoff = pgoff;
+ num_free = 1;
+ } else {
+ num_free++;
+ }
+ pgoff++;
+ } else {
+ /* We are finding a hole. Jump to the next entry. */
+ entry = nova_find_next_entry(sb, sih, pgoff);
+ if (!entry)
+ break;
+
+ if (metadata_csum == 0)
+ entryc = entry;
+ else if (!nova_verify_entry_csum(sb, entry, entryc))
+ break;
+
+ pgoff++;
+ pgoff = pgoff > entryc->pgoff ? pgoff : entryc->pgoff;
+ }
+ } while (1);
+
+ if (old_entry && delete_nvmm) {
+ nova_free_old_entry(sb, sih, old_entry, old_pgoff,
+ num_free, delete_dead, epoch_id);
+ freed += num_free;
+ }
+
+ nova_dbgv("Inode %lu: delete file tree from pgoff %lu to %lu, %d blocks freed\n",
+ sih->ino, start_blocknr, last_blocknr, freed);
+
+ NOVA_END_TIMING(delete_file_tree_t, delete_time);
+ return freed;
+}
+
+static int nova_free_dram_resource(struct super_block *sb,
+ struct nova_inode_info_header *sih)
+{
+ unsigned long last_blocknr;
+ int freed = 0;
+
+ if (!(S_ISREG(sih->i_mode)) && !(S_ISDIR(sih->i_mode)))
+ return 0;
+
+ if (S_ISREG(sih->i_mode)) {
+ last_blocknr = nova_get_last_blocknr(sb, sih);
+ freed = nova_delete_file_tree(sb, sih, 0,
+ last_blocknr, false, false, 0);
+ } else {
+ nova_delete_dir_tree(sb, sih);
+ freed = 1;
+ }
+
+ return freed;
+}
+
+static inline void check_eof_blocks(struct super_block *sb,
+ struct nova_inode *pi, struct inode *inode,
+ struct nova_inode_info_header *sih)
+{
+ if ((pi->i_flags & cpu_to_le32(NOVA_EOFBLOCKS_FL)) &&
+ (inode->i_size + sb->s_blocksize) > (sih->i_blocks
+ << sb->s_blocksize_bits)) {
+ nova_memunlock_inode(sb, pi);
+ pi->i_flags &= cpu_to_le32(~NOVA_EOFBLOCKS_FL);
+ nova_update_inode_checksum(pi);
+ nova_update_alter_inode(sb, inode, pi);
+ nova_memlock_inode(sb, pi);
+ }
+}
+
+/*
+ * Free data blocks from inode in the range start <=> end
+ */
+static void nova_truncate_file_blocks(struct inode *inode, loff_t start,
+ loff_t end, u64 epoch_id)
+{
+ struct super_block *sb = inode->i_sb;
+ struct nova_inode *pi = nova_get_inode(sb, inode);
+ struct nova_inode_info *si = NOVA_I(inode);
+ struct nova_inode_info_header *sih = &si->header;
+ unsigned int data_bits = blk_type_to_shift[sih->i_blk_type];
+ unsigned long first_blocknr, last_blocknr;
+ int freed = 0;
+
+ inode->i_mtime = inode->i_ctime = current_time(inode);
+
+ nova_dbg_verbose("truncate: pi %p iblocks %lx %llx %llx %llx\n", pi,
+ sih->i_blocks, start, end, pi->i_size);
+
+ first_blocknr = (start + (1UL << data_bits) - 1) >> data_bits;
+
+ if (end == 0)
+ return;
+ last_blocknr = (end - 1) >> data_bits;
+
+ if (first_blocknr > last_blocknr)
+ return;
+
+ freed = nova_delete_file_tree(sb, sih, first_blocknr,
+ last_blocknr, true, false, epoch_id);
+
+ inode->i_blocks -= (freed * (1 << (data_bits -
+ sb->s_blocksize_bits)));
+
+ sih->i_blocks = inode->i_blocks;
+ /* Check for the flag EOFBLOCKS is still valid after the set size */
+ check_eof_blocks(sb, pi, inode, sih);
+
+}
+
+/* search the radix tree to find hole or data
+ * in the specified range
+ * Input:
+ * first_blocknr: first block in the specified range
+ * last_blocknr: last_blocknr in the specified range
+ * @data_found: indicates whether data blocks were found
+ * @hole_found: indicates whether a hole was found
+ * hole: whether we are looking for a hole or data
+ */
+static int nova_lookup_hole_in_range(struct super_block *sb,
+ struct nova_inode_info_header *sih,
+ unsigned long first_blocknr, unsigned long last_blocknr,
+ int *data_found, int *hole_found, int hole)
+{
+ struct nova_file_write_entry *entry;
+ struct nova_file_write_entry *entryc, entry_copy;
+ unsigned long blocks = 0;
+ unsigned long pgoff, old_pgoff;
+
+ entryc = (metadata_csum == 0) ? entry : &entry_copy;
+
+ pgoff = first_blocknr;
+ while (pgoff <= last_blocknr) {
+ old_pgoff = pgoff;
+ entry = radix_tree_lookup(&sih->tree, pgoff);
+ if (entry) {
+ *data_found = 1;
+ if (!hole)
+ goto done;
+ pgoff++;
+ } else {
+ *hole_found = 1;
+ entry = nova_find_next_entry(sb, sih, pgoff);
+ pgoff++;
+ if (entry) {
+ if (metadata_csum == 0)
+ entryc = entry;
+ else if (!nova_verify_entry_csum(sb, entry,
+ entryc))
+ goto done;
+
+ pgoff = pgoff > entryc->pgoff ?
+ pgoff : entryc->pgoff;
+ if (pgoff > last_blocknr)
+ pgoff = last_blocknr + 1;
+ }
+ }
+
+ if (!*hole_found || !hole)
+ blocks += pgoff - old_pgoff;
+ }
+done:
+ return blocks;
+}
+
+/* copy persistent state to struct inode */
+static int nova_read_inode(struct super_block *sb, struct inode *inode,
+ u64 pi_addr)
+{
+ struct nova_inode_info *si = NOVA_I(inode);
+ struct nova_inode *pi, fake_pi;
+ struct nova_inode_info_header *sih = &si->header;
+ int ret = -EIO;
+ unsigned long ino;
+
+ ret = nova_get_reference(sb, pi_addr, &fake_pi,
+ (void **)&pi, sizeof(struct nova_inode));
+ if (ret) {
+ nova_dbg("%s: read pi @ 0x%llx failed\n",
+ __func__, pi_addr);
+ goto bad_inode;
+ }
+
+ inode->i_mode = sih->i_mode;
+ i_uid_write(inode, le32_to_cpu(pi->i_uid));
+ i_gid_write(inode, le32_to_cpu(pi->i_gid));
+// set_nlink(inode, le16_to_cpu(pi->i_links_count));
+ inode->i_generation = le32_to_cpu(pi->i_generation);
+ nova_set_inode_flags(inode, pi, le32_to_cpu(pi->i_flags));
+ ino = inode->i_ino;
+
+ /* check if the inode is active. */
+ if (inode->i_mode == 0 || pi->deleted == 1) {
+ /* this inode is deleted */
+ ret = -ESTALE;
+ goto bad_inode;
+ }
+
+ inode->i_blocks = sih->i_blocks;
+ inode->i_mapping->a_ops = &nova_aops_dax;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_op = &nova_file_inode_operations;
+ if (inplace_data_updates && wprotect == 0)
+ inode->i_fop = &nova_dax_file_operations;
+ else
+ inode->i_fop = &nova_wrap_file_operations;
+ break;
+ case S_IFDIR:
+ inode->i_op = &nova_dir_inode_operations;
+ inode->i_fop = &nova_dir_operations;
+ break;
+ case S_IFLNK:
+ inode->i_op = &nova_symlink_inode_operations;
+ break;
+ default:
+ inode->i_op = &nova_special_inode_operations;
+ init_special_inode(inode, inode->i_mode,
+ le32_to_cpu(pi->dev.rdev));
+ break;
+ }
+
+ /* Update size and time after rebuild the tree */
+ inode->i_size = le64_to_cpu(sih->i_size);
+ inode->i_atime.tv_sec = (__s32)le32_to_cpu(pi->i_atime);
+ inode->i_ctime.tv_sec = (__s32)le32_to_cpu(pi->i_ctime);
+ inode->i_mtime.tv_sec = (__s32)le32_to_cpu(pi->i_mtime);
+ inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec =
+ inode->i_ctime.tv_nsec = 0;
+ set_nlink(inode, le16_to_cpu(pi->i_links_count));
+ return 0;
+
+bad_inode:
+ make_bad_inode(inode);
+ return ret;
+}
+
+static void nova_get_inode_flags(struct inode *inode, struct nova_inode *pi)
+{
+ unsigned int flags = inode->i_flags;
+ unsigned int nova_flags = le32_to_cpu(pi->i_flags);
+
+ nova_flags &= ~(FS_SYNC_FL | FS_APPEND_FL | FS_IMMUTABLE_FL |
+ FS_NOATIME_FL | FS_DIRSYNC_FL);
+ if (flags & S_SYNC)
+ nova_flags |= FS_SYNC_FL;
+ if (flags & S_APPEND)
+ nova_flags |= FS_APPEND_FL;
+ if (flags & S_IMMUTABLE)
+ nova_flags |= FS_IMMUTABLE_FL;
+ if (flags & S_NOATIME)
+ nova_flags |= FS_NOATIME_FL;
+ if (flags & S_DIRSYNC)
+ nova_flags |= FS_DIRSYNC_FL;
+
+ pi->i_flags = cpu_to_le32(nova_flags);
+}
+
+static void nova_init_inode(struct inode *inode, struct nova_inode *pi)
+{
+ pi->i_mode = cpu_to_le16(inode->i_mode);
+ pi->i_uid = cpu_to_le32(i_uid_read(inode));
+ pi->i_gid = cpu_to_le32(i_gid_read(inode));
+ pi->i_links_count = cpu_to_le16(inode->i_nlink);
+ pi->i_size = cpu_to_le64(inode->i_size);
+ pi->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+ pi->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+ pi->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+ pi->i_generation = cpu_to_le32(inode->i_generation);
+ pi->log_head = 0;
+ pi->log_tail = 0;
+ pi->alter_log_head = 0;
+ pi->alter_log_tail = 0;
+ pi->deleted = 0;
+ pi->delete_epoch_id = 0;
+ nova_get_inode_flags(inode, pi);
+
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+ pi->dev.rdev = cpu_to_le32(inode->i_rdev);
+}
+
+static int nova_alloc_unused_inode(struct super_block *sb, int cpuid,
+ unsigned long *ino)
+{
+ struct nova_sb_info *sbi = NOVA_SB(sb);
+ struct inode_map *inode_map;
+ struct nova_range_node *i, *next_i;
+ struct rb_node *temp, *next;
+ unsigned long next_range_low;
+ unsigned long new_ino;
+ unsigned long MAX_INODE = 1UL << 31;
+
+ inode_map = &sbi->inode_maps[cpuid];
+ i = inode_map->first_inode_range;
+ NOVA_ASSERT(i);
+ if (!nova_range_node_checksum_ok(i)) {
+ nova_dbg("%s: first node failed\n", __func__);
+ return -EIO;
+ }
+
+ temp = &i->node;
+ next = rb_next(temp);
+
+ if (!next) {
+ next_i = NULL;
+ next_range_low = MAX_INODE;
+ } else {
+ next_i = container_of(next, struct nova_range_node, node);
+ if (!nova_range_node_checksum_ok(next_i)) {
+ nova_dbg("%s: second node failed\n", __func__);
+ return -EIO;
+ }
+ next_range_low = next_i->range_low;
+ }
+
+ new_ino = i->range_high + 1;
+
+ if (next_i && new_ino == (next_range_low - 1)) {
+ /* Fill the gap completely */
+ i->range_high = next_i->range_high;
+ nova_update_range_node_checksum(i);
+ rb_erase(&next_i->node, &inode_map->inode_inuse_tree);
+ nova_free_inode_node(sb, next_i);
+ inode_map->num_range_node_inode--;
+ } else if (new_ino < (next_range_low - 1)) {
+ /* Aligns to left */
+ i->range_high = new_ino;
+ nova_update_range_node_checksum(i);
+ } else {
+ nova_dbg("%s: ERROR: new ino %lu, next low %lu\n", __func__,
+ new_ino, next_range_low);
+ return -ENOSPC;
+ }
+
+ *ino = new_ino * sbi->cpus + cpuid;
+ sbi->s_inodes_used_count++;
+ inode_map->allocated++;
+
+ nova_dbg_verbose("Alloc ino %lu\n", *ino);
+ return 0;
+}
+
+static int nova_free_inuse_inode(struct super_block *sb, unsigned long ino)
+{
+ struct nova_sb_info *sbi = NOVA_SB(sb);
+ struct inode_map *inode_map;
+ struct nova_range_node *i = NULL;
+ struct nova_range_node *curr_node;
+ int found = 0;
+ int cpuid = ino % sbi->cpus;
+ unsigned long internal_ino = ino / sbi->cpus;
+ int ret = 0;
+
+ nova_dbg_verbose("Free inuse ino: %lu\n", ino);
+ inode_map = &sbi->inode_maps[cpuid];
+
+ mutex_lock(&inode_map->inode_table_mutex);
+ found = nova_search_inodetree(sbi, ino, &i);
+ if (!found) {
+ nova_dbg("%s ERROR: ino %lu not found\n", __func__, ino);
+ mutex_unlock(&inode_map->inode_table_mutex);
+ return -EINVAL;
+ }
+
+ if ((internal_ino == i->range_low) && (internal_ino == i->range_high)) {
+ /* fits entire node */
+ rb_erase(&i->node, &inode_map->inode_inuse_tree);
+ nova_free_inode_node(sb, i);
+ inode_map->num_range_node_inode--;
+ goto block_found;
+ }
+ if ((internal_ino == i->range_low) && (internal_ino < i->range_high)) {
+ /* Aligns left */
+ i->range_low = internal_ino + 1;
+ nova_update_range_node_checksum(i);
+ goto block_found;
+ }
+ if ((internal_ino > i->range_low) && (internal_ino == i->range_high)) {
+ /* Aligns right */
+ i->range_high = internal_ino - 1;
+ nova_update_range_node_checksum(i);
+ goto block_found;
+ }
+ if ((internal_ino > i->range_low) && (internal_ino < i->range_high)) {
+ /* Aligns somewhere in the middle */
+ curr_node = nova_alloc_inode_node(sb);
+ NOVA_ASSERT(curr_node);
+ if (curr_node == NULL) {
+ /* returning without freeing the block */
+ goto block_found;
+ }
+ curr_node->range_low = internal_ino + 1;
+ curr_node->range_high = i->range_high;
+ nova_update_range_node_checksum(curr_node);
+
+ i->range_high = internal_ino - 1;
+ nova_update_range_node_checksum(i);
+
+ ret = nova_insert_inodetree(sbi, curr_node, cpuid);
+ if (ret) {
+ nova_free_inode_node(sb, curr_node);
+ goto err;
+ }
+ inode_map->num_range_node_inode++;
+ goto block_found;
+ }
+
+err:
+ nova_error_mng(sb, "Unable to free inode %lu\n", ino);
+ nova_error_mng(sb, "Found inuse block %lu - %lu\n",
+ i->range_low, i->range_high);
+ mutex_unlock(&inode_map->inode_table_mutex);
+ return ret;
+
+block_found:
+ sbi->s_inodes_used_count--;
+ inode_map->freed++;
+ mutex_unlock(&inode_map->inode_table_mutex);
+ return ret;
+}
+
+static int nova_free_inode(struct super_block *sb, struct nova_inode *pi,
+ struct nova_inode_info_header *sih)
+{
+ int err = 0;
+ timing_t free_time;
+
+ NOVA_START_TIMING(free_inode_t, free_time);
+
+ nova_free_inode_log(sb, pi, sih);
+
+ sih->log_pages = 0;
+ sih->i_mode = 0;
+ sih->pi_addr = 0;
+ sih->alter_pi_addr = 0;
+ sih->i_size = 0;
+ sih->i_blocks = 0;
+
+ err = nova_free_inuse_inode(sb, pi->nova_ino);
+
+ NOVA_END_TIMING(free_inode_t, free_time);
+ return err;
+}
+
+struct inode *nova_iget(struct super_block *sb, unsigned long ino)
+{
+ struct nova_inode_info *si;
+ struct inode *inode;
+ u64 pi_addr;
+ int err;
+
+ inode = iget_locked(sb, ino);
+ if (unlikely(!inode))
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+
+ si = NOVA_I(inode);
+
+ nova_dbgv("%s: inode %lu\n", __func__, ino);
+
+ err = nova_get_inode_address(sb, ino, 0, &pi_addr, 0, 0);
+ if (err) {
+ nova_dbg("%s: get inode %lu address failed %d\n",
+ __func__, ino, err);
+ goto fail;
+ }
+
+ if (pi_addr == 0) {
+ nova_dbg("%s: failed to get pi_addr for inode %lu\n",
+ __func__, ino);
+ err = -EACCES;
+ goto fail;
+ }
+
+ err = nova_rebuild_inode(sb, si, ino, pi_addr, 1);
+ if (err) {
+ nova_dbg("%s: failed to rebuild inode %lu\n", __func__, ino);
+ goto fail;
+ }
+
+ err = nova_read_inode(sb, inode, pi_addr);
+ if (unlikely(err)) {
+ nova_dbg("%s: failed to read inode %lu\n", __func__, ino);
+ goto fail;
+
+ }
+
+ inode->i_ino = ino;
+
+ unlock_new_inode(inode);
+ return inode;
+fail:
+ iget_failed(inode);
+ return ERR_PTR(err);
+}
+
+unsigned long nova_get_last_blocknr(struct super_block *sb,
+ struct nova_inode_info_header *sih)
+{
+ struct nova_inode *pi, fake_pi;
+ unsigned long last_blocknr;
+ unsigned int btype;
+ unsigned int data_bits;
+ int ret;
+
+ ret = nova_get_reference(sb, sih->pi_addr, &fake_pi,
+ (void **)&pi, sizeof(struct nova_inode));
+ if (ret) {
+ nova_dbg("%s: read pi @ 0x%lx failed\n",
+ __func__, sih->pi_addr);
+ btype = 0;
+ } else {
+ btype = sih->i_blk_type;
+ }
+
+ data_bits = blk_type_to_shift[btype];
+
+ if (sih->i_size == 0)
+ last_blocknr = 0;
+ else
+ last_blocknr = (sih->i_size - 1) >> data_bits;
+
+ return last_blocknr;
+}
+
+static int nova_free_inode_resource(struct super_block *sb,
+ struct nova_inode *pi, struct nova_inode_info_header *sih)
+{
+ unsigned long last_blocknr;
+ int ret = 0;
+ int freed = 0;
+ struct nova_inode *alter_pi;
+
+ nova_memunlock_inode(sb, pi);
+ pi->deleted = 1;
+
+ if (pi->valid) {
+ nova_dbg("%s: inode %lu still valid\n",
+ __func__, sih->ino);
+ pi->valid = 0;
+ }
+ nova_update_inode_checksum(pi);
+ if (metadata_csum && sih->alter_pi_addr) {
+ alter_pi = (struct nova_inode *)nova_get_block(sb,
+ sih->alter_pi_addr);
+ memcpy_to_pmem_nocache(alter_pi, pi, sizeof(struct nova_inode));
+ }
+ nova_memlock_inode(sb, pi);
+
+ /* We need the log to free the blocks from the b-tree */
+ switch (sih->i_mode & S_IFMT) {
+ case S_IFREG:
+ last_blocknr = nova_get_last_blocknr(sb, sih);
+ nova_dbgv("%s: file ino %lu\n", __func__, sih->ino);
+ freed = nova_delete_file_tree(sb, sih, 0,
+ last_blocknr, true, true, 0);
+ break;
+ case S_IFDIR:
+ nova_dbgv("%s: dir ino %lu\n", __func__, sih->ino);
+ nova_delete_dir_tree(sb, sih);
+ break;
+ case S_IFLNK:
+ /* Log will be freed later */
+ nova_dbgv("%s: symlink ino %lu\n",
+ __func__, sih->ino);
+ freed = nova_delete_file_tree(sb, sih, 0, 0,
+ true, true, 0);
+ break;
+ default:
+ nova_dbgv("%s: special ino %lu\n",
+ __func__, sih->ino);
+ break;
+ }
+
+ nova_dbg_verbose("%s: Freed %d\n", __func__, freed);
+ /* Then we can free the inode */
+ ret = nova_free_inode(sb, pi, sih);
+ if (ret)
+ nova_err(sb, "%s: free inode %lu failed\n",
+ __func__, sih->ino);
+
+ return ret;
+}
+
+void nova_evict_inode(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct nova_inode *pi = nova_get_inode(sb, inode);
+ struct nova_inode_info *si = NOVA_I(inode);
+ struct nova_inode_info_header *sih = &si->header;
+ timing_t evict_time;
+ int destroy = 0;
+ int ret;
+
+ NOVA_START_TIMING(evict_inode_t, evict_time);
+ if (!sih) {
+ nova_err(sb, "%s: ino %lu sih is NULL!\n",
+ __func__, inode->i_ino);
+ NOVA_ASSERT(0);
+ goto out;
+ }
+
+ // pi can be NULL if the file has already been deleted, but a handle
+ // remains.
+ if (pi && pi->nova_ino != inode->i_ino) {
+ nova_err(sb, "%s: inode %lu ino does not match: %llu\n",
+ __func__, inode->i_ino, pi->nova_ino);
+ nova_dbg("inode size %llu, pi addr 0x%lx, pi head 0x%llx, tail 0x%llx, mode %u\n",
+ inode->i_size, sih->pi_addr, sih->log_head,
+ sih->log_tail, pi->i_mode);
+ nova_dbg("sih: ino %lu, inode size %lu, mode %u, inode mode %u\n",
+ sih->ino, sih->i_size,
+ sih->i_mode, inode->i_mode);
+ nova_print_inode_log(sb, inode);
+ }
+
+ /* Check if this inode exists in at least one snapshot. */
+ if (pi && pi->valid == 0) {
+ ret = nova_append_inode_to_snapshot(sb, pi);
+ if (ret == 0)
+ goto out;
+ }
+
+ nova_dbg_verbose("%s: %lu\n", __func__, inode->i_ino);
+ if (!inode->i_nlink && !is_bad_inode(inode)) {
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ goto out;
+
+ if (pi) {
+ ret = nova_free_inode_resource(sb, pi, sih);
+ if (ret)
+ goto out;
+ }
+
+ destroy = 1;
+ pi = NULL; /* we no longer own the nova_inode */
+
+ inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_size = 0;
+ }
+out:
+ if (destroy == 0) {
+ nova_dbgv("%s: destroying %lu\n", __func__, inode->i_ino);
+ nova_free_dram_resource(sb, sih);
+ }
+ /* TODO: Since we don't use page-cache, do we really need the following
+ * call?
+ */
+ truncate_inode_pages(&inode->i_data, 0);
+
+ clear_inode(inode);
+ NOVA_END_TIMING(evict_inode_t, evict_time);
+}
+
+/* First rebuild the inode tree, then free the blocks */
+int nova_delete_dead_inode(struct super_block *sb, u64 ino)
+{
+ struct nova_inode_info si;
+ struct nova_inode_info_header *sih;
+ struct nova_inode *pi;
+ u64 pi_addr = 0;
+ int err;
+
+ if (ino < NOVA_NORMAL_INODE_START) {
+ nova_dbg("%s: invalid inode %llu\n", __func__, ino);
+ return -EINVAL;
+ }
+
+ err = nova_get_inode_address(sb, ino, 0, &pi_addr, 0, 0);
+ if (err) {
+ nova_dbg("%s: get inode %llu address failed %d\n",
+ __func__, ino, err);
+ return -EINVAL;
+ }
+
+ if (pi_addr == 0)
+ return -EACCES;
+
+ memset(&si, 0, sizeof(struct nova_inode_info));
+ err = nova_rebuild_inode(sb, &si, ino, pi_addr, 0);
+ if (err)
+ return err;
+
+ pi = (struct nova_inode *)nova_get_block(sb, pi_addr);
+ sih = &si.header;
+
+ nova_dbgv("Delete dead inode %lu, log head 0x%llx, tail 0x%llx\n",
+ sih->ino, sih->log_head, sih->log_tail);
+
+ return nova_free_inode_resource(sb, pi, sih);
+}
+
+/* Returns 0 on failure */
+u64 nova_new_nova_inode(struct super_block *sb, u64 *pi_addr)
+{
+ struct nova_sb_info *sbi = NOVA_SB(sb);
+ struct inode_map *inode_map;
+ unsigned long free_ino = 0;
+ int map_id;
+ u64 ino = 0;
+ int ret;
+ timing_t new_inode_time;
+
+ NOVA_START_TIMING(new_nova_inode_t, new_inode_time);
+ map_id = sbi->map_id;
+ sbi->map_id = (sbi->map_id + 1) % sbi->cpus;
+
+ inode_map = &sbi->inode_maps[map_id];
+
+ mutex_lock(&inode_map->inode_table_mutex);
+ ret = nova_alloc_unused_inode(sb, map_id, &free_ino);
+ if (ret) {
+ nova_dbg("%s: alloc inode number failed %d\n", __func__, ret);
+ mutex_unlock(&inode_map->inode_table_mutex);
+ return 0;
+ }
+
+ ret = nova_get_inode_address(sb, free_ino, 0, pi_addr, 1, 1);
+ if (ret) {
+ nova_dbg("%s: get inode address failed %d\n", __func__, ret);
+ mutex_unlock(&inode_map->inode_table_mutex);
+ return 0;
+ }
+
+ mutex_unlock(&inode_map->inode_table_mutex);
+
+ ino = free_ino;
+
+ NOVA_END_TIMING(new_nova_inode_t, new_inode_time);
+ return ino;
+}
+
+struct inode *nova_new_vfs_inode(enum nova_new_inode_type type,
+ struct inode *dir, u64 pi_addr, u64 ino, umode_t mode,
+ size_t size, dev_t rdev, const struct qstr *qstr, u64 epoch_id)
+{
+ struct super_block *sb;
+ struct nova_sb_info *sbi;
+ struct inode *inode;
+ struct nova_inode *diri = NULL;
+ struct nova_inode_info *si;
+ struct nova_inode_info_header *sih = NULL;
+ struct nova_inode *pi;
+ struct nova_inode *alter_pi;
+ int errval;
+ u64 alter_pi_addr = 0;
+ timing_t new_inode_time;
+
+ NOVA_START_TIMING(new_vfs_inode_t, new_inode_time);
+ sb = dir->i_sb;
+ sbi = (struct nova_sb_info *)sb->s_fs_info;
+ inode = new_inode(sb);
+ if (!inode) {
+ errval = -ENOMEM;
+ goto fail2;
+ }
+
+ inode_init_owner(inode, dir, mode);
+ inode->i_blocks = inode->i_size = 0;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+
+ inode->i_generation = atomic_add_return(1, &sbi->next_generation);
+ inode->i_size = size;
+
+ diri = nova_get_inode(sb, dir);
+ if (!diri) {
+ errval = -EACCES;
+ goto fail1;
+ }
+
+ if (metadata_csum) {
+ /* Get alternate inode address */
+ errval = nova_get_alter_inode_address(sb, ino, &alter_pi_addr);
+ if (errval)
+ goto fail1;
+ }
+
+ pi = (struct nova_inode *)nova_get_block(sb, pi_addr);
+ nova_dbg_verbose("%s: allocating inode %llu @ 0x%llx\n",
+ __func__, ino, pi_addr);
+
+ /* chosen inode is in ino */
+ inode->i_ino = ino;
+
+ switch (type) {
+ case TYPE_CREATE:
+ inode->i_op = &nova_file_inode_operations;
+ inode->i_mapping->a_ops = &nova_aops_dax;
+ if (inplace_data_updates && wprotect == 0)
+ inode->i_fop = &nova_dax_file_operations;
+ else
+ inode->i_fop = &nova_wrap_file_operations;
+ break;
+ case TYPE_MKNOD:
+ init_special_inode(inode, mode, rdev);
+ inode->i_op = &nova_special_inode_operations;
+ break;
+ case TYPE_SYMLINK:
+ inode->i_op = &nova_symlink_inode_operations;
+ inode->i_mapping->a_ops = &nova_aops_dax;
+ break;
+ case TYPE_MKDIR:
+ inode->i_op = &nova_dir_inode_operations;
+ inode->i_fop = &nova_dir_operations;
+ inode->i_mapping->a_ops = &nova_aops_dax;
+ set_nlink(inode, 2);
+ break;
+ default:
+ nova_dbg("Unknown new inode type %d\n", type);
+ break;
+ }
+
+ /*
+ * Pi is part of the dir log so no transaction is needed,
+ * but we need to flush to NVMM.
+ */
+ nova_memunlock_inode(sb, pi);
+ pi->i_blk_type = NOVA_DEFAULT_BLOCK_TYPE;
+ pi->i_flags = nova_mask_flags(mode, diri->i_flags);
+ pi->nova_ino = ino;
+ pi->i_create_time = current_time(inode).tv_sec;
+ pi->create_epoch_id = epoch_id;
+ nova_init_inode(inode, pi);
+
+ if (metadata_csum) {
+ alter_pi = (struct nova_inode *)nova_get_block(sb,
+ alter_pi_addr);
+ memcpy_to_pmem_nocache(alter_pi, pi, sizeof(struct nova_inode));
+ }
+
+ nova_memlock_inode(sb, pi);
+
+ si = NOVA_I(inode);
+ sih = &si->header;
+ nova_init_header(sb, sih, inode->i_mode);
+ sih->pi_addr = pi_addr;
+ sih->alter_pi_addr = alter_pi_addr;
+ sih->ino = ino;
+ sih->i_blk_type = NOVA_DEFAULT_BLOCK_TYPE;
+
+ nova_set_inode_flags(inode, pi, le32_to_cpu(pi->i_flags));
+
+ if (insert_inode_locked(inode) < 0) {
+ nova_err(sb, "nova_new_inode failed ino %lx\n", inode->i_ino);
+ errval = -EINVAL;
+ goto fail1;
+ }
+
+ nova_flush_buffer(pi, NOVA_INODE_SIZE, 0);
+ NOVA_END_TIMING(new_vfs_inode_t, new_inode_time);
+ return inode;
+fail1:
+ make_bad_inode(inode);
+ iput(inode);
+fail2:
+ NOVA_END_TIMING(new_vfs_inode_t, new_inode_time);
+ return ERR_PTR(errval);
+}
+
+int nova_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ /* write_inode should never be called because we always keep our inodes
+ * clean. So let us know if write_inode ever gets called.
+ */
+// BUG();
+ return 0;
+}
+
+/*
+ * dirty_inode() is called from mark_inode_dirty_sync()
+ * usually dirty_inode should not be called because NOVA always keeps its inodes
+ * clean. Only exception is touch_atime which calls dirty_inode to update the
+ * i_atime field.
+ */
+void nova_dirty_inode(struct inode *inode, int flags)
+{
+ struct super_block *sb = inode->i_sb;
+ struct nova_sb_info *sbi = NOVA_SB(sb);
+ struct nova_inode_info *si = NOVA_I(inode);
+ struct nova_inode_info_header *sih = &si->header;
+ struct nova_inode *pi, inode_copy;
+
+ if (sbi->mount_snapshot)
+ return;
+
+ pi = nova_get_block(sb, sih->pi_addr);
+
+ /* check the inode before updating to make sure all fields are good */
+ if (nova_check_inode_integrity(sb, sih->ino, sih->pi_addr,
+ sih->alter_pi_addr, &inode_copy, 0) < 0)
+ return;
+
+ /* only i_atime should have changed if at all.
+ * we can do in-place atomic update
+ */
+ nova_memunlock_inode(sb, pi);
+ pi->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+ nova_update_inode_checksum(pi);
+ nova_update_alter_inode(sb, inode, pi);
+ nova_memlock_inode(sb, pi);
+ /* Relax atime persistency */
+ nova_flush_buffer(&pi->i_atime, sizeof(pi->i_atime), 0);
+}
+
+static void nova_setsize(struct inode *inode, loff_t oldsize, loff_t newsize,
+ u64 epoch_id)
+{
+ struct super_block *sb = inode->i_sb;
+ struct nova_inode_info *si = NOVA_I(inode);
+ struct nova_inode_info_header *sih = &si->header;
+ timing_t setsize_time;
+
+ /* We only support truncate regular file */
+ if (!(S_ISREG(inode->i_mode))) {
+ nova_err(inode->i_sb, "%s:wrong file mode %x\n", inode->i_mode);
+ return;
+ }
+
+ NOVA_START_TIMING(setsize_t, setsize_time);
+
+ inode_dio_wait(inode);
+
+ nova_dbgv("%s: inode %lu, old size %llu, new size %llu\n",
+ __func__, inode->i_ino, oldsize, newsize);
+
+ if (newsize != oldsize) {
+ nova_clear_last_page_tail(sb, inode, newsize);
+ i_size_write(inode, newsize);
+ sih->i_size = newsize;
+ }
+
+ /* FIXME: we should make sure that there is nobody reading the inode
+ * before truncating it. Also we need to munmap the truncated range
+ * from application address space, if mmapped.
+ */
+ /* synchronize_rcu(); */
+
+ /* FIXME: Do we need to clear truncated DAX pages? */
+// dax_truncate_page(inode, newsize, nova_dax_get_block);
+
+ truncate_pagecache(inode, newsize);
+ nova_truncate_file_blocks(inode, newsize, oldsize, epoch_id);
+ NOVA_END_TIMING(setsize_t, setsize_time);
+}
+
+int nova_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
+{
+ struct inode *inode;
+
+ inode = path->dentry->d_inode;
+ generic_fillattr(inode, stat);
+ /* stat->blocks should be the number of 512B blocks */
+ stat->blocks = (inode->i_blocks << inode->i_sb->s_blocksize_bits) >> 9;
+ return 0;
+}
+
+int nova_notify_change(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = dentry->d_inode;
+ struct nova_inode_info *si = NOVA_I(inode);
+ struct nova_inode_info_header *sih = &si->header;
+ struct super_block *sb = inode->i_sb;
+ struct nova_inode *pi = nova_get_inode(sb, inode);
+ int ret;
+ unsigned int ia_valid = attr->ia_valid, attr_mask;
+ loff_t oldsize = inode->i_size;
+ u64 epoch_id;
+ timing_t setattr_time;
+
+ NOVA_START_TIMING(setattr_t, setattr_time);
+ if (!pi) {
+ ret = -EACCES;
+ goto out;
+ }
+
+ ret = setattr_prepare(dentry, attr);
+ if (ret)
+ goto out;
+
+ /* Update inode with attr except for size */
+ setattr_copy(inode, attr);
+
+ epoch_id = nova_get_epoch_id(sb);
+
+ attr_mask = ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_SIZE | ATTR_ATIME
+ | ATTR_MTIME | ATTR_CTIME;
+
+ ia_valid = ia_valid & attr_mask;
+
+ if (ia_valid == 0)
+ goto out;
+
+ ret = nova_handle_setattr_operation(sb, inode, pi, ia_valid,
+ attr, epoch_id);
+ if (ret)
+ goto out;
+
+ /* Only after log entry is committed, we can truncate size */
+ if ((ia_valid & ATTR_SIZE) && (attr->ia_size != oldsize ||
+ pi->i_flags & cpu_to_le32(NOVA_EOFBLOCKS_FL))) {
+// nova_set_blocksize_hint(sb, inode, pi, attr->ia_size);
+
+ /* now we can freely truncate the inode */
+ nova_setsize(inode, oldsize, attr->ia_size, epoch_id);
+ }
+
+ sih->trans_id++;
+out:
+ NOVA_END_TIMING(setattr_t, setattr_time);
+ return ret;
+}
+
+void nova_set_inode_flags(struct inode *inode, struct nova_inode *pi,
+ unsigned int flags)
+{
+ inode->i_flags &=
+ ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC);
+ if (flags & FS_SYNC_FL)
+ inode->i_flags |= S_SYNC;
+ if (flags & FS_APPEND_FL)
+ inode->i_flags |= S_APPEND;
+ if (flags & FS_IMMUTABLE_FL)
+ inode->i_flags |= S_IMMUTABLE;
+ if (flags & FS_NOATIME_FL)
+ inode->i_flags |= S_NOATIME;
+ if (flags & FS_DIRSYNC_FL)
+ inode->i_flags |= S_DIRSYNC;
+ if (!pi->i_xattr)
+ inode_has_no_xattr(inode);
+ inode->i_flags |= S_DAX;
+}
+
+static int nova_legacy_get_blocks(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
+ bool new = false, boundary = false;
+ u32 bno;
+ int ret;
+
+ ret = nova_dax_get_blocks(inode, iblock, max_blocks, &bno, &new,
+ &boundary, create, false);
+ if (ret <= 0)
+ return ret;
+
+ map_bh(bh, inode->i_sb, bno);
+ bh->b_size = ret << inode->i_blkbits;
+ return 0;
+}
+
+static ssize_t nova_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *filp = iocb->ki_filp;
+ struct address_space *mapping = filp->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t ret;
+ timing_t dio_time;
+
+ if (WARN_ON_ONCE(IS_DAX(inode)))
+ return -EIO;
+
+ NOVA_START_TIMING(direct_IO_t, dio_time);
+
+ ret = blockdev_direct_IO(iocb, inode, iter, nova_legacy_get_blocks);
+
+ NOVA_END_TIMING(direct_IO_t, dio_time);
+ return ret;
+}
+
+/*
+ * find the file offset for SEEK_DATA/SEEK_HOLE
+ */
+unsigned long nova_find_region(struct inode *inode, loff_t *offset, int hole)
+{
+ struct nova_inode_info *si = NOVA_I(inode);
+ struct nova_inode_info_header *sih = &si->header;
+ unsigned int data_bits = blk_type_to_shift[sih->i_blk_type];
+ unsigned long first_blocknr, last_blocknr;
+ unsigned long blocks = 0, offset_in_block;
+ int data_found = 0, hole_found = 0;
+
+ if (*offset >= inode->i_size)
+ return -ENXIO;
+
+ if (!inode->i_blocks || !sih->i_size) {
+ if (hole)
+ return inode->i_size;
+ else
+ return -ENXIO;
+ }
+
+ offset_in_block = *offset & ((1UL << data_bits) - 1);
+
+ first_blocknr = *offset >> data_bits;
+ last_blocknr = inode->i_size >> data_bits;
+
+ nova_dbg_verbose("find_region offset %llx, first_blocknr %lx, last_blocknr %lx hole %d\n",
+ *offset, first_blocknr, last_blocknr, hole);
+
+ blocks = nova_lookup_hole_in_range(inode->i_sb, sih,
+ first_blocknr, last_blocknr, &data_found, &hole_found, hole);
+
+ /* Searching data but only hole found till the end */
+ if (!hole && !data_found && hole_found)
+ return -ENXIO;
+
+ if (data_found && !hole_found) {
+ /* Searching data but we are already into them */
+ if (hole)
+ /* Searching hole but only data found, go to the end */
+ *offset = inode->i_size;
+ return 0;
+ }
+
+ /* Searching for hole, hole found and starting inside an hole */
+ if (hole && hole_found && !blocks) {
+ /* we found data after it */
+ if (!data_found)
+ /* last hole */
+ *offset = inode->i_size;
+ return 0;
+ }
+
+ if (offset_in_block) {
+ blocks--;
+ *offset += (blocks << data_bits) +
+ ((1 << data_bits) - offset_in_block);
+ } else {
+ *offset += blocks << data_bits;
+ }
+
+ return 0;
+}
+
+static int nova_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ int ret;
+ timing_t wp_time;
+
+ NOVA_START_TIMING(write_pages_t, wp_time);
+ ret = dax_writeback_mapping_range(mapping,
+ mapping->host->i_sb->s_bdev, wbc);
+ NOVA_END_TIMING(write_pages_t, wp_time);
+ return ret;
+}
+
+const struct address_space_operations nova_aops_dax = {
+ .writepages = nova_writepages,
+ .direct_IO = nova_direct_IO,
+ /*.dax_mem_protect = nova_dax_mem_protect,*/
+};
diff --git a/fs/nova/inode.h b/fs/nova/inode.h
new file mode 100644
index 000000000000..5ad69335799c
--- /dev/null
+++ b/fs/nova/inode.h
@@ -0,0 +1,389 @@
+#ifndef __INODE_H
+#define __INODE_H
+
+struct nova_inode_info_header;
+struct nova_inode;
+
+#include "super.h"
+#include "log.h"
+
+enum nova_new_inode_type {
+ TYPE_CREATE = 0,
+ TYPE_MKNOD,
+ TYPE_SYMLINK,
+ TYPE_MKDIR
+};
+
+
+/*
+ * Structure of an inode in PMEM
+ * Keep the inode size to within 120 bytes: We use the last eight bytes
+ * as inode table tail pointer.
+ */
+struct nova_inode {
+
+ /* first 40 bytes */
+ u8 i_rsvd; /* reserved. used to be checksum */
+ u8 valid; /* Is this inode valid? */
+ u8 deleted; /* Is this inode deleted? */
+ u8 i_blk_type; /* data block size this inode uses */
+ __le32 i_flags; /* Inode flags */
+ __le64 i_size; /* Size of data in bytes */
+ __le32 i_ctime; /* Inode modification time */
+ __le32 i_mtime; /* Inode b-tree Modification time */
+ __le32 i_atime; /* Access time */
+ __le16 i_mode; /* File mode */
+ __le16 i_links_count; /* Links count */
+
+ __le64 i_xattr; /* Extended attribute block */
+
+ /* second 40 bytes */
+ __le32 i_uid; /* Owner Uid */
+ __le32 i_gid; /* Group Id */
+ __le32 i_generation; /* File version (for NFS) */
+ __le32 i_create_time; /* Create time */
+ __le64 nova_ino; /* nova inode number */
+
+ __le64 log_head; /* Log head pointer */
+ __le64 log_tail; /* Log tail pointer */
+
+ /* last 40 bytes */
+ __le64 alter_log_head; /* Alternate log head pointer */
+ __le64 alter_log_tail; /* Alternate log tail pointer */
+
+ __le64 create_epoch_id; /* Transaction ID when create */
+ __le64 delete_epoch_id; /* Transaction ID when deleted */
+
+ struct {
+ __le32 rdev; /* major/minor # */
+ } dev; /* device inode */
+
+ __le32 csum; /* CRC32 checksum */
+
+ /* Leave 8 bytes for inode table tail pointer */
+} __attribute((__packed__));
+
+/*
+ * Inode table. It's a linked list of pages.
+ */
+struct inode_table {
+ __le64 log_head;
+};
+
+/*
+ * NOVA-specific inode state kept in DRAM
+ */
+struct nova_inode_info_header {
+ /* For files, tree holds a map from file offsets to
+ * write log entries.
+ *
+ * For directories, tree holds a map from a hash of the file name to
+ * dentry log entry.
+ */
+ struct radix_tree_root tree;
+ struct rb_root vma_tree; /* Write vmas */
+ struct list_head list; /* SB list of mmap sih */
+ int num_vmas;
+ unsigned short i_mode; /* Dir or file? */
+ unsigned long log_pages; /* Num of log pages */
+ unsigned long i_size;
+ unsigned long i_blocks;
+ unsigned long ino;
+ unsigned long pi_addr;
+ unsigned long alter_pi_addr;
+ unsigned long valid_entries; /* For thorough GC */
+ unsigned long num_entries; /* For thorough GC */
+ u64 last_setattr; /* Last setattr entry */
+ u64 last_link_change; /* Last link change entry */
+ u64 last_dentry; /* Last updated dentry */
+ u64 trans_id; /* Transaction ID */
+ u64 log_head; /* Log head pointer */
+ u64 log_tail; /* Log tail pointer */
+ u64 alter_log_head; /* Alternate log head pointer */
+ u64 alter_log_tail; /* Alternate log tail pointer */
+ u8 i_blk_type;
+};
+
+/* For rebuild purpose, temporarily store pi infomation */
+struct nova_inode_rebuild {
+ u64 i_size;
+ u32 i_flags; /* Inode flags */
+ u32 i_ctime; /* Inode modification time */
+ u32 i_mtime; /* Inode b-tree Modification time */
+ u32 i_atime; /* Access time */
+ u32 i_uid; /* Owner Uid */
+ u32 i_gid; /* Group Id */
+ u32 i_generation; /* File version (for NFS) */
+ u16 i_links_count; /* Links count */
+ u16 i_mode; /* File mode */
+ u64 trans_id;
+};
+
+/*
+ * DRAM state for inodes
+ */
+struct nova_inode_info {
+ struct nova_inode_info_header header;
+ struct inode vfs_inode;
+};
+
+
+static inline struct nova_inode_info *NOVA_I(struct inode *inode)
+{
+ return container_of(inode, struct nova_inode_info, vfs_inode);
+}
+
+static inline struct nova_inode *nova_get_alter_inode(struct super_block *sb,
+ struct inode *inode)
+{
+ struct nova_inode_info *si = NOVA_I(inode);
+ struct nova_inode_info_header *sih = &si->header;
+ struct nova_inode fake_pi;
+ void *addr;
+ int rc;
+
+ if (metadata_csum == 0)
+ return NULL;
+
+ addr = nova_get_block(sb, sih->alter_pi_addr);
+ rc = memcpy_mcsafe(&fake_pi, addr, sizeof(struct nova_inode));
+ if (rc)
+ return NULL;
+
+ return (struct nova_inode *)addr;
+}
+
+static inline int nova_update_alter_inode(struct super_block *sb,
+ struct inode *inode, struct nova_inode *pi)
+{
+ struct nova_inode *alter_pi;
+
+ if (metadata_csum == 0)
+ return 0;
+
+ alter_pi = nova_get_alter_inode(sb, inode);
+ if (!alter_pi)
+ return -EINVAL;
+
+ memcpy_to_pmem_nocache(alter_pi, pi, sizeof(struct nova_inode));
+ return 0;
+}
+
+
+static inline int nova_update_inode_checksum(struct nova_inode *pi)
+{
+ u32 crc = 0;
+
+ if (metadata_csum == 0)
+ return 0;
+
+ crc = nova_crc32c(~0, (__u8 *)pi,
+ (sizeof(struct nova_inode) - sizeof(__le32)));
+
+ pi->csum = crc;
+ nova_flush_buffer(pi, sizeof(struct nova_inode), 1);
+ return 0;
+}
+
+static inline int nova_check_inode_checksum(struct nova_inode *pi)
+{
+ u32 crc = 0;
+
+ if (metadata_csum == 0)
+ return 0;
+
+ crc = nova_crc32c(~0, (__u8 *)pi,
+ (sizeof(struct nova_inode) - sizeof(__le32)));
+
+ if (pi->csum == cpu_to_le32(crc))
+ return 0;
+ else
+ return 1;
+}
+
+
+
+static inline void nova_update_tail(struct nova_inode *pi, u64 new_tail)
+{
+ timing_t update_time;
+
+ NOVA_START_TIMING(update_tail_t, update_time);
+
+ PERSISTENT_BARRIER();
+ pi->log_tail = new_tail;
+ nova_flush_buffer(&pi->log_tail, CACHELINE_SIZE, 1);
+
+ NOVA_END_TIMING(update_tail_t, update_time);
+}
+
+static inline void nova_update_alter_tail(struct nova_inode *pi, u64 new_tail)
+{
+ timing_t update_time;
+
+ if (metadata_csum == 0)
+ return;
+
+ NOVA_START_TIMING(update_tail_t, update_time);
+
+ PERSISTENT_BARRIER();
+ pi->alter_log_tail = new_tail;
+ nova_flush_buffer(&pi->alter_log_tail, CACHELINE_SIZE, 1);
+
+ NOVA_END_TIMING(update_tail_t, update_time);
+}
+
+
+
+/* Update inode tails and checksums */
+static inline void nova_update_inode(struct super_block *sb,
+ struct inode *inode, struct nova_inode *pi,
+ struct nova_inode_update *update, int update_alter)
+{
+ struct nova_inode_info *si = NOVA_I(inode);
+ struct nova_inode_info_header *sih = &si->header;
+
+ sih->log_tail = update->tail;
+ sih->alter_log_tail = update->alter_tail;
+ nova_update_tail(pi, update->tail);
+ if (metadata_csum)
+ nova_update_alter_tail(pi, update->alter_tail);
+
+ nova_update_inode_checksum(pi);
+ if (inode && update_alter)
+ nova_update_alter_inode(sb, inode, pi);
+}
+
+
+static inline
+struct inode_table *nova_get_inode_table(struct super_block *sb,
+ int version, int cpu)
+{
+ struct nova_sb_info *sbi = NOVA_SB(sb);
+ int table_start;
+
+ if (cpu >= sbi->cpus)
+ return NULL;
+
+ if ((version & 0x1) == 0)
+ table_start = INODE_TABLE0_START;
+ else
+ table_start = INODE_TABLE1_START;
+
+ return (struct inode_table *)((char *)nova_get_block(sb,
+ NOVA_DEF_BLOCK_SIZE_4K * table_start) +
+ cpu * CACHELINE_SIZE);
+}
+
+static inline unsigned int
+nova_inode_blk_shift(struct nova_inode_info_header *sih)
+{
+ return blk_type_to_shift[sih->i_blk_type];
+}
+
+static inline uint32_t nova_inode_blk_size(struct nova_inode_info_header *sih)
+{
+ return blk_type_to_size[sih->i_blk_type];
+}
+
+static inline u64 nova_get_reserved_inode_addr(struct super_block *sb,
+ u64 inode_number)
+{
+ return (NOVA_DEF_BLOCK_SIZE_4K * RESERVE_INODE_START) +
+ inode_number * NOVA_INODE_SIZE;
+}
+
+static inline u64 nova_get_alter_reserved_inode_addr(struct super_block *sb,
+ u64 inode_number)
+{
+ struct nova_sb_info *sbi = NOVA_SB(sb);
+
+ return nova_get_addr_off(sbi, sbi->replica_reserved_inodes_addr) +
+ inode_number * NOVA_INODE_SIZE;
+}
+
+static inline struct nova_inode *nova_get_reserved_inode(struct super_block *sb,
+ u64 inode_number)
+{
+ struct nova_sb_info *sbi = NOVA_SB(sb);
+ u64 addr;
+
+ addr = nova_get_reserved_inode_addr(sb, inode_number);
+
+ return (struct nova_inode *)(sbi->virt_addr + addr);
+}
+
+static inline struct nova_inode *
+nova_get_alter_reserved_inode(struct super_block *sb,
+ u64 inode_number)
+{
+ struct nova_sb_info *sbi = NOVA_SB(sb);
+ u64 addr;
+
+ addr = nova_get_alter_reserved_inode_addr(sb, inode_number);
+
+ return (struct nova_inode *)(sbi->virt_addr + addr);
+}
+
+/* If this is part of a read-modify-write of the inode metadata,
+ * nova_memunlock_inode() before calling!
+ */
+static inline struct nova_inode *nova_get_inode_by_ino(struct super_block *sb,
+ u64 ino)
+{
+ if (ino == 0 || ino >= NOVA_NORMAL_INODE_START)
+ return NULL;
+
+ return nova_get_reserved_inode(sb, ino);
+}
+
+static inline struct nova_inode *nova_get_inode(struct super_block *sb,
+ struct inode *inode)
+{
+ struct nova_inode_info *si = NOVA_I(inode);
+ struct nova_inode_info_header *sih = &si->header;
+ struct nova_inode fake_pi;
+ void *addr;
+ int rc;
+
+ addr = nova_get_block(sb, sih->pi_addr);
+ rc = memcpy_mcsafe(&fake_pi, addr, sizeof(struct nova_inode));
+ if (rc)
+ return NULL;
+
+ return (struct nova_inode *)addr;
+}
+
+
+
+extern const struct address_space_operations nova_aops_dax;
+int nova_init_inode_inuse_list(struct super_block *sb);
+extern int nova_init_inode_table(struct super_block *sb);
+int nova_get_alter_inode_address(struct super_block *sb, u64 ino,
+ u64 *alter_pi_addr);
+unsigned long nova_get_last_blocknr(struct super_block *sb,
+ struct nova_inode_info_header *sih);
+int nova_get_inode_address(struct super_block *sb, u64 ino, int version,
+ u64 *pi_addr, int extendable, int extend_alternate);
+int nova_set_blocksize_hint(struct super_block *sb, struct inode *inode,
+ struct nova_inode *pi, loff_t new_size);
+extern struct inode *nova_iget(struct super_block *sb, unsigned long ino);
+extern void nova_evict_inode(struct inode *inode);
+extern int nova_write_inode(struct inode *inode, struct writeback_control *wbc);
+extern void nova_dirty_inode(struct inode *inode, int flags);
+extern int nova_notify_change(struct dentry *dentry, struct iattr *attr);
+extern int nova_getattr(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags);
+extern void nova_set_inode_flags(struct inode *inode, struct nova_inode *pi,
+ unsigned int flags);
+extern unsigned long nova_find_region(struct inode *inode, loff_t *offset,
+ int hole);
+int nova_delete_file_tree(struct super_block *sb,
+ struct nova_inode_info_header *sih, unsigned long start_blocknr,
+ unsigned long last_blocknr, bool delete_nvmm,
+ bool delete_dead, u64 trasn_id);
+u64 nova_new_nova_inode(struct super_block *sb, u64 *pi_addr);
+extern struct inode *nova_new_vfs_inode(enum nova_new_inode_type,
+ struct inode *dir, u64 pi_addr, u64 ino, umode_t mode,
+ size_t size, dev_t rdev, const struct qstr *qstr, u64 epoch_id);
+
+#endif