Re: [RFC v2][PATCH 8/9] File descriprtors - dump state

From: Louis Rilling
Date: Thu Aug 21 2008 - 07:06:26 EST


On Wed, Aug 20, 2008 at 11:07:16PM -0400, Oren Laadan wrote:
>
> Dump the files_struct of a task with 'struct cr_hdr_files', followed by
> all open file descriptors. Since FDs can be shared, they are assigned a
> tag and registered in the object hash.
>
> For each open FD there is a 'struct cr_hdr_fd_ent' with the FD, its tag
> and its close-on-exec property. If the FD is to be saved (first time)
> then this is followed by a 'struct cr_hdr_fd_data' with the FD state.
> Then will come the next FD and so on.
>
> This patch only handles basic FDs - regular files, directories and also
> symbolic links.

[...]

> diff --git a/checkpoint/ckpt_file.c b/checkpoint/ckpt_file.c
> new file mode 100644
> index 0000000..18faaf1
> --- /dev/null
> +++ b/checkpoint/ckpt_file.c
> @@ -0,0 +1,234 @@
> +/*
> + * Checkpoint file descriptors
> + *
> + * Copyright (C) 2008 Oren Laadan
> + *
> + * This file is subject to the terms and conditions of the GNU General Public
> + * License. See the file COPYING in the main directory of the Linux
> + * distribution for more details.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/sched.h>
> +#include <linux/file.h>
> +#include <linux/fdtable.h>
> +
> +#include "ckpt.h"
> +#include "ckpt_hdr.h"
> +#include "ckpt_file.h"
> +
> +#define CR_DEFAULT_FDTABLE 128
> +
> +/**
> + * cr_scan_fds - scan file table and construct array of open fds
> + * @files: files_struct pointer
> + * @fdtable: (output) array of open fds
> + * @return: the number of open fds found
> + *
> + * Allocates the file descriptors array (*fdtable), caller should free
> + */
> +int cr_scan_fds(struct files_struct *files, int **fdtable)
> +{
> + int i, j, n, max;
> + struct fdtable *fdt;
> + int *fdlist;
> +
> + max = CR_DEFAULT_FDTABLE;
> +
> + repeat:
> + fdlist = kmalloc(max * sizeof(*fdlist), GFP_KERNEL);
> + if (!fdlist)
> + return -ENOMEM;
> +
> + j = 0;
> + n = 0;
> +
> + spin_lock(&files->file_lock);
> + fdt = files_fdtable(files);
> + for (;;) {
> + unsigned long set;
> + i = j * __NFDBITS;
> + if (i >= fdt->max_fds)
> + break;
> + set = fdt->open_fds->fds_bits[j++];

Why not simply use fcheck_files(files, n) and check if the result is not NULL?

> + while (set) {
> + if (set & 1) {
> + if (unlikely(n == max)) {
> + spin_unlock(&files->file_lock);
> + kfree(fdlist);
> + max *= 2;
> + if (max < 0) /* overflow ? */
> + return -EMFILE;
> + goto repeat;
> + }
> + fdlist[n++] = i;
> + }
> + i++;
> + set >>= 1;
> + }
> + }
> + spin_unlock(&files->file_lock);
> +
> + *fdtable = fdlist;
> + return n;
> +}
> +
> +/* cr_write_fd_data - dump the state of a given file pointer */
> +static int cr_write_fd_data(struct cr_ctx *ctx, struct file *file, int ptag)
> +{
> + struct cr_hdr h;
> + struct cr_hdr_fd_data *hh = ctx->hbuf;
> + struct dentry *dent = file->f_dentry;
> + struct inode *inode = dent->d_inode;
> + char *fname;
> + int flen, how, ret;
> +
> + h.type = CR_HDR_FD_DATA;
> + h.len = sizeof(*hh);
> + h.ptag = ptag;
> +
> + BUG_ON(!inode);
> +
> + flen = PAGE_SIZE;
> + fname = cr_fill_fname(&file->f_path, ctx->vfsroot, ctx->tbuf, &flen);
> + if (IS_ERR(fname))
> + return PTR_ERR(fname);
> +
> + hh->f_flags = file->f_flags;
> + hh->f_mode = file->f_mode;
> + hh->f_pos = file->f_pos;
> + hh->f_uid = file->f_uid;
> + hh->f_gid = file->f_gid;
> + hh->f_version = file->f_version;
> + /* FIX: need also file->f_owner */
> +
> + switch(inode->i_mode & S_IFMT) {
> + case S_IFREG:
> + how = CR_FD_FILE;
> + break;
> + case S_IFDIR:
> + how = CR_FD_DIR;
> + break;
> + case S_IFLNK:
> + how = CR_FD_LINK;
> + break;
> + default:
> + return -EBADF;
> + }
> +
> + /* FIX: check if the file/dir/link is unlinked */
> +
> + BUG_ON(!flen);
> +
> + ret = cr_write_obj(ctx, &h, hh);
> + if (!ret && flen)
> + ret = cr_write_str(ctx, fname, flen);
> +
> + return ret;
> +}
> +
> +/**
> + * cr_write_fd_ent - dump the state of a given file descriptor
> + * @ctx: checkpoint context
> + * @files: files_struct pointer
> + * @fd: file descriptor
> + * + * Save the state of the file descriptor; look up the actual file
> pointer
> + * in the hash table, and if found save the matching tag, otherwise call
> + * cr_write_fd_data to dump the file pointer too.
> + */
> +static int
> +cr_write_fd_ent(struct cr_ctx *ctx, struct files_struct *files, int fd)
> +{
> + struct cr_hdr h;
> + struct cr_hdr_fd_ent *hh = ctx->hbuf;
> + struct file *file = NULL;
> + struct fdtable *fdt;
> + int coe, tag, ret;
> +
> + /* make sure hh->fd (that is of type __u16) doesn't overflow */
> + if (fd > USHORT_MAX) {
> + pr_warning("CR: open files table too big (%d)\n", USHORT_MAX);
> + return -EMFILE;
> + }
> +
> + rcu_read_lock();
> + fdt = files_fdtable(files);
> + if (fd < fdt->max_fds)
> + file = rcu_dereference(fdt->fd[fd]);

You should probably use fcheck_files() instead of copying its code here. I agree
that this means dereferencing files->fdt a second time below, but is it so
performance critical?

> + if (file) {
> + coe = FD_ISSET(fd, fdt->close_on_exec);
> + get_file(file);
> + }
> + rcu_read_unlock();
> +
> + /* sanity check (although this shouldn't happen) */
> + if (unlikely(!file))
> + return -EBADF;
> +
> + ret = cr_obj_add_ptr(ctx, (void *) file, &tag, CR_OBJ_FILE, 0);
> + cr_debug("fd %d tag %d file %p c-o-e %d)\n", fd, tag, file, coe);
> +
> + if (ret >= 0) {
> + int new = ret;
> +
> + h.type = CR_HDR_FD_ENT;
> + h.len = sizeof(*hh);
> + h.ptag = 0;
> + + hh->tag = tag;
^
|

> + hh->fd = fd;
> + hh->close_on_exec = coe;
> +
> + ret = cr_write_obj(ctx, &h, hh);
> +
> + /* new==1 if-and-only-if file was new and added to hash */
> + if (!ret && new)
> + ret = cr_write_fd_data(ctx, file, tag);
> + }
> +
> + fput(file);
> + return ret;
> +}
> +
> +int cr_write_files(struct cr_ctx *ctx, struct task_struct *t)
> +{
> + struct cr_hdr h;
> + struct cr_hdr_files *hh = ctx->hbuf;
> + struct files_struct *files;
> + int *fdtable;
> + int nfds, n, ret;
> +
> + h.type = CR_HDR_FILES;
> + h.len = sizeof(*hh);
> + h.ptag = task_pid_vnr(t);
> +
> + files = get_files_struct(t);
> +
> + nfds = cr_scan_fds(files, &fdtable);
> + if (nfds < 0) {
> + ret = nfds;
> + goto out;
> + }
> + + hh->tag = 0;
^
|

> + hh->nfds = nfds;
> +
> + ret = cr_write_obj(ctx, &h, hh);
> + if (ret < 0)
> + goto clean;
> +
> + cr_debug("nfds %d\n", nfds);
> + for (n = 0; n < nfds; n++) {
> + ret = cr_write_fd_ent(ctx, files, n);
> + if (ret < 0)
> + break;
> + }
> +
> + clean:
> + kfree(fdtable);
> + out:
> + put_files_struct(files);
> +
> + return ret;
> +}

[...]

> diff --git a/checkpoint/ckpt_hdr.h b/checkpoint/ckpt_hdr.h
> index a3919cf..a8a37db 100644
> --- a/checkpoint/ckpt_hdr.h
> +++ b/checkpoint/ckpt_hdr.h

[...]

> @@ -114,4 +125,24 @@ struct cr_hdr_vma {
>
> } __attribute__ ((aligned (8)));
>
> +struct cr_hdr_files {
> + __u32 tag; /* sharing identifier */
> + __u32 nfds;
> +} __attribute__ ((aligned (8)));
> +
> +struct cr_hdr_fd_ent {
> + __u32 tag;
> + __u16 fd;
> + __u16 close_on_exec;
> +} __attribute__ ((aligned (8)));
> +
> +struct cr_hdr_fd_data {
> + __u16 how;
> + __u16 f_mode;
> + __u32 f_flags;
> + __u32 f_uid, f_gid;

We are not at a 64bits boundary here. Should add one __32 padding or reorder the
fields.

> + __u64 f_pos;
> + __u64 f_version;
> +} __attribute__ ((aligned (8)));
> +
> #endif /* _CHECKPOINT_CKPT_HDR_H_ */
> --
> 1.5.4.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes

Attachment: signature.asc
Description: Digital signature