Re: [PATCHv11 2.6.36-rc2-tip 5/15] 5: uprobes: Uprobes(un)registration and exception handling.

From: Peter Zijlstra
Date: Tue Sep 07 2010 - 05:34:04 EST


On Tue, 2010-09-07 at 12:18 +0530, Srikar Dronamraju wrote:
> > You're really not getting it, are you? No, it would result in the exact
> > same amount of actual breakpoints hit.
>
> If there is just one instance of traced process for the inode then yes the
> number of breakpoints when traced with pid or based on inode would be the
> same. However if there are multiple instances of the traced process [example
> bash/zsh] (or the inode corresponds to a library that gets mapped into
> multiple processes example libc), and the user is interested in tracing
> just one instance of the process, then dont wont the inode based tracing
> amount to far more number of breakpoints hits?

Not if your filter function works.

So let me try this again, (assumes boosted probes):

struct uprobe {
struct inode *inode; /* we hold a ref */
unsigned long offset;

int (*handler)(void); /* arguments.. ? */
int (*filter)(struct task_struct *);

int insn_size; /* size of */
char insn[MAX_INSN_SIZE]; /* the original insn */

int ret_addr_offset; /* return addr offset
in the slot */
char replacement[SLOT_SIZE]; /* replacement
instructions */

atomic_t ref; /* lifetime muck */
struct rcu_head rcu;
};

static struct {
raw_spinlock_t tree_lock;
rb_root tree;
} uprobes;

static void uprobes_add(struct uprobe *uprobe)
{
/* add to uprobes.tree, sorted on inode:offset */
}

static void uprobes_del(struct uprobe *uprobe)
{
/* delete from uprobes.tree */
}

static struct uprobe *
uprobes_find_get(struct address_space *mapping, unsigned long offset)
{
unsigned long flags;
struct uprobe *uprobe;

raw_spin_lock_irqsave(&uprobes.treelock, flags);
uprobe = find_in_tree(&uprobes.tree);
if (!atomic_inc_not_zero(&uprobe->ref))
uprobe = NULL;
raw_spin_unlock_irqrestore(&uprobes.treelock, flags);

return uprobe;
}

static void __uprobe_free(struct rcu_head *head)
{
struct uprobe *uprobe = container_of(head, struct uprobe, rcu);

kfree(uprobe);
}

static void put_uprobe(struct uprobe *uprobe)
{
if (atomic_dec_and_test(&uprobe->ref))
call_rcu(&uprobe->rcu, __uprobe_free);
}

static inline int valid_vma(struct vm_area_struct *vma)
{
if (!vma->vm_file)
return 0;

if (vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED) ==
(VM_READ|VM_EXEC))
return 1;

return 0;
}

int register_uprobe(struct uprobe *uprobe)
{
struct vm_area_struct *vma;

inode_get(uprobe->inode);
atomic_set(1, &uprobe->ref);

uprobes_add(uprobe); /* add before the rmap walk, so that
new mmap()s will find it too */

for_each_rmap_vma(vma, uprobe->inode->i_mapping) {
struct mm_struct *mm = vma->vm_mm;
int install_probe = 0;

if (!valid_vma(vma))
continue;

for_each_task_in_process(p, mm->owner) {
if (uprobe->filter(p)) {
p->has_uprobe = 1;
install_probe = 1;
}
}

if (install_probe) {
mm->has_uprobes = 1;
frob_text(uprobe, mm);
}
}
}

void unregister_uprobe(struct uprobe *uprobe)
{
/* pretty much the same, except restore the original text */
put_uprobe(uprobe);
}

void uprobe_fork(struct task_struct *child)
{
struct vm_area_struct *vma;

if (!child->mm->has_uprobes)
return;

for_each_vma(vma, child->mm) {
struct uprobe *uprobe;

if (!valid_vma(vma))
continue;

for_each_probe_in_mapping(uprobe, vma->vm_file->f_mapping) {
if (uprobe->filter(child)) {
child->has_uprobe = 1;
return;
}
}
}
}

void uprobe_mmap(struct vm_area_struct *vma)
{
struct uprobe *uprobe;

if (!valid_vma(vma))
return;

for_each_probe_in_mapping(uprobe, vma->vm_file->f_mapping) {
int install_probe = 0;

for_each_task_in_process(p, vma->vm_mm->owner) {
if (uprobe->filter(p)) {
p->has_uprobe = 1;
install_probe = 1;
}
}

if (install_probe) {
mm->has_uprobes = 1;
frob_text(uprobe, mm);
}
}
}

void uprobe_hit(struct pt_regs *regs)
{
unsigned long addr = instruction_pointer(regs);
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long offset;

down_read(&mm->mmap_sem);
vma = find_vma(mm, addr);

if (!valid_vma)
goto fail;

offset = addr - vma->vm_start + (vma->vm_pgoff << PAGE_SHIFT);

uprobe = uprobes_find_get(vma->vm_file->f_mapping, offset);
up_read(&mm->mmap_sem);

if (!uprobe)
goto fail;

if (current->has_uprobe && uprobe->filter(current))
uprobe->handle();

ret_addr = addr + uprobe->insn_size;

cpu = get_cpu()
slot = get_slot(cpu);
memcpy(slot, uprobe->replacement, SLOT_SIZE);
memcpy(slot + uprobe->ret_addr_offset, &ret_addr, sizeof(unsigned
long));
set_instruction_pointer(regs, uaddr_addr_of(slot));
put_cpu(); /* preemption notifiers would take it from here */

put_uprobe(uprobe);
return;

fail:
SIGTRAP
}

See, no extra traps, no funny intermediate data structures to manage,
and you get the power of ->filter() to implement whatever policy you
want, including simple process wide things.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/