[PATCH] Unified lguest launcher

From: Glauber de Oliveira Costa
Date: Wed Apr 04 2007 - 12:03:51 EST


This is a new version of the unified lguest launcher that applies to
the current tree. According to rusty's suggestion, I'm bothering less
to be able to load 32 bit kernels on 64-bit machines: changing the
launcher for such case would be the easy part! In the absence of
further objections, I'll commit it.

Signed-off-by: Glauber de Oliveira Costa <gcosta@xxxxxxxxxx>

--
Glauber de Oliveira Costa.
"Free as in Freedom"

"The less confident you are, the more serious you have to act." Index: linux-2.6.20/Documentation/lguest/Makefile
===================================================================
--- linux-2.6.20.orig/Documentation/lguest/Makefile
+++ linux-2.6.20/Documentation/lguest/Makefile
@@ -1,12 +1,13 @@
# This creates the demonstration utility "lguest" which runs a Linux guest.

-# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
-# Some shells (dash - ubunu) can't handle numbers that big so we cheat.
+#we could uname -i, but it seems to return unknown on a bunch locations
+ARCH:=$(shell uname -m | sed s/i[3456]86/i386/)
+
include ../../.config
-LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
+include $(ARCH)/defines

CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 \
- -static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds
+ -static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds -I$(ARCH)
LDLIBS:=-lz

all: lguest.lds lguest
Index: linux-2.6.20/Documentation/lguest/i386/defines
===================================================================
--- /dev/null
+++ linux-2.6.20/Documentation/lguest/i386/defines
@@ -0,0 +1,4 @@
+# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary.
+# Some shells (dash - ubunu) can't handle numbers that big so we cheat.
+include ../../.config
+LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
Index: linux-2.6.20/Documentation/lguest/i386/lguest_defs.h
===================================================================
--- /dev/null
+++ linux-2.6.20/Documentation/lguest/i386/lguest_defs.h
@@ -0,0 +1,9 @@
+#ifndef _LGUEST_DEFS_H_
+#define _LGUEST_DEFS_H_
+
+/* LGUEST_TOP_ADDRESS comes from the Makefile */
+#define RESERVE_TOP_ADDRESS LGUEST_GUEST_TOP - 1024*1024
+
+#include "../../../include/linux/lguest_launcher.h"
+
+#endif
Index: linux-2.6.20/Documentation/lguest/lguest.c
===================================================================
--- linux-2.6.20.orig/Documentation/lguest/lguest.c
+++ linux-2.6.20/Documentation/lguest/lguest.c
@@ -33,7 +33,7 @@
typedef uint32_t u32;
typedef uint16_t u16;
typedef uint8_t u8;
-#include "../../include/linux/lguest_launcher.h"
+#include <lguest_defs.h>

#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
#define NET_PEERNUM 1
@@ -64,7 +64,7 @@ struct device

/* Watch DMA to this key if handle_input non-NULL. */
unsigned long watch_key;
- u32 (*handle_output)(int fd, const struct iovec *iov,
+ unsigned long (*handle_output)(int fd, const struct iovec *iov,
unsigned int num, struct device *me);

/* Device-specific data. */
@@ -94,20 +94,29 @@ static void *map_zeroed_pages(unsigned l
}

/* Returns the entry point */
-static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
+
+static unsigned long map_elf(int elf_fd, const void *hdr,
unsigned long *page_offset)
{
- void *addr;
+#ifndef __x86_64__
+ const Elf32_Ehdr *ehdr = hdr;
Elf32_Phdr phdr[ehdr->e_phnum];
+#else
+ const Elf64_Ehdr *ehdr = hdr;
+ Elf64_Phdr phdr[ehdr->e_phnum];
+#endif
+ void *addr;
unsigned int i;

/* Sanity checks. */
if (ehdr->e_type != ET_EXEC
- || ehdr->e_machine != EM_386
- || ehdr->e_phentsize != sizeof(Elf32_Phdr)
- || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
+ || ((ehdr->e_machine != EM_386) &&
+ (ehdr->e_machine != EM_X86_64))
+ || ehdr->e_phentsize != sizeof(phdr[0])
+ || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(phdr[0]))
errx(1, "Malformed elf header");

+
if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
err(1, "Seeking to program headers");
if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
@@ -120,13 +129,17 @@ static unsigned long map_elf(int elf_fd,
if (phdr[i].p_type != PT_LOAD)
continue;

- verbose("Section %i: size %i addr %p\n",
- i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
+ verbose("Section %i: size %lu addr %p\n",
+ i, (unsigned long)phdr[i].p_memsz, (void *)phdr[i].p_paddr);

/* We expect linear address space. */
if (!*page_offset)
*page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
- else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
+ else if ((*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
+#ifdef __x86_64__
+ && (phdr[i].p_vaddr != VSYSCALL_START)
+#endif
+ )
errx(1, "Page offset of section %i different", i);

/* We map everything private, writable. */
@@ -210,15 +223,18 @@ static unsigned long load_bzimage(int fd
errx(1, "Could not find kernel in bzImage");
}

-static unsigned long load_kernel(int fd, unsigned long *page_offset)
+/* In x86_64 systems, page table hierarchy does not start at a common location,
+ * but rather, is indicated by a symbol called boot_level4_pgt. Due to this, we
+ * need the elf header to survive this function, to lookup for the pgdir */
+static unsigned long load_kernel(int fd, unsigned long *page_offset, void *ehdr)
{
- Elf32_Ehdr hdr;
+ Elf64_Ehdr *hdr = ehdr;

- if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
+ if (read(fd, hdr, sizeof(*hdr)) != sizeof(*hdr))
err(1, "Reading kernel");

- if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
- return map_elf(fd, &hdr, page_offset);
+ if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) == 0)
+ return map_elf(fd, ehdr, page_offset);

return load_bzimage(fd, page_offset);
}
@@ -250,6 +266,7 @@ static inline unsigned long page_align(u
return ((addr + getpagesize()-1) & ~(getpagesize()-1));
}

+#ifndef __x86_64__
static unsigned long setup_pagetables(unsigned long mem,
unsigned long initrd_size,
unsigned long page_offset)
@@ -285,6 +302,74 @@ static unsigned long setup_pagetables(un

return (unsigned long)pgdir;
}
+#else
+static unsigned long find_pgt_symbol(int elf_fd,
+ Elf64_Ehdr *ehdr,
+ unsigned long page_offset)
+{
+ unsigned int i;
+ Elf64_Shdr sec[ehdr->e_shnum];
+ Elf64_Sym *syms;
+ char *strtab = NULL;
+ unsigned long pgdir_addr = 0;
+ unsigned long nsyms = 0;
+
+ /* Now process sections searching for boot page tables
+ * Start by finding the symtab section */
+ if (lseek(elf_fd, ehdr->e_shoff, SEEK_SET) < 0)
+ err(1, "Seeking to section headers");
+ if (read(elf_fd, sec, sizeof(sec)) != sizeof(sec))
+ err(1, "Reading section headers");
+
+ for (i = 0; i < ehdr->e_shnum; i++) {
+ if (sec[i].sh_type == SHT_SYMTAB) {
+ int ret = 0;
+ syms = malloc(sec[i].sh_size);
+ if (!syms)
+ err(1,"Not enough memory for symbol table");
+ ret = lseek(elf_fd, sec[i].sh_offset, SEEK_SET);
+ if (ret < 0)
+ err(1, "Seeking to symbol table");
+ ret = read(elf_fd, syms, sec[i].sh_size);
+ if (ret != sec[i].sh_size)
+ err(1, "Reading symbol table");
+ nsyms = sec[i].sh_size / sizeof(Elf64_Sym);
+
+
+ /* symtab links to strtab. We use it to find symbol
+ * names */
+ strtab = malloc(sec[sec[i].sh_link].sh_size);
+ if (!strtab)
+ err(1,"Not enough memory for string table");
+ ret = lseek(elf_fd, sec[sec[i].sh_link].sh_offset , SEEK_SET);
+ if (ret < 0)
+ err(1, "Seeking to string table");
+ ret = read(elf_fd, strtab, sec[sec[i].sh_link].sh_size);
+ if (ret != sec[sec[i].sh_link].sh_size)
+ err(1, "Reading string table");
+ break;
+ }
+ }
+
+ /* We now have a pointer to the symtab, start searching for the symbol */
+ for (i = 0; i < nsyms; i++) {
+ if ((syms[i].st_shndx == SHN_UNDEF) || !syms[i].st_name)
+ continue;
+ if (!strcmp(BOOT_PGTABLE,
+ (char *)((u64)syms[i].st_name + strtab))) {
+ pgdir_addr = syms[i].st_value - page_offset;
+ break;
+ }
+ }
+
+ if (!pgdir_addr) {
+ errno = ESRCH;
+ err(1,"Unable to find boot pgdir");
+ }
+
+ return pgdir_addr;
+}
+#endif

static void concat(char *dst, char *args[])
{
@@ -299,9 +384,10 @@ static void concat(char *dst, char *args
dst[len] = '\0';
}

-static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
+static int tell_kernel(unsigned long pgdir, unsigned long start,
+ unsigned long page_offset)
{
- u32 args[] = { LHREQ_INITIALIZE,
+ unsigned long args[] = { LHREQ_INITIALIZE,
LGUEST_GUEST_TOP/getpagesize(), /* Just below us */
pgdir, start, page_offset };
int fd;
@@ -379,7 +465,8 @@ static void *_check_pointer(unsigned lon
#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)

/* Returns pointer to dma->used_len */
-static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
+static unsigned long *dma2iov(unsigned long dma, struct iovec iov[],
+ unsigned *num)
{
unsigned int i;
struct lguest_dma *udma;
@@ -396,12 +483,12 @@ static u32 *dma2iov(unsigned long dma, s
return &udma->used_len;
}

-static u32 *get_dma_buffer(int fd, void *key,
+static unsigned long *get_dma_buffer(int fd, void *key,
struct iovec iov[], unsigned int *num, u32 *irq)
{
- u32 buf[] = { LHREQ_GETDMA, (u32)key };
+ unsigned long buf[] = { LHREQ_GETDMA, (unsigned long)key };
unsigned long udma;
- u32 *res;
+ unsigned long *res;

udma = write(fd, buf, sizeof(buf));
if (udma == (unsigned long)-1)
@@ -415,7 +502,7 @@ static u32 *get_dma_buffer(int fd, void

static void trigger_irq(int fd, u32 irq)
{
- u32 buf[] = { LHREQ_IRQ, irq };
+ unsigned long buf[] = { LHREQ_IRQ, irq };
if (write(fd, buf, sizeof(buf)) != 0)
err(1, "Triggering irq %i", irq);
}
@@ -443,7 +530,8 @@ struct console_abort
/* We DMA input to buffer bound at start of console page. */
static bool handle_console_input(int fd, struct device *dev)
{
- u32 irq = 0, *lenp;
+ u32 irq = 0;
+ unsigned long *lenp;
int len;
unsigned int num;
struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
@@ -487,14 +575,14 @@ static bool handle_console_input(int fd,
return true;
}

-static u32 handle_console_output(int fd, const struct iovec *iov,
- unsigned num, struct device*dev)
+static unsigned long handle_console_output(int fd, const struct iovec *iov,
+ unsigned num, struct device*dev)
{
return writev(STDOUT_FILENO, iov, num);
}

-static u32 handle_tun_output(int fd, const struct iovec *iov,
- unsigned num, struct device *dev)
+static unsigned long handle_tun_output(int fd, const struct iovec *iov,
+ unsigned num, struct device *dev)
{
/* Now we've seen output, we should warn if we can't get buffers. */
*(bool *)dev->priv = true;
@@ -508,7 +596,8 @@ static unsigned long peer_offset(unsigne

static bool handle_tun_input(int fd, struct device *dev)
{
- u32 irq = 0, *lenp;
+ u32 irq = 0;
+ unsigned long *lenp;
int len;
unsigned num;
struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
@@ -534,11 +623,12 @@ static bool handle_tun_input(int fd, str
return true;
}

-static u32 handle_block_output(int fd, const struct iovec *iov,
- unsigned num, struct device *dev)
+static unsigned long handle_block_output(int fd, const struct iovec *iov,
+ unsigned num, struct device *dev)
{
struct lguest_block_page *p = dev->mem;
- u32 irq, *lenp;
+ u32 irq;
+ unsigned long *lenp;
unsigned int len, reply_num;
struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
off64_t device_len, off = (off64_t)p->sector * 512;
@@ -546,11 +636,14 @@ static u32 handle_block_output(int fd, c
device_len = *(off64_t *)dev->priv;

if (off >= device_len)
- err(1, "Bad offset %llu vs %llu", off, device_len);
+ err(1, "Bad offset %llu vs %llu",
+ (unsigned long long)off,
+ (unsigned long long) device_len);
if (lseek64(dev->fd, off, SEEK_SET) != off)
err(1, "Bad seek to sector %i", p->sector);

- verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
+ verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ",
+ (unsigned long long)off);

lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
if (!lenp)
@@ -560,7 +653,8 @@ static u32 handle_block_output(int fd, c
len = writev(dev->fd, iov, num);
if (off + len > device_len) {
ftruncate(dev->fd, device_len);
- errx(1, "Write past end %llu+%u", off, len);
+ errx(1, "Write past end %llu+%u",
+ (unsigned long long)off, len);
}
*lenp = 0;
} else {
@@ -577,7 +671,7 @@ static void handle_output(int fd, unsign
struct device_list *devices)
{
struct device *i;
- u32 *lenp;
+ unsigned long *lenp;
struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
unsigned num = 0;

@@ -639,7 +733,7 @@ static struct device *new_device(struct
int fd,
bool (*handle_input)(int, struct device *),
unsigned long watch_off,
- u32 (*handle_output)(int,
+ unsigned long (*handle_output)(int,
const struct iovec *,
unsigned,
struct device *))
@@ -916,9 +1010,12 @@ int main(int argc, char *argv[])
{
unsigned long mem, pgdir, start, page_offset;
int c, lguest_fd, waker_fd;
+ int elf_fd;
struct device_list device_list;
struct lguest_boot_info *boot = (void *)0;
const char *initrd_name = NULL;
+ /* 64-bit header is bigger, so this is a worst case for both */
+ Elf64_Ehdr ehdr;

device_list.max_infd = -1;
device_list.dev = NULL;
@@ -958,8 +1055,8 @@ int main(int argc, char *argv[])
map_zeroed_pages(0, mem / getpagesize());

/* Now we load the kernel */
- start = load_kernel(open_or_die(argv[optind+1], O_RDONLY),
- &page_offset);
+ elf_fd = open_or_die(argv[optind+1], O_RDONLY);
+ start = load_kernel(elf_fd, &page_offset, &ehdr);

/* Write the device descriptors into memory. */
map_device_descriptors(&device_list, mem);
@@ -969,7 +1066,11 @@ int main(int argc, char *argv[])
boot->initrd_size = load_initrd(initrd_name, mem);

/* Set up the initial linar pagetables. */
+#ifndef __x86_64__
pgdir = setup_pagetables(mem, boot->initrd_size, page_offset);
+#else
+ pgdir = find_pgt_symbol(elf_fd, &ehdr, page_offset);
+#endif

/* Give the guest the boot information it needs. */
concat(boot->cmdline, argv+optind+2);
Index: linux-2.6.20/Documentation/lguest/x86_64/defines
===================================================================
--- /dev/null
+++ linux-2.6.20/Documentation/lguest/x86_64/defines
@@ -0,0 +1,4 @@
+# For now on x86_64 we'll hard code the location of the lguest binary loader.
+# But when we can get a relocatable kernel, we'll have to work to make this
+# dynamic.
+LGUEST_GUEST_TOP := 0x7f000000
Index: linux-2.6.20/Documentation/lguest/x86_64/lguest_defs.h
===================================================================
--- /dev/null
+++ linux-2.6.20/Documentation/lguest/x86_64/lguest_defs.h
@@ -0,0 +1,15 @@
+#ifndef _LGUEST_DEFS_H_
+#define _LGUEST_DEFS_H_
+
+#include <asm/vsyscall.h>
+
+/* LGUEST_TOP_ADDRESS comes from the Makefile */
+typedef uint64_t u64;
+#include "../../../include/asm/lguest_user.h"
+
+#define RESERVE_TOP_ADDRESS LGUEST_GUEST_TOP
+
+
+#define BOOT_PGTABLE "boot_level4_pgt"
+
+#endif