[PATCH 0/2] Use a single loader for i386 and x86_64

From: Glauber de Oliveira Costa
Date: Mon Apr 02 2007 - 17:19:24 EST


This patch moves lguest.c one level bellow, and enhances it with the
ability to kick off 64 binaries. It would be much easier to just ifdef
functions, but I have x86_64 machines loading 32-bit kernels as a longer
goal, and that's why the patch features the load_elf_header() function.

Signed-off-by: Glauber de Oliveira Costa <gcosta@xxxxxxxxxx>

--
Glauber de Oliveira Costa
Red Hat Inc.
"Free as in Freedom"
--- i386/lguest.c 2007-04-02 16:19:27.000000000 -0300
+++ lguest.c 2007-04-02 16:19:28.000000000 -0300
@@ -29,11 +29,22 @@
#include <sys/uio.h>
#include <termios.h>
#include <zlib.h>
+
+typedef uint64_t u64;
typedef uint32_t u32;
typedef uint16_t u16;
typedef uint8_t u8;

-#include "../../../include/asm/lguest_user.h"
+#include "../../include/asm/lguest_user.h"
+#include <lguest_defs.h>
+
+unsigned long (*finish)(unsigned long mem, unsigned long *page_offset,
+ const char *initrd, unsigned long *ird_size);
+
+typedef unsigned long (*load_function)(int, void *, unsigned long,
+ unsigned long *, const char *, unsigned long *,
+ unsigned long *);
+

#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
#define NET_PEERNUM 1
@@ -63,8 +74,8 @@ struct device

/* Watch DMA to this address if handle_input non-NULL. */
unsigned long watch_address;
- u32 (*handle_output)(int fd, const struct iovec *iov,
- unsigned int num, struct device *me);
+ unsigned long (*handle_output)(int fd, const struct iovec *iov,
+ unsigned int num, struct device *me);

/* Device-specific data. */
void *priv;
@@ -78,7 +89,7 @@ static int zero_fd;
FIXME: vdso gets mapped just under it, and we need to protect that. */
#define RESERVE_TOP LGUEST_GUEST_TOP - 1024*1024

-static u32 memparse(const char *ptr)
+static unsigned long memparse(const char *ptr)
{
char *end;
unsigned long ret = strtoul(ptr, &end, 0);
@@ -142,8 +153,8 @@ static void map_memory(unsigned long mem
err(1, "Mmaping /dev/zero for %li bytes", mem);
}

-static u32 finish(unsigned long mem, unsigned long *page_offset,
- const char *initrd, unsigned long *ird_size)
+static unsigned long finish32(unsigned long mem, unsigned long *page_offset,
+ const char *initrd, unsigned long *ird_size)
{
u32 *pgdir = NULL, *linear = NULL;
int i, pte_pages;
@@ -169,7 +180,7 @@ static u32 finish(unsigned long mem, uns
/* Now set up pgd so that this memory is at page_offset */
for (i = 0; i < mem / getpagesize(); i += getpagesize()/sizeof(u32)) {
pgdir[(i + *page_offset/getpagesize())/1024]
- = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT);
+ = (((u32)(long)linear + i*sizeof(u32)) | PAGE_PRESENT);
verbose("Top level %lu = %#08x\n",
(i + *page_offset/getpagesize())/1024,
pgdir[(i + *page_offset/getpagesize())/1024]);
@@ -178,8 +189,14 @@ static u32 finish(unsigned long mem, uns
return (unsigned long)pgdir;
}

+static unsigned long finish64(unsigned long mem, unsigned long *page_offset,
+ const char *initrd, unsigned long *ird_size)
+{
+ return 0;
+}
+
/* Returns the entry point */
-static u32 map_elf(int elf_fd, const Elf32_Ehdr *ehdr, unsigned long mem,
+static unsigned long map_elf32(int elf_fd, const Elf32_Ehdr *ehdr, unsigned long mem,
unsigned long *pgdir_addr,
const char *initrd, unsigned long *ird_size,
unsigned long *page_offset)
@@ -210,7 +227,7 @@ static u32 map_elf(int elf_fd, const Elf
continue;

verbose("Section %i: size %i addr %p\n",
- i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
+ i, phdr[i].p_memsz, (void *)(long)phdr[i].p_paddr);
/* We map everything private, writable. */
if (phdr[i].p_paddr + phdr[i].p_memsz > mem)
errx(1, "Segment %i overlaps end of memory", i);
@@ -227,6 +244,77 @@ static u32 map_elf(int elf_fd, const Elf
phdr[i].p_offset -= (phdr[i].p_paddr % getpagesize());
phdr[i].p_paddr -= (phdr[i].p_paddr % getpagesize());
}
+ addr = mmap((void *)(long)phdr[i].p_paddr,
+ phdr[i].p_filesz,
+ PROT_READ|PROT_WRITE|PROT_EXEC,
+ MAP_FIXED|MAP_PRIVATE,
+ elf_fd, phdr[i].p_offset);
+ if (addr != (void *)(long)phdr[i].p_paddr)
+ err(1, "Mmaping vmlinux segment %i returned %p not %p (%p)",
+ i, addr, (void *)(long)phdr[i].p_paddr, &phdr[i].p_paddr);
+ }
+
+ *pgdir_addr = finish(mem, page_offset, initrd, ird_size);
+ /* Entry is physical address: convert to virtual */
+ return ehdr->e_entry + *page_offset;
+}
+
+/* Returns the entry point */
+static unsigned long map_elf64(int elf_fd, const Elf64_Ehdr *ehdr, unsigned long mem,
+ unsigned long *pgdir_addr,
+ const char *initrd, unsigned long *ird_size,
+ unsigned long *page_offset)
+{
+#ifdef CONFIG_X86_64
+ void *addr;
+ Elf64_Phdr phdr[ehdr->e_phnum];
+ unsigned int i;
+ Elf64_Shdr sec[ehdr->e_shnum];
+ Elf64_Sym *syms;
+ char *strtab = NULL;
+ unsigned long nsyms = 0;
+
+ /* Sanity checks. */
+ if (ehdr->e_type != ET_EXEC
+ || ehdr->e_machine != EM_X86_64
+ || ehdr->e_phentsize != sizeof(Elf64_Phdr)
+ || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf64_Phdr))
+ errx(1, "Malformed elf64 header");
+
+ if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
+ err(1, "Seeking to program headers");
+ if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
+ err(1, "Reading program headers");
+
+ map_memory(mem);
+
+ *page_offset = 0;
+ /* We map the loadable segments at virtual addresses corresponding
+ * to their physical addresses (our virtual == guest physical). */
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ if (phdr[i].p_type != PT_LOAD)
+ continue;
+
+ verbose("Section %i: size %li addr %p\n",
+ i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
+ /* We map everything private, writable. */
+ if (phdr[i].p_paddr + phdr[i].p_memsz > mem)
+ errx(1, "Segment %i overlaps end of memory", i);
+
+ /* We expect linear address space. */
+ if (!*page_offset)
+ *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
+ else if ((*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr) &&
+ phdr[i].p_vaddr != VSYSCALL_START)
+ errx(1, "Page offset of section %i different (got %lx, expected %lx)",
+ i, (phdr[i].p_vaddr - phdr[i].p_paddr), *page_offset);
+
+ /* Recent ld versions don't page align any more. */
+ if (phdr[i].p_paddr % getpagesize()) {
+ phdr[i].p_filesz += (phdr[i].p_paddr % getpagesize());
+ phdr[i].p_offset -= (phdr[i].p_paddr % getpagesize());
+ phdr[i].p_paddr -= (phdr[i].p_paddr % getpagesize());
+ }
addr = mmap((void *)phdr[i].p_paddr,
phdr[i].p_filesz,
PROT_READ|PROT_WRITE|PROT_EXEC,
@@ -237,9 +325,67 @@ static u32 map_elf(int elf_fd, const Elf
i, addr, (void *)phdr[i].p_paddr, &phdr[i].p_paddr);
}

- *pgdir_addr = finish(mem, page_offset, initrd, ird_size);
+ /* Now process sections searching for boot page tables
+ * Start by finding the symtab section */
+ if (lseek(elf_fd, ehdr->e_shoff, SEEK_SET) < 0)
+ err(1, "Seeking to section headers");
+ if (read(elf_fd, sec, sizeof(sec)) != sizeof(sec))
+ err(1, "Reading section headers");
+
+ for (i = 0; i < ehdr->e_shnum; i++) {
+ if (sec[i].sh_type == SHT_SYMTAB) {
+ int ret = 0;
+ syms = malloc(sec[i].sh_size);
+ if (!syms)
+ err(1,"Not enough memory for symbol table");
+ ret = lseek(elf_fd, sec[i].sh_offset, SEEK_SET);
+ if (ret < 0)
+ err(1, "Seeking to symbol table");
+ ret = read(elf_fd, syms, sec[i].sh_size);
+ if (ret != sec[i].sh_size)
+ err(1, "Reading symbol table");
+ nsyms = sec[i].sh_size / sizeof(Elf64_Sym);
+
+
+ /* symtab links to strtab. We use it to find symbol
+ * names */
+ strtab = malloc(sec[sec[i].sh_link].sh_size);
+ if (!strtab)
+ err(1,"Not enough memory for string table");
+ ret = lseek(elf_fd, sec[sec[i].sh_link].sh_offset , SEEK_SET);
+ if (ret < 0)
+ err(1, "Seeking to string table");
+ ret = read(elf_fd, strtab, sec[sec[i].sh_link].sh_size);
+ if (ret != sec[sec[i].sh_link].sh_size)
+ err(1, "Reading string table");
+ break;
+ }
+ }
+
+ /* We now have a pointer to the symtab, start searching for the symbol */
+ for (i = 0; i < nsyms; i++) {
+ if ((syms[i].st_shndx == SHN_UNDEF) || !syms[i].st_name)
+ continue;
+ if (!strcmp("boot_level4_pgt",
+ (char *)((u64)syms[i].st_name + strtab))) {
+ *pgdir_addr = syms[i].st_value - *page_offset;
+ break;
+ }
+ }
+
+ if (!*pgdir_addr)
+ err(1,"Unable to find boot pgdir");
+
+ *ird_size = load_initrd(initrd, mem);
+
/* Entry is physical address: convert to virtual */
+ printf("entry=%lx page_offset=%lx entry+page_offset=%lx\n",
+ ehdr->e_entry, *page_offset, ehdr->e_entry + *page_offset);
return ehdr->e_entry + *page_offset;
+#else
+ errno = EINVAL;
+ err(1, "Too many bits! i386 architecture cannot load 64 bit kernels");
+#endif
}

static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
@@ -254,9 +400,9 @@ static unsigned long intuit_page_offset(
errx(1, "could not determine page offset");
}

-static u32 bzimage(int fd, unsigned long mem, unsigned long *pgdir_addr,
- const char *initrd, unsigned long *ird_size,
- unsigned long *page_offset)
+static unsigned long bzimage(int fd, unsigned long mem, unsigned long *pgdir_addr,
+ const char *initrd, unsigned long *ird_size,
+ unsigned long *page_offset)
{
gzFile f;
int ret, len = 0;
@@ -277,13 +423,13 @@ static u32 bzimage(int fd, unsigned long
*pgdir_addr = finish(mem, page_offset, initrd, ird_size);

/* Entry is physical address: convert to virtual */
- return (u32)img + *page_offset;
+ return (long)img + *page_offset;
}

-static u32 load_bzimage(int bzimage_fd, const Elf32_Ehdr *ehdr,
- unsigned long mem, unsigned long *pgdir_addr,
- const char *initrd, unsigned long *ird_size,
- unsigned long *page_offset)
+static unsigned long load_bzimage(int bzimage_fd, const Elf32_Ehdr *ehdr,
+ unsigned long mem, unsigned long *pgdir_addr,
+ const char *initrd, unsigned long *ird_size,
+ unsigned long *page_offset)
{
unsigned char c;
int state = 0;
@@ -363,7 +509,7 @@ static struct device *new_device(struct
int fd,
int (*handle_input)(int, struct device *),
unsigned long watch_off,
- u32 (*handle_output)(int,
+ unsigned long (*handle_output)(int,
const struct iovec *,
unsigned,
struct device *))
@@ -384,16 +530,16 @@ static struct device *new_device(struct
return dev;
}

-static int tell_kernel(u32 pagelimit, u32 pgdir, u32 start, u32 page_offset)
+static int tell_kernel(long pagelimit, long pgdir, long start, long page_offset)
{
- u32 args[] = { LHREQ_INITIALIZE,
+ unsigned long args[] = { LHREQ_INITIALIZE,
pagelimit, pgdir, start, page_offset };
int fd = open("/dev/lguest", O_RDWR);

if (fd < 0)
err(1, "Opening /dev/lguest");

- verbose("Telling kernel limit %u, pgdir %i, e=%#08x page_off=0x%08x\n",
+ verbose("Telling kernel limit %lu, pgdir %li, e=%#08lx page_off=0x%08lx\n",
pagelimit, pgdir, start, page_offset);
if (write(fd, args, sizeof(args)) < 0)
err(1, "Writing to /dev/lguest");
@@ -423,7 +569,7 @@ static void *_check_pointer(unsigned lon
#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)

/* Returns pointer to dma->used_len */
-static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
+static unsigned long *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
{
unsigned int i;
struct lguest_dma *udma;
@@ -446,12 +592,12 @@ static u32 *dma2iov(unsigned long dma, s
return &udma->used_len;
}

-static u32 *get_dma_buffer(int fd, void *addr,
+static unsigned long *get_dma_buffer(int fd, void *addr,
struct iovec iov[], unsigned *num, u32 *irq)
{
- u32 buf[] = { LHREQ_GETDMA, (u32)addr };
+ unsigned long buf[] = { LHREQ_GETDMA, (unsigned long)addr };
unsigned long udma;
- u32 *res;
+ unsigned long *res;

udma = write(fd, buf, sizeof(buf));
if (udma == (unsigned long)-1)
@@ -466,7 +612,7 @@ static u32 *get_dma_buffer(int fd, void

static void trigger_irq(int fd, u32 irq)
{
- u32 buf[] = { LHREQ_IRQ, irq };
+ unsigned long buf[] = { LHREQ_IRQ, irq };
if (write(fd, buf, sizeof(buf)) != 0)
err(1, "Triggering irq %i", irq);
}
@@ -486,7 +632,8 @@ struct console_abort
/* We DMA input to buffer bound at start of console page. */
static int handle_console_input(int fd, struct device *dev)
{
- u32 num, irq = 0, *lenp;
+ u32 num, irq = 0;
+ unsigned long *lenp;
int len;
struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
struct console_abort *abort = dev->priv;
@@ -535,19 +682,20 @@ static unsigned long peer_offset(unsigne
return 4 * peernum;
}

-static u32 handle_tun_output(int fd, const struct iovec *iov,
- unsigned num, struct device *dev)
+static unsigned long handle_tun_output(int fd, const struct iovec *iov,
+ unsigned num, struct device *dev)
{
/* Now we've seen output, we should warn if we can't get buffers. */
*(bool *)dev->priv = true;
return writev(dev->fd, iov, num);
}

-static u32 handle_block_output(int fd, const struct iovec *iov,
- unsigned num, struct device *dev)
+static unsigned long handle_block_output(int fd, const struct iovec *iov,
+ unsigned num, struct device *dev)
{
struct lguest_block_page *p = dev->mem;
- u32 irq, reply_num, *lenp;
+ u32 irq, reply_num;
+ unsigned long *lenp;
int len;
struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
off64_t device_len, off = (off64_t)p->sector * 512;
@@ -555,11 +703,13 @@ static u32 handle_block_output(int fd, c
device_len = *(off64_t *)dev->priv;

if (off >= device_len)
- err(1, "Bad offset %llu vs %llu", off, device_len);
+ err(1, "Bad offset %llu vs %llu", (unsigned long long)off,
+ (unsigned long long)device_len);
if (lseek64(dev->fd, off, SEEK_SET) != off)
err(1, "Bad seek to sector %i", p->sector);

- verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
+ verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ",
+ (unsigned long long)off);

lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
if (!lenp)
@@ -569,7 +719,8 @@ static u32 handle_block_output(int fd, c
len = writev(dev->fd, iov, num);
if (off + len > device_len) {
ftruncate(dev->fd, device_len);
- errx(1, "Write past end %llu+%u", off, len);
+ errx(1, "Write past end %llu+%u",
+ (unsigned long long)off, len);
}
*lenp = 0;
} else {
@@ -639,7 +790,8 @@ static void wakeup(int signo)

static int handle_tun_input(int fd, struct device *dev)
{
- u32 irq = 0, num, *lenp;
+ u32 irq = 0, num;
+ unsigned long *lenp;
int len;
struct iovec iov[LGUEST_MAX_DMA_SECTIONS];

@@ -836,8 +988,8 @@ static void setup_block_file(const char
(void *)(dev->desc->pfn * getpagesize()), p->num_sectors);
}

-static u32 handle_console_output(int fd, const struct iovec *iov,
- unsigned num, struct device*dev)
+static unsigned long handle_console_output(int fd, const struct iovec *iov,
+ unsigned num, struct device*dev)
{
return writev(STDOUT_FILENO, iov, num);
}
@@ -871,11 +1023,11 @@ static const char *get_arg(const char *a
return NULL;
}

-static u32 handle_device(int fd, unsigned long dma, unsigned long addr,
+static long handle_device(int fd, unsigned long dma, unsigned long addr,
struct devices *devices)
{
struct device *i;
- u32 *lenp;
+ unsigned long *lenp;
struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
unsigned num = 0;

@@ -916,20 +1068,45 @@ static void handle_input(int fd, int chi
}
}

+static unsigned long load_elf_header(unsigned char *elf_nident)
+{
+ errno = 0;
+ switch (*(elf_nident+EI_CLASS)) {
+ case ELFCLASS32:
+ finish = finish32;
+ if (memcmp(elf_nident, ELFMAG, SELFMAG) == 0)
+ return (unsigned long)map_elf32;
+ else
+ return (unsigned long)load_bzimage;
+ break;
+ case ELFCLASS64:
+ finish = finish64;
+ if (memcmp(elf_nident, ELFMAG, SELFMAG) == 0)
+ return (unsigned long)map_elf64;
+ else
+ return (unsigned long)load_bzimage;
+ break;
+ default:
+ /* unrecognized class */
+ errno = EINVAL;
+ return 0;
+ }
+
+}
+
int main(int argc, char *argv[])
{
unsigned long mem, pgdir, entry, initrd_size, page_offset;
int arg, kern_fd, fd, child, pipefd[2];
- Elf32_Ehdr hdr;
+ /* Worst case */
+ Elf64_Ehdr hdr;
struct sigaction act;
sigset_t sigset;
struct lguest_device_desc *devdescs;
struct devices devices;
struct lguest_boot_info *boot = (void *)0;
const char *initrd_name = NULL;
- u32 (*load)(int, const Elf32_Ehdr *ehdr, unsigned long,
- unsigned long *, const char *, unsigned long *,
- unsigned long *);
+ load_function load;

if (argv[1] && strcmp(argv[1], "--verbose") == 0) {
verbose = true;
@@ -954,10 +1131,10 @@ int main(int argc, char *argv[])
if (read(kern_fd, &hdr, sizeof(hdr)) != sizeof(hdr))
err(1, "Reading %s elf header", argv[2]);

- if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
- load = map_elf;
- else
- load = load_bzimage;
+ load = (load_function)load_elf_header(hdr.e_ident);
+
+ if (!load)
+ err(1, "Could not identify file class");

devices.max_infd = -1;
devices.dev = NULL;