#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/wait.h>

#define MEG (1024*1024)
#define PER_PROCESS_ALLOC (256*MEG)

#define NUM_SYNC_WORDS (256/32)

typedef struct shared_data_s
{
  int sync[NUM_SYNC_WORDS];
  pid_t pid[256];
} shared_data_t;
typedef shared_data_t *shared_data_p;

shared_data_p shared_data;
int num_cpus;
void *map;
int n;

/* This test also works on x86, but only fails
   on x86_64.  */
#if __x86_64__

#define LOCK_PREFIX "lock ; "

int
atomic_cas (int *ptr, int old, int new)
{
  int prev;
  __asm__ __volatile__ (LOCK_PREFIX "cmpxchgl %1,%2":"=a" (prev):"q" (new),
			"m" (*ptr), "0" (old):"memory");
  return prev == old;
}
#else
# error this test must be run on an x86_64 cpu
#endif

int
get_bit (int *bits, int bitnum)
{
  int *word = bits + (bitnum / 32);
  int bit = (1 << (bitnum % 32));
  return (*word & bit) != 0;
}

void
atomic_set_bit (int *bits, int bitnum)
{
  int *word = bits + (bitnum / 32);
  int bit = (1 << (bitnum % 32));
  int old_val, new_val;
  do
    {
      old_val = *word;
      new_val = old_val | bit;
    }
  while (!atomic_cas (word, old_val, new_val));
}

void
barrier ()
{
  int *sync = shared_data->sync;
  atomic_set_bit (sync, n);	/* set my bit */
  if (n == 0)
    {
      int i;
      /* spin until all bits are set */
      for (i = 0; i < num_cpus; ++i)
	while (!get_bit (sync, i)) /* loop */ ;
      /* all set, open the barrier. */
      for (i = 0; i < (num_cpus + 31) / 32; ++i)
	sync[i] = 0;
    }
  else
    while (get_bit (sync, n)) /* spin on my bit set */ ;
}

void
run_test ()
{
  int nxt = (n + 1) % num_cpus;
  char *s;
  char *cp;
  int i;
  int c;
  char *buf;
  buf = malloc (PER_PROCESS_ALLOC);
  if (!buf)
    {
      perror ("malloc");
      abort ();
    }
  c = 'A' + n % 26;
  memset (buf, c, PER_PROCESS_ALLOC);
  barrier ();
  /* write the data for the next process */
  s = map + nxt * PER_PROCESS_ALLOC;
  c = 'A' + nxt % 26;
  memset (s, c, PER_PROCESS_ALLOC);
  barrier ();
  /* read our data */
  s = map + n * PER_PROCESS_ALLOC;
  if (memcmp (s, buf, PER_PROCESS_ALLOC))
    {
      fprintf (stderr, "%d: data mismatch\n", n);
      abort ();
    }
  barrier ();
  exit (0);
}

int
main (int argc, char *argv[])
{
  char *pgm = argv[0];
  int mask;
  int fd;
  int pid;
  off_t alloc_size;
  int wait_status;
  num_cpus = (int) sysconf (_SC_NPROCESSORS_ONLN);
  if (num_cpus <= 0)
    {
      perror ("sysconf");
      abort ();
    }
  printf ("%d cpus\n", num_cpus);
  printf ("mapping %ld megs per process\n",
	  (long int) PER_PROCESS_ALLOC / MEG);
  shared_data =
    mmap ((void *) 0, 64 * 1024, PROT_READ | PROT_WRITE,
	  MAP_SHARED | MAP_ANONYMOUS, 0, (off_t) 0);
  if (shared_data == MAP_FAILED)
    {
      perror ("mmap");
      abort ();
    }
  memset (shared_data, '\0', sizeof (shared_data_t));
  alloc_size = num_cpus * PER_PROCESS_ALLOC;
  map = mmap ((void *) 0, alloc_size, PROT_READ | PROT_WRITE,
	      MAP_SHARED | MAP_ANONYMOUS, 0, (off_t) 0);
  if (map == MAP_FAILED)
    {
      perror ("mmap");
      abort ();
    }
  for (n = 0; n < num_cpus; ++n)
    {
      pid = fork ();
      if (pid == 0)
	run_test (n);		/* no return */
      else if (pid < 0)
	{
	  perror ("fork");
	  abort ();
	}
      shared_data->pid[n] = pid;
    }
  printf ("processes created, wait for completion\n");
  while ((pid = wait (&wait_status)) > 0)
    {
      int np;
      for (np = 0;
	   np < num_cpus && shared_data->pid[np] != pid; ++np) /* loop */ ;
      if (WIFEXITED (wait_status))
	{
	  int child_exit = WEXITSTATUS (wait_status);
	  if (child_exit)
	    {
	      fprintf (stderr, "%d: non-zero child exit status\n", np);
	      abort ();
	    }
	}
      else if (WIFSIGNALED (wait_status))
	{
	  int child_sig = WTERMSIG (wait_status);
	  fprintf (stderr, "%d child caught signal: %d\n", np, child_sig);
	  abort ();
	}
      printf ("process %d completed successfully\n", np);
    }
  exit (0);
}