/*
  commonly-used routines
  Copyright (c) 2006-2013 Cheng Zhang

  This material is provided "as is", with absolutely no warranty expressed
  or implied. Any use is at your own risk.
  Permission to use or copy this software for any purpose is hereby granted
  without fee. Permission to modify the code and to distribute modified
  code is also granted without any restrictions.

  Usage:

  1.  It is designed for quick programming.
      For simple use, include this file and all functions will be available.
      But there might be many compiler warnings for unused functions.

  2.  You can include this file multiple times in a single file.

  3.  Function are static by default. To export functions,
      e.g., to make it easier to debug, or to avoid warnings of unused functions,
      define ZCOM_XFUNCS before the first inclusion.

  4.  To hand-pick specific set of modules, e.g.,
        #define ZCOM_PICK
        #define ZCOM_RNG
        #define ZCOM_ARGOPT
      before including this file, so other modules are skipped.

  5.  If the compiler supports keywords inline and restrict, write
        #define INLINE inline
        #define RESTRICT restrict
      before including this file. Otherwise the two keywords are guessed
      according to the compiler.

  6.  Define HAVEVAM if the compiler supports variable-argument macros.

  7.  The def module defines `real' as a double, to override it, write
        typedef float real;
        #define HAVEREAL 1
      before including this file (or equivalently define HAVE_REAL)
*/

/* ZCOM_PICK or ZCOM_NONE is used include only subset of modules
 * 1. to reduce the number of warnings for unused functions
 * 2. to reduce the compiling time
 * 3. to avoid potential name conflicts
 * By default, ZCOM_PICK is undefined, so everything is used. */
#ifdef ZCOM_NONE  /* equivalent to ZCOM_PICK */
#define ZCOM_PICK
#endif

#ifndef ZCOM_PICK
  #ifndef ZCOM_UTIL
  #define ZCOM_UTIL
  #endif
  #ifndef ZCOM_SS
  #define ZCOM_SS
  #endif
  #ifndef ZCOM_OPT
  #define ZCOM_OPT
  #endif
  #ifndef ZCOM_ARGOPT
  #define ZCOM_ARGOPT
  #endif
  #ifndef ZCOM_AV
  #define ZCOM_AV
  #endif
  #ifndef ZCOM_HIST
  #define ZCOM_HIST
  #endif
  #ifndef ZCOM_RNG
  #define ZCOM_RNG
  #endif
  #ifndef ZCOM_RV2
  #define ZCOM_RV2
  #endif
  #ifndef ZCOM_RV3
  #define ZCOM_RV3
  #endif
  #ifndef ZCOM_MD
  #define ZCOM_MD
  #endif
  #ifndef ZCOM_ISING2
  #define ZCOM_ISING2
  #endif
  #ifndef ZCOM_LJ
  #define ZCOM_LJ
  #endif
#endif

/* build dependencies */


#ifdef ZCOM_LJ
  #define ZCOM_MD
  #define ZCOM_HIST
#endif


#ifdef ZCOM_ISING2
  #define ZCOM_RNG
#endif


#ifdef ZCOM_MD
  #define ZCOM_RV3
  #define ZCOM_RV2
#endif


#ifdef ZCOM_RV3
  #define ZCOM_RNG
#endif

#ifdef ZCOM_RV2
  #define ZCOM_RNG
#endif


#ifdef ZCOM_RNG
  #define ZCOM_UTIL
#endif

#ifdef ZCOM_HIST
  #define ZCOM_UTIL
#endif

#ifdef ZCOM_AV
  #define ZCOM_UTIL
#endif


#ifdef ZCOM_ARGOPT
  #define ZCOM_OPT
#endif

#ifdef ZCOM_OPT
  #define ZCOM_SS
  #define ZCOM_UTIL
#endif


/* manage storage class: static is the safer choice
   to avoid naming conflict.  Example:
   both m.c and n.c include this file,
   m.c --> m.o, n.c --> n.o, m.o+n.o --> a.out
   static is the only way to avoid naming conflict in this case.

   In case that this file is included multiple times,
   ZCOM_XFUNCS should be defined before the first inclusion,
   otherwise it won't be effective in deciding storage class. */
#ifndef STRCLS
  #ifndef ZCOM_XFUNCS
    #define STRCLS static
  #else
    #define STRCLS
  #endif
#endif

/* inline keyword */
#ifndef INLINE
  #if defined(__GNUC__) || defined(__xlC__)
    #define INLINE STRCLS __inline__
  #elif defined(__INTEL_COMPILER)
    #define INLINE STRCLS __inline
  #elif defined(_MSC_VER) || defined(__BORLANDC__)
    #define INLINE __inline STRCLS
  #elif defined(__STDC_VERSION__) && (STDC_VERSION__ >= 199901L)
    #define INLINE STRCLS inline
  #else
    #define INLINE STRCLS
  #endif
#endif

/* restrict keyword */
#ifndef RESTRICT
  #if (defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__xlC__) \
      || (defined(_MSC_VER) && _MSC_VER >= 1400))
    #define RESTRICT __restrict
  #elif defined(__STDC_VERSION__) && (STDC_VERSION__ >= 199901L)
    #define RESTRICT restrict
  #else
    #define RESTRICT
  #endif
#endif

/* macros with variable-length arguments */
#ifndef HAVEVAM
  #if (  (defined(__GNUC__) && (__GNUC__ >= 3))   \
      || (defined(__xlC__)  && (__xlC__ >= 0x0700)) \
      || (defined(_MSC_VER) && (_MSC_VER >= 1400)) )
    #define HAVEVAM 1
  #endif
#endif

#ifdef __INTEL_COMPILER
  #pragma warning push
  #pragma warning(disable:981) /* unspecified order of operands */
  #pragma warning(disable:177) /* unreferenced function */
  #pragma warning(disable:161) /* unrecognized #pragma, for omp */
#elif defined(__GNUC__) && (__GNUC__ >= 4 && __GNUC_MINOR__ >= 2)
  /* Note: intel compiler also defines __GNUC__ */
  #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
    /* diagonistic push and pop are added in GCC 4.6 */
    #pragma GCC diagnostic push
  #endif
  #pragma GCC diagnostic ignored "-Wunknown-pragmas"
  #pragma GCC diagnostic ignored "-Wvariadic-macros"
#endif

#ifdef _MSC_VER
  #pragma warning(push)
  #pragma warning(disable:4127) /* conditional expression constant */
  #pragma warning(disable:4505) /* unreferenced function */
  #pragma warning(disable:4514) /* unreferenced inline */
  #pragma warning(disable:4710) /* not inlined */
  /* suppress CRT safety warnings:
   * no need for "_s" versions of CRT functions
   * The macros below will not work if standard C library files
   * like <stdio.h> and <time.h> are included before zcom.h */
  #ifndef _CRT_SECURE_NO_DEPRECATE
  #define _CRT_SECURE_NO_DEPRECATE
  #endif
  #ifndef _CRT_SECURE_NO_WARNINGS
  #define _CRT_SECURE_NO_WARNINGS
  #endif
  #include <stdio.h>
#endif

#ifdef __BORLANDC__
  #pragma warning(push)
  #pragma warn -8027 /* not expanded inlined */
#endif

#ifdef _OPENMP
#include <omp.h>
#endif

#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif

/* In addition to ZCOM_ABC, we have to define another macro ZCOM_ABC__
 * to allow multiple inclusions. */


#ifdef  ZCOM_UTIL
#ifndef ZCOM_UTIL__
#define ZCOM_UTIL__
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <stdarg.h>
#include <math.h>
#include <float.h>


/* define a real type */
#ifdef HAVE_REAL
  #ifndef HAVEREAL
  #define HAVEREAL HAVE_REAL
  #endif
#endif

#ifndef HAVEREAL
  #define HAVEREAL 1
  typedef double real;
#endif


/* define int16_t/int32_t/int64_t, etc. */
#if (  (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) \
     || defined(__GNUC__) || defined(__INTEL_COMPILER) )
  /* C99 compatible compilers support int64_t etc.
   * but GCC and other compilers has the header even in C89/C90 mode
   * So we need to include more compilers here, see the list on
   * http://sourceforge.net/p/predef/wiki/Compilers/ */
  #include <inttypes.h>
#elif (defined(_MSC_VER) \
      || (defined(__BORLANDC__) && (__BORLANDC__ >= 0x520)))
  /* tested for Visual C++ 6.0 and Borland C++ 5.5 */
  typedef __int8              int8_t;
  typedef __int16             int16_t;
  typedef __int32             int32_t;
  typedef __int64             int64_t;
  typedef unsigned __int8     uint8_t;
  typedef unsigned __int16    uint16_t;
  typedef unsigned __int32    uint32_t;
  typedef unsigned __int64    uint64_t;
#elif defined(__unix__)
  /* a unix compiler is likely to have inttypes.h  */
  #include <inttypes.h>
#else
  /* note the following is a guess, long long is not supported
   * until a later version of visual C++ */
  typedef char                int8_t;
  typedef short               int16_t;
  typedef int                 int32_t;
  typedef long long           int64_t;
  typedef unsigned char       uint8_t;
  typedef unsigned short      uint16_t;
  typedef unsigned            uint32_t;
  typedef unsigned long long  uint64_t;
#endif


/* constant 64-bit integer */
#if defined(_MSC_VER) || defined(__BORLANDC__)
  #define CI64(x) (x ## i64)
  #define CU64(x) (x ## ui64)
#else
  #define CI64(x) (x ## ll)
  #define CU64(x) (x ## ull)
#endif


/* printf() format strings for integers
 * the macros PRId32, PRIu64, etc are defined by C99
 * we write the macros below just in case they are not defined */
#if defined(_MSC_VER) || defined(__BORLANDC__)
  #ifndef PRId32
  #define PRId32 "I32d"
  #endif
  #ifndef SCNd32
  #define SCNd32 "I32d"
  #endif
  #ifndef PRIi32
  #define PRIi32 "I32i"
  #endif
  #ifndef SCNi32
  #define SCNi32 "I32i"
  #endif
  #ifndef PRIu32
  #define PRIu32 "I32u"
  #endif
  #ifndef SCNu32
  #define SCNu32 "I32u"
  #endif
  #ifndef PRIo32
  #define PRIo32 "I32o"
  #endif
  #ifndef SCNo32
  #define SCNo32 "I32o"
  #endif
  #ifndef PRIx32
  #define PRIx32 "I32x"
  #endif
  #ifndef SCNx32
  #define SCNx32 "I32x"
  #endif
  #ifndef PRId64
  #define PRId64 "I64d"
  #endif
  #ifndef SCNd64
  #define SCNd64 "I64d"
  #endif
  #ifndef PRIi64
  #define PRIi64 "I64i"
  #endif
  #ifndef SCNi64
  #define SCNi64 "I64i"
  #endif
  #ifndef PRIu64
  #define PRIu64 "I64u"
  #endif
  #ifndef SCNu64
  #define SCNu64 "I64u"
  #endif
  #ifndef PRIo64
  #define PRIo64 "I64o"
  #endif
  #ifndef SCNo64
  #define SCNo64 "I64o"
  #endif
  #ifndef PRIx64
  #define PRIx64 "I64x"
  #endif
  #ifndef SCNx64
  #define SCNx64 "I64x"
  #endif
#else
  #ifndef PRId32
  #define PRId32 "d"
  #endif
  #ifndef SCNd32
  #define SCNd32 "d"
  #endif
  #ifndef PRIi32
  #define PRIi32 "i"
  #endif
  #ifndef SCNi32
  #define SCNi32 "i"
  #endif
  #ifndef PRIu32
  #define PRIu32 "u"
  #endif
  #ifndef SCNu32
  #define SCNu32 "u"
  #endif
  #ifndef PRIo32
  #define PRIo32 "o"
  #endif
  #ifndef SCNo32
  #define SCNo32 "o"
  #endif
  #ifndef PRIx32
  #define PRIx32 "x"
  #endif
  #ifndef SCNx32
  #define SCNx32 "x"
  #endif
  #ifndef PRId64
  #define PRId64 "lld"
  #endif
  #ifndef SCNd64
  #define SCNd64 "lld"
  #endif
  #ifndef PRIi64
  #define PRIi64 "lli"
  #endif
  #ifndef SCNi64
  #define SCNi64 "lli"
  #endif
  #ifndef PRIu64
  #define PRIu64 "llu"
  #endif
  #ifndef SCNu64
  #define SCNu64 "llu"
  #endif
  #ifndef PRIo64
  #define PRIo64 "llo"
  #endif
  #ifndef SCNo64
  #define SCNo64 "llo"
  #endif
  #ifndef PRIx64
  #define PRIx64 "llx"
  #endif
  #ifndef SCNx64
  #define SCNx64 "llx"
  #endif
#endif


/* print an error message */
INLINE void perrmsg__(const char *file, int line, const char *why,
    const char *fmt, va_list args)
{
  fprintf(stderr, "error: ");
  vfprintf(stderr, fmt, args);
  if (fmt[strlen(fmt) - 1] != '\n')
    fprintf(stderr, "\n"); /* add a new line if needed */
  if (file != NULL) fprintf(stderr, "file: %s\n", file);
  if (line >= 0) fprintf(stderr, "line: %d\n", line);
  if (why != NULL && strcmp(why, "1") != 0)
    fprintf(stderr, "cond: %s\n", why);
}


#ifdef HAVEVAM

INLINE void perrmsg_(const char *file, int line, const char *why,
    int cond, const char *fmt, ...)
{
  if (cond) {
    va_list args;
    va_start(args, fmt);
    perrmsg__(file, line, why, fmt, args);
    va_end(args);
    exit(1);
  }
}

#define die_if(cond, fmt, ...) \
  perrmsg_(__FILE__, __LINE__, #cond, cond, fmt, ## __VA_ARGS__)
#define fatal(fmt, ...)  die_if(1, fmt, ## __VA_ARGS__)

#else /* !HAVEVAM */

#define PERRMSG__(c) {                        \
  if ((#c[0] == '1' && #c[1] == '\0') || c) { \
    va_list args;                             \
    va_start(args, fmt);                      \
    perrmsg__(NULL, -1, NULL, fmt, args);     \
    va_end(args);                             \
    exit(1);                                  \
  } }
INLINE void die_if(int cond, const char *fmt, ...) PERRMSG__(cond)
INLINE void fatal(const char *fmt, ...) PERRMSG__(1)
#undef PERRMSG__

#endif /* HAVEVAM */


#ifndef xnew
#define xnew(x, n) { \
  size_t num_ = (size_t) (n); \
  die_if (num_ <= 0, \
    "cannot allocate %d objects for %s\n", (int) num_, #x); \
  die_if ((x = calloc(num_, sizeof(*(x)))) == NULL, \
    "no memory for %s x %d\n", #x, (int) num_); }
#endif


#ifndef xrenew
#define xrenew(x, n) { \
  size_t num_ = (size_t) (n); \
  die_if (num_ <= 0, \
    "cannot allocate %d objects for %s\n", (int) num_, #x); \
  die_if ((x = realloc(x, (num_)*sizeof(*(x)))) == NULL, \
    "no memory for %s x %d\n", #x, (int) num_); }
#endif


#define xfopen(fp, fn, fmt, err) \
  if ((fp = fopen(fn, fmt)) == NULL) { \
    fprintf(stderr, "cannot open file %s\n", fn); err; }


/* check if file `fn' exists */
INLINE int fexists(const char *fn)
{
  FILE *fp;
  if ((fp = fopen(fn, "r")) == NULL) return 0;
  else { fclose(fp); return 1; }
}


/* copy file */
INLINE int copyfile(const char *fninp, const char *fnout)
{
  FILE *fpinp, *fpout;
#ifndef COPYFILE_BUFSZ
#define COPYFILE_BUFSZ (64*1024)
#endif
  unsigned char buf[COPYFILE_BUFSZ];
  size_t sz, tot = 0;

  if ((fpinp = fopen(fninp, "rb")) == NULL) {
    fprintf(stderr, "copyfile: cannot read file %s\n", fninp);
    return -1;
  }
  if ((fpout = fopen(fnout, "wb")) == NULL) {
    fprintf(stderr, "copyfile: cannot write file %s\n", fnout);
    fclose(fpout);
    return -2;
  }
  while ((sz = fread(buf, sizeof(buf[1]), COPYFILE_BUFSZ, fpinp)) != 0) {
    tot += sz;
    /* note: sz may differ from COPYFILE_BUFSZ */
    if (sz != fwrite(buf, sizeof(buf[1]), sz, fpout))
      fprintf(stderr, "copyfile: error writing %s, byte %.0f\n", fnout, 1.*tot);
    if ( feof(fpinp) ) break;
  }
  fclose(fpinp);
  fclose(fpout);
  return 0;
}


/* swap two variables */
#define xtpswap(tp, x, y) { tp dum_; dum_ = (x); (x) = (y); (y) = dum_; }

#define intswap(x, y) xtpswap(int, x, y)

#define dblswap(x, y) xtpswap(double, x, y)

#define realswap(x, y) xtpswap(real, x, y)


INLINE int intmax(int x, int y) { return x > y ? x : y; }
INLINE int intmin(int x, int y) { return x < y ? x : y; }


/* confine x within [xmin, xmax] */
INLINE int intconfine(int x, int xmin, int xmax)
  { return x < xmin ? xmin : x > xmax ? xmax : x; }


INLINE int intsqr(int x) { return x * x; }


/* get the pair index from 0 to n*(n - 1)/2 - 1 */
INLINE int getpairindex(int i, int j, int n)
{
  die_if (i < 0 || i >= n || j < 0 || j >= n || i == j,
      "bad index error i %d, j %d, n %d\n", i, j, n);
  if (i > j) { int i1 = i; i = j; j = i1; }
  return n*i - (i + 1)*(i + 2)/2 + j;
}

/* return individual indices for a given pair index */
INLINE int parsepairindex(int id, int n, int *i, int *j)
{
  int i1, n1;
  die_if (id < 0 || id >= n*(n - 1)/2, "index %d too large for n %d\n", id, n);
  for (i1 = n - 2; i1 >= 0; i1--) {
    if (id >= (n1 = i1*n - i1*(i1 + 1)/2)) {
      *i = i1;
      *j = id - n1 + i1 + 1;
      return 0;
    }
  }
  return 1;
}


INLINE double dblmax(double x, double y) { return x > y ? x : y; }
INLINE double dblmin(double x, double y) { return x < y ? x : y; }


/* confine x within [xmin, xmax] */
INLINE double dblconfine(double x, double xmin, double xmax)
{ return x < xmin ? xmin : x > xmax ? xmax : x; }


INLINE double dblsqr(double x) { return x * x; }


/* sqrt(x*x + y*y) */
INLINE double dblhypot(double x, double y)
{
  double t;
  x = fabs(x);
  y = fabs(y);
  if (x <= 0.) return y;
  else if (y <= 0.) return x;
  if (x < y) t = x, x = y, y = t;
  t = y/x;
  return x*sqrt(1 + t*t);
}


/* round x to a multiple dx  */
INLINE double dblround(double x, double dx)
{
  if (x*dx > 0) return dx * (int)(x/dx + (.5 - DBL_EPSILON));
  else return -dx * (int)(-x/dx + (.5 - DBL_EPSILON));
}


/* convert to double to integer */
INLINE int dbl2int(double x)
{
  return (int) ((x < 0) ? (x - .5) : (x + .5));
}


INLINE void dblcleararr(double *x, int n)
{
  int i; for (i = 0; i < n; i++) x[i] = 0.0;
}


#ifdef HAVEREAL
INLINE real realmax(real x, real y) { return x > y ? x : y; }
INLINE real realmin(real x, real y) { return x < y ? x : y; }
/* confine x within [xmin, xmax] */
INLINE real realconfine(real x, real xmin, real xmax)
{ return x < xmin ? xmin : x > xmax ? xmax : x; }
#endif


#ifndef LNADD_DEFINED
#define LNADD_DEFINED
#define LN_BIG 50.0

/* log(exp(a) + exp(b)) */
INLINE double lnadd(double a, double b)
{
  double c;
  if (a < b) { c = a; a = b; b = c; } /* ensure a >= b */
  return ((c = a - b) > LN_BIG) ? a : a + log(1 + exp(-c));
}

/* log(exp(a) - exp(b)), only works for a > b */
INLINE double lndif(double a, double b)
{
  double c;
  die_if (a < b, "lndif: %g < %g\n", a, b);
  return ((c = a - b) > LN_BIG) ? a : a + log(1 - exp(-c));
}

/* log(exp(a)+b) */
INLINE double lnaddn(double a, double b)
{
  return (a > LN_BIG) ? a : a + log(1 + b*exp(-a));
}

#undef LN_BIG
#endif  /* LNADD_DEFINED */


#define cisalnum(c)   isalnum((unsigned char)(c))
#define cisalpha(c)   isalpha((unsigned char)(c))
#define cisdigit(c)   isdigit((unsigned char)(c))
#define cisxdigit(c)  isxdigit((unsigned char)(c))
#define cisprint(c)   isprint((unsigned char)(c))
#define cisspace(c)   isspace((unsigned char)(c))
#define cislower(c)   islower((unsigned char)(c))
#define cisupper(c)   isupper((unsigned char)(c))
#define ctolower(c)   (char) tolower((unsigned char)(c))
#define ctoupper(c)   (char) toupper((unsigned char)(c))


/* string manipulation */
#define ZSTR_XSPACEL  0x0001
#define ZSTR_XSPACER  0x0002
#define ZSTR_XSPACE   (ZSTR_XSPACEL|ZSTR_XSPACER)
#define ZSTR_COPY     0x0004
#define ZSTR_CAT      0x0008
#define ZSTR_CASE     0x0100
#define ZSTR_UPPER_   0x0200
#define ZSTR_UPPER    (ZSTR_CASE|ZSTR_UPPER_)
#define ZSTR_LOWER    ZSTR_CASE


/* remove leading and trailing spaces */
#define strip(s)  stripx(s, ZSTR_XSPACE)
#define lstrip(s) stripx(s, ZSTR_XSPACEL)
#define rstrip(s) stripx(s, ZSTR_XSPACER)
INLINE char *stripx(char *s, unsigned flags)
{
  char *p;

  if (flags & ZSTR_XSPACEL) { /* remove leading spaces */
    for (p = s; cisspace(*p); p++) ;
    if (*p == '\0') *s = '\0';
    else memmove(s, p, strlen(p) + 1);
  }
  if (flags & ZSTR_XSPACER) /* remove trailing spaces */
    for (p = s + strlen(s) - 1; p >= s && cisspace(*p); p--)
      *p = '\0';
  return s;
}


/* in the follows, size_s means the buffer size of s, i.e., sizeof(s) for static strings */
/* copy the string and convert it to upper/lower case */
#define strcpy2u(s, t, size_s) strcnv(s, t, size_s - 1, ZSTR_COPY|ZSTR_UPPER)
#define strcpy2l(s, t, size_s) strcnv(s, t, size_s - 1, ZSTR_COPY|ZSTR_LOWER)
#define strcpy_sf(s, t, size_s) strcnv(s, t, size_s - 1, ZSTR_COPY)
#define substr(s, t, start, len) strcnv(s, t+start, len, ZSTR_COPY)
/* concatenate strings, the last parameter is the buffer size of s,
 * unlike strncat(), in which it's the number of characters from *t* to be copied.  */
#define strcat_sf(s, t, size_s) strcnv(s, t, size_s - 1, ZSTR_CAT)


/* safely copy/cat strings with case conversion
 * unlike strncpy(), s is always null-terminated on return: it copies at most
 * len non-blank characters, i.e., s[len] = '\0' for the longest output */
INLINE char *strcnv(char *s, const char *t, size_t len, unsigned flags)
{
  size_t i = 0, j;
  unsigned docase = flags & ZSTR_CASE, up = flags & ZSTR_UPPER_;

  if (len == 0 || s == NULL || t == NULL) return s;
  if (flags & ZSTR_CAT) while(s[i]) i++;
  for (j = 0; i < len; i++, j++) {
    if (docase && t[j]) {
      if (up) s[i] = (char) (unsigned char) toupper((unsigned char) t[j]);
      else    s[i] = (char) (unsigned char) tolower((unsigned char) t[j]);
    } else s[i] = t[j];
    if (t[j] == 0) break;
  }
  if (i == len) s[i] = '\0';
  if (flags & ZSTR_XSPACE) stripx(s, flags); /* call strip */
  return s;
}


/* compare strings without case */
#define strcmpnc(s, t) strncmpnc(s, t, -1)
INLINE int strncmpnc(const char *s, const char *t, int n)
{
  int i, cs, ct;

  if (s == NULL || t == NULL) return 0;
  for (i = 0; ; i++) {
    if (i >= n) return 0;
    cs = s[i];
    ct = t[i];
    if (cs == 0 || ct == 0) break;
    cs = toupper( (unsigned char) cs );
    ct = toupper( (unsigned char) ct );
    if (cs != ct) break;
  }
  return cs - ct;
}


#endif /* ZCOM_UTIL__ */
#endif /* ZCOM_UTIL */


#ifdef  ZCOM_SS
#ifndef ZCOM_SS__
#define ZCOM_SS__
#include <stdio.h>
#include <string.h>
#include <stdarg.h>
#include <stdlib.h>

/* operation code */
enum { SSCAT = 1, SSDELETE = 2, SSSHRINK = 3, SSSINGLE = 0x1000 };

#ifndef SSMINSIZ /* to override the block size, define it before inclusion */
#define SSMINSIZ 256 /* at least sizeof(int), but change this value to 1 for debugging */
#endif
#ifndef SSHASHBITS
#define SSHASHBITS 8
#endif
#define SSHASHSIZ   (1 << SSHASHBITS)
#define SSOVERALLOC 1
#define sscalcsize_(n) (((n)/SSMINSIZ + 1) * SSMINSIZ) /* size for n nonblank characters */

struct ssheader {
  size_t size;
  size_t hashval;
  struct ssheader *next;
} ssbase_[SSHASHSIZ] = {{ 0u, 0u, NULL }};


/* we use the string address instead of that of the pointer
 * to struct ssheader to compute the Hash value,
 * because the former is more frequently used in e.g. looking-up
 * */
INLINE size_t sshashval_(const char *p)
{
  size_t val = (size_t) p * 1664525u + 1013904223u;
  return (val >> (sizeof(size_t)*8 - SSHASHBITS)) & ((1<<SSHASHBITS) - 1);
}


/* return the *previous* header to the one that associates with s
 * first locate the list from the Hash value, then enumerate the linked list.
 * */
INLINE struct ssheader *sslistfind_(const char *s)
{
  struct ssheader *hp, *head;

  if (s == NULL) return NULL;
  head = ssbase_ + sshashval_(s);
  if (head->next == NULL) return NULL; /* uninitialized head node */
  for (hp = head; hp->next != head; hp = hp->next)
    if ((char *)(hp->next + 1) == s) /* match the string address */
      return hp;
  return NULL;
}


/* simply add the entry h at the begining of the list
 * we do not accept a pre-calculated hash value,
 * since realloc might have changed it
 * */
INLINE struct ssheader *sslistadd_(struct ssheader *h)
{
  struct ssheader *head;

  head = ssbase_ + sshashval_( (char *)(h + 1) );
  if (head->next == NULL) /* initialize the base */
    head->next = head;
  h->next = head->next;
  head->next = h;
  return head;
}


/* remove hp->next */
INLINE void sslistremove_(struct ssheader *hp, int f)
{
  struct ssheader *h = hp->next;

  hp->next = h->next;
  if (f) free(h);
}


/* (re)allocate memory for (*php)->next, update list, return the new string
 * n is the number of nonempty characters, obtained e.g. from strlen().
 * create a new header if *php is NULL, in this case, the first character
 * of the string is '\0'
 * */
INLINE char *ssresize_(struct ssheader **php, size_t n, unsigned flags)
{
  struct ssheader *h = NULL, *hn, *hp;
  size_t size;

  if (php == NULL) {
    fprintf(stderr, "ssresize_: php is NULL, n = %u", (unsigned) n);
    return NULL;
  }

  /* we use the following if to assign hp and h, so the order is crucial */
  if ((hp = *php) == NULL || (h = hp->next)->size < n + 1 || !(flags & SSOVERALLOC)) {
    size = sscalcsize_(n);
    if (h == NULL || size != h->size) {
      /* remove h from the list  */
      if (hp != NULL) sslistremove_(hp, 0);
      if ((hn = calloc(sizeof(*h) + size, 1)) == NULL) {
        fprintf(stderr, "ssresize_: no memory for %u\n", (unsigned) size);
        return NULL;
      }
      if (h != NULL) {
        memcpy(hn, h, sizeof(*hn) + (size > h->size ? h->size : size));
        free(h);
      }
      h = hn;

      *php = hp = sslistadd_(h);
      hp->next->size = size;
    }
  }
  return (char *)(hp->next + 1);
}


INLINE void ssmanage_low_(struct ssheader *hp, unsigned opt)
{
  if (opt == SSDELETE)
    sslistremove_(hp, 1);
  else if (opt == SSSHRINK)
    ssresize_(&hp, strlen((char *)(hp->next + 1)), 0);
}


#define ssdel(s)       ssmanage((s), SSDELETE|SSSINGLE)
#define ssdelete(s)    { ssdel(s); (s)=NULL; }
#define ssshrink(s)    ssmanage((s), SSSHRINK|SSSINGLE)
#define ssdelall()     ssmanage(NULL, SSDELETE)
#define ssshrinkall()  ssmanage(NULL, SSHRINK)

/* delete a string, shrink memory, etc ... */
INLINE int ssmanage(char *s, unsigned flags)
{
  struct ssheader *hp, *head;
  unsigned opt = flags & 0xFF;
  size_t i;

  if (flags & SSSINGLE) { /* working on a single string */
    if (s == NULL || (hp = sslistfind_(s)) == NULL) {
      if (s) fprintf(stderr, "ssmanage: unknown address %p (%s)\n",  s, s);
      return -1;
    }
    ssmanage_low_(hp, opt);
  } else {
    for (i = 0; i < SSHASHSIZ; i++)
      for (hp = head = ssbase_ + i; hp->next && hp->next != head; hp = hp->next)
        /* we must not operate on h itself, which renders the iterator h invalid */
        ssmanage_low_(hp, opt);
  }
  return 0;
}


#define ssnew(n)       sscpycatx(NULL, NULL, (n),    0)
#define ssdup(t)       sscpycatx(NULL, (t),   0,     0)
#define sscpy(s, t)    sscpycatx(&(s), (t),   0,     0)
#define sscat(s, t)    sscpycatx(&(s), (t),   0, SSCAT)

/* copy/cat t to *ps
 *
 * If (flags & SSCAT) == 0:
 * copy t to *ps, if ps is not NULL, and return the result
 * if ps or *ps is NULL, we return a string created from t
 *   *ps is set to the same value if ps is not NULL
 * otherwise, we update the record that corresponds to *ps
 *
 * minsize: to request a minimal size for the resulting buffer
 *
 * If flags & SSCAT:
 * append t after *ps. Equivalent to cpy if ps or *ps is NULL.
 * */
INLINE char *sscpycatx(char **ps, const char *t, size_t minsize, unsigned flags)
{
  struct ssheader *hp = NULL;
  size_t size = 0u, sizes = 0u;
  char *s = NULL, *p;

  /* both ps and *ps can be NULL, in which cases we leave hp as NULL */
  if (ps != NULL && (s = *ps) != NULL && (hp = sslistfind_(s)) == NULL) {
    fprintf(stderr, "sscpycatx: unknown address %p (%s)\n", s, s);
    return NULL;
  }
  if (t != NULL)
    while (t[size]) /* compute the length of t */
      size++;
  if (flags & SSCAT) {
    if (s != NULL)  /* s is also NULL, if ps itself is NULL */
      while (s[sizes]) /* compute the length of s */
        sizes++;
    size += sizes;
  }  /* sizes is always 0 in case of copying */
  if (size < minsize)
    size = minsize;
  if ((s = ssresize_(&hp, size, SSOVERALLOC)) == NULL) { /* change size */
    return NULL;
  }
  if (t != NULL)
    for (p = s + sizes; (*p++ = *t++) != '\0'; ) /* copy/cat the string */
      ;
  if (ps != NULL)
    *ps = s;
  return s;
}


#define ssfgets(s, pn, fp)    ssfgetx(&(s), (pn), '\n', (fp))
#define ssfgetall(s, pn, fp)  ssfgetx(&(s), (pn), EOF, (fp))

/* get a string *ps from file fp
 * *ps can be NULL, in which case memory is allocated
 * *pn is number of characters read (including '\n', but not the terminal null)
 * delim is the '\n' for reading a singe line
 * */
INLINE char *ssfgetx(char **ps, size_t *pn, int delim, FILE *fp)
{
  size_t n, max;
  int c;
  char *s;
  struct ssheader *hp;

  if (ps == NULL || fp == NULL)
    return NULL;
  if ((s = *ps) == NULL) /* allocate an initial buffer if *ps is NULL */
    if ((s = sscpycatx(ps, NULL, 0, 0u)) == NULL)
      return NULL;
  if ((hp = sslistfind_(s)) == NULL) {
    fprintf(stderr, "ssfgetx: unknown address %p (%s)\n", s, s);
    return NULL;
  }
  max = hp->next->size-1;
  for (n = 0; (c = fgetc(fp)) != EOF; ) {
    if (n + 1 > max) { /* request space for n+1 nonblank characters */
      if ((*ps = s = ssresize_(&hp, n + 1, SSOVERALLOC)) == NULL)
        return NULL;
      max = hp->next->size - 1;
    }
    s[n++] = (char)(unsigned char) c;
    if (c == delim)
      break;
  }
  s[n] = '\0';
  if (pn != NULL)
    *pn = n;
  return (n > 0) ? s : NULL;
}


/* parse `s' into a string array
 * delimiters are removed */
INLINE char **ssparse(char *s, int *pn, const char *delim)
{
  const int capsz = 16;
  int cap, n;
  char **sarr, *p, *q;
  char delim0[8] = "\n\r"; /* default deliminators: new lines */

  if (pn) *pn = 0;
  if (delim == NULL) delim = delim0;

  cap = capsz;
  if ((sarr = calloc(cap, sizeof(sarr[0]))) == NULL) {
    fprintf(stderr, "no memory for sarr\n");
    return NULL;
  }
  for (n = 0, p = s; ; ) { /* n is # of lines */
    for (q = p; *q != '\0'; q++)
      if (strchr(delim, *q))
        break;
    if (q != p) { /* skip an empty line */
      sarr[n++] = p;
      if (n >= cap) { /* expand the array */
        cap += capsz;
        if ((sarr = realloc(sarr, cap * sizeof(sarr[0]))) == NULL) {
          fprintf(stderr, "no memory for sarr, %d\n", cap);
          return NULL;
        }
      }
    }
    if (*q == '\0') break; /* we are done */
    *q = '\0';
    /* search for the next starting point */
    for (p = q + 1; *p && strchr(delim, *p); p++)
      ;
    if (*p == '\0') break;
  }

  if (pn) *pn = n;
  return sarr;
}

/* free the string array, sarr[0] created by ssnew() and sarr created by malloc() */
#define ssarrfree(sarr) { ssdel(sarr[0]); free(sarr); }

#endif /* ZCOM_SS__ */
#endif /* ZCOM_SS */


#ifdef  ZCOM_OPT
#ifndef ZCOM_OPT__
#define ZCOM_OPT__


/* option either from arguments or configuration */
typedef struct {
  int isopt; /* is option (1) or argument (0) or cfg file entry (2) */
  char ch; /* single letter option flag */
  const char *sflag; /* long string flag */
  const char *key; /* key, for cfg files as in `key = val' */

  const char *val; /* raw string from command line */
  const char *desc; /* description */
  const char *fmt; /* sscanf format */
  const char *pfmt; /* printf format, NULL: to guess */
  void *ptr; /* address to the target variable */
  int ival; /* initial value, for switches */
  unsigned flags;
} opt_t;


#define OPT_MUST     0x0001  /* a mandatory argument or option */
#define OPT_SWITCH   0x0002  /* an option is a switch */
#define OPT_SET      0x0004  /* an argument/option is set */

/* translate string value in `o->val' into
 * actual ones through sscanf(), etc */
INLINE int opt_getval(opt_t *o)
{
  const char *fmt = o->fmt;

  if (fmt == NULL || fmt[0] == '\0') { /* raw string assignment */
    *((const char **) o->ptr) = o->val;
  } else if (strcmp(fmt, "%s") == 0) { /* copy the string */
    sscpy( *((char **) o->ptr), o->val);
  } else if (strcmp(fmt, "%b") == 0) { /* switch */
    /* switch the default value */
    if (o->flags & OPT_SET) return !o->ival;
    else return o->ival;
  } else { /* call sscanf */
    if (strcmp(fmt, "%r") == 0) /* real */
      fmt = (sizeof(real) == sizeof(float)) ? "%f" : "%lf";
    if (1 != sscanf(o->val, fmt, o->ptr)) {
      fprintf(stderr, "Error: unable to convert a value for [%s] as fmt [%s], raw string: [%s]\n",
          o->desc, fmt, o->val);
      return 1;
    }
  }
  return 0;
}


/* register an option
 *
 * for a configure entry, set `key' and leave `sflag' = NULL
 * for a command-line option, set `sflag' and leave `key' = NULL
 * `fmt' is the sscanf() format string
 * `*ptr' is the target variable
 * `fmt' can "%b" for a switch (like an command-line option "-v")
 * `fmt' can have a prefix `!' to mean a mandatory option
 * both NULL and "%s" of `fmt' mean string values, the type of
 *  `ptr' should be `char **', the difference is that `*ptr'
 *  is directly assigned to `o->val' during opt_getval() in the
 *  former case, but extra memory is allocated to copy `o->val'
 *  in the latter case */
INLINE void opt_set(opt_t *o, const char *sflag, const char *key,
    const char *fmt, void *ptr, const char *desc)
{
  o->ch = '\0';
  if (key) { /* cfg file `key = val', not a command-line argument */
    o->isopt = 2;
  } else if (sflag) { /* option */
    o->isopt = 1;
    o->ch = (char) ( sflag[2] ? '\0' : sflag[1] ); /* no ch for a long flag */
  } else { /* argument */
    o->isopt = 0;
  }
  o->sflag = sflag;
  o->key = key;
  o->flags = 0;
  die_if (ptr == NULL, "null pass to opt with %s: %s\n", sflag, desc);
  o->ptr = ptr;
  if (fmt == NULL) fmt = "";
  if (fmt[0] == '!') {
    fmt++;
    o->flags |= OPT_MUST;
  }
  die_if (fmt[0] != '\0' && fmt[0] != '%',
      "unknown format (missing `%%') flag `%s\', fmt `%s', description: %s\n",
      sflag, fmt, desc);
  if (strcmp(fmt, "%b") == 0) {
    fmt = "%d";
    o->flags |= OPT_SWITCH;
    o->ival = *((int *) ptr); /* save the initial value */
  }
  o->fmt = fmt;
  o->pfmt = NULL;
  o->desc = desc;
}


/* print the value of o->ptr */
#define opt_printptr(o) opt_fprintptr(stderr, o)
INLINE void opt_fprintptr(FILE *fp, opt_t *o)
{
  const char *fmt;

  for (fmt = o->fmt; *fmt && *fmt != '%'; fmt++) ;
#define ELIF_PF_(fm, fmp, type) \
  else if (strcmp(fmt, fm) == 0) \
    fprintf(fp, (o->pfmt ? o->pfmt : fmp), *(type *)o->ptr)

  if (fmt == NULL || *fmt == '\0' || strcmp(fmt, "%s") == 0)
    fprintf(fp, "%s", (*(char **) o->ptr) ? (*(char **) o->ptr) : "NULL");
  ELIF_PF_("%b", "%d", int); /* switch */
  ELIF_PF_("%d", "%d", int);
  ELIF_PF_("%u", "%u", unsigned);
  ELIF_PF_("%x", "0x%x", unsigned);
  ELIF_PF_("%ld", "%ld", long);
  ELIF_PF_("%lu", "%lu", unsigned long);
  ELIF_PF_("%lx", "0x%lx", unsigned long);
#if 0  /* C99 only */
  ELIF_PF_("%lld", "%lld", long long);
  ELIF_PF_("%llu", "%llu", unsigned long long);
  ELIF_PF_("%llx", "0x%llx", unsigned long long);
#endif
  ELIF_PF_("%f", "%g", float);
  ELIF_PF_("%lf", "%g", double);
  ELIF_PF_("%r", "%g", real);
  else fprintf(fp, "unknown %s-->%%d: %d", fmt, *(int *) o->ptr);
#undef ELIF_PF_
}


/* search an option list, return an option whose variable address is p */
INLINE opt_t *opt_find(opt_t *ls, int n, const void *p)
{
  int i;

  for (i = 0; i < n; i++) if (ls[i].ptr == p) return ls + i;
  return NULL;
}


/* search an option list to see if an option is explicitly set */
INLINE int opt_isset(opt_t *ls, int n, const void *p, const char *var)
{
  opt_t *o = opt_find(ls, n, p);
  die_if (!o, "cannot find var %s, ptr %p\n", var, p);
  return o->flags & OPT_SET ? 1 : 0;
}


#endif /* ZCOM_OPT__ */
#endif /* ZCOM_OPT */


#ifdef  ZCOM_ARGOPT
#ifndef ZCOM_ARGOPT__
#define ZCOM_ARGOPT__
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>


typedef struct {
  int nopt;
  opt_t *opts;
  const char *prog;
  const char *desc;
  const char *author;
  const struct tm *tm; /* compilation time */
  int version;
  unsigned flags;
  int dum_[4]; /* space holder */
} argopt_t;

#define ARGOPT_MUST     OPT_MUST    /* mandatory argument or option, format starts with ! */
#define ARGOPT_SWITCH   OPT_SWITCH  /* format "%b" */
#define ARGOPT_SET      OPT_SET
#define ARGOPT_LONGOPT  0x0010  /* always assume long format, e.g., -maxh */


#define argopt_getopt(ao, p) opt_find(ao->opts, ao->nopt, p)
#define argopt_getarg argopt_getopt

/* test if argument/option is explicitly set */
#define argopt_isset(ao, var) opt_isset(ao->opts, ao->nopt, &var, #var)


/* initialize the argument structure */
INLINE argopt_t *argopt_open(unsigned flags)
{
  argopt_t *ao;
  time_t tmcmpl;

  xnew(ao, 1);
  ao->flags = flags;
  ao->nopt = 0;
  ao->opts = NULL;
  tmcmpl = time(NULL);
  ao->tm = localtime( &tmcmpl );
  memset(ao->dum_, '\0', sizeof(ao->dum_));
  return ao;
}


INLINE void argopt_close(argopt_t *ao)
{
  if (ao->opts) { free(ao->opts); ao->opts = NULL; }
  free(ao);
}


/* print version and die */
INLINE void argopt_version(argopt_t *ao)
{
  fprintf(stderr, "%s: %s, version %d\n",
      ao->prog, ao->desc ? ao->desc : "", ao->version);
  if (ao->author && ao->tm)
    fprintf(stderr, "Copyright (c) %s %d\n", ao->author, ao->tm->tm_year + 1900);
  argopt_close(ao);
  exit(1);
}


/* print help message and die */
INLINE void argopt_help(argopt_t *ao)
{
  int i, len, maxlen;
  opt_t *o;
  const char *sysopt[2] = {"print help message", "print version"}, *desc;

  fprintf(stderr, "%s, version %d",
      ao->desc ? ao->desc : ao->prog, ao->version);
  if (ao->author && ao->tm)
    fprintf(stderr, ", Copyright (c) %s %d", ao->author, ao->tm->tm_year + 1900);
  fprintf(stderr, "\nUSAGE\n  %s {OPTIONS}", ao->prog);
  for (i = 0; i < ao->nopt; i++) {
    const char *bra = "", *ket = "";
    o = ao->opts + i;
    if (o->isopt) continue;
    if (o->flags & OPT_MUST) {
      if (strchr(o->desc, ' '))
        bra = "[", ket = "]";
    } else
      bra = "{", ket = "}";
    fprintf(stderr, " %s%s%s", bra, o->desc, ket);
  }
  fprintf(stderr, "\n");

  fprintf(stderr, "OPTIONS:\n") ;
  for (maxlen = 0, i = 0; i < ao->nopt; i++) { /* compute the longest option */
    if (!ao->opts[i].isopt) continue;
    len = strlen(ao->opts[i].sflag);
    if (len > maxlen) maxlen = len;
  }
  for (i = 0; i < ao->nopt; i++) {
    o = ao->opts + i;
    if (!o->isopt) continue;
    desc = o->desc;
    if (strcmp(desc, "$HELP") == 0)
      desc = sysopt[0];
    else if (strcmp(desc, "$VERSION") == 0)
      desc = sysopt[1];
    fprintf(stderr, "  %-*s : %s%s%s", maxlen, o->sflag,
        ((o->flags & OPT_MUST) ? "[MUST] " : ""),
        (!(o->flags & OPT_SWITCH) ? "followed by " : ""), desc);
    if (o->ptr && o->ptr != ao->dum_) { /* print default values */
      fprintf(stderr, ", default: ");
      opt_fprintptr(stderr, o);
    }
    fprintf(stderr, "\n");
  }
  argopt_close(ao);
  exit(1);
}


#define argopt_regarg(ao, fmt, ptr, desc) argopt_add(ao, NULL, fmt, ptr, desc)
#define argopt_regopt argopt_add
#define argopt_reghelp argopt_addhelp
#define argopt_regversion argopt_addversion
#define argopt_addhelp(ao, sflag) argopt_add(ao, sflag, "%b", ao->dum_, "$HELP")
#define argopt_addversion(ao, sflag) argopt_add(ao, sflag, "%b", ao->dum_, "$VERSION")

/* register an argument or option
 * sflag: string flag, or NULL for an argument
 * fmt: sscanf() format string, "%b" for a switch, "%r" for real
 * return the index */
INLINE int argopt_add(argopt_t *ao, const char *sflag,
    const char *fmt, void *ptr, const char *desc)
{
  opt_t *o;
  int n;

  n = ao->nopt++;
  xrenew(ao->opts, ao->nopt);
  o = ao->opts + n;
  opt_set(o, sflag, NULL, fmt, ptr, desc);
  return n;
}


/* main parser of arguments */
INLINE void argopt_parse(argopt_t *ao, int argc, char **argv)
{
  int i, j, k, ch, acnt = 0;
  opt_t *ol = ao->opts;

  ao->prog = argv[0];
  for (i = 1; i < argc; i++) {
    if (argv[i][0] != '-') { /* it's an argument */
      while (ol[acnt].isopt && acnt < ao->nopt) acnt++;
      if (acnt >= ao->nopt) argopt_help(ao);
      ol[acnt].val = argv[i];
      ol[acnt].flags |= OPT_SET;
      if (0 != opt_getval(ol + acnt))
        argopt_help(ao);
      ++acnt;
      continue;
    }

    /* it's an option, loop for abbreviated form "-abc" == "-a -b -c" */
    for (j = 1; (ch = argv[i][j]) != '\0'; j++) {
      int islong = (j == 1 && argv[i][1] == '-') | (ao->flags & ARGOPT_LONGOPT);

      if (islong) { /* compare against long options */
        for (k = 0; k < ao->nopt; k++)
          if (ol[k].isopt &&
              strncmp(argv[i], ol[k].sflag, strlen(ol[k].sflag)) == 0)
            break;
      } else { /* compare against short options */
        for (k = 0; k < ao->nopt; k++)
          if (ol[k].isopt && ch == ol[k].ch)
            break;
      }
      if (k >= ao->nopt) {
        fprintf(stderr, "cannot handle option [%s]\n", argv[i]);
        argopt_help(ao);
      }

      if (ol[k].desc[0] == '$') { /* system commands */
        if (strcmp(ol[k].desc, "$HELP") == 0)
          argopt_help(ao);
        else if (strcmp(ol[k].desc, "$VERSION") == 0)
          argopt_version(ao);
      }

      if (ol[k].flags & OPT_SWITCH) { /* handle switches "%b" */
        ol[k].flags |= OPT_SET;
        /* switch the default value, note this flag may be passed
         * several times, so we don't want to flip around */
        *((int *) ol[k].ptr) = !ol[k].ival;
        if (islong) break; /* go to the next argument argv[i+1] */
      } else { /* look for the additional argument for this */
        int hasv = 0;
        if (islong) { /* e.g., --version=11 */
          j = strlen(ol[k].sflag);
          if (argv[i][ j ] == '=') {
            ol[k].val = argv[i] + j + 1;
            hasv = 1;
          }
        } else { /* e.g., -n8 */
          if (argv[i][++j]) {
            ol[k].val = argv[i] + j;
            hasv = 1;
          }
        }

        if (!hasv) { /* --version 11 or -n 8 */
          if (++i >= argc) {
            fprintf(stderr, "%s(%s) requires an argument!\n", ol[k].sflag, argv[i - 1]);
            argopt_help(ao);
          }
          ol[k].val = argv[i];
        }
        ol[k].flags |= OPT_SET;
        if (0 != opt_getval(ol + k)) argopt_help(ao);
        break; /* go to the next argument argv[i+1] */
      }
    } /* end of short option loop */
  }
  /* check if we have all mandatory arguments and options */
  for (i = 0; i < ao->nopt; i++) {
    if ((ol[i].flags & OPT_MUST) && !(ol[i].flags & OPT_SET)) {
      fprintf(stderr, "Error: missing %s %s: %s\n\n",
          ol[i].isopt ? "option" : "argument", ol[i].sflag, ol[i].desc);
      argopt_help(ao);
    }
  }
}


/* dump the current values */
INLINE void argopt_dump(const argopt_t *ao)
{
  int i, len = 2;
  opt_t *ol = ao->opts;

  /* get the width of the widest option */
  for (i = 0; i < ao->nopt; i++)
    if (ol[i].sflag)
      len = intmax(len, strlen(ol[i].sflag));

  /* print values of all options */
  for (i = 0; i < ao->nopt; i++) {
    const char *sflag = ol[i].sflag;
    if (sflag == NULL) sflag = "arg";
    fprintf(stderr, "%*s: ", len + 1, sflag);
    opt_fprintptr(stderr, ol + i);
    fprintf(stderr, ",  %s\n", ol[i].desc);
  }
}


#endif /* ZCOM_ARGOPT__ */
#endif /* ZCOM_ARGOPT */


#ifdef  ZCOM_AV
#ifndef ZCOM_AV__
#define ZCOM_AV__
#include <stdio.h>
#include <stdarg.h>
#include <math.h>


typedef struct {
  double s, sx;
} av0_t; /* simplest averager without variance */


INLINE void av0_clear(av0_t *av) { av->s = 1e-20; av->sx = 0; }
INLINE double av0_getave(const av0_t *av) { return av->sx / av->s; }
INLINE void av0_add(av0_t *av, double x) { av->s += 1; av->sx += x; }
INLINE void av0_addw(av0_t *av, double x, double w) { av->s += w; av->sx += x * w; }


typedef struct {
  double s, sx, sx2; /* sum, sum x, variance */
} av_t;

INLINE void av_clear(av_t *av) { av->s = av->sx = av->sx2 = 0; }
INLINE double av_getave(const av_t *av) { return (av && av->s > 0) ? av->sx/av->s : 0; }
INLINE double av_getvar(const av_t *av) { return (av && av->s > 0) ? av->sx2/av->s : 0; }
INLINE double av_getdev(const av_t *av) { return (av && av->s > 0) ? sqrt(av_getvar(av)) : 0; }

/* add a new value to av_t with a weight `w' */
INLINE void av_addw(av_t *av, double x, double w)
{
  double s, sx;

  av->s = (s = av->s) + w;
  av->sx = (sx = av->sx) + x*w;
  if (s <= 0.0) return;
  av->sx2 += (x - sx/s)*(x - av->sx/av->s)*w;
}
#define av_add(av, x) av_addw(av, x, 1)

/* adaptive averaging: sX = sX * gam + X */
INLINE void av_gaddw(av_t *av, double x, double w, double ngam)
{
  double s, sx, del, gam = 1.0 - ngam;

  av->s = (s = av->s)*gam + w;
  av->sx = (sx = av->sx)*gam + w*x;
  if (s <= 0.0) return;
  del = x*s - sx;
  av->sx2 = (av->sx2 + w*del*del/(s*av->s))*gam;
}

#define av_gadd(av, x, ngam) av_gaddw(av, x, 1, ngam)


/* average of n quantities */
typedef struct {
  int n;
  double s;
  double *x; /* buffer, current x value */
  double *sx, *sxb; /* sum */
  double *sx2; /* variance */
} avn_t;

/* open average for n quantities */
INLINE avn_t *avn_open(int n)
{
  avn_t *a;
  int i;

  xnew(a, 1);
  die_if (n <= 0, "avn needs at least n %d >= 1\n", n);
  a->n = n;
  xnew(a->x, n);
  xnew(a->sx, n);
  xnew(a->sxb, n);
  xnew(a->sx2, n*n);
  a->s = 0;
  for (i = 0; i < n; i++) a->x[i] = a->sx[i] = a->sxb[i] = 0;
  for (i = 0; i < n * n; i++) a->sx2[i] = 0;
  return a;
}

INLINE void avn_close(avn_t *a)
{
  free(a->x);
  free(a->sx);
  free(a->sxb);
  free(a->sx2);
  free(a);
}

/* add values to avn_t with a weight `w'
 * must add all values simultaneously, otherwise a->s is messed up */
INLINE void avn_addwv(avn_t *a, const double *x, double w)
{
  int k, l, n = a->n;
  double s;

  a->s = (s = a->s) + w;
  for (k = 0; k < n; k++) {
    a->sx[k] = (a->sxb[k] = a->sx[k]) + x[k] * w;
  }
  if (s > 0) { /* update variance */
    for (k = 0; k < n; k++)
      for (l = k; l < n; l++)
        a->sx2[k*n + l] += (x[k] - a->sxb[k]/s) * (x[l] - a->sx[l]/a->s) * w;
  }
}

#define avn_addv(a, x) avn_addwv(a, x, 1)

/* add values to avn_t with a weight `w'
 * use argument list */
INLINE void avn_addw(avn_t *a, double w, ...)
{
  int k;
  va_list va;

  va_start(va, w);
  for (k = 0; k < a->n; k++) {
    a->x[k] = va_arg(va, double);
  }
  va_end(va);
  avn_addwv(a, a->x, w);
}


/* weighted update: sX = sX*gam + X */
INLINE void avn_gaddwv(avn_t *a, const double *x, double w, double ngam)
{
  int k, l, n = a->n;
  double s, gam = 1.0 - ngam;

  a->s = (s = a->s)*gam + w;
  for (k = 0; k < n; k++) {
    a->sx[k] = (a->sxb[k] = a->sx[k]) * gam + w * x[k];
  }
  if (s > 0) { /* update variance */
    for (k = 0; k < n; k++)
      for (l = k; l < n; l++)
        a->sx2[k*n + l] = gam * (a->sx2[k*n + l] +
            w*(x[k]*s - a->sxb[k]) * (x[l]*s - a->sxb[l])/(s*a->s));
  }
}

#define avn_gaddv(a, x, ngam) avn_gaddwv(a, x, 1, ngam)

/* weighted update
 * use argument list */
INLINE void avn_gaddw(avn_t *a, double w, double ngam, ...)
{
  int k;
  va_list va;

  va_start(va, ngam);
  for (k = 0; k < a->n; k++)
    a->x[k] = va_arg(va, double);
  va_end(va);
  avn_gaddwv(a, a->x, w, ngam);
}

/* these macros are only available if we have variable arguments macros */
#ifdef HAVEVAM
#define anv_add(a, ...) avn_addw(a, 1.0, ## __VARARGS__)
#define anv_gadd(a, ngam, ...) avn_gaddw(a, 1.0, ngam, ## __VARARGS__)
#endif

INLINE void avn_clear(avn_t *a)
{
  int i;

  a->s = 0;
  for (i = 0; i < a->n; i++)
    a->x[i] = a->sx[i] = a->sxb[i] = 0;
  for (i = 0; i < a->n * a->n; i++)
    a->sx2[i] = 0;
}

/* get average of quantity k */
INLINE double avn_getave(const avn_t *a, int k)
{
  die_if (k < 0 || k >= a->n, "avn index %d out of range %d\n", k, a->n);
  return (a->s > 0) ? a->sx[k]/a->s : 0;
}

/* get averages of all quantities */
INLINE double *avn_getaven(const avn_t *a, double *val)
{
  int k;

  if (a->s <= 0.) {
    for (k = 0; k < a->n; k++) val[k] = 0;
  } else {
    for (k = 0; k < a->n; k++)
      val[k] = a->sx[k] / a->s;
  }
  return val;
}

/* get cross variance of quantities k and l */
INLINE double avn_getvar(const avn_t *a, int k, int l)
{
  die_if (k < 0 || k >= a->n || l < 0 || l >= a->n,
      "avn index %d, %d out of range %d\n", k, l, a->n);
  if (k > l) intswap(k, l);
  return (a->s > 0) ? a->sx2[k * a->n + l]/a->s : 0;
}

/* get variances of all quantities */
INLINE double *avn_getvarn(const avn_t *a, double *val)
{
  int k, l, n = a->n;

  if (a->s <= 0.) {
    for (k = 0; k < n * n; k++) val[k] = 0;
  } else {
    for (k = 0; k < n; k++) {
      for (l = k; l < n; l++) {
        val[k*n + l] = a->sx2[k*n + l] / a->s;
        if (l > k) val[l*n + k] = val[k*n + l];
      }
    }
  }
  return val;
}

/* get standard deviation of quantity k */
INLINE double avn_getdev(const avn_t *a, int k)
{
  die_if (k < 0 || k >= a->n, "avn index %d out of range %d\n", k, a->n);
  return (a->s > 0) ? sqrt(a->sx2[k * a->n + k]/a->s) : 0;
}

/* get standard deviations of all quantities */
INLINE double *avn_getdevn(const avn_t *a, double *val)
{
  int k, n = a->n;

  if (a->s <= 0.) {
    for (k = 0; k < n; k++) val[k] = 0;
  } else {
    for (k = 0; k < n; k++)
      val[k] = sqrt(a->sx2[k*n + k] / a->s);
  }
  return val;
}

/* get correlation coefficient between quantities k and l */
INLINE double avn_getcor(const avn_t *a, int k, int l)
{
  int n = a->n;
  die_if (k < 0 || k >= n || l < 0 || l >= n,
      "avn index %d, %d out of range %d\n", k, l, n);
  if (k > l) intswap(k, l);
  return (a->s > 0) ? a->sx2[k*n + l] / sqrt(a->sx2[k*n + k] * a->sx2[l*n + l]) : 0;
}

/* get correlation coefficients among all quantities */
INLINE double *avn_getcorn(const avn_t *a, double *val)
{
  int k, l, n = a->n;

  if (a->s <= 0.) {
    for (k = 0; k < n * n; k++) val[k] = 0;
  } else {
    for (k = 0; k < n; k++) {
      for (l = k; l < n; l++) {
        val[k*n + l] = a->sx2[k*n + l] / sqrt(a->sx2[k*n + k] * a->sx2[l*n + l]);
        if (l > k) val[l*n + k] = val[k*n + l];
      }
    }
  }
  return val;
}

#endif /* ZCOM_AV__ */
#endif /* ZCOM_AV */


#ifdef  ZCOM_HIST
#ifndef ZCOM_HIST__
#define ZCOM_HIST__

#define HIST_VERBOSE    0x0001
#define HIST_ADDAHALF   0x0010
#define HIST_NOZEROES   0x0020
#define HIST_KEEPLEFT   0x0040
#define HIST_KEEPRIGHT  0x0080
#define HIST_KEEPLEFT2  0x0040
#define HIST_KEEPRIGHT2 0x0080
#define HIST_KEEPEDGE   (HIST_KEEPLEFT | HIST_KEEPRIGHT | HIST_KEEPLEFT2 | HIST_KEEPRIGHT2)
#define HIST_KEEPHIST   0x0100
#define HIST_OVERALL    0x0200
#define HIST_ADDITION   0x1000


/* compute sum, average and variance */
INLINE double *histgetsums_(const double *h, int rows, int n,
    double xmin, double dx, double *sums)
{
  double *xav, *xxc, x, w;
  int i, r;

  xav = sums + rows;
  xxc = xav  + rows;
  for (r = 0; r < rows; r++) {
    sums[r] = xav[r] = xxc[r] = 0.;
    for (i = 0; i < n; i++) {
      x = xmin + (i+.5)*dx;
      w = h[r*n + i];
      sums[r] += w;
      xav[r]  += w*x;
      xxc[r]  += w*x*x;
    }
    if (sums[r] > 1e-5) {
      xav[r] /= sums[r];
      xxc[r] = xxc[r]/sums[r] - xav[r]*xav[r];
    }
  }
  return sums;
}


/* old names */
#define wdist(h, m, n, x0, dx, fn) \
  wdistex(h, m, n, x0, dx, HIST_ADDAHALF | HIST_VERBOSE, fn)
#define wdistex histsave

#define histsave(h, rows, n, xmin, dx, flags, fn) \
  histsavex((const double *) h, rows, n, xmin, dx, fn, \
            NULL, NULL, NULL, flags)

/* write histograms to file
 * histogram 'h' contains 'rows' histograms,
 * each contains 'n' entries, from 'xmin' to 'xmin+dx*n'
 * (*fwheader) is function to print additional information
 * (*fnorm) is advanced normalization function */
INLINE int histsavex(const double *h, int rows, int n, double xmin, double dx,
    const char *fn, int (*fwheader)(FILE *fp, void *data),
    double (*fnorm)(int r, int ix, double xmin, double dx, void *data),
    void *pdata, unsigned flags)
{
  const int version = 0;
  FILE *fp;
  int i, r, rp, rowp, imax, imin;
  const double *p;
  double sm, *sums, fac, delta;
  double smtot[3], *htot = NULL;

  if (fn == NULL) fn = "HIST";
  xfopen(fp, fn, "w", return -1);

  /* get statistics */
  xnew(sums, rows * 3);
  histgetsums_(h, rows, n, xmin, dx, sums);

  /* compute the overall histogram */
  if (flags & HIST_OVERALL) {
    xnew(htot, n); /* the overall histogram */
    for (i = 0; i < n; i++) htot[i] = 0.;

    for (r = 0; r < rows; r++)
      for (i = 0; i < n; i++)
        htot[i] += h[r*n + i];
    histgetsums_(htot, 1, n, xmin, dx, smtot);
    rowp = rows + 1;
  } else {
    rowp = rows;
  }

  /* print basic information */
  fprintf(fp, "# %d 0x%X | %d %d %g %g | ",
      version, flags, rows, n, xmin, dx);
  for (r = 0; r < rows; r++) /* number of visits */
    fprintf(fp, "%g ", sums[r]);
  fprintf(fp, "| ");
  for (r = 0; r < rows; r++) /* average, standard deviation */
    fprintf(fp, "%g %g ", sums[r+rows], sqrt(sums[r+rows*2]));
  fprintf(fp, "| ");
  if (fwheader != NULL) (*fwheader)(fp, pdata);
  fprintf(fp, "\n");

  delta = (flags & HIST_ADDAHALF) ? 0.5 : 0;

  for (r = 0; r < rowp; r++) {
    p = (r == rows) ? htot : (h+r*n);

    if (flags & HIST_KEEPRIGHT) {
      imax = n;
    } else { /* trim the right edge */
      for (i = n-1; i >= 0; i--)
        if (p[i] > 0)
          break;
      imax = i+1;
      if (imax == 0)
        continue;
    }

    if (flags & HIST_KEEPLEFT) {
      imin = 0;
    } else { /* trim the left edge */
      for (i = 0; i < imax; i++)
        if (p[i] > 0)
          break;
      imin = i;
    }

    sm = (r == rows) ? smtot[0] : sums[r];
    if (fabs(sm) < 1e-6) fac = 1.;
    else fac = 1.0/(sm*dx);

    for (i = imin; i < imax; i++) {
      if ((flags & HIST_NOZEROES) && p[i] < 1e-6)
        continue;
      fprintf(fp, "%g ", xmin+(i+delta)*dx);
      if (flags & HIST_KEEPHIST)
        fprintf(fp, "%20.14E ", p[i]);
      rp = (r == rows) ? (-1) : r;
      if (fnorm != NULL) /* advanced normalization, note the r = -1 case */
        fac = (*fnorm)(rp, i, xmin, dx, pdata);
      fprintf(fp, "%20.14E %d\n", p[i]*fac, rp);
    }
    fprintf(fp, "\n");
  }
  fclose(fp);
  if (flags & HIST_VERBOSE) {
    fprintf(stderr, "successfully wrote %s\n", fn);
    for (r = 0; r < rows; r++)
      fprintf(stderr, "%2d cnt: %20.4f av: %10.4f(%10.4f)\n",
          r, sums[r], sums[r+rows], sums[r+rows*2]);
  }
  free(sums);
  if (flags & HIST_OVERALL) {
    free(htot);
  }
  return 0;
}


/* fetch histogram size */
INLINE int histgetinfo(const char *fn, int *row, double *xmin, double *xmax, double *xdel,
    int *version, unsigned *fflags)
{
  FILE *fp;
  char s[1024];
  int n;

  xfopen(fp, fn, "r", return -1);
  if (fgets(s, sizeof s, fp) == NULL) {
    fprintf(stderr, "%s: missing the first line\n", fn);
    fclose(fp);
    return -1;
  }
  if (6 != sscanf(s, "# %d 0x %X | %d %d %lf %lf ",
        version, fflags, row, &n, xmin, xdel)) {
    fprintf(stderr, "%s: bad first line\n%s", fn, s);
    fclose(fp);
    return -1;
  }
  *xmax = *xmin + *xdel * n;
  fclose(fp);
  return 0;
}


/* skip a | */
INLINE char *skipabar_(char *p)
{
  int next = -1;
  sscanf(p, " | %n", &next);
  return (next < 0) ? NULL : (p + next);
}


#define histload(h, rows, n, xmin, dx, fn, flags) \
  histloadx((double *) h, rows, n, xmin, dx, fn, \
            NULL, NULL, NULL, flags)

/* load a previous histogram
 * (*frheader) function to read additional header info.
 * (*fnorm) normalization factor
 * flags can have HIST_ADDITION and/or HIST_VERBOSE */
INLINE int histloadx(double *hist, int rows, int n, double xmin, double dx,
    const char *fn,
    int (*frheader)(const char *s, void *data),
    double (*fnorm)(int r, int ix, double xmin, double dx, void *data),
    void *pdata, unsigned flags)
{
  static char s[40960] = "";
#ifdef _OPENMP
#pragma omp threadprivate(s)
#endif
  FILE *fp;
  char *p;
  int verbose = (flags & HIST_VERBOSE);
  int add = (flags & HIST_ADDITION);
  int ver, next, hashist;
  int i, i1, r, r1, nlin = 0;
  unsigned fflags;
  double x, y, y2, fac, delta, *arr, *sums = NULL;

  xfopen(fp, fn, "r", return -1);

  /* check the first line */
  if (fgets(s, sizeof s, fp) == NULL || s[0] != '#') {
    fprintf(stderr, "%s: missing the first line\n", fn);
    fclose(fp);
    return -1;
  }
  nlin++;
  if (6 != sscanf(s, " # %d 0x %X | %d%d%lf%lf | %n", &ver, &fflags, &r, &i, &y, &x, &next)
      || i < n || r != rows || fabs(x - dx) > 1e-5) {
    fprintf(stderr, "Error: bins = %d, %d, ng = %d, %d; dx = %g, %g\n",
        i, n, r, rows, x, dx);
    fclose(fp);
    return -1;
  }
  delta   = ((fflags & HIST_ADDAHALF) ? .5 : 0.);
  hashist =  (fflags & HIST_KEEPHIST);
  /* scan sums */
  xnew(sums, rows);
  for (p = s+next, r = 0; r < rows; r++) {
    if (1 != sscanf(p, "%lf%n", sums + r, &next)) {
      fprintf(stderr, "cannot read sums from at %d/%d, s:\n%s\np:\n%s\n", r, rows, s, p);
      goto EXIT;
    }
    p += next;
  }
  if ((p = skipabar_(p)) == NULL) goto EXIT;
  for (r = 0; r < rows; r++) {
    if (2 != sscanf(p, "%lf%lf%n", &y, &y2, &next)) {
      fprintf(stderr, "cannot read average/stddev from at %d/%d, s:\n%s\np:\n%s\n", r, rows, s, p);
      goto EXIT;
    }
    p += next;
  }
  if ((p = skipabar_(p)) == NULL) goto EXIT;
  if (frheader != NULL) {
    if (0 != frheader(p, pdata))
      goto EXIT;
  }


  if (!add) { /* clear histogram */
    for (i = 0; i < rows*n; i++) hist[i] = 0.;
  }

  /* loop over r = 0..rows-1 */
  for (r = 0; r < rows; r++) {
    arr = hist + r*n;
    fac = sums[r]*dx;
    while (fgets(s, sizeof s, fp)) {
      nlin++;
      for (p = s+strlen(s)-1; isspace((unsigned char)(*p)) && p >= s; p--)
        *p = '\0'; /* trim ending */
      if (s[0] == '#' || s[0] == '\0') break;
      if (hashist) {
        if (4 != sscanf(s, "%lf%lf%lf%d", &x, &y, &y2, &r1)) {
          fprintf(stderr, "error on line %d\n", nlin);
          goto EXIT;
        }
      } else { /* simple */
        if (3 != sscanf(s, "%lf%lf%d", &x, &y2, &r1)) {
          fprintf(stderr, "error on line %d\n", nlin);
          goto EXIT;
        }
      }
      if (r1 < 0) break; /* overall histogram */

      if (r1 < r) {
        fprintf(stderr, "wrong column index %d vs. %d on line %d, s=[%s]\n",
            r1, r, nlin, s);
        goto EXIT;
      } else if (r1 > r) {
        r = r1;
        arr = hist + r*n;
        fac = sums[r]*dx;
      }
      i1 = (int)((x - xmin)/dx - delta + .5);
      if (i1 < 0 || i1 >= n) {
        fprintf(stderr, "cannot find index for x = %g, delta = %g, i = %d/%d, on line %d\n",
            x, delta, i1, n, nlin);
        goto EXIT;
      }
      if (!hashist) {
        if (fnorm != NULL) {
          fac = (*fnorm)(r, i1, xmin, dx, pdata);
          fac = ((fabs(fac) < 1e-8) ? 1. : 1./fac);
        }
        y = y2*fac;
      }
      if (add) arr[i1] += y;
      else arr[i1] = y;
    }
  }
  if (verbose)
    fprintf(stderr, "histogram loaded successfully from %s\n", fn);

  if (sums) free(sums);
  fclose(fp);
  return 0;
EXIT:
  fprintf(stderr, "error occurs at file %s, line %d, s:%s\n", fn, nlin, s);
  if (sums) free(sums);
  /* we always clear histogram on error */
  for (i = 0; i < rows*n; i++) hist[i] = 0.;
  return -1;
}


/* add x of weight w, into histogram h
 * return number of success */
INLINE int histadd(const double *xarr, double w, double *h, int rows,
    int n, double xmin, double dx, unsigned flags)
{
  int r, ix, good = 0, verbose = flags & HIST_VERBOSE;
  double x;

  for (r = 0; r < rows; r++) {
    x = xarr[r];
    if (x < xmin) {
      if (verbose)
        fprintf(stderr, "histadd underflows %d: %g < %g\n", r, x, xmin);
      continue;
    }
    ix = (int)((x - xmin)/dx);
    if (ix >= n) {
      if (verbose)
        fprintf(stderr, "histadd overflows %d: %g > %g\n", r, x, xmin+dx*n);
      continue;
    }
    h[r*n + ix] += w;
    good++;
  }
  return good;
}


/* object wrappers */
typedef struct {
  int rows;
  int n;
  double xmin;
  double dx;
  double *arr;
} hist_t;

typedef hist_t hs_t;

#define hs_open1(x0, x1, dx) hs_open(1, x0, x1, dx)
#define hs_clear(hs) dblcleararr(hs->arr, hs->rows * hs->n)


INLINE hist_t *hs_open(int rows, double xmin, double xmax, double dx)
{
  hist_t *hs;

  xnew(hs, 1);
  hs->rows = rows;
  hs->xmin = xmin;
  hs->dx   = dx;
  hs->n = (int)((xmax - xmin)/dx + 0.99999999);
  xnew(hs->arr, hs->n * hs->rows);
  return hs;
}


INLINE void hs_close(hist_t *hs)
{
  if (!hs) return;
  if (hs->arr) free(hs->arr);
  memset(hs, 0, sizeof(*hs));
  free(hs);
}


INLINE void hs_check(const hist_t *hs)
{
  die_if (hs == NULL, "hs is %p", (const void *) hs);
  die_if (hs->arr == NULL || hs->rows == 0 || hs->n == 0,
    "hist: arr %p rows %d n %d\n", (const void *)(hs->arr), hs->rows, hs->n);
}


#define hs_save(hs, fn, flags) hs_savex(hs, fn, NULL, NULL, NULL, flags)

INLINE int hs_savex(const hist_t *hs, const char *fn,
    int (*fwheader)(FILE *, void *),
    double (*fnorm)(int, int, double, double, void *),
    void *pdata, unsigned flags)
{
  hs_check(hs);
  return histsavex(hs->arr, hs->rows, hs->n, hs->xmin, hs->dx, fn,
      fwheader, fnorm, pdata, flags);
}


#define hs_load(hs, fn, flags) hs_loadx(hs, fn, NULL, NULL, NULL, flags)

INLINE int hs_loadx(hist_t *hs, const char *fn,
    int (frheader)(const char *, void *),
    double (*fnorm)(int, int, double, double, void *),
    void *pdata, unsigned flags)
{
  hs_check(hs);
  return histloadx(hs->arr, hs->rows, hs->n, hs->xmin, hs->dx, fn,
      frheader, fnorm, pdata, flags);
}


INLINE int hs_add(hist_t *hs, const double *x, double w, unsigned flags)
{
  hs_check(hs);
  return histadd(x, w, hs->arr, hs->rows, hs->n, hs->xmin, hs->dx, flags);
}


#define hs_add1ez(hs, x, flags) hs_add1(hs, 0, x, 1, flags)

INLINE int hs_add1(hist_t *hs, int r, double x, double w, unsigned flags)
{
  hs_check(hs);
  die_if (r >= hs->rows || r < 0, "bad row index %d\n", r);
  return histadd(&x, w, hs->arr + r*hs->n, 1, hs->n, hs->xmin, hs->dx, flags);
}


/* get average of a certain `row' */
INLINE double hs_getave(const hist_t *hs, int row, double *sum, double *var)
{
  double arr[3];

  histgetsums_(hs->arr + row * hs->n, 1, hs->n, hs->xmin, hs->dx, arr);
  if (sum) *sum = arr[0];
  if (var) *var = arr[2];
  return arr[1];
}


/* two dimensional histograms */
INLINE int hist2save(const double *h, int rows, int n, double xmin, double dx,
    int m, double ymin, double dy, unsigned flags, const char *fname);
INLINE int hist2load(double *hist, int rows, int n, double xmin, double dx,
    int m, double ymin, double dy, unsigned flags, const char *fname);
INLINE int hist2add(const double *xarr, const double *yarr, int skip,
    double w, double *h, int rows,
    int n, double xmin, double dx,
    int m, double ymin, double dy, unsigned flags);

typedef struct {
  int rows;
  int n, m;
  double xmin, ymin;
  double dx, dy;
  double *arr, *dumptr;
} hist2_t;

typedef hist2_t hs2_t;

#define hs2_clear(hs2) dblcleararr(hs2->arr, hs2->rows * hs2->n * hs2->m)

#define hs2_open1(xmin, xmax, dx, ymin, ymax, dy) \
  hs2_open(1, xmin, xmax, dx, ymin, ymax, dy)
#define hs2_opensqr(rows, xmin, xmax, dx) \
  hs2_open(rows, xmin, xmax, dx, xmin, xmax, dx)
#define hs2_opensqr1(xmin, xmax, dx) \
  hs2_opensqr(1, xmin, xmax, dx)

INLINE hist2_t *hs2_open(int rows, double xmin, double xmax, double dx,
    double ymin, double ymax, double dy)
{
  hist2_t *hs2;

  xnew(hs2, 1);
  hs2->rows = rows;
  hs2->xmin = xmin;
  hs2->dx   = dx;
  hs2->n    = (int)((xmax - xmin)/dx + 0.99999999);
  hs2->ymin = ymin;
  hs2->dy   = dy;
  hs2->m    = (int)((ymax - ymin)/dy + 0.99999999);
  xnew(hs2->arr, hs2->n * hs2->m * hs2->rows);
  return hs2;
}

INLINE void hs2_close(hist2_t *hs2)
{
  if (hs2) {
    if (hs2->arr) free(hs2->arr);
    memset(hs2, 0, sizeof(*hs2));
    free(hs2);
  }
}

INLINE void hs2_check(const hist2_t *hs)
{
  die_if (hs == NULL, "hist2 is %p", (const void *) hs);
  die_if (hs->arr == NULL || hs->rows == 0 || hs->n == 0 || hs->m == 0,
    "hist2: arr %p rows %d n %d m %d\n",
    (const void *)(hs->arr), hs->rows, hs->n, hs->m);
}

INLINE int hs2_save(const hist2_t *hs, const char *fn, unsigned flags)
{
  hs2_check(hs);
  return hist2save(hs->arr, hs->rows, hs->n, hs->xmin, hs->dx,
      hs->m, hs->ymin, hs->dy, flags, fn);
}

INLINE int hs2_load(hist2_t *hs, const char *fn, unsigned flags)
{
  hs2_check(hs);
  return hist2load(hs->arr, hs->rows, hs->n, hs->xmin, hs->dx,
      hs->m, hs->ymin, hs->dy, flags, fn);
}

INLINE int hs2_add(hist2_t *hs, const double *x, const double *y, int skip,
    double w, unsigned flags)
{
  hs2_check(hs);
  return hist2add(x, y, skip, w, hs->arr, hs->rows, hs->n, hs->xmin, hs->dx,
      hs->m, hs->ymin, hs->dy, flags);
}

#define hs2_add1ez(hs, x, y, flags) hs2_add1(hs, 0, x, y, 1.0, flags)

INLINE int hs2_add1(hist2_t *hs, int r, double x, double y,
    double w, unsigned flags)
{
  hs2_check(hs);
  return hist2add(&x, &y, 1, w, hs->arr+r*hs->n*hs->m, 1,
      hs->n, hs->xmin, hs->dx, hs->m, hs->ymin, hs->dy, flags);
}


INLINE double *hist2getsums_(const double *h, int rows, int n,
    double xmin, double dx, int m, double ymin, double dy, double *sums)
{
  double *xav, *yav, *xxc, *yyc, *xyc, x, y, w;
  int i, j, r;

  xav = sums + rows;
  xxc = sums + rows*2;
  yav = sums + rows*3;
  yyc = sums + rows*4;
  xyc = sums + rows*5;
  for (r = 0; r < rows; r++) {
    sums[r] = xav[r] = xxc[r] = yav[r] = yyc[r] = 0.;
    for (i = 0; i < n; i++) {
      x = xmin + (i+.5)*dx;
      for (j = 0; j < m; j++) {
        y = ymin + (j+.5)*dy;
        w = h[r*n*m + i*m + j];
        sums[r] += w;
        xav[r]  += w*x;
        xxc[r]  += w*x*x;
        yav[r]  += w*y;
        yyc[r]  += w*y*y;
        xyc[r]  += w*x*y;
      }
    }
    if (sums[r] > 1e-5) {
      xav[r] /= sums[r];
      xxc[r] = sqrt(xxc[r]/sums[r] - xav[r]*xav[r]);
      yav[r] /= sums[r];
      yyc[r] = sqrt(yyc[r]/sums[r] - yav[r]*yav[r]);
      xyc[r] = xyc[r]/sums[r] - xav[r]*yav[r];
    }
  }
  return sums;
}


/* write 'rows' 2d n^2 histograms to file */
INLINE int hist2save(const double *h, int rows, int n, double xmin, double dx,
    int m, double ymin, double dy, unsigned flags, const char *fn)
{
  const int version = 1; /* v1 allows different dimension in x and y */
  FILE *fp;
  int i, j, r, imax, imin, jmax, jmin, nm;
  const double *p;
  double *sums, fac, delta;

  if (fn == NULL) fn = "HIST2";
  xfopen(fp, fn, "w", return 1);

  nm = n*m;
  xnew(sums, rows * 6);
  hist2getsums_(h, rows, n, xmin, dx, m, ymin, dy, sums);
  /* print basic information */
  fprintf(fp, "# %d 0x%X | %d %d %g %g %d %g %g | ",
      version, flags, rows, n, xmin, dx, m, ymin, dy);
  for (r = 0; r < rows; r++) /* number of visits */
    fprintf(fp, "%g ", sums[r]);
  fprintf(fp, " | ");
  for (r = 0; r < rows; r++) /* averages and standard deviations */
    fprintf(fp, "%g %g %g %g %g ", sums[r+rows], sums[r+rows*2],
        sums[r+rows*3], sums[r+rows*4], sums[r+rows*5]);
  fprintf(fp, "| \n");

  delta = (flags & HIST_ADDAHALF) ? 0.5 : 0;

  for (r = 0; r < rows; r++) { /* the rth data set */
    p = h + r*nm;

    if (flags & HIST_KEEPRIGHT) {
      imax = n;
    } else { /* trim the right edge of i */
      for (i = n-1; i >= 0; i--) {
        for (j = 0; j < m; j++)
          if (p[i*m + j] > 0) break;
        if (j < m) break; /* found a nonzero entry */
      }
      imax = i+1;
      if (imax == 0)
        continue;
    }

    if (flags & HIST_KEEPLEFT) {
      imin = 0;
    } else { /* trim the left edge of i */
      for (i = 0; i < imax; i++) {
        for (j = 0; j < m; j++)
          if (p[i*m + j] > 0) break;
        if (j < m) break; /* found a nonzero entry */
      }
      imin = i;
    }

    if (flags & HIST_KEEPRIGHT2) {
      jmax = m;
    } else { /* trim the right edge of j */
      for (j = m-1; j >= 0; j--) {
        for (i = imin; i < imax; i++)
          if (p[i*m + j] > 0) break;
        if (i < imax) break;
      }
      jmax = j+1;
    }

    if (flags & HIST_KEEPLEFT2) {
      jmin = 0;
    } else { /* trim the left edge of j */
      for (j = 0; j < jmax; j++) {
        for (i = imin; i < imax; i++)
          if (p[i*m + j] > 0) break;
        if (i < imax) break;
      }
      jmin = j;
    }

    if (fabs(sums[r]) < 1e-6) fac = 1.;
    else fac = 1.0/(sums[r]*dx*dy);

    for (i = imin; i < imax; i++) {
      for (j = jmin; j < jmax; j++) {
        double x, y;
        if ((flags & HIST_NOZEROES) && p[i*m + j] < 1e-16)
          continue;
        x = xmin + (i+delta)*dx;
        y = ymin + (j+delta)*dy;
        fprintf(fp, "%g %g ", x, y);
        if (flags & HIST_KEEPHIST)
          fprintf(fp, "%20.14E ", p[i*m+j]);
        fprintf(fp, "%20.14E %d\n", p[i*m+j]*fac, r);
      }
      fprintf(fp, "\n");
    }
    fprintf(fp, "\n#\n");
  }
  fclose(fp);
  if (flags & HIST_VERBOSE) {
    fprintf(stderr, "successfully wrote %s\n", fn);
    for (r = 0; r < rows; r++)
      fprintf(stderr, "%2d cnt: %20.4f xav: %10.4f(%10.4f) yav: %10.4f(%10.4f)\n",
          r, sums[r], sums[r+rows], sums[r+rows*2], sums[r+rows*3], sums[r+rows*4]);
  }
  free(sums);
  return 0;
}

INLINE int hist2load(double *hist, int rows, int n, double xmin, double dx,
    int m, double ymin, double dy, unsigned flags, const char *fn)
{
  static char s[40960] = "";
#ifdef _OPENMP
#pragma omp threaprivate(s)
#endif
  FILE *fp;
  char *p;
  int verbose = (flags & HIST_VERBOSE);
  int add = (flags & HIST_ADDITION);
  int ver, next, hashist;
  int i, j, r, r1, nm, nlin = 0;
  unsigned fflags;
  double x, y, g, g2, fac, delta, *arr, *sums = NULL;
  double xmin1, dx1, ymin1, dy1;

  xfopen(fp, fn, "r", return -1);

  nm = n*m;
  /* check the first line */
  if (fgets(s, sizeof s, fp) == NULL || s[0] != '#') {
    fprintf(stderr, "%s: missing the first line\n", fn);
    fclose(fp);
    return -1;
  }
  nlin++;
  if (9 != sscanf(s, " # %d 0x %X | %d%d%lf%lf%d%lf%lf | %n", &ver, &fflags, &r,
        &i, &xmin1, &dx1, &j, &ymin1, &dy1, &next)
      || i < n || j < m || r != rows
      || fabs(dx1 - dx) > 1e-5 || fabs(dy1 - dy) > 1e-5 ) {
    fprintf(stderr, "Error: bins %d, %d; %d, %d; ng %d, %d; dx %g, %g; dy %g, %g\n",
        i, n, j, m, r, rows, dx1, dx, dy1, dy);
    fclose(fp);
    return -1;
  }
  delta   = ((fflags & HIST_ADDAHALF) ? .5 : 0.);
  hashist =  (fflags & HIST_KEEPHIST);
  /* scan sums */
  xnew(sums, rows);
  for (p = s+next, r = 0; r < rows; r++) {
    if (1 != sscanf(p, "%lf%n", sums + r, &next)) {
      fprintf(stderr, "cannot read sums from at %d/%d, s:\n%s\np:\n%s\n", r, rows, s, p);
      goto EXIT;
    }
    p += next;
  }
  if ((p = skipabar_(p)) == NULL) goto EXIT;
  for (r = 0; r < rows; r++) {
    if (4 != sscanf(p, "%lf%lf%lf%lf%n", &x, &y, &g, &g2, &next)) {
      fprintf(stderr, "cannot read average/stddev from at %d/%d, s:\n%s\np:\n%s\n", r, rows, s, p);
      goto EXIT;
    }
    p += next;
  }
  if ((p = skipabar_(p)) == NULL) goto EXIT;

  if (!add) { /* clear histogram */
    for (i = 0; i < rows*nm; i++) hist[i] = 0.;
  }

  /* loop over r = 0..rows-1 */
  for (r = 0; r < rows; r++) {
    arr = hist + r*nm;
    fac = sums[r]*(dx*dx);
    while (fgets(s, sizeof s, fp)) {
      nlin++;
      for (p = s+strlen(s)-1; isspace((unsigned char)(*p)) && p >= s; p--)
        *p = '\0'; /* trim ending */
      if (s[0] == '#') break;
      if (s[0] == '\0') continue;

      if (hashist) {
        if (5 != sscanf(s, "%lf%lf%lf%lf%d", &x, &y, &g, &g2, &r1)) {
          fprintf(stderr, "error on line %d\n", nlin);
          goto EXIT;
        }
      } else {
        if (4 != sscanf(s, "%lf%lf%lf%d", &x, &y, &g2, &r1)) {
          fprintf(stderr, "error on line %d\n", nlin);
          goto EXIT;
        }
      }
      if (r1 != r) {
        fprintf(stderr, "wrong column index %d vs. %d on line %d\n",
          r1, r, nlin);
        goto EXIT;
      }
      i = (int)((x - xmin)/dx - delta + .5);
      if (i < 0 || i >= n) {
        fprintf(stderr, "cannot find index for x = %g\n", x);
        goto EXIT;
      }
      j = (int)((y - ymin)/dy - delta + .5);
      if (j < 0 || j >= m) {
        fprintf(stderr, "cannot find index for y = %g\n", y);
        return -1;
      }
      if (!hashist) {
        g = g2*fac;
      }
      if (add) arr[i*m+j] += g;
      else arr[i*m+j] = g;
    }
  }
  if (verbose) fprintf(stderr, "%s loaded successfully\n", fn);
  return 0;
EXIT:
  fprintf(stderr, "error occurs at file %s, line %d, s:%s\n", fn, nlin, s);
  if (sums) free(sums);
  for (i = 0; i < rows*nm; i++) hist[i] = 0.;
  return -1;
}

/* add (xarr[skip*r], yarr[skip*r]) of weight w, into histogram h
 * return number of success */
INLINE int hist2add(const double *xarr, const double *yarr, int skip,
    double w, double *h, int rows,
    int n, double xmin, double dx,
    int m, double ymin, double dy, unsigned flags)
{
  int r, ix, iy, good = 0, verbose = flags & HIST_VERBOSE;
  double x, y;

  for (r = 0; r < rows; r++) {
    x = xarr[skip*r];
    y = yarr[skip*r];
    if (x < xmin || y < ymin) {
      if (verbose)
        fprintf(stderr, "histadd underflows %d: %g < %g or %g < %g\n",
          r, x, xmin, y, ymin);
      continue;
    }
    ix = (int)((x - xmin)/dx);
    iy = (int)((y - ymin)/dy);
    if (ix >= n || iy >= m) {
      if (verbose)
        fprintf(stderr, "histadd overflows %d: %g > %g or %g > %g\n",
            r, x, xmin + dx*n, y, ymin + dy*m);
      continue;
    }
    h[r*n*m + ix*m + iy] += w;
    good++;
  }
  return good;
}


#endif /* ZCOM_HIST__ */
#endif /* ZCOM_HIST */


#ifdef  ZCOM_RNG
#ifndef ZCOM_RNG__
#define ZCOM_RNG__

#include <stdio.h>
#include <string.h>
#include <math.h>

#define MTFILE    "MTSEED"  /* default file */
#define MTSEED    5489UL    /* default seed */

/* Mersenne Twister was developed by Makoto Matsumoto and Takuji Nishimura */
#define MT_N 624
#define MT_M 397
#define MT_UMASK 0x80000000UL /* most significant w-r bits */
#define MT_LMASK 0x7fffffffUL /* least significant r bits */

typedef struct {
  int idx;
  uint32_t arr[MT_N];
} mtrng_t;


/* local copy for convenience */
STRCLS mtrng_t mrstock_ = {-1, {0}}; /* index in mr_, -1: uninitialized */

STRCLS mtrng_t *mr_ = &mrstock_;
/* by default `mr_' is pointed to `mtstock_', but we replace it
 * by a thread private version by calling mtmkprivate(seed)
 * this trick allows the different random numbers generated
 * from different threads even by calling the default versions
 * of the functions, e.g., rnd0() */
#ifdef _OPENMP
#pragma omp threadprivate(mr_)
#endif


/* in the default scrambling, we create a new RNG to be safe */
#define mtscramble(seed) { \
  if (mr_ == &mrstock_) mr_ = mtrng_open0(); \
  mtrng_scramble(mr_, seed); }

/* free the default RNG */
#define mtclosedef() { if (mr_ != &mrstock_) mtrng_close(mr_); }


/* default versions */
#define mtsave(fn)          mtrng_save(mr_, fn)
#define mtload(fn, seed)    mtrng_load(mr_, fn, seed)
#define rand32()            mtrng_rand32(mr_)
#define rnd0()              mtrng_rnd0(mr_) /* double, [0, 1) */
#define rnd(a, b)           mtrng_rnd(mr_, a, b) /* double, [a, b) */
#define mtrand()            mtrng_rand(mr_)
#define grand0()            mtrng_grand0(mr_)
#define randgam(k)          mtrng_randgam(mr_, k)
#define randgausssum(n)     mtrng_randgausssum(mr_, n)
#define randpair(n, j)      mtrng_randpair(mr_, n, j)
#define metroacc0(r)        mtrng_metroacc0(mr_, r)
#define metroacc1(de, bet)  mtrng_metroacc1(mr_, de, bet)


INLINE mtrng_t *mtrng_open0(void)
{
  mtrng_t *mr;

  xnew(mr, 1);
  mr->idx = -1;
  return mr;
}


INLINE void mtrng_close(mtrng_t *mr)
{
  free(mr);
}


/* save the current state to file */
INLINE int mtrng_save(mtrng_t *mr, const char *fn)
{
  FILE *fp;
  int k;

  if (mr->idx < 0) return 1; /* RNG was never used, so it cannot be saved */
  if (fn == NULL) fn = MTFILE;
  xfopen(fp, fn, "w", return -1);
  fprintf(fp, "MTSEED\n%d\n", mr->idx);
  for (k = 0; k < MT_N; k++)
    fprintf(fp, "%" PRIu32 "\n", mr->arr[k]);
  fclose(fp);
  return 0;
}


/* randomize the array `mr_' */
INLINE void mtrng_scramble(mtrng_t *mr, uint32_t seed)
{
  int k;

  mr->arr[0] = ((seed + MTSEED) * 314159265ul + 271828183ul) & 0xfffffffful;
  for (k = 1; k < MT_N; k++) { /* the final mask is for 64-bit machines */
    mr->arr[k] = 1812433253ul * (mr->arr[k - 1] ^ (mr->arr[k - 1] >> 30)) + k;
    /* mr->arr[k] = (mr->arr[k] + seed) * 22695477ul + 1ul; */
    mr->arr[k] = ((mr->arr[k] + seed) * 314159265ul + 1ul) & 0xfffffffful;
  }
  mr->idx = MT_N; /* request for an update */
}


INLINE mtrng_t *mtrng_open(uint32_t seed)
{
  mtrng_t *mr = mtrng_open0();
  mtrng_scramble(mr, seed);
  return mr;
}


/* load mr state from `fn', or if it fails, use `seed' to initialize mr  */
INLINE int mtrng_load(mtrng_t *mr, const char *fn, uint32_t seed)
{
  char s[64];
  int k, z, err = 1;
  FILE *fp;

  if (fn == NULL) fn = MTFILE;
  if ((fp = fopen(fn, "r")) != NULL) { /* try to load from file */
    if (fgets(s, sizeof s, fp) == NULL) {
      fprintf(stderr, "%s is empty\n", fn);
    } else if (strncmp(s, "MTSEED", 6) != 0) { /* to check the first line */
      fprintf(stderr, "%s corrupted\n", fn);
    } else if (fscanf(fp, "%d", &mr->idx) != 1) {
      fprintf(stderr, "no index in %s\n", fn);
    } else {
      if (mr->idx < 0) mr->idx = MT_N; /* request updating */
      for (z = 1, k = 0; k < MT_N; k++) {
        if (fscanf(fp, "%" SCNu32, &mr->arr[k]) != 1) break;
        if (mr->arr[k] != 0) z = 0; /* a non-zero number */
      }
      if (k != MT_N) fprintf(stderr, "%s incomplete %d/%d\n", fn, k, MT_N);
      else err = z; /* clear error, if array is nonzero */
    }
    fclose(fp);
  }

  if (err) mtrng_scramble(mr, seed);
  return (mr->idx < 0);
}


#define mtrng_rand32(mr) mtrng_rand(mr)
/* must be double to avoid the round-off error that gives >= 1 result */
#define mtrng_rnd0(mr) (mtrng_rand32(mr) * (1./4294967296.0)) /* double, [0, 1) */
#define mtrng_rnd(mr, a, b) ((a) + ((b) - (a)) * mtrng_rnd0(mr)) /* double, [a, b) */

/* return an unsigned random number */
INLINE uint32_t mtrng_rand(mtrng_t *mr)
{
  static const uint32_t mag01[2] = {0, 0x9908b0dfUL}; /* MATRIX_A */
#ifdef _OPENMP
#pragma omp threadprivate(mag01)
#endif
  uint32_t x;
  int k;

  if (mr->idx < 0) mtrng_load(mr, NULL, 0);
  if (mr->idx >= MT_N) { /* generate MT_N words at one time */
    for (k = 0; k < MT_N - MT_M; k++) {
      x = (mr->arr[k] & MT_UMASK) | (mr->arr[k+1] & MT_LMASK);
      mr->arr[k] = mr->arr[k+MT_M] ^ (x>>1) ^ mag01[x&1UL];
    }
    for (; k < MT_N-1; k++) {
      x = (mr->arr[k] & MT_UMASK) | (mr->arr[k+1] & MT_LMASK);
      mr->arr[k] = mr->arr[k+(MT_M-MT_N)] ^ (x>>1) ^ mag01[x&1UL];
    }
    x = (mr->arr[MT_N-1] & MT_UMASK) | (mr->arr[0] & MT_LMASK);
    mr->arr[MT_N-1] = mr->arr[MT_M-1] ^ (x>>1) ^ mag01[x&1UL];
    mr->idx = 0;
  }
  x = mr->arr[ mr->idx++ ];
  /* tempering */
  x ^= (x >> 11);
  x ^= (x <<  7) & 0x9d2c5680UL;
  x ^= (x << 15) & 0xefc60000UL;
  x ^= (x >> 18);
  return x;
}


#undef MT_N
#undef MT_M
#undef MT_UMASK
#undef MT_LMASK


/* Gaussian distribution with zero mean and unit variance
 * using ratio method */
INLINE double mtrng_grand0(mtrng_t *mr)
{
  double x, y, u, v, q;
  do {
    u = 1 - mtrng_rnd0(mr);
    v = 1.7156*(mtrng_rnd0(mr) - .5);  /* >= 2*sqrt(2/e) */
    x = u - 0.449871;
    y = fabs(v) + 0.386595;
    q = x*x  + y*(0.196*y - 0.25472*x);
    if (q < 0.27597) break;
  } while (q > 0.27846 || v*v > -4*u*u*log(u));
  return v/u;
}


/* return a random number that satisfies a gamma distribution
   p(x) = x^(k - 1) e^(-x) / (k - 1)! */
INLINE double mtrng_randgam(mtrng_t *mr, int k)
{
  int i;
  double x, k1 = k - 1, r, y, v1, v2, s;

  if (k < 0) { printf("mtrng_randgam: k %d must be positive\n", k); return 0.; }
  if (k == 0) return 0.; /* nothing */
  if (k <= 7) { /* adding numbers of exponential distribution */
    /* exp(- x1 - x2 - x3 - x4) dx1 dx2 dx3 dx4 */
    for (x = 1.0, i = 0; i < k; i++)
      x *= mtrng_rnd0(mr);
    return -log(x);
  }

  /* generate gamma distribution by the rejection method */
  for (;;) {
    /* generate Lorentz distribution, centered at k1, width is sqrt(2.0*k - 1)
     p(y) = 1/pi/(1 + y^2), x = y*w + k1, w = sqrt(2.0*k - 1) */
    for (;;) { /* get a unit circle */
      v1 = 2.0 * mtrng_rnd0(mr) - 1.0;
      v2 = 2.0 * mtrng_rnd0(mr) - 1.0;
      if (v1*v1 + v2*v2 <= 1.0) {
        y = v2/v1; /* tan */
        s = sqrt(2.0*k - 1);
        x = s*y + k1;
        if (x > 0.0) break; /* drop the negative value */
      }
    }
    /* compare with the gamma distribution
       r peaks at x = k1, where, y = 0 and r = 1 */
    r = (1.0 + y*y) * exp(k1 * log(x/k1) - x + k1);
    if (mtrng_rnd0(mr) <= r) break;
  }

  return x;
}


/* return the sum of the square of Gaussian random numbers  */
INLINE double mtrng_randgausssum(mtrng_t *mr, int n)
{
  double x, r;
  if (n <= 0) return 0.0;
  x = 2.0 * mtrng_randgam(mr, n/2);
  if (n % 2) { r = mtrng_grand0(mr); x += r*r; }
  return x;
}


/* random pair index (i, j) */
INLINE int mtrng_randpair(mtrng_t *mr, int n, int *j)
{
  int pid = (int) (mtrng_rnd0(mr) * n * (n - 1)), i;
  i = pid / (n - 1);
  *j = pid - i * (n - 1);
  if (*j >= i) (*j)++;
  return i;
}


/* Metropolis acceptance probability rnd0() < exp(- bet * de), assuming bet > 0
 * defined as a macro, in case r is an integer */
#define mtrng_metroacc1(mr, de, bet) \
  ((de <= 0) ? 1 : mtrng_metroacc0(mr, -bet * de))

/* Metropolis acceptance probability rnd0() < exp(r), assuming r > 0 */
INLINE int mtrng_metroacc0(mtrng_t *mr, double r)
{
  r = exp(r);
  return mtrng_rnd0(mr) < r;
}


#endif /* ZCOM_RNG__ */
#endif /* ZCOM_RNG */


#ifdef  ZCOM_RV2
#ifndef ZCOM_RV2__
#define ZCOM_RV2__
#include <stdio.h>
#include <string.h>
#include <math.h>


#ifndef FV2_T
#define FV2_T fv2_t
typedef float fv2_t[2];
#endif

#ifndef DV2_T
#define DV2_T dv2_t
typedef double dv2_t[2];
#endif

#ifndef RV2_T
#define RV2_T rv2_t
typedef real rv2_t[2];
#endif


#define fv2_print(r, nm, fmt, nl) fv2_fprint(stdout, r, nm, fmt, nl)

#define dv2_print(r, nm, fmt, nl) dv2_fprint(stdout, r, nm, fmt, nl)

#define rv2_print(r, nm, fmt, nl) rv2_fprint(stdout, r, nm, fmt, nl)

INLINE void fv2_fprint(FILE *fp, const float *r, const char *nm,
    const char *fmt, int nl)
{
  int i;
  if (nm) fprintf(fp, "%s: ", nm);
  for (i = 0; i < 2; i++)
    fprintf(fp, fmt, r[i], nl);
  fprintf(fp, "%c", (nl ? '\n' : ';'));
}

INLINE void dv2_fprint(FILE *fp, const double *r, const char *nm,
    const char *fmt, int nl)
{
  int i;
  if (nm) fprintf(fp, "%s: ", nm);
  for (i = 0; i < 2; i++)
    fprintf(fp, fmt, r[i], nl);
  fprintf(fp, "%c", (nl ? '\n' : ';'));
}

INLINE void rv2_fprint(FILE *fp, const real *r, const char *nm,
    const char *fmt, int nl)
{
  int i;
  if (nm) fprintf(fp, "%s: ", nm);
  for (i = 0; i < 2; i++)
    fprintf(fp, fmt, r[i], nl);
  fprintf(fp, "%c", (nl ? '\n' : ';'));
}


/* due to that pointer may overlap with each other,
 * be careful when using the const modifier */
INLINE float *fv2_make(float *x, float a, float b)
{
  x[0] = a;
  x[1] = b;
  return x;
}

/* due to that pointer may overlap with each other,
 * be careful when using the const modifier */
INLINE double *dv2_make(double *x, double a, double b)
{
  x[0] = a;
  x[1] = b;
  return x;
}

/* due to that pointer may overlap with each other,
 * be careful when using the const modifier */
INLINE real *rv2_make(real *x, real a, real b)
{
  x[0] = a;
  x[1] = b;
  return x;
}


#define fv2_makev(rx, x) fv2_make(rx, (float) x[0], (float) x[1])

#define dv2_makev(rx, x) dv2_make(rx, (double) x[0], (double) x[1])

#define rv2_makev(rx, x) rv2_make(rx, (real) x[0], (real) x[1])


#define fv2_zero(x) fv2_make(x, 0, 0)

#define dv2_zero(x) dv2_make(x, 0, 0)

#define rv2_zero(x) rv2_make(x, 0, 0)


INLINE float *fv2_copy(float *x, const float *src)
{
  x[0] = src[0];
  x[1] = src[1];
  return x;
}

INLINE double *dv2_copy(double *x, const double *src)
{
  x[0] = src[0];
  x[1] = src[1];
  return x;
}

INLINE real *rv2_copy(real *x, const real *src)
{
  x[0] = src[0];
  x[1] = src[1];
  return x;
}


/* use macro to avoid const qualifier of src */
#define fv2_ncopy(x, src, n) memcpy(x, src, 2*n*sizeof(float))

/* use macro to avoid const qualifier of src */
#define dv2_ncopy(x, src, n) memcpy(x, src, 2*n*sizeof(double))

/* use macro to avoid const qualifier of src */
#define rv2_ncopy(x, src, n) memcpy(x, src, 2*n*sizeof(real))


INLINE void fv2_swap(float * RESTRICT x, float * RESTRICT y)
{
  float z[2];
  fv2_copy(z, x);
  fv2_copy(x, y);
  fv2_copy(y, z);
}

INLINE void dv2_swap(double * RESTRICT x, double * RESTRICT y)
{
  double z[2];
  dv2_copy(z, x);
  dv2_copy(x, y);
  dv2_copy(y, z);
}

INLINE void rv2_swap(real * RESTRICT x, real * RESTRICT y)
{
  real z[2];
  rv2_copy(z, x);
  rv2_copy(x, y);
  rv2_copy(y, z);
}


INLINE float fv2_sqr(const float *x)
{
  return x[0] * x[0] + x[1] * x[1];
}

INLINE double dv2_sqr(const double *x)
{
  return x[0] * x[0] + x[1] * x[1];
}

INLINE real rv2_sqr(const real *x)
{
  return x[0] * x[0] + x[1] * x[1];
}


INLINE float fv2_norm(const float *x)
{
  return (float) sqrt(fv2_sqr(x));
}

INLINE double dv2_norm(const double *x)
{
  return (double) sqrt(dv2_sqr(x));
}

INLINE real rv2_norm(const real *x)
{
  return (real) sqrt(rv2_sqr(x));
}


INLINE float fv2_dot(const float *x, const float *y)
{
  return x[0] * y[0] + x[1] * y[1];
}

INLINE double dv2_dot(const double *x, const double *y)
{
  return x[0] * y[0] + x[1] * y[1];
}

INLINE real rv2_dot(const real *x, const real *y)
{
  return x[0] * y[0] + x[1] * y[1];
}


INLINE float fv2_cross(const float *x, const float *y)
{
  return x[0] * y[1] - x[1] * y[0];
}

INLINE double dv2_cross(const double *x, const double *y)
{
  return x[0] * y[1] - x[1] * y[0];
}

INLINE real rv2_cross(const real *x, const real *y)
{
  return x[0] * y[1] - x[1] * y[0];
}


INLINE float *fv2_neg(float * RESTRICT x)
{
  x[0] = -x[0];
  x[1] = -x[1];
  return x;
}

INLINE double *dv2_neg(double * RESTRICT x)
{
  x[0] = -x[0];
  x[1] = -x[1];
  return x;
}

INLINE real *rv2_neg(real * RESTRICT x)
{
  x[0] = -x[0];
  x[1] = -x[1];
  return x;
}


INLINE float *fv2_neg2(float * RESTRICT nx, const float *x)
{
  nx[0] = -x[0];
  nx[1] = -x[1];
  return nx;
}

INLINE double *dv2_neg2(double * RESTRICT nx, const double *x)
{
  nx[0] = -x[0];
  nx[1] = -x[1];
  return nx;
}

INLINE real *rv2_neg2(real * RESTRICT nx, const real *x)
{
  nx[0] = -x[0];
  nx[1] = -x[1];
  return nx;
}


INLINE float *fv2_inc(float * RESTRICT x, const float *dx)
{
  x[0] += dx[0];
  x[1] += dx[1];
  return x;
}

INLINE double *dv2_inc(double * RESTRICT x, const double *dx)
{
  x[0] += dx[0];
  x[1] += dx[1];
  return x;
}

INLINE real *rv2_inc(real * RESTRICT x, const real *dx)
{
  x[0] += dx[0];
  x[1] += dx[1];
  return x;
}


INLINE float *fv2_dec(float * RESTRICT x, const float *dx)
{
  x[0] -= dx[0];
  x[1] -= dx[1];
  return x;
}

INLINE double *dv2_dec(double * RESTRICT x, const double *dx)
{
  x[0] -= dx[0];
  x[1] -= dx[1];
  return x;
}

INLINE real *rv2_dec(real * RESTRICT x, const real *dx)
{
  x[0] -= dx[0];
  x[1] -= dx[1];
  return x;
}


INLINE float *fv2_sinc(float * RESTRICT x, const float *dx, float s)
{
  x[0] += s*dx[0];
  x[1] += s*dx[1];
  return x;
}

INLINE double *dv2_sinc(double * RESTRICT x, const double *dx, double s)
{
  x[0] += s*dx[0];
  x[1] += s*dx[1];
  return x;
}

INLINE real *rv2_sinc(real * RESTRICT x, const real *dx, real s)
{
  x[0] += s*dx[0];
  x[1] += s*dx[1];
  return x;
}


INLINE float *fv2_smul(float *x, float s)
{
  x[0] *= s;
  x[1] *= s;
  return x;
}

INLINE double *dv2_smul(double *x, double s)
{
  x[0] *= s;
  x[1] *= s;
  return x;
}

INLINE real *rv2_smul(real *x, real s)
{
  x[0] *= s;
  x[1] *= s;
  return x;
}


INLINE float *fv2_smul2(float * RESTRICT y, const float *x, float s)
{
  y[0] = x[0] * s;
  y[1] = x[1] * s;
  return y;
}

INLINE double *dv2_smul2(double * RESTRICT y, const double *x, double s)
{
  y[0] = x[0] * s;
  y[1] = x[1] * s;
  return y;
}

INLINE real *rv2_smul2(real * RESTRICT y, const real *x, real s)
{
  y[0] = x[0] * s;
  y[1] = x[1] * s;
  return y;
}


INLINE float *fv2_normalize(float *x)
{
  float r = fv2_norm(x);
  if (r > 0.f) fv2_smul(x, 1.f/r);
  return x;
}

INLINE double *dv2_normalize(double *x)
{
  double r = dv2_norm(x);
  if (r > 0.f) dv2_smul(x, 1.f/r);
  return x;
}

INLINE real *rv2_normalize(real *x)
{
  real r = rv2_norm(x);
  if (r > 0.f) rv2_smul(x, 1.f/r);
  return x;
}


INLINE float *fv2_makenorm(float *v, float x, float y)
{
  return fv2_normalize( fv2_make(v, x, y) );
}

INLINE double *dv2_makenorm(double *v, double x, double y)
{
  return dv2_normalize( dv2_make(v, x, y) );
}

INLINE real *rv2_makenorm(real *v, real x, real y)
{
  return rv2_normalize( rv2_make(v, x, y) );
}


/* for in-place difference use fv2_dec */
INLINE float *fv2_diff(float * RESTRICT c, const float *a, const float *b)
{
  c[0] = a[0] - b[0];
  c[1] = a[1] - b[1];
  return c;
}

/* for in-place difference use dv2_dec */
INLINE double *dv2_diff(double * RESTRICT c, const double *a, const double *b)
{
  c[0] = a[0] - b[0];
  c[1] = a[1] - b[1];
  return c;
}

/* for in-place difference use rv2_dec */
INLINE real *rv2_diff(real * RESTRICT c, const real *a, const real *b)
{
  c[0] = a[0] - b[0];
  c[1] = a[1] - b[1];
  return c;
}

/* distance^2 between a and b */
INLINE float fv2_dist2(const float *a, const float *b)
{
  float d0 = a[0] - b[0], d1 = a[1] - b[1];
  return d0*d0 + d1*d1;
}

/* distance^2 between a and b */
INLINE double dv2_dist2(const double *a, const double *b)
{
  double d0 = a[0] - b[0], d1 = a[1] - b[1];
  return d0*d0 + d1*d1;
}

/* distance^2 between a and b */
INLINE real rv2_dist2(const real *a, const real *b)
{
  real d0 = a[0] - b[0], d1 = a[1] - b[1];
  return d0*d0 + d1*d1;
}


/* distance between a and b */
INLINE float fv2_dist(const float *a, const float *b)
{
  float d0 = a[0] - b[0], d1 = a[1] - b[1];
  return (float) sqrt(d0*d0 + d1*d1);
}

/* distance between a and b */
INLINE double dv2_dist(const double *a, const double *b)
{
  double d0 = a[0] - b[0], d1 = a[1] - b[1];
  return (double) sqrt(d0*d0 + d1*d1);
}

/* distance between a and b */
INLINE real rv2_dist(const real *a, const real *b)
{
  real d0 = a[0] - b[0], d1 = a[1] - b[1];
  return (real) sqrt(d0*d0 + d1*d1);
}


/* c = a + b, for in-place addition use fv2_inc */
INLINE float *fv2_add(float * RESTRICT c, const float *a, const float *b)
{
  c[0] = a[0] + b[0];
  c[1] = a[1] + b[1];
  return c;
}

/* c = a + b, for in-place addition use dv2_inc */
INLINE double *dv2_add(double * RESTRICT c, const double *a, const double *b)
{
  c[0] = a[0] + b[0];
  c[1] = a[1] + b[1];
  return c;
}

/* c = a + b, for in-place addition use rv2_inc */
INLINE real *rv2_add(real * RESTRICT c, const real *a, const real *b)
{
  c[0] = a[0] + b[0];
  c[1] = a[1] + b[1];
  return c;
}


/* c = - a - b */
INLINE float *fv2_nadd(float * RESTRICT c, const float *a, const float *b)
{
  c[0] = - a[0] - b[0];
  c[1] = - a[1] - b[1];
  return c;
}

/* c = - a - b */
INLINE double *dv2_nadd(double * RESTRICT c, const double *a, const double *b)
{
  c[0] = - a[0] - b[0];
  c[1] = - a[1] - b[1];
  return c;
}

/* c = - a - b */
INLINE real *rv2_nadd(real * RESTRICT c, const real *a, const real *b)
{
  c[0] = - a[0] - b[0];
  c[1] = - a[1] - b[1];
  return c;
}


/* c = a + b * s */
INLINE float *fv2_sadd(float * RESTRICT c, const float *a, const float *b, float s)
{
  c[0] = a[0] + b[0] * s;
  c[1] = a[1] + b[1] * s;
  return c;
}

/* c = a + b * s */
INLINE double *dv2_sadd(double * RESTRICT c, const double *a, const double *b, double s)
{
  c[0] = a[0] + b[0] * s;
  c[1] = a[1] + b[1] * s;
  return c;
}

/* c = a + b * s */
INLINE real *rv2_sadd(real * RESTRICT c, const real *a, const real *b, real s)
{
  c[0] = a[0] + b[0] * s;
  c[1] = a[1] + b[1] * s;
  return c;
}


/* c = a * s1 + b * s2 */
INLINE float *fv2_lincomb2(float * RESTRICT c, const float *a, const float *b, float s1, float s2)
{
  c[0] = a[0] * s1 + b[0] * s2;
  c[1] = a[1] * s1 + b[1] * s2;
  return c;
}

/* c = a * s1 + b * s2 */
INLINE double *dv2_lincomb2(double * RESTRICT c, const double *a, const double *b, double s1, double s2)
{
  c[0] = a[0] * s1 + b[0] * s2;
  c[1] = a[1] * s1 + b[1] * s2;
  return c;
}

/* c = a * s1 + b * s2 */
INLINE real *rv2_lincomb2(real * RESTRICT c, const real *a, const real *b, real s1, real s2)
{
  c[0] = a[0] * s1 + b[0] * s2;
  c[1] = a[1] * s1 + b[1] * s2;
  return c;
}


/* cosine of the angle of x1-x2-x3 */
INLINE float fv2_cosang(const float *x1, const float *x2, const float *x3,
    float *g1, float *g2, float *g3)
{
  float a[2], b[2], ra, rb, dot;

  ra = fv2_norm(fv2_diff(a, x1, x2));
  fv2_smul(a, 1.f/ra);
  rb = fv2_norm(fv2_diff(b, x3, x2));
  fv2_smul(b, 1.f/rb);
  dot = fv2_dot(a, b);
  if (dot > 1) dot = 1; else if (dot < -1) dot = -1;
  if (g1) {
    fv2_lincomb2(g1, b, a, 1.f/ra, -dot/ra);
    fv2_lincomb2(g3, a, b, 1.f/rb, -dot/rb);
    fv2_nadd(g2, g1, g3);
  }
  return dot;
}

/* cosine of the angle of x1-x2-x3 */
INLINE double dv2_cosang(const double *x1, const double *x2, const double *x3,
    double *g1, double *g2, double *g3)
{
  double a[2], b[2], ra, rb, dot;

  ra = dv2_norm(dv2_diff(a, x1, x2));
  dv2_smul(a, 1.f/ra);
  rb = dv2_norm(dv2_diff(b, x3, x2));
  dv2_smul(b, 1.f/rb);
  dot = dv2_dot(a, b);
  if (dot > 1) dot = 1; else if (dot < -1) dot = -1;
  if (g1) {
    dv2_lincomb2(g1, b, a, 1.f/ra, -dot/ra);
    dv2_lincomb2(g3, a, b, 1.f/rb, -dot/rb);
    dv2_nadd(g2, g1, g3);
  }
  return dot;
}

/* cosine of the angle of x1-x2-x3 */
INLINE real rv2_cosang(const real *x1, const real *x2, const real *x3,
    real *g1, real *g2, real *g3)
{
  real a[2], b[2], ra, rb, dot;

  ra = rv2_norm(rv2_diff(a, x1, x2));
  rv2_smul(a, 1.f/ra);
  rb = rv2_norm(rv2_diff(b, x3, x2));
  rv2_smul(b, 1.f/rb);
  dot = rv2_dot(a, b);
  if (dot > 1) dot = 1; else if (dot < -1) dot = -1;
  if (g1) {
    rv2_lincomb2(g1, b, a, 1.f/ra, -dot/ra);
    rv2_lincomb2(g3, a, b, 1.f/rb, -dot/rb);
    rv2_nadd(g2, g1, g3);
  }
  return dot;
}


/* angle and gradients of x1-x2-x3 */
INLINE float fv2_ang(const float *x1, const float *x2, const float *x3,
    float * RESTRICT g1, float * RESTRICT g2, float * RESTRICT g3)
{
  float dot, sn;
  dot = fv2_cosang(x1, x2, x3, g1, g2, g3);
  sn = (float) sqrt(1 - dot*dot);
  if (sn > 1e-7) sn = -1/sn; else sn = 0.;
  if (g1) {
    fv2_smul(g1, sn);
    fv2_smul(g2, sn);
    fv2_smul(g3, sn);
  }
  return (float) acos(dot);
}

/* angle and gradients of x1-x2-x3 */
INLINE double dv2_ang(const double *x1, const double *x2, const double *x3,
    double * RESTRICT g1, double * RESTRICT g2, double * RESTRICT g3)
{
  double dot, sn;
  dot = dv2_cosang(x1, x2, x3, g1, g2, g3);
  sn = (double) sqrt(1 - dot*dot);
  if (sn > 1e-7) sn = -1/sn; else sn = 0.;
  if (g1) {
    dv2_smul(g1, sn);
    dv2_smul(g2, sn);
    dv2_smul(g3, sn);
  }
  return (double) acos(dot);
}

/* angle and gradients of x1-x2-x3 */
INLINE real rv2_ang(const real *x1, const real *x2, const real *x3,
    real * RESTRICT g1, real * RESTRICT g2, real * RESTRICT g3)
{
  real dot, sn;
  dot = rv2_cosang(x1, x2, x3, g1, g2, g3);
  sn = (real) sqrt(1 - dot*dot);
  if (sn > 1e-7) sn = -1/sn; else sn = 0.;
  if (g1) {
    rv2_smul(g1, sn);
    rv2_smul(g2, sn);
    rv2_smul(g3, sn);
  }
  return (real) acos(dot);
}


/* vertical distance from x to line a-b */
INLINE real rv2_vdist(const real *x, const real *a, const real *b)
{
  real nm[2], d[2], dot;

  rv2_diff(d, x, a);
  rv2_normalize(rv2_diff(nm, a, b));
  dot = rv2_dot(d, nm);
  return rv2_norm(rv2_sinc(d, nm, -dot));
}


/* determinant of a 2x2 matrix */
INLINE real rm2_det(real a[2][2])
{
  return a[0][0]*a[1][1] - a[0][1]*a[1][0];
}


/* inverse matrix b = a^(-1) */
INLINE void rm2_inv(real b[2][2], real a[2][2])
{
  real det = rm2_det(a);
  if (fabs(det) < 1e-30) det = (det < 0) ? -1e-30f: 1e-30f;
  b[0][0] =  a[1][1]/det;
  b[0][1] = -a[0][1]/det;
  b[1][0] = -a[1][0]/det;
  b[1][1] =  a[0][0]/det;
}


#define fv2_rnd0() fv2_rnd(v, 0, 1)

#define dv2_rnd0() dv2_rnd(v, 0, 1)

#define rv2_rnd0() rv2_rnd(v, 0, 1)

/* uniformly distributed random vector [a, a + b) */
INLINE float *fv2_rnd(float *v, float a, float b)
{
  b -= a;
  v[0] = a + b * (float) rnd0();
  v[1] = a + b * (float) rnd0();
  return v;
}

/* uniformly distributed random vector [a, a + b) */
INLINE double *dv2_rnd(double *v, double a, double b)
{
  b -= a;
  v[0] = a + b * (double) rnd0();
  v[1] = a + b * (double) rnd0();
  return v;
}

/* uniformly distributed random vector [a, a + b) */
INLINE real *rv2_rnd(real *v, real a, real b)
{
  b -= a;
  v[0] = a + b * (real) rnd0();
  v[1] = a + b * (real) rnd0();
  return v;
}


/* displace `x0' by a random vector in [-a, a)^2 */
INLINE real *rv2_rnddisp(real * RESTRICT x, const real *x0, real a)
{
  x[0] = x0[0] + (real) rnd(-a, a);
  x[1] = x0[1] + (real) rnd(-a, a);
  return x;
}


/* normally distributed random vector */
INLINE float *fv2_grand0(float *v)
{
  v[0] = (float) grand0();
  v[1] = (float) grand0();
  return v;
}

/* normally distributed random vector */
INLINE double *dv2_grand0(double *v)
{
  v[0] = (double) grand0();
  v[1] = (double) grand0();
  return v;
}

/* normally distributed random vector */
INLINE real *rv2_grand0(real *v)
{
  v[0] = (real) grand0();
  v[1] = (real) grand0();
  return v;
}


/* normally distributed random vector */
INLINE float *fv2_grand(float *v, float c, float r)
{
  v[0] = c + r * (float) grand0();
  v[1] = c + r * (float) grand0();
  return v;
}

/* normally distributed random vector */
INLINE double *dv2_grand(double *v, double c, double r)
{
  v[0] = c + r * (double) grand0();
  v[1] = c + r * (double) grand0();
  return v;
}

/* normally distributed random vector */
INLINE real *rv2_grand(real *v, real c, real r)
{
  v[0] = c + r * (real) grand0();
  v[1] = c + r * (real) grand0();
  return v;
}


/* displace `x0' by a normally-distributed random vector */
INLINE real *rv2_granddisp(real * RESTRICT x, const real *x0, real a)
{
  x[0] = x0[0] + (real) grand0() * a;
  x[1] = x0[1] + (real) grand0() * a;
  return x;
}


/* randomly oriented vector on the sphere of radius r */
#define fv2_rnddir(v, r) fv2_smul(fv2_rnddir0(v), r)

/* randomly oriented vector on the sphere of radius r */
#define dv2_rnddir(v, r) dv2_smul(dv2_rnddir0(v), r)

/* randomly oriented vector on the sphere of radius r */
#define rv2_rnddir(v, r) rv2_smul(rv2_rnddir0(v), r)

/* randomly oriented vector on the unit sphere */
INLINE float *fv2_rnddir0(float *v)
{
  do {
    fv2_rnd(v, -1, 1);
  } while (fv2_sqr(v) >= 1);
  return fv2_normalize(v);
}

/* randomly oriented vector on the unit sphere */
INLINE double *dv2_rnddir0(double *v)
{
  do {
    dv2_rnd(v, -1, 1);
  } while (dv2_sqr(v) >= 1);
  return dv2_normalize(v);
}

/* randomly oriented vector on the unit sphere */
INLINE real *rv2_rnddir0(real *v)
{
  do {
    rv2_rnd(v, -1, 1);
  } while (rv2_sqr(v) >= 1);
  return rv2_normalize(v);
}


#define rv2_rndball rv2_rnddisk

#define rv2_rndball0 rv2_rnddisk0

/* randomly orientied vector within the sphere of radius `r' */
#define fv2_rnddisk(v, r) fv2_smul(fv2_rnddisk0(v), r)

/* randomly orientied vector within the sphere of radius `r' */
#define dv2_rnddisk(v, r) dv2_smul(dv2_rnddisk0(v), r)

/* randomly orientied vector within the sphere of radius `r' */
#define rv2_rnddisk(v, r) rv2_smul(rv2_rnddisk0(v), r)

/* randomly vector within the unit sphere */
INLINE float *fv2_rnddisk0(float *v)
{
  do {
    fv2_rnd(v, -1, 1);
  } while (fv2_sqr(v) >= 1);
  return v;
}

/* randomly vector within the unit sphere */
INLINE double *dv2_rnddisk0(double *v)
{
  do {
    dv2_rnd(v, -1, 1);
  } while (dv2_sqr(v) >= 1);
  return v;
}

/* randomly vector within the unit sphere */
INLINE real *rv2_rnddisk0(real *v)
{
  do {
    rv2_rnd(v, -1, 1);
  } while (rv2_sqr(v) >= 1);
  return v;
}


#ifndef FM2_T
#define FM2_T fm2_t
typedef float fm2_t[2];
#endif

#ifndef DM2_T
#define DM2_T dm2_t
typedef double dm2_t[2];
#endif

#ifndef RM2_T
#define RM2_T rm2_t
typedef real rm2_t[2];
#endif


#define fm2_print(r, nm, fmt, nl) fm2_fprint(stdout, r, nm, fmt, nl)

#define dm2_print(r, nm, fmt, nl) dm2_fprint(stdout, r, nm, fmt, nl)

#define rm2_print(r, nm, fmt, nl) rm2_fprint(stdout, r, nm, fmt, nl)

INLINE void fm2_fprint(FILE *fp, float r[2][2], const char *nm,
    const char *fmt, int nl)
{
  int i, j;
  if (nm) fprintf(fp, "%s:%c", nm, (nl ? '\n' : ' '));
  for (i = 0; i < 2; i++) {
    for (j = 0; j < 2; j++) {
      fprintf(fp, fmt, r[i][j], nl);
    }
    fprintf(fp, "%s", (nl ? "\n" : "; "));
  }
}

INLINE void dm2_fprint(FILE *fp, double r[2][2], const char *nm,
    const char *fmt, int nl)
{
  int i, j;
  if (nm) fprintf(fp, "%s:%c", nm, (nl ? '\n' : ' '));
  for (i = 0; i < 2; i++) {
    for (j = 0; j < 2; j++) {
      fprintf(fp, fmt, r[i][j], nl);
    }
    fprintf(fp, "%s", (nl ? "\n" : "; "));
  }
}

INLINE void rm2_fprint(FILE *fp, real r[2][2], const char *nm,
    const char *fmt, int nl)
{
  int i, j;
  if (nm) fprintf(fp, "%s:%c", nm, (nl ? '\n' : ' '));
  for (i = 0; i < 2; i++) {
    for (j = 0; j < 2; j++) {
      fprintf(fp, fmt, r[i][j], nl);
    }
    fprintf(fp, "%s", (nl ? "\n" : "; "));
  }
}


INLINE fv2_t *fm2_make(float x[2][2], float a00, float a01, float a10, float a11)
{
  fv2_make(x[0], a00, a01);
  fv2_make(x[1], a10, a11);
  return x;
}

INLINE dv2_t *dm2_make(double x[2][2], double a00, double a01, double a10, double a11)
{
  dv2_make(x[0], a00, a01);
  dv2_make(x[1], a10, a11);
  return x;
}

INLINE rv2_t *rm2_make(real x[2][2], real a00, real a01, real a10, real a11)
{
  rv2_make(x[0], a00, a01);
  rv2_make(x[1], a10, a11);
  return x;
}


#define fm2_makem(rx, x) fm2_make(rx, (float) x[0][0], (float) x[0][1], \
    (float) x[1][0], (float) x[1][1])

#define dm2_makem(rx, x) dm2_make(rx, (double) x[0][0], (double) x[0][1], \
    (double) x[1][0], (double) x[1][1])

#define rm2_makem(rx, x) rm2_make(rx, (real) x[0][0], (real) x[0][1], \
    (real) x[1][0], (real) x[1][1])


#define fm2_zero(x) fm2_make(x, 0, 0, 0, 0)

#define dm2_zero(x) dm2_make(x, 0, 0, 0, 0)

#define rm2_zero(x) rm2_make(x, 0, 0, 0, 0)


#define fm2_one(x) fm2_make(x, 1, 0, 0, 1)

#define dm2_one(x) dm2_make(x, 1, 0, 0, 1)

#define rm2_one(x) rm2_make(x, 1, 0, 0, 1)


/* generate a random orthonormal (unitary) 2x2 matrix */
INLINE fv2_t *fm2_rnduni(float a[2][2])
{
  fv2_rnddir0(a[0]);
  fv2_make(a[1], a[0][1], -a[0][0]);
  if (rnd0() > 0.5) fv2_neg(a[1]);
  return a;
}

/* generate a random orthonormal (unitary) 2x2 matrix */
INLINE dv2_t *dm2_rnduni(double a[2][2])
{
  dv2_rnddir0(a[0]);
  dv2_make(a[1], a[0][1], -a[0][0]);
  if (rnd0() > 0.5) dv2_neg(a[1]);
  return a;
}

/* generate a random orthonormal (unitary) 2x2 matrix */
INLINE rv2_t *rm2_rnduni(real a[2][2])
{
  rv2_rnddir0(a[0]);
  rv2_make(a[1], a[0][1], -a[0][0]);
  if (rnd0() > 0.5) rv2_neg(a[1]);
  return a;
}


#endif /* ZCOM_RV2__ */
#endif /* ZCOM_RV2 */


#ifdef  ZCOM_RV3
#ifndef ZCOM_RV3__
#define ZCOM_RV3__

#ifndef FV3_T
#define FV3_T fv3_t
typedef float fv3_t[3];
#endif

#ifndef DV3_T
#define DV3_T dv3_t
typedef double dv3_t[3];
#endif

#ifndef RV3_T
#define RV3_T rv3_t
typedef real rv3_t[3];
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <float.h>


#define fv3_print(r, nm, fmt, nl) fv3_fprint(stdout, r, nm, fmt, nl)

#define dv3_print(r, nm, fmt, nl) dv3_fprint(stdout, r, nm, fmt, nl)

#define rv3_print(r, nm, fmt, nl) rv3_fprint(stdout, r, nm, fmt, nl)

INLINE void fv3_fprint(FILE *fp, float *r, const char *nm, const char *fmt, int nl)
{
  int i;

  if (nm) fprintf(fp, "%s: ", nm);
  for (i = 0; i < 3; i++) fprintf(fp, fmt, r[i]);
  fprintf(fp, "%c", nl ? '\n' : ';');
}

INLINE void dv3_fprint(FILE *fp, double *r, const char *nm, const char *fmt, int nl)
{
  int i;

  if (nm) fprintf(fp, "%s: ", nm);
  for (i = 0; i < 3; i++) fprintf(fp, fmt, r[i]);
  fprintf(fp, "%c", nl ? '\n' : ';');
}

INLINE void rv3_fprint(FILE *fp, real *r, const char *nm, const char *fmt, int nl)
{
  int i;

  if (nm) fprintf(fp, "%s: ", nm);
  for (i = 0; i < 3; i++) fprintf(fp, fmt, r[i]);
  fprintf(fp, "%c", nl ? '\n' : ';');
}


/* due to possible pointer overlap, 'const' are not add to some parameters */

INLINE float *fv3_make(float *x, float a, float b, float c)
{
  x[0] = a;
  x[1] = b;
  x[2] = c;
  return x;
}

INLINE double *dv3_make(double *x, double a, double b, double c)
{
  x[0] = a;
  x[1] = b;
  x[2] = c;
  return x;
}

INLINE real *rv3_make(real *x, real a, real b, real c)
{
  x[0] = a;
  x[1] = b;
  x[2] = c;
  return x;
}


#define fv3_makev(rv, v) fv3_make(rv, (float) v[0], (float) v[1], (float) v[2])

#define dv3_makev(rv, v) dv3_make(rv, (double) v[0], (double) v[1], (double) v[2])

#define rv3_makev(rv, v) rv3_make(rv, (real) v[0], (real) v[1], (real) v[2])


#define fv3_zero(x) fv3_make(x, 0, 0, 0)

#define dv3_zero(x) dv3_make(x, 0, 0, 0)

#define rv3_zero(x) rv3_make(x, 0, 0, 0)


INLINE float *fv3_copy(float * RESTRICT x, const float *src)
{
  x[0] = src[0];
  x[1] = src[1];
  x[2] = src[2];
  return x;
}

INLINE double *dv3_copy(double * RESTRICT x, const double *src)
{
  x[0] = src[0];
  x[1] = src[1];
  x[2] = src[2];
  return x;
}

INLINE real *rv3_copy(real * RESTRICT x, const real *src)
{
  x[0] = src[0];
  x[1] = src[1];
  x[2] = src[2];
  return x;
}


/* use macro to avoid const qualifier of src */
#define fv3_ncopy(x, src, n) memcpy(x, src, 3*n*sizeof(float))

/* use macro to avoid const qualifier of src */
#define dv3_ncopy(x, src, n) memcpy(x, src, 3*n*sizeof(double))

/* use macro to avoid const qualifier of src */
#define rv3_ncopy(x, src, n) memcpy(x, src, 3*n*sizeof(real))


INLINE void fv3_swap(float * RESTRICT x, float * RESTRICT y)
{
  float z[3];
  fv3_copy(z, x);
  fv3_copy(x, y);
  fv3_copy(y, z);
}

INLINE void dv3_swap(double * RESTRICT x, double * RESTRICT y)
{
  double z[3];
  dv3_copy(z, x);
  dv3_copy(x, y);
  dv3_copy(y, z);
}

INLINE void rv3_swap(real * RESTRICT x, real * RESTRICT y)
{
  real z[3];
  rv3_copy(z, x);
  rv3_copy(x, y);
  rv3_copy(y, z);
}


INLINE float fv3_sqr(const float *x)
{
  return x[0]*x[0] + x[1]*x[1] + x[2]*x[2];
}

INLINE double dv3_sqr(const double *x)
{
  return x[0]*x[0] + x[1]*x[1] + x[2]*x[2];
}

INLINE real rv3_sqr(const real *x)
{
  return x[0]*x[0] + x[1]*x[1] + x[2]*x[2];
}


INLINE float fv3_norm(const float *x)
{
  return (float) sqrt( fv3_sqr(x) );
}

INLINE double dv3_norm(const double *x)
{
  return (double) sqrt( dv3_sqr(x) );
}

INLINE real rv3_norm(const real *x)
{
  return (real) sqrt( rv3_sqr(x) );
}


/* if x == y, try to use sqr */
INLINE float fv3_dot(const float *x, const float *y)
{
  return x[0]*y[0] + x[1]*y[1] + x[2]*y[2];
}

/* if x == y, try to use sqr */
INLINE double dv3_dot(const double *x, const double *y)
{
  return x[0]*y[0] + x[1]*y[1] + x[2]*y[2];
}

/* if x == y, try to use sqr */
INLINE real rv3_dot(const real *x, const real *y)
{
  return x[0]*y[0] + x[1]*y[1] + x[2]*y[2];
}


INLINE float *fv3_cross(float * RESTRICT z, const float *x, const float *y)
{
  z[0] = x[1]*y[2] - x[2]*y[1];
  z[1] = x[2]*y[0] - x[0]*y[2];
  z[2] = x[0]*y[1] - x[1]*y[0];
  return z;
}

INLINE double *dv3_cross(double * RESTRICT z, const double *x, const double *y)
{
  z[0] = x[1]*y[2] - x[2]*y[1];
  z[1] = x[2]*y[0] - x[0]*y[2];
  z[2] = x[0]*y[1] - x[1]*y[0];
  return z;
}

INLINE real *rv3_cross(real * RESTRICT z, const real *x, const real *y)
{
  z[0] = x[1]*y[2] - x[2]*y[1];
  z[1] = x[2]*y[0] - x[0]*y[2];
  z[2] = x[0]*y[1] - x[1]*y[0];
  return z;
}


INLINE float *fv3_neg(float *x)
{
  x[0] = -x[0];
  x[1] = -x[1];
  x[2] = -x[2];
  return x;
}

INLINE double *dv3_neg(double *x)
{
  x[0] = -x[0];
  x[1] = -x[1];
  x[2] = -x[2];
  return x;
}

INLINE real *rv3_neg(real *x)
{
  x[0] = -x[0];
  x[1] = -x[1];
  x[2] = -x[2];
  return x;
}


INLINE float *fv3_neg2(float * RESTRICT nx, const float *x)
{
  nx[0] = -x[0];
  nx[1] = -x[1];
  nx[2] = -x[2];
  return nx;
}

INLINE double *dv3_neg2(double * RESTRICT nx, const double *x)
{
  nx[0] = -x[0];
  nx[1] = -x[1];
  nx[2] = -x[2];
  return nx;
}

INLINE real *rv3_neg2(real * RESTRICT nx, const real *x)
{
  nx[0] = -x[0];
  nx[1] = -x[1];
  nx[2] = -x[2];
  return nx;
}


INLINE float *fv3_inc(float * RESTRICT x, const float *dx)
{
  x[0] += dx[0];
  x[1] += dx[1];
  x[2] += dx[2];
  return x;
}

INLINE double *dv3_inc(double * RESTRICT x, const double *dx)
{
  x[0] += dx[0];
  x[1] += dx[1];
  x[2] += dx[2];
  return x;
}

INLINE real *rv3_inc(real * RESTRICT x, const real *dx)
{
  x[0] += dx[0];
  x[1] += dx[1];
  x[2] += dx[2];
  return x;
}


INLINE float *fv3_dec(float *x, const float *dx)
{
  x[0] -= dx[0];
  x[1] -= dx[1];
  x[2] -= dx[2];
  return x;
}

INLINE double *dv3_dec(double *x, const double *dx)
{
  x[0] -= dx[0];
  x[1] -= dx[1];
  x[2] -= dx[2];
  return x;
}

INLINE real *rv3_dec(real *x, const real *dx)
{
  x[0] -= dx[0];
  x[1] -= dx[1];
  x[2] -= dx[2];
  return x;
}


INLINE float *fv3_sinc(float * RESTRICT x, const float *dx, float s)
{
  x[0] += dx[0] * s;
  x[1] += dx[1] * s;
  x[2] += dx[2] * s;
  return x;
}

INLINE double *dv3_sinc(double * RESTRICT x, const double *dx, double s)
{
  x[0] += dx[0] * s;
  x[1] += dx[1] * s;
  x[2] += dx[2] * s;
  return x;
}

INLINE real *rv3_sinc(real * RESTRICT x, const real *dx, real s)
{
  x[0] += dx[0] * s;
  x[1] += dx[1] * s;
  x[2] += dx[2] * s;
  return x;
}


INLINE float *fv3_smul(float *x, float s)
{
  x[0] *= s;
  x[1] *= s;
  x[2] *= s;
  return x;
}

INLINE double *dv3_smul(double *x, double s)
{
  x[0] *= s;
  x[1] *= s;
  x[2] *= s;
  return x;
}

INLINE real *rv3_smul(real *x, real s)
{
  x[0] *= s;
  x[1] *= s;
  x[2] *= s;
  return x;
}


/* if y == x, just use smul */
INLINE float *fv3_smul2(float * RESTRICT y, const float *x, float s)
{
  y[0] = x[0] * s;
  y[1] = x[1] * s;
  y[2] = x[2] * s;
  return y;
}

/* if y == x, just use smul */
INLINE double *dv3_smul2(double * RESTRICT y, const double *x, double s)
{
  y[0] = x[0] * s;
  y[1] = x[1] * s;
  y[2] = x[2] * s;
  return y;
}

/* if y == x, just use smul */
INLINE real *rv3_smul2(real * RESTRICT y, const real *x, real s)
{
  y[0] = x[0] * s;
  y[1] = x[1] * s;
  y[2] = x[2] * s;
  return y;
}


INLINE float *fv3_normalize(float *x)
{
  float r = fv3_norm(x);
  if (r > 0.f) fv3_smul(x, 1.f/r);
  return x;
}

INLINE double *dv3_normalize(double *x)
{
  double r = dv3_norm(x);
  if (r > 0.f) dv3_smul(x, 1.f/r);
  return x;
}

INLINE real *rv3_normalize(real *x)
{
  real r = rv3_norm(x);
  if (r > 0.f) rv3_smul(x, 1.f/r);
  return x;
}


INLINE float *fv3_makenorm(float *v, float x, float y, float z)
{
  return fv3_normalize( fv3_make(v, x, y, z) );
}

INLINE double *dv3_makenorm(double *v, double x, double y, double z)
{
  return dv3_normalize( dv3_make(v, x, y, z) );
}

INLINE real *rv3_makenorm(real *v, real x, real y, real z)
{
  return rv3_normalize( rv3_make(v, x, y, z) );
}


/* for in-place difference use fv3_dec */
INLINE float *fv3_diff(float * RESTRICT c, const float *a, const float *b)
{
  c[0] = a[0] - b[0];
  c[1] = a[1] - b[1];
  c[2] = a[2] - b[2];
  return c;
}

/* for in-place difference use dv3_dec */
INLINE double *dv3_diff(double * RESTRICT c, const double *a, const double *b)
{
  c[0] = a[0] - b[0];
  c[1] = a[1] - b[1];
  c[2] = a[2] - b[2];
  return c;
}

/* for in-place difference use rv3_dec */
INLINE real *rv3_diff(real * RESTRICT c, const real *a, const real *b)
{
  c[0] = a[0] - b[0];
  c[1] = a[1] - b[1];
  c[2] = a[2] - b[2];
  return c;
}


/* distance^2 between a and b */
INLINE float fv3_dist2(const float *a, const float *b)
{
  float d0 = a[0] - b[0], d1 = a[1] - b[1], d2 = a[2] - b[2];
  return d0*d0 + d1*d1 + d2*d2;
}

/* distance^2 between a and b */
INLINE double dv3_dist2(const double *a, const double *b)
{
  double d0 = a[0] - b[0], d1 = a[1] - b[1], d2 = a[2] - b[2];
  return d0*d0 + d1*d1 + d2*d2;
}

/* distance^2 between a and b */
INLINE real rv3_dist2(const real *a, const real *b)
{
  real d0 = a[0] - b[0], d1 = a[1] - b[1], d2 = a[2] - b[2];
  return d0*d0 + d1*d1 + d2*d2;
}


/* distance between a and b */
INLINE float fv3_dist(const float *a, const float *b)
{
  float d0 = a[0] - b[0], d1 = a[1] - b[1], d2 = a[2] - b[2];
  return (float) sqrt(d0*d0 + d1*d1 + d2*d2);
}

/* distance between a and b */
INLINE double dv3_dist(const double *a, const double *b)
{
  double d0 = a[0] - b[0], d1 = a[1] - b[1], d2 = a[2] - b[2];
  return (double) sqrt(d0*d0 + d1*d1 + d2*d2);
}

/* distance between a and b */
INLINE real rv3_dist(const real *a, const real *b)
{
  real d0 = a[0] - b[0], d1 = a[1] - b[1], d2 = a[2] - b[2];
  return (real) sqrt(d0*d0 + d1*d1 + d2*d2);
}


/* c = a + b, for in-place addition use fv3_inc */
INLINE float *fv3_add(float * RESTRICT c, const float *a, const float *b)
{
  c[0] = a[0] + b[0];
  c[1] = a[1] + b[1];
  c[2] = a[2] + b[2];
  return c;
}

/* c = a + b, for in-place addition use dv3_inc */
INLINE double *dv3_add(double * RESTRICT c, const double *a, const double *b)
{
  c[0] = a[0] + b[0];
  c[1] = a[1] + b[1];
  c[2] = a[2] + b[2];
  return c;
}

/* c = a + b, for in-place addition use rv3_inc */
INLINE real *rv3_add(real * RESTRICT c, const real *a, const real *b)
{
  c[0] = a[0] + b[0];
  c[1] = a[1] + b[1];
  c[2] = a[2] + b[2];
  return c;
}


/* c = - a - b */
INLINE float *fv3_nadd(float * RESTRICT c, const float *a, const float *b)
{
  c[0] = - a[0] - b[0];
  c[1] = - a[1] - b[1];
  c[2] = - a[2] - b[2];
  return c;
}

/* c = - a - b */
INLINE double *dv3_nadd(double * RESTRICT c, const double *a, const double *b)
{
  c[0] = - a[0] - b[0];
  c[1] = - a[1] - b[1];
  c[2] = - a[2] - b[2];
  return c;
}

/* c = - a - b */
INLINE real *rv3_nadd(real * RESTRICT c, const real *a, const real *b)
{
  c[0] = - a[0] - b[0];
  c[1] = - a[1] - b[1];
  c[2] = - a[2] - b[2];
  return c;
}


/* c = a + b * s */
INLINE float *fv3_sadd(float * RESTRICT c, const float *a, const float *b, float s)
{
  c[0] = a[0] + b[0] * s;
  c[1] = a[1] + b[1] * s;
  c[2] = a[2] + b[2] * s;
  return c;
}

/* c = a + b * s */
INLINE double *dv3_sadd(double * RESTRICT c, const double *a, const double *b, double s)
{
  c[0] = a[0] + b[0] * s;
  c[1] = a[1] + b[1] * s;
  c[2] = a[2] + b[2] * s;
  return c;
}

/* c = a + b * s */
INLINE real *rv3_sadd(real * RESTRICT c, const real *a, const real *b, real s)
{
  c[0] = a[0] + b[0] * s;
  c[1] = a[1] + b[1] * s;
  c[2] = a[2] + b[2] * s;
  return c;
}


/* c = a * s1 + b * s2 */
INLINE float *fv3_lincomb2(float * RESTRICT c, const float *a, const float *b,
    float s1, float s2)
{
  c[0] = a[0] * s1 + b[0] * s2;
  c[1] = a[1] * s1 + b[1] * s2;
  c[2] = a[2] * s1 + b[2] * s2;
  return c;
}

/* c = a * s1 + b * s2 */
INLINE double *dv3_lincomb2(double * RESTRICT c, const double *a, const double *b,
    double s1, double s2)
{
  c[0] = a[0] * s1 + b[0] * s2;
  c[1] = a[1] * s1 + b[1] * s2;
  c[2] = a[2] * s1 + b[2] * s2;
  return c;
}

/* c = a * s1 + b * s2 */
INLINE real *rv3_lincomb2(real * RESTRICT c, const real *a, const real *b,
    real s1, real s2)
{
  c[0] = a[0] * s1 + b[0] * s2;
  c[1] = a[1] * s1 + b[1] * s2;
  c[2] = a[2] * s1 + b[2] * s2;
  return c;
}


/* angle and gradients of cos(x1-x2-x3) */
INLINE float fv3_cosang(const float *x1, const float *x2, const float *x3,
    float * RESTRICT g1, float * RESTRICT g2, float * RESTRICT g3)
{
  float a[3], b[3], ra, rb, dot;

  ra = fv3_norm(fv3_diff(a, x1, x2));
  fv3_smul(a, 1.f/ra);
  rb = fv3_norm(fv3_diff(b, x3, x2));
  fv3_smul(b, 1.f/rb);
  dot = fv3_dot(a, b);
  if (dot > 1) dot = 1; else if (dot < -1) dot = -1;
  if (g1) {
    fv3_lincomb2(g1, b, a, 1.f/ra, -dot/ra);
    fv3_lincomb2(g3, a, b, 1.f/rb, -dot/rb);
    fv3_nadd(g2, g1, g3);
  }
  return dot;
}

/* angle and gradients of cos(x1-x2-x3) */
INLINE double dv3_cosang(const double *x1, const double *x2, const double *x3,
    double * RESTRICT g1, double * RESTRICT g2, double * RESTRICT g3)
{
  double a[3], b[3], ra, rb, dot;

  ra = dv3_norm(dv3_diff(a, x1, x2));
  dv3_smul(a, 1.f/ra);
  rb = dv3_norm(dv3_diff(b, x3, x2));
  dv3_smul(b, 1.f/rb);
  dot = dv3_dot(a, b);
  if (dot > 1) dot = 1; else if (dot < -1) dot = -1;
  if (g1) {
    dv3_lincomb2(g1, b, a, 1.f/ra, -dot/ra);
    dv3_lincomb2(g3, a, b, 1.f/rb, -dot/rb);
    dv3_nadd(g2, g1, g3);
  }
  return dot;
}

/* angle and gradients of cos(x1-x2-x3) */
INLINE real rv3_cosang(const real *x1, const real *x2, const real *x3,
    real * RESTRICT g1, real * RESTRICT g2, real * RESTRICT g3)
{
  real a[3], b[3], ra, rb, dot;

  ra = rv3_norm(rv3_diff(a, x1, x2));
  rv3_smul(a, 1.f/ra);
  rb = rv3_norm(rv3_diff(b, x3, x2));
  rv3_smul(b, 1.f/rb);
  dot = rv3_dot(a, b);
  if (dot > 1) dot = 1; else if (dot < -1) dot = -1;
  if (g1) {
    rv3_lincomb2(g1, b, a, 1.f/ra, -dot/ra);
    rv3_lincomb2(g3, a, b, 1.f/rb, -dot/rb);
    rv3_nadd(g2, g1, g3);
  }
  return dot;
}


/* angle and gradients of x1-x2-x3 */
INLINE float fv3_ang(const float *x1, const float *x2, const float *x3,
    float * RESTRICT g1, float * RESTRICT g2, float * RESTRICT g3)
{
  float dot, sn;

  dot = fv3_cosang(x1, x2, x3, g1, g2, g3);
  sn = (float) sqrt(1 - dot*dot);
  if (sn < 1e-7) sn = 1; else sn = -1.f/sn;
  if (g1) {
    fv3_smul(g1, sn);
    fv3_smul(g2, sn);
    fv3_smul(g3, sn);
  }
  return (float) acos(dot);
}

/* angle and gradients of x1-x2-x3 */
INLINE double dv3_ang(const double *x1, const double *x2, const double *x3,
    double * RESTRICT g1, double * RESTRICT g2, double * RESTRICT g3)
{
  double dot, sn;

  dot = dv3_cosang(x1, x2, x3, g1, g2, g3);
  sn = (double) sqrt(1 - dot*dot);
  if (sn < 1e-7) sn = 1; else sn = -1.f/sn;
  if (g1) {
    dv3_smul(g1, sn);
    dv3_smul(g2, sn);
    dv3_smul(g3, sn);
  }
  return (double) acos(dot);
}

/* angle and gradients of x1-x2-x3 */
INLINE real rv3_ang(const real *x1, const real *x2, const real *x3,
    real * RESTRICT g1, real * RESTRICT g2, real * RESTRICT g3)
{
  real dot, sn;

  dot = rv3_cosang(x1, x2, x3, g1, g2, g3);
  sn = (real) sqrt(1 - dot*dot);
  if (sn < 1e-7) sn = 1; else sn = -1.f/sn;
  if (g1) {
    rv3_smul(g1, sn);
    rv3_smul(g2, sn);
    rv3_smul(g3, sn);
  }
  return (real) acos(dot);
}


/* vertical distance from `x' to the line extended by `a' and `b' */
INLINE float fv3_vdist(const float *x, const float *a, const float *b)
{
  float nm[3], d[3], dot;

  fv3_diff(d, x, a);
  fv3_normalize(fv3_diff(nm, a, b));
  dot = fv3_dot(d, nm);
  return fv3_norm(fv3_sinc(d, nm, -dot));
}

/* vertical distance from `x' to the line extended by `a' and `b' */
INLINE double dv3_vdist(const double *x, const double *a, const double *b)
{
  double nm[3], d[3], dot;

  dv3_diff(d, x, a);
  dv3_normalize(dv3_diff(nm, a, b));
  dot = dv3_dot(d, nm);
  return dv3_norm(dv3_sinc(d, nm, -dot));
}

/* vertical distance from `x' to the line extended by `a' and `b' */
INLINE real rv3_vdist(const real *x, const real *a, const real *b)
{
  real nm[3], d[3], dot;

  rv3_diff(d, x, a);
  rv3_normalize(rv3_diff(nm, a, b));
  dot = rv3_dot(d, nm);
  return rv3_norm(rv3_sinc(d, nm, -dot));
}


/* signed distance from x to the plane extended by a, b, c */
INLINE float fv3_vpdist(const float *x, const float *a, const float *b, const float *c)
{
  float u[3], v[3], m[3];

  fv3_diff(u, b, a);
  fv3_diff(v, c, b);
  fv3_normalize(fv3_cross(m, u, v));
  fv3_diff(u, x, a);
  return fv3_dot(u, m);
}

/* signed distance from x to the plane extended by a, b, c */
INLINE double dv3_vpdist(const double *x, const double *a, const double *b, const double *c)
{
  double u[3], v[3], m[3];

  dv3_diff(u, b, a);
  dv3_diff(v, c, b);
  dv3_normalize(dv3_cross(m, u, v));
  dv3_diff(u, x, a);
  return dv3_dot(u, m);
}

/* signed distance from x to the plane extended by a, b, c */
INLINE real rv3_vpdist(const real *x, const real *a, const real *b, const real *c)
{
  real u[3], v[3], m[3];

  rv3_diff(u, b, a);
  rv3_diff(v, c, b);
  rv3_normalize(rv3_cross(m, u, v));
  rv3_diff(u, x, a);
  return rv3_dot(u, m);
}


/* light weight dihedral */
INLINE float fv3_dih(const float *xi, const float *xj, const float *xk, const float *xl,
    float * RESTRICT gi, float * RESTRICT gj, float * RESTRICT gk, float * RESTRICT gl)
{
  float tol, phi, cosphi = 1.f;
  float nxkj, nxkj2, m2, n2;
  float xij[3], xkj[3], xkl[3], uvec[3], vvec[3], svec[3];
  float m[3], n[3]; /* the planar vector of xij x xkj,  and xkj x xkj */

  fv3_diff(xij, xi, xj);
  fv3_diff(xkj, xk, xj);
  fv3_diff(xkl, xk, xl);
  nxkj2 = fv3_sqr(xkj);
  nxkj = (float) sqrt(nxkj2);
  tol = (sizeof(float) == sizeof(float)) ? nxkj2 * 6e-8f : nxkj2 * 1e-16f;

  fv3_cross(m, xij, xkj);
  m2 = fv3_sqr(m);
  fv3_cross(n, xkj, xkl);
  n2 = fv3_sqr(n);
  if (m2 > tol && n2 > tol) {
    cosphi = fv3_dot(m, n);
    cosphi /= (float) sqrt(m2 * n2);
    if (cosphi >= 1.f) cosphi = 1.f;
    else if (cosphi < -1.f) cosphi = -1.f;
  }
  phi = (float) acos(cosphi);
  if (fv3_dot(n, xij) < 0.0) phi = -phi;

  /* optionally calculate the gradient */
  if (gi != NULL) {
    if (m2 > tol && n2 > tol) {
      fv3_smul2(gi, m, nxkj/m2);
      fv3_smul2(gl, n, -nxkj/n2);
      fv3_smul2(uvec, gi, fv3_dot(xij, xkj)/nxkj2);
      fv3_smul2(vvec, gl, fv3_dot(xkl, xkj)/nxkj2);
      fv3_diff(svec, uvec, vvec);
      fv3_diff(gj, svec, gi);
      fv3_nadd(gk, svec, gl);
    } else { /* clear the gradients */
      fv3_zero(gi);
      fv3_zero(gj);
      fv3_zero(gk);
      fv3_zero(gl);
    }
  }
  return phi;
}

/* light weight dihedral */
INLINE double dv3_dih(const double *xi, const double *xj, const double *xk, const double *xl,
    double * RESTRICT gi, double * RESTRICT gj, double * RESTRICT gk, double * RESTRICT gl)
{
  double tol, phi, cosphi = 1.f;
  double nxkj, nxkj2, m2, n2;
  double xij[3], xkj[3], xkl[3], uvec[3], vvec[3], svec[3];
  double m[3], n[3]; /* the planar vector of xij x xkj,  and xkj x xkj */

  dv3_diff(xij, xi, xj);
  dv3_diff(xkj, xk, xj);
  dv3_diff(xkl, xk, xl);
  nxkj2 = dv3_sqr(xkj);
  nxkj = (double) sqrt(nxkj2);
  tol = (sizeof(double) == sizeof(float)) ? nxkj2 * 6e-8f : nxkj2 * 1e-16f;

  dv3_cross(m, xij, xkj);
  m2 = dv3_sqr(m);
  dv3_cross(n, xkj, xkl);
  n2 = dv3_sqr(n);
  if (m2 > tol && n2 > tol) {
    cosphi = dv3_dot(m, n);
    cosphi /= (double) sqrt(m2 * n2);
    if (cosphi >= 1.f) cosphi = 1.f;
    else if (cosphi < -1.f) cosphi = -1.f;
  }
  phi = (double) acos(cosphi);
  if (dv3_dot(n, xij) < 0.0) phi = -phi;

  /* optionally calculate the gradient */
  if (gi != NULL) {
    if (m2 > tol && n2 > tol) {
      dv3_smul2(gi, m, nxkj/m2);
      dv3_smul2(gl, n, -nxkj/n2);
      dv3_smul2(uvec, gi, dv3_dot(xij, xkj)/nxkj2);
      dv3_smul2(vvec, gl, dv3_dot(xkl, xkj)/nxkj2);
      dv3_diff(svec, uvec, vvec);
      dv3_diff(gj, svec, gi);
      dv3_nadd(gk, svec, gl);
    } else { /* clear the gradients */
      dv3_zero(gi);
      dv3_zero(gj);
      dv3_zero(gk);
      dv3_zero(gl);
    }
  }
  return phi;
}

/* light weight dihedral */
INLINE real rv3_dih(const real *xi, const real *xj, const real *xk, const real *xl,
    real * RESTRICT gi, real * RESTRICT gj, real * RESTRICT gk, real * RESTRICT gl)
{
  real tol, phi, cosphi = 1.f;
  real nxkj, nxkj2, m2, n2;
  real xij[3], xkj[3], xkl[3], uvec[3], vvec[3], svec[3];
  real m[3], n[3]; /* the planar vector of xij x xkj,  and xkj x xkj */

  rv3_diff(xij, xi, xj);
  rv3_diff(xkj, xk, xj);
  rv3_diff(xkl, xk, xl);
  nxkj2 = rv3_sqr(xkj);
  nxkj = (real) sqrt(nxkj2);
  tol = (sizeof(real) == sizeof(float)) ? nxkj2 * 6e-8f : nxkj2 * 1e-16f;

  rv3_cross(m, xij, xkj);
  m2 = rv3_sqr(m);
  rv3_cross(n, xkj, xkl);
  n2 = rv3_sqr(n);
  if (m2 > tol && n2 > tol) {
    cosphi = rv3_dot(m, n);
    cosphi /= (real) sqrt(m2 * n2);
    if (cosphi >= 1.f) cosphi = 1.f;
    else if (cosphi < -1.f) cosphi = -1.f;
  }
  phi = (real) acos(cosphi);
  if (rv3_dot(n, xij) < 0.0) phi = -phi;

  /* optionally calculate the gradient */
  if (gi != NULL) {
    if (m2 > tol && n2 > tol) {
      rv3_smul2(gi, m, nxkj/m2);
      rv3_smul2(gl, n, -nxkj/n2);
      rv3_smul2(uvec, gi, rv3_dot(xij, xkj)/nxkj2);
      rv3_smul2(vvec, gl, rv3_dot(xkl, xkj)/nxkj2);
      rv3_diff(svec, uvec, vvec);
      rv3_diff(gj, svec, gi);
      rv3_nadd(gk, svec, gl);
    } else { /* clear the gradients */
      rv3_zero(gi);
      rv3_zero(gj);
      rv3_zero(gk);
      rv3_zero(gl);
    }
  }
  return phi;
}


#define fv3_rnd0(v) fv3_rnd(v, 0, 1)

#define dv3_rnd0(v) dv3_rnd(v, 0, 1)

#define rv3_rnd0(v) rv3_rnd(v, 0, 1)

/* uniformly distributed random vector [a, b) */
INLINE float *fv3_rnd(float *v, float a, float b)
{
  b -= a;
  v[0] = a + b * (float) rnd0();
  v[1] = a + b * (float) rnd0();
  v[2] = a + b * (float) rnd0();
  return v;
}

/* uniformly distributed random vector [a, b) */
INLINE double *dv3_rnd(double *v, double a, double b)
{
  b -= a;
  v[0] = a + b * (double) rnd0();
  v[1] = a + b * (double) rnd0();
  v[2] = a + b * (double) rnd0();
  return v;
}

/* uniformly distributed random vector [a, b) */
INLINE real *rv3_rnd(real *v, real a, real b)
{
  b -= a;
  v[0] = a + b * (real) rnd0();
  v[1] = a + b * (real) rnd0();
  v[2] = a + b * (real) rnd0();
  return v;
}


/* displace `x0' by a random vector in [-a, a)^3 */
INLINE real *rv3_rnddisp(real * RESTRICT x, const real *x0, real a)
{
  x[0] = x0[0] + (real) rnd(-a, a);
  x[1] = x0[1] + (real) rnd(-a, a);
  x[2] = x0[2] + (real) rnd(-a, a);
  return x;
}


/* normally distributed random vector */
INLINE float *fv3_grand0(float *v)
{
  v[0] = (float) grand0();
  v[1] = (float) grand0();
  v[2] = (float) grand0();
  return v;
}

/* normally distributed random vector */
INLINE double *dv3_grand0(double *v)
{
  v[0] = (double) grand0();
  v[1] = (double) grand0();
  v[2] = (double) grand0();
  return v;
}

/* normally distributed random vector */
INLINE real *rv3_grand0(real *v)
{
  v[0] = (real) grand0();
  v[1] = (real) grand0();
  v[2] = (real) grand0();
  return v;
}


/* normally distributed random vector */
INLINE float *fv3_grand(float *v, float c, float r)
{
  v[0] = c + r * (float) grand0();
  v[1] = c + r * (float) grand0();
  v[2] = c + r * (float) grand0();
  return v;
}

/* normally distributed random vector */
INLINE double *dv3_grand(double *v, double c, double r)
{
  v[0] = c + r * (double) grand0();
  v[1] = c + r * (double) grand0();
  v[2] = c + r * (double) grand0();
  return v;
}

/* normally distributed random vector */
INLINE real *rv3_grand(real *v, real c, real r)
{
  v[0] = c + r * (real) grand0();
  v[1] = c + r * (real) grand0();
  v[2] = c + r * (real) grand0();
  return v;
}


/* displace `x0' by a normally-distributed random vector */
INLINE real *rv3_granddisp(real * RESTRICT x, const real *x0, real a)
{
  x[0] = x0[0] + (real) grand0() * a;
  x[1] = x0[1] + (real) grand0() * a;
  x[2] = x0[2] + (real) grand0() * a;
  return x;
}


/* randomly oriented vector on the sphere of radius r */
#define fv3_rnddir(v, r) fv3_smul(fv3_rnddir0(v), r)

/* randomly oriented vector on the sphere of radius r */
#define dv3_rnddir(v, r) dv3_smul(dv3_rnddir0(v), r)

/* randomly oriented vector on the sphere of radius r */
#define rv3_rnddir(v, r) rv3_smul(rv3_rnddir0(v), r)

/* randomly oriented vector on the unit sphere */
INLINE float *fv3_rnddir0(float *v)
{
  double a, b, sq, s;

  do { /* projection on the x-y plane */
    a = 2 * rnd0() - 1;
    b = 2 * rnd0() - 1;
    sq = a * a + b * b;
  } while (sq >= 1); /* avoid sin() and cos() */

  s = 2. * sqrt(1 - sq);
  return fv3_make(v, (float) (a * s), (float) (b * s), (float) (1 - 2 * sq));
}

/* randomly oriented vector on the unit sphere */
INLINE double *dv3_rnddir0(double *v)
{
  double a, b, sq, s;

  do { /* projection on the x-y plane */
    a = 2 * rnd0() - 1;
    b = 2 * rnd0() - 1;
    sq = a * a + b * b;
  } while (sq >= 1); /* avoid sin() and cos() */

  s = 2. * sqrt(1 - sq);
  return dv3_make(v, (double) (a * s), (double) (b * s), (double) (1 - 2 * sq));
}

/* randomly oriented vector on the unit sphere */
INLINE real *rv3_rnddir0(real *v)
{
  double a, b, sq, s;

  do { /* projection on the x-y plane */
    a = 2 * rnd0() - 1;
    b = 2 * rnd0() - 1;
    sq = a * a + b * b;
  } while (sq >= 1); /* avoid sin() and cos() */

  s = 2. * sqrt(1 - sq);
  return rv3_make(v, (real) (a * s), (real) (b * s), (real) (1 - 2 * sq));
}


/* randomly orientied vector within the sphere of radius `r' */
#define fv3_rndball(v, r) fv3_smul(fv3_rndball0(v), r)

/* randomly orientied vector within the sphere of radius `r' */
#define dv3_rndball(v, r) dv3_smul(dv3_rndball0(v), r)

/* randomly orientied vector within the sphere of radius `r' */
#define rv3_rndball(v, r) rv3_smul(rv3_rndball0(v), r)

/* randomly orientied vector within the unit sphere */
INLINE float *fv3_rndball0(float *v)
{
  do {
    fv3_rnd(v, -1, 1);
  } while (fv3_sqr(v) >= 1);
  return v;
}

/* randomly orientied vector within the unit sphere */
INLINE double *dv3_rndball0(double *v)
{
  do {
    dv3_rnd(v, -1, 1);
  } while (dv3_sqr(v) >= 1);
  return v;
}

/* randomly orientied vector within the unit sphere */
INLINE real *rv3_rndball0(real *v)
{
  do {
    rv3_rnd(v, -1, 1);
  } while (rv3_sqr(v) >= 1);
  return v;
}


/* heavy-weight dihedral calculation */

/* structure for dihedral calculation */
typedef struct {
  int  szreal; /* sizeof real */
  int  pad0;   /* padding */
  real phi; /* cis is zero, clockwise positive */
  real cos; /* cos(m, n) */
  real sgn; /* (0, pi) is 1.0, otherwise -1.0 */

  real g2;
  real g[4][3]; /* gradient for each particle */

  real div; /* the divergence */
  real d4ij, d4ik, d4jj, d4jk, d4jl, d4kk, d4kl;

  unsigned int flags; /* a copy of flags used */
  int t1, t2, t3; /* GROMACS shift indices */
  const void *pbcdata; /* periodic boundary condition descriptor */
  int (*pbcdiff)(real *xij, const real *xi, const real *xj, const void *);
    /* function to handle pbc, use GROMACS convention: the last is the difference */
} dihcalc_t;

#define DIH_GRAD  0x0001
#define DIH_DIV   0x0002
/*#define DIH_CONJ  0x0004 */
/*#define DIH_PROJ  0x0008 */
#define DIH_I     0x0010
#define DIH_J     0x0020
#define DIH_K     0x0040
#define DIH_L     0x0080
#define DIH_FOUR  (DIH_I|DIH_J|DIH_K|DIH_L)
/* the four atoms involved */
#define DIH_ALL   (DIH_FOUR|DIH_GRAD|DIH_DIV)
/* only I and L, so no divergence */
#define DIH_ENDS  (DIH_GRAD|DIH_I|DIH_L)
/* polymer convention, 0 == trans */
#define DIH_POLYMER 0x1000

/* compute the dihedral angle, gradient g and divergence
 * of the field v conjugate to gradient (v.g = 1)
 *
 * if dih is NULL and flags is 0, only the dihedral angle is computed
 * optionally, the gradient and divergent are computed with flags
 * DIH_GRAD and DIH_DIV respectively (the latter requires the former)
 * routine for treating periodic boundary condition can specified by
 * assigning a function pointer to dih->pbcdiff with additional information
 * to dih->pbcdata entry, otherwise, dih->pbcdiff *must* be set to NULL
 * the procedure of computing similar to that in GROMACS
 *
 * the conjugate field v = g / [g.g], such that v.g = 1, where g = grad(phi)
 * it is not explicitly computed (since it lies along the gradient)
 * however, the denominator is saved to dih->g2
 * the calculation of the divergence of v is simplified by the fact that
 * the divergence of the gradient is always zero, i.e., div.g = 0, thus
 *     div.v = -2 [ g.(gg).g ] /[g.g]^2,
 * where gg = grad grad(phi) involves: d4ij, d4ik, ..., d4kl.
 *
 * both g and div can be computed for a subset of the four involving atoms
 * by passing `flags' a combination of DIH_I, DIH_J, DIH_K and DIH_L
 * however, *all* moments d4ij, d4ik, ... , d4kl are always calculated
 * though only the involved ones are used to produce the divergence. */
#define rv3_calcdihv(dih, x, idx, flags) \
  rv3_calcdih(dih, x[*(idx)], x[*(idx+1)], x[*(idx+2)], x[*(idx+3)], flags)

INLINE real rv3_calcdih(dihcalc_t *dih,
    const real *xi, const real *xj, const real *xk, const real *xl,
    unsigned int flags)
{
  real dot, scl, tol, vol, phi, sgn, cosphi;
  real nxkj, nxkj2, m2, n2;
  real xij[3], xkj[3], xkl[3];
  real m[3], n[3]; /* the planar vector of xij x xkj,  and xkj x xkj */

  if (dih != NULL && sizeof(real) != dih->szreal) {
    fprintf(stderr, "size real should be %d instead of %d\n",
        (int) sizeof(real), (int) dih->szreal);
    exit(1);
  }
  if (dih != NULL && dih->pbcdiff != NULL) { /* handle pbc */
    dih->t1 = (*dih->pbcdiff)(xij, xi, xj, dih->pbcdata);
    dih->t2 = (*dih->pbcdiff)(xkj, xk, xj, dih->pbcdata);
    dih->t3 = (*dih->pbcdiff)(xkl, xk, xl, dih->pbcdata);
  } else {
    rv3_diff(xij, xi, xj);
    rv3_diff(xkj, xk, xj);
    rv3_diff(xkl, xk, xl);
  }
  nxkj2 = rv3_sqr(xkj);
  nxkj = (real) sqrt(nxkj2);
  if (sizeof(real) <= 4)
    tol = nxkj2 * 6e-8f;
  else
    tol = nxkj2 * 1e-16f;

  rv3_cross(m, xij, xkj);
  m2 = rv3_sqr(m);
  rv3_cross(n, xkj, xkl);
  n2 = rv3_sqr(n);
  if (m2 > tol && n2 > tol) {
    scl = (real) sqrt(m2*n2);
    dot = rv3_dot(m, n);
    cosphi = dot/scl;
    if (cosphi >= (real) 1.) cosphi = 1.f;
    else if (cosphi < (real)(-1.)) cosphi = -1.f;
  } else {
    cosphi = 1.f;
  }
  phi = (real) acos(cosphi);
  vol = rv3_dot(n, xij);
  sgn = (vol > 0.0f) ? 1.0f : -1.0f;
  phi *= sgn;
  if (flags & DIH_POLYMER) { /* switch to polymer convention, 0 == trans */
    if (phi > 0) phi -= M_PI;
    else phi += M_PI;
  }
  if (dih != NULL) {
    dih->phi = phi;
    dih->sgn = sgn;
    dih->cos = cosphi;
    dih->flags = flags;
  }

  /* optionally calculate the gradient */
  if (dih != NULL && (flags & (DIH_GRAD|DIH_DIV))) { /* divergence implies gradient */
    /* clear divergence */
    dih->div = dih->d4ij = dih->d4ik = dih->d4jj = dih->d4jk = dih->d4jl = dih->d4kk = dih->d4kl = 0.0f;

    /* calculate the gradient of the dihedral */
    if (m2 > tol && n2 > tol) {
      real uvec[3], vvec[3], svec[3], g2all, invg2;
      unsigned doi, doj, dok, dol;

      doi = (flags & DIH_I);
      doj = (flags & DIH_J);
      dok = (flags & DIH_K);
      dol = (flags & DIH_L);

      rv3_smul2(dih->g[0], m,  nxkj/m2);
      rv3_smul2(dih->g[3], n, -nxkj/n2);

      rv3_smul2(uvec, dih->g[0], rv3_dot(xij, xkj)/nxkj2);
      rv3_smul2(vvec, dih->g[3], rv3_dot(xkl, xkj)/nxkj2);
      rv3_diff(svec, uvec, vvec);

      rv3_diff(dih->g[1], svec, dih->g[0]);
      rv3_nadd(dih->g[2], svec, dih->g[3]);

      g2all = 0.0f;
      if (doi) g2all += rv3_sqr(dih->g[0]);
      if (doj) g2all += rv3_sqr(dih->g[1]);
      if (dok) g2all += rv3_sqr(dih->g[2]);
      if (dol) g2all += rv3_sqr(dih->g[3]);
      dih->g2 = g2all;
      invg2 = 1.0f/g2all;

      if (flags & DIH_DIV) {
        real xkjv[3], nvv[3], mvv[3];
        real gjxij, gjmvv, gjxkl, gjnvv;
        real gkmvv, gknvv, gkxkl, gkxij;
        real kivkj, klvkj, ljvkj, ijvkj;
        real kikl, ijlj;
        real tmp1, tmp2;
        real sinmn;

        rv3_smul2(mvv, m, 1.0f/m2);
        rv3_smul2(nvv, n, 1.0f/n2);
        rv3_smul2(xkjv, xkj, 1.0f/nxkj);

        sinmn = vol*nxkj/(m2*n2);

        ijvkj = rv3_dot(xij, xkjv);
        kivkj = nxkj-ijvkj;
        klvkj = rv3_dot(xkl, xkjv);
        ljvkj = nxkj-klvkj;

        ijlj = ijvkj*ljvkj;
        kikl = kivkj*klvkj;

        gjxij = rv3_dot(dih->g[1], xij);
        gjxkl = rv3_dot(dih->g[1], xkl);
        gjmvv = rv3_dot(dih->g[1], mvv);
        gjnvv = rv3_dot(dih->g[1], nvv);
        gkxij = rv3_dot(dih->g[2], xij);
        gkxkl = rv3_dot(dih->g[2], xkl);
        gkmvv = rv3_dot(dih->g[2], mvv);
        gknvv = rv3_dot(dih->g[2], nvv);

        tmp1 = nxkj2*sinmn;
        tmp2 = tmp1/m2;
        dih->d4ij = kikl*tmp2;
        dih->d4ik = ijlj*tmp2;
        tmp2 = tmp1/n2;
        dih->d4jl = kikl*tmp2;
        dih->d4kl = ijlj*tmp2;

        dih->d4jj = -(gjxij*gjmvv+gjxkl*gjnvv)/nxkj
                +2.0f*(kivkj*gjmvv-klvkj*gjnvv)*(-kikl*sinmn);

        dih->d4jk = (gjxij*gkmvv+gjxkl*gknvv)/nxkj
              +(-(gjmvv*ljvkj+gkmvv*klvkj)*(ijvkj*kivkj)
                +(gjnvv*ijvkj+gknvv*kivkj)*(ljvkj*klvkj) )*sinmn;

        dih->d4kk = -(gkxkl*gknvv+gkxij*gkmvv)/nxkj
                +2.0f*(ljvkj*gknvv-ijvkj*gkmvv)*(ijlj*sinmn);

        /* summarize */
        if ((flags & DIH_FOUR) == DIH_FOUR) {
          tmp1 = dih->d4jj + dih->d4kk;
          tmp2 = dih->d4ij + dih->d4ik+dih->d4jk+dih->d4jl+dih->d4kl;
        } else {
          tmp1 = tmp2 = 0.0f;
          if (doj) { tmp1 += dih->d4jj; }
          if (dok) { tmp1 += dih->d4kk; }
          if (doi && doj) tmp2 += dih->d4ij;
          if (doi && dok) tmp2 += dih->d4ik;
          if (doj && dok) tmp2 += dih->d4jk;
          if (doj && dol) tmp2 += dih->d4jl;
          if (dok && dol) tmp2 += dih->d4kl;
        }
        dih->div = -2.0f*(tmp1+2.0f*tmp2)*(invg2*invg2);
      } /* do divergence */

    } else { /* clear the gradients */
      int j;
      for (j = 0; j < 4; j++)
        rv3_zero(dih->g[j]);
    }
  }
  return phi;
}

/* Gauss integral and solid angles */


/* compute the trihedral angle
 * http://planetmath.org/encyclopedia/TrihedralAngle.html
 * tan(omega/2) = vol/den,
 * where
 * den = r1 (r2.r3) + r2 (r1.r3) + r3 (r1.r2) + r1 r2 r3
 */
INLINE real rv3_solidang(const real *v1, const real *v2, const real *v3,
  real * RESTRICT g1, real * RESTRICT g2, real * RESTRICT g3)
{
  real vc[3];
  real r1, r2, r3;
  real ang, vol, den, v2d2, scnum, scden;
  const real eps = (real) 1e-10;

  r1 = rv3_norm(v1);
  r2 = rv3_norm(v2);
  r3 = rv3_norm(v3);
  if (g1 != NULL) rv3_zero(g1);
  if (g2 != NULL) rv3_zero(g2);
  if (g3 != NULL) rv3_zero(g3);

  /* at least two points coincide */
  if (r1 < eps || r2 < eps || r3 < eps) return 0;

  /* the numerator */
  vol = rv3_dot(v3, rv3_cross(vc, v1, v2));

  /* the denominator */
  den  = r1 * rv3_dot(v3, v2);
  den += r2 * rv3_dot(v3, v1);
  den += r3 * (r1*r2 + rv3_dot(v1, v2));

  v2d2 = vol*vol + den*den;

  /* this happens if two vector are opposite
     the solid angle could be evolved from  +/-pi or 0
     but unfortunately, we don't know which one */
  if (v2d2 < eps) return 0;

  if (g1 != NULL && g2 != NULL && g3 != NULL) { /* compute the gradients */
    scnum =  2.f*den/v2d2;
    scden = -2.f*vol/v2d2;

    /* cross products */
    rv3_smul(rv3_cross(g3, v1, v2), scnum);
    rv3_smul(rv3_cross(g1, v2, v3), scnum);
    rv3_smul(rv3_cross(g2, v3, v1), scnum);

    /* compute the contributions to the denominator */
    rv3_lincomb2(vc, v2, v3, r3, r2);
    rv3_sinc(vc, v1, (rv3_dot(v2, v3) + r2*r3)/r1);
    rv3_sinc(g1, vc, scden);

    rv3_lincomb2(vc, v1, v3, r3, r1);
    rv3_sinc(vc, v2, (rv3_dot(v1, v3) + r1*r3)/r2);
    rv3_sinc(g2, vc, scden);

    rv3_lincomb2(vc, v1, v2, r2, r1);
    rv3_sinc(vc, v3, (rv3_dot(v1, v2) + r1*r2)/r3);
    rv3_sinc(g3, vc, scden);
  }

  /* calculate tan(omega/2) */
  ang = (real) atan2(vol, den); /* 0 to pi */
  return 2*ang;
}


/* compute the Gauss integral for the two line segments, with gradients
 *    int_i \int_j (dri X drj).rij/ rij^3,
 * over two line segments rip - ri, rjp - rj, and rij = ri - rj
 * (-2 pi, 2 pi)
 * Note the sign is opposite to that of the dihedral */
INLINE real rv3_solidang2g(const real *ri, const real *rip,
    const real *rj, const real *rjp,
    real * RESTRICT gi, real * RESTRICT gip,
    real * RESTRICT gj, real * RESTRICT gjp)
{
  rv3_t v0, v1, v2, v3, g0, g1, g2, g3, g4, g5;
  real ang1, ang2;

  rv3_diff(v0, ri, rj);
  rv3_diff(v1, ri, rjp);
  rv3_diff(v2, rip, rj);
  rv3_diff(v3, rip, rjp);

  ang1 = rv3_solidang(v0, v1, v2, g0, g1, g2);
  ang2 = rv3_solidang(v2, v1, v3, g3, g4, g5);

  rv3_inc(rv3_inc(rv3_copy(gi,  g0), g1), g4);
  rv3_inc(rv3_inc(rv3_copy(gip, g2), g3), g5);
  rv3_neg(rv3_inc(rv3_inc(rv3_copy(gj,  g0), g2), g3));
  rv3_neg(rv3_inc(rv3_inc(rv3_copy(gjp, g1), g4), g5));

  return ang1 + ang2;
}


/* compute the double integral, old code
 *    \int_i \int_j (dri X drj).rij/ rij^3,
 * over two line segments rip - ri, rjp - rj, and rij = ri - rj
 * (-2 pi, 2 pi)
 * Note the sign is opposite to that of the dihedral */
INLINE real rv3_solidang2(const real *ri, const real *rip,
    const real *rj, const real *rjp)
{
  real v0[3], v1[3], v2[3], v3[3], vc[3];
  double r0, r1, r2, r3;
  double ang, vol, dn1, dn2, dn, tmp;

  r0 = rv3_norm(rv3_diff(v0, ri, rj));
  r1 = rv3_norm(rv3_diff(v1, ri, rjp));
  r2 = rv3_norm(rv3_diff(v2, rip, rj));
  r3 = rv3_norm(rv3_diff(v3, rip, rjp));

  /* avoid coplanar vectors */
  vol = rv3_dot(v0, rv3_cross(vc, v1, v2));
  if (fabs(vol) < 1e-28) return 0;

  /* calculate the denominator */
  tmp = r1*r2 + rv3_dot(v1, v2);
  /* http://planetmath.org/encyclopedia/TrihedralAngle.html
   * tan(omega/2) = vol/den,
   * where
   * den = r1 (r2.r3) + r2 (r1.r3) + r3 (r1.r2) + r1 r2 r3
   * */
  dn1  = r1 * rv3_dot(v0, v2);
  dn1 += r2 * rv3_dot(v0, v1) + r0 * tmp;
  dn2  = r1 * rv3_dot(v3, v2);
  dn2 += r2 * rv3_dot(v3, v1) + r3 * tmp;

  /* calculate tan(omega1/2 + omega2/2) */
  dn = (dn1 + dn2)/(dn1*dn2 - vol*vol);
  ang = atan(fabs(vol) * dn) + (dn < 0 ? M_PI : 0); /* 0 to pi */

  return (real) (vol > 0 ? 2*ang : -2*ang);
}
/* routines for 3x3 matrices */


#ifndef FM3_T
#define FM3_T fm3_t
typedef float fm3_t[3][3];
#endif

#ifndef DM3_T
#define DM3_T dm3_t
typedef double dm3_t[3][3];
#endif

#ifndef RM3_T
#define RM3_T rm3_t
typedef real rm3_t[3][3];
#endif


#define fm3_print(m, nm, fmt, nl) fm3_fprint(stdout, m, nm, fmt, nl)

#define dm3_print(m, nm, fmt, nl) dm3_fprint(stdout, m, nm, fmt, nl)

#define rm3_print(m, nm, fmt, nl) rm3_fprint(stdout, m, nm, fmt, nl)

INLINE void fm3_fprint(FILE *fp, float (*m)[3], const char *nm, const char *fmt, int nl)
{
  int i, j;

  if (nm) fprintf(fp, "%s:%c", nm, nl ? '\n' : ' ');
  for (i = 0; i < 3; i++) {
    for (j = 0; j < 3; j++)
      fprintf(fp, fmt, m[i][j]);
    fprintf(fp, "%s", nl ? "\n" : "; ");
  }
}

INLINE void dm3_fprint(FILE *fp, double (*m)[3], const char *nm, const char *fmt, int nl)
{
  int i, j;

  if (nm) fprintf(fp, "%s:%c", nm, nl ? '\n' : ' ');
  for (i = 0; i < 3; i++) {
    for (j = 0; j < 3; j++)
      fprintf(fp, fmt, m[i][j]);
    fprintf(fp, "%s", nl ? "\n" : "; ");
  }
}

INLINE void rm3_fprint(FILE *fp, real (*m)[3], const char *nm, const char *fmt, int nl)
{
  int i, j;

  if (nm) fprintf(fp, "%s:%c", nm, nl ? '\n' : ' ');
  for (i = 0; i < 3; i++) {
    for (j = 0; j < 3; j++)
      fprintf(fp, fmt, m[i][j]);
    fprintf(fp, "%s", nl ? "\n" : "; ");
  }
}


INLINE fv3_t *fm3_make(float rx[3][3], float a00, float a01, float a02,
    float a10, float a11, float a12, float a20, float a21, float a22)
{
  fv3_make(rx[0], a00, a01, a02);
  fv3_make(rx[1], a10, a11, a12);
  fv3_make(rx[2], a20, a21, a22);
  return rx;
}

INLINE dv3_t *dm3_make(double rx[3][3], double a00, double a01, double a02,
    double a10, double a11, double a12, double a20, double a21, double a22)
{
  dv3_make(rx[0], a00, a01, a02);
  dv3_make(rx[1], a10, a11, a12);
  dv3_make(rx[2], a20, a21, a22);
  return rx;
}

INLINE rv3_t *rm3_make(real rx[3][3], real a00, real a01, real a02,
    real a10, real a11, real a12, real a20, real a21, real a22)
{
  rv3_make(rx[0], a00, a01, a02);
  rv3_make(rx[1], a10, a11, a12);
  rv3_make(rx[2], a20, a21, a22);
  return rx;
}


#define fm3_makem(rx, x) fm3_make(rx, \
    (float) x[0][0], (float) x[0][1], (float) x[0][2], \
    (float) x[1][0], (float) x[1][1], (float) x[1][2], \
    (float) x[2][0], (float) x[2][1], (float) x[2][2])

#define dm3_makem(rx, x) dm3_make(rx, \
    (double) x[0][0], (double) x[0][1], (double) x[0][2], \
    (double) x[1][0], (double) x[1][1], (double) x[1][2], \
    (double) x[2][0], (double) x[2][1], (double) x[2][2])

#define rm3_makem(rx, x) rm3_make(rx, \
    (real) x[0][0], (real) x[0][1], (real) x[0][2], \
    (real) x[1][0], (real) x[1][1], (real) x[1][2], \
    (real) x[2][0], (real) x[2][1], (real) x[2][2])


/* zero matrix */
#define fm3_zero(x) fm3_makem(x, 0, 0, 0, 0, 0, 0, 0, 0, 0)

/* zero matrix */
#define dm3_zero(x) dm3_makem(x, 0, 0, 0, 0, 0, 0, 0, 0, 0)

/* zero matrix */
#define rm3_zero(x) rm3_makem(x, 0, 0, 0, 0, 0, 0, 0, 0, 0)


/* identity matrix */
#define fm3_one(x) fm3_makem(x, 1, 0, 0, 0, 1, 0, 0, 0, 1)

/* identity matrix */
#define dm3_one(x) dm3_makem(x, 1, 0, 0, 0, 1, 0, 0, 0, 1)

/* identity matrix */
#define rm3_one(x) rm3_makem(x, 1, 0, 0, 0, 1, 0, 0, 0, 1)


/* a = b */
INLINE fv3_t *fm3_copy(float a[3][3], float b[3][3])
{
  fv3_copy(a[0], b[0]);
  fv3_copy(a[1], b[1]);
  fv3_copy(a[2], b[2]);
  return a;
}

/* a = b */
INLINE dv3_t *dm3_copy(double a[3][3], double b[3][3])
{
  dv3_copy(a[0], b[0]);
  dv3_copy(a[1], b[1]);
  dv3_copy(a[2], b[2]);
  return a;
}

/* a = b */
INLINE rv3_t *rm3_copy(real a[3][3], real b[3][3])
{
  rv3_copy(a[0], b[0]);
  rv3_copy(a[1], b[1]);
  rv3_copy(a[2], b[2]);
  return a;
}


/* transpose */
INLINE fv3_t *fm3_trans(float a[3][3])
{
  float x;

  x = a[0][1], a[0][1] = a[1][0], a[1][0] = x;
  x = a[0][2], a[0][2] = a[2][0], a[2][0] = x;
  x = a[2][1], a[2][1] = a[1][2], a[1][2] = x;
  return a;
}

/* transpose */
INLINE dv3_t *dm3_trans(double a[3][3])
{
  double x;

  x = a[0][1], a[0][1] = a[1][0], a[1][0] = x;
  x = a[0][2], a[0][2] = a[2][0], a[2][0] = x;
  x = a[2][1], a[2][1] = a[1][2], a[1][2] = x;
  return a;
}

/* transpose */
INLINE rv3_t *rm3_trans(real a[3][3])
{
  real x;

  x = a[0][1], a[0][1] = a[1][0], a[1][0] = x;
  x = a[0][2], a[0][2] = a[2][0], a[2][0] = x;
  x = a[2][1], a[2][1] = a[1][2], a[1][2] = x;
  return a;
}


/* a = u^T v */
INLINE fv3_t *fm3_vtv(float a[3][3], const float *u, const float *v)
{
  fv3_smul2(a[0], v, u[0]);
  fv3_smul2(a[1], v, u[1]);
  fv3_smul2(a[2], v, u[2]);
  return a;
}

/* a = u^T v */
INLINE dv3_t *dm3_vtv(double a[3][3], const double *u, const double *v)
{
  dv3_smul2(a[0], v, u[0]);
  dv3_smul2(a[1], v, u[1]);
  dv3_smul2(a[2], v, u[2]);
  return a;
}

/* a = u^T v */
INLINE rv3_t *rm3_vtv(real a[3][3], const real *u, const real *v)
{
  rv3_smul2(a[0], v, u[0]);
  rv3_smul2(a[1], v, u[1]);
  rv3_smul2(a[2], v, u[2]);
  return a;
}


/* a += b */
INLINE fv3_t *fm3_inc(float a[3][3], float b[3][3])
{
  fv3_inc(a[0], b[0]);
  fv3_inc(a[1], b[1]);
  fv3_inc(a[2], b[2]);
  return a;
}

/* a += b */
INLINE dv3_t *dm3_inc(double a[3][3], double b[3][3])
{
  dv3_inc(a[0], b[0]);
  dv3_inc(a[1], b[1]);
  dv3_inc(a[2], b[2]);
  return a;
}

/* a += b */
INLINE rv3_t *rm3_inc(real a[3][3], real b[3][3])
{
  rv3_inc(a[0], b[0]);
  rv3_inc(a[1], b[1]);
  rv3_inc(a[2], b[2]);
  return a;
}


/* a += b*s */
INLINE fv3_t *fm3_sinc(float a[3][3], float b[3][3], float s)
{
  fv3_sinc(a[0], b[0], s);
  fv3_sinc(a[1], b[1], s);
  fv3_sinc(a[2], b[2], s);
  return a;
}

/* a += b*s */
INLINE dv3_t *dm3_sinc(double a[3][3], double b[3][3], double s)
{
  dv3_sinc(a[0], b[0], s);
  dv3_sinc(a[1], b[1], s);
  dv3_sinc(a[2], b[2], s);
  return a;
}

/* a += b*s */
INLINE rv3_t *rm3_sinc(real a[3][3], real b[3][3], real s)
{
  rv3_sinc(a[0], b[0], s);
  rv3_sinc(a[1], b[1], s);
  rv3_sinc(a[2], b[2], s);
  return a;
}


/* c = a b */
INLINE fv3_t *fm3_mul(float c[3][3], float a[3][3], float b[3][3])
{
  int i, j;
  for (i = 0; i < 3; i++)
    for (j = 0; j < 3; j++)
      c[i][j] = a[i][0]*b[0][j] + a[i][1]*b[1][j] + a[i][2]*b[2][j];
  return c;
}

/* c = a b */
INLINE dv3_t *dm3_mul(double c[3][3], double a[3][3], double b[3][3])
{
  int i, j;
  for (i = 0; i < 3; i++)
    for (j = 0; j < 3; j++)
      c[i][j] = a[i][0]*b[0][j] + a[i][1]*b[1][j] + a[i][2]*b[2][j];
  return c;
}

/* c = a b */
INLINE rv3_t *rm3_mul(real c[3][3], real a[3][3], real b[3][3])
{
  int i, j;
  for (i = 0; i < 3; i++)
    for (j = 0; j < 3; j++)
      c[i][j] = a[i][0]*b[0][j] + a[i][1]*b[1][j] + a[i][2]*b[2][j];
  return c;
}


/* c = a b^T */
INLINE fv3_t *fm3_mult(float c[3][3], float a[3][3], float b[3][3])
{
  int i, j;
  for (i = 0; i < 3; i++)
    for (j = 0; j < 3; j++)
      c[i][j] = fv3_dot(a[i], b[j]);
  return c;
}

/* c = a b^T */
INLINE dv3_t *dm3_mult(double c[3][3], double a[3][3], double b[3][3])
{
  int i, j;
  for (i = 0; i < 3; i++)
    for (j = 0; j < 3; j++)
      c[i][j] = dv3_dot(a[i], b[j]);
  return c;
}

/* c = a b^T */
INLINE rv3_t *rm3_mult(real c[3][3], real a[3][3], real b[3][3])
{
  int i, j;
  for (i = 0; i < 3; i++)
    for (j = 0; j < 3; j++)
      c[i][j] = rv3_dot(a[i], b[j]);
  return c;
}


/* c = a^T b */
INLINE fv3_t *fm3_tmul(float c[3][3], float a[3][3], float b[3][3])
{
  int i, j;
  for (i = 0; i < 3; i++)
    for (j = 0; j < 3; j++)
      c[i][j] = a[0][i]*b[0][j] + a[1][i]*b[1][j] + a[2][i]*b[2][j];
  return c;
}

/* c = a^T b */
INLINE dv3_t *dm3_tmul(double c[3][3], double a[3][3], double b[3][3])
{
  int i, j;
  for (i = 0; i < 3; i++)
    for (j = 0; j < 3; j++)
      c[i][j] = a[0][i]*b[0][j] + a[1][i]*b[1][j] + a[2][i]*b[2][j];
  return c;
}

/* c = a^T b */
INLINE rv3_t *rm3_tmul(real c[3][3], real a[3][3], real b[3][3])
{
  int i, j;
  for (i = 0; i < 3; i++)
    for (j = 0; j < 3; j++)
      c[i][j] = a[0][i]*b[0][j] + a[1][i]*b[1][j] + a[2][i]*b[2][j];
  return c;
}


/* c = a v */
INLINE float *fm3_mulvec(float * RESTRICT c, float a[3][3], const float *v)
{
  c[0] = fv3_dot(a[0], v);
  c[1] = fv3_dot(a[1], v);
  c[2] = fv3_dot(a[2], v);
  return c;
}

/* c = a v */
INLINE double *dm3_mulvec(double * RESTRICT c, double a[3][3], const double *v)
{
  c[0] = dv3_dot(a[0], v);
  c[1] = dv3_dot(a[1], v);
  c[2] = dv3_dot(a[2], v);
  return c;
}

/* c = a v */
INLINE real *rm3_mulvec(real * RESTRICT c, real a[3][3], const real *v)
{
  c[0] = rv3_dot(a[0], v);
  c[1] = rv3_dot(a[1], v);
  c[2] = rv3_dot(a[2], v);
  return c;
}


/* c = a^T v */
INLINE float *fm3_tmulvec(float * RESTRICT c, float a[3][3], const float *v)
{
  c[0] = a[0][0]*v[0] + a[1][0]*v[1] + a[2][0]*v[2];
  c[1] = a[0][1]*v[0] + a[1][1]*v[1] + a[2][1]*v[2];
  c[2] = a[0][2]*v[0] + a[1][2]*v[1] + a[2][2]*v[2];
  return c;
}

/* c = a^T v */
INLINE double *dm3_tmulvec(double * RESTRICT c, double a[3][3], const double *v)
{
  c[0] = a[0][0]*v[0] + a[1][0]*v[1] + a[2][0]*v[2];
  c[1] = a[0][1]*v[0] + a[1][1]*v[1] + a[2][1]*v[2];
  c[2] = a[0][2]*v[0] + a[1][2]*v[1] + a[2][2]*v[2];
  return c;
}

/* c = a^T v */
INLINE real *rm3_tmulvec(real * RESTRICT c, real a[3][3], const real *v)
{
  c[0] = a[0][0]*v[0] + a[1][0]*v[1] + a[2][0]*v[2];
  c[1] = a[0][1]*v[0] + a[1][1]*v[1] + a[2][1]*v[2];
  c[2] = a[0][2]*v[0] + a[1][2]*v[1] + a[2][2]*v[2];
  return c;
}


/* return 0 rotation matrix around v for ang */
INLINE fv3_t *fm3_mkrot(float m[3][3], const float *v, float ang)
{
  float c = (float) cos(ang), s = (float) sin(ang), nc, n[3];

  fv3_copy(n, v);
  fv3_normalize(n);
  nc = 1 - c;
  m[0][0] = n[0] * n[0] * nc + c;
  m[0][1] = n[0] * n[1] * nc - n[2] * s;
  m[0][2] = n[0] * n[2] * nc + n[1] * s;
  m[1][0] = n[1] * n[0] * nc + n[2] * s;
  m[1][1] = n[1] * n[1] * nc + c;
  m[1][2] = n[1] * n[2] * nc - n[0] * s;
  m[2][0] = n[2] * n[0] * nc - n[1] * s;
  m[2][1] = n[2] * n[1] * nc + n[0] * s;
  m[2][2] = n[2] * n[2] * nc + c;
  return m;
}

/* return 0 rotation matrix around v for ang */
INLINE dv3_t *dm3_mkrot(double m[3][3], const double *v, double ang)
{
  double c = (double) cos(ang), s = (double) sin(ang), nc, n[3];

  dv3_copy(n, v);
  dv3_normalize(n);
  nc = 1 - c;
  m[0][0] = n[0] * n[0] * nc + c;
  m[0][1] = n[0] * n[1] * nc - n[2] * s;
  m[0][2] = n[0] * n[2] * nc + n[1] * s;
  m[1][0] = n[1] * n[0] * nc + n[2] * s;
  m[1][1] = n[1] * n[1] * nc + c;
  m[1][2] = n[1] * n[2] * nc - n[0] * s;
  m[2][0] = n[2] * n[0] * nc - n[1] * s;
  m[2][1] = n[2] * n[1] * nc + n[0] * s;
  m[2][2] = n[2] * n[2] * nc + c;
  return m;
}

/* return 0 rotation matrix around v for ang */
INLINE rv3_t *rm3_mkrot(real m[3][3], const real *v, real ang)
{
  real c = (real) cos(ang), s = (real) sin(ang), nc, n[3];

  rv3_copy(n, v);
  rv3_normalize(n);
  nc = 1 - c;
  m[0][0] = n[0] * n[0] * nc + c;
  m[0][1] = n[0] * n[1] * nc - n[2] * s;
  m[0][2] = n[0] * n[2] * nc + n[1] * s;
  m[1][0] = n[1] * n[0] * nc + n[2] * s;
  m[1][1] = n[1] * n[1] * nc + c;
  m[1][2] = n[1] * n[2] * nc - n[0] * s;
  m[2][0] = n[2] * n[0] * nc - n[1] * s;
  m[2][1] = n[2] * n[1] * nc + n[0] * s;
  m[2][2] = n[2] * n[2] * nc + c;
  return m;
}


/* rotate `x' around `u' by `ang', save the result to `y' */
INLINE float *fv3_rot(float * RESTRICT y, const float *x, const float *u, float ang)
{
  float m[3][3];

  fm3_mkrot(m, u, ang);
  fm3_mulvec(y, m, x);
  return y;
}

/* rotate `x' around `u' by `ang', save the result to `y' */
INLINE double *dv3_rot(double * RESTRICT y, const double *x, const double *u, double ang)
{
  double m[3][3];

  dm3_mkrot(m, u, ang);
  dm3_mulvec(y, m, x);
  return y;
}

/* rotate `x' around `u' by `ang', save the result to `y' */
INLINE real *rv3_rot(real * RESTRICT y, const real *x, const real *u, real ang)
{
  real m[3][3];

  rm3_mkrot(m, u, ang);
  rm3_mulvec(y, m, x);
  return y;
}


/* determinant of a 3x3 matrix */
INLINE float fm3_det(float a[3][3])
{
  return a[0][0] * (a[1][1]*a[2][2] - a[1][2]*a[2][1])
      +  a[0][1] * (a[1][2]*a[2][0] - a[1][0]*a[2][2])
      +  a[0][2] * (a[1][0]*a[2][1] - a[1][1]*a[2][0]);
}

/* determinant of a 3x3 matrix */
INLINE double dm3_det(double a[3][3])
{
  return a[0][0] * (a[1][1]*a[2][2] - a[1][2]*a[2][1])
      +  a[0][1] * (a[1][2]*a[2][0] - a[1][0]*a[2][2])
      +  a[0][2] * (a[1][0]*a[2][1] - a[1][1]*a[2][0]);
}

/* determinant of a 3x3 matrix */
INLINE real rm3_det(real a[3][3])
{
  return a[0][0] * (a[1][1]*a[2][2] - a[1][2]*a[2][1])
      +  a[0][1] * (a[1][2]*a[2][0] - a[1][0]*a[2][2])
      +  a[0][2] * (a[1][0]*a[2][1] - a[1][1]*a[2][0]);
}


/* inverse matrix b = a^(-1) */
INLINE fv3_t *fm3_inv(float b[3][3], float a[3][3])
{
  float d00, d01, d02, detm;
  d00 = a[1][1]*a[2][2] - a[1][2]*a[2][1];
  d01 = a[1][2]*a[2][0] - a[1][0]*a[2][2];
  d02 = a[1][0]*a[2][1] - a[1][1]*a[2][0];
  detm = a[0][0]*d00 + a[0][1]*d01 + a[0][2]*d02;
  if (fabs(detm) < 1e-30) detm = (detm < 0) ? -1e-30f: 1e-30f;
  b[0][0] = d00/detm;
  b[0][1] = (a[2][1]*a[0][2] - a[0][1]*a[2][2])/detm;
  b[0][2] = (a[0][1]*a[1][2] - a[0][2]*a[1][1])/detm;
  b[1][0] = d01/detm;
  b[1][1] = (a[2][2]*a[0][0] - a[2][0]*a[0][2])/detm;
  b[1][2] = (a[0][2]*a[1][0] - a[1][2]*a[0][0])/detm;
  b[2][0] = d02/detm;
  b[2][1] = (a[2][0]*a[0][1] - a[2][1]*a[0][0])/detm;
  b[2][2] = (a[0][0]*a[1][1] - a[0][1]*a[1][0])/detm;
  return b;
}

/* inverse matrix b = a^(-1) */
INLINE dv3_t *dm3_inv(double b[3][3], double a[3][3])
{
  double d00, d01, d02, detm;
  d00 = a[1][1]*a[2][2] - a[1][2]*a[2][1];
  d01 = a[1][2]*a[2][0] - a[1][0]*a[2][2];
  d02 = a[1][0]*a[2][1] - a[1][1]*a[2][0];
  detm = a[0][0]*d00 + a[0][1]*d01 + a[0][2]*d02;
  if (fabs(detm) < 1e-30) detm = (detm < 0) ? -1e-30f: 1e-30f;
  b[0][0] = d00/detm;
  b[0][1] = (a[2][1]*a[0][2] - a[0][1]*a[2][2])/detm;
  b[0][2] = (a[0][1]*a[1][2] - a[0][2]*a[1][1])/detm;
  b[1][0] = d01/detm;
  b[1][1] = (a[2][2]*a[0][0] - a[2][0]*a[0][2])/detm;
  b[1][2] = (a[0][2]*a[1][0] - a[1][2]*a[0][0])/detm;
  b[2][0] = d02/detm;
  b[2][1] = (a[2][0]*a[0][1] - a[2][1]*a[0][0])/detm;
  b[2][2] = (a[0][0]*a[1][1] - a[0][1]*a[1][0])/detm;
  return b;
}

/* inverse matrix b = a^(-1) */
INLINE rv3_t *rm3_inv(real b[3][3], real a[3][3])
{
  real d00, d01, d02, detm;
  d00 = a[1][1]*a[2][2] - a[1][2]*a[2][1];
  d01 = a[1][2]*a[2][0] - a[1][0]*a[2][2];
  d02 = a[1][0]*a[2][1] - a[1][1]*a[2][0];
  detm = a[0][0]*d00 + a[0][1]*d01 + a[0][2]*d02;
  if (fabs(detm) < 1e-30) detm = (detm < 0) ? -1e-30f: 1e-30f;
  b[0][0] = d00/detm;
  b[0][1] = (a[2][1]*a[0][2] - a[0][1]*a[2][2])/detm;
  b[0][2] = (a[0][1]*a[1][2] - a[0][2]*a[1][1])/detm;
  b[1][0] = d01/detm;
  b[1][1] = (a[2][2]*a[0][0] - a[2][0]*a[0][2])/detm;
  b[1][2] = (a[0][2]*a[1][0] - a[1][2]*a[0][0])/detm;
  b[2][0] = d02/detm;
  b[2][1] = (a[2][0]*a[0][1] - a[2][1]*a[0][0])/detm;
  b[2][2] = (a[0][0]*a[1][1] - a[0][1]*a[1][0])/detm;
  return b;
}


/* compute eigenvalues of a 3x3 matrix
 * solving a cubic equation
 * use double for internal calculation */
INLINE float *fm3_eigval(float v[3], float a[3][3])
{
  double m, p, q, pr, pr3, a00, a11, a22;

  m = (a[0][0] + a[1][1] + a[2][2])/3;
  a00 = a[0][0] - m;
  a11 = a[1][1] - m;
  a22 = a[2][2] - m;
  q = ( a00 * (a11*a22 - a[1][2]*a[2][1])
      + a[0][1] * (a[1][2]*a[2][0] - a[1][0]*a22)
      + a[0][2] * (a[1][0]*a[2][1] - a11*a[2][0]) ) / 2.0;
  p = (a00*a00 + a11*a11 + a22*a22) / 6.0
    + (a[0][1]*a[1][0] + a[1][2]*a[2][1] + a[2][0]*a[0][2]) / 3.0;
  /* solve x^3 - 3 p x  - 2 q = 0 */
  pr = sqrt(p);
  pr3 = p * pr;
  if (pr3 <= fabs(q)) {
    if (q < 0.) { /* choose phi = pi/3 */
      v[1] = v[0] = (float) (m + pr);
      v[2] = (float) (m - 2.0 * pr);
    } else { /* phi = 0 */
      v[0] = (float) (m + 2.0 * pr);
      v[2] = v[1] = (float) (m - pr);
    }
  } else {
    double phi = acos(q/pr3)/3.0; /* 0 < phi < pi/3 */

    v[0] = (float) (m + 2.0 * pr * cos(phi));  /* largest */
    v[1] = (float) (m + 2.0 * pr * cos(phi - 2*M_PI/3)); /* second largest */
    v[2] = (float) (m + 2.0 * pr * cos(phi + 2*M_PI/3)); /* smallest */
  }
  return v;
}

/* compute eigenvalues of a 3x3 matrix
 * solving a cubic equation
 * use double for internal calculation */
INLINE double *dm3_eigval(double v[3], double a[3][3])
{
  double m, p, q, pr, pr3, a00, a11, a22;

  m = (a[0][0] + a[1][1] + a[2][2])/3;
  a00 = a[0][0] - m;
  a11 = a[1][1] - m;
  a22 = a[2][2] - m;
  q = ( a00 * (a11*a22 - a[1][2]*a[2][1])
      + a[0][1] * (a[1][2]*a[2][0] - a[1][0]*a22)
      + a[0][2] * (a[1][0]*a[2][1] - a11*a[2][0]) ) / 2.0;
  p = (a00*a00 + a11*a11 + a22*a22) / 6.0
    + (a[0][1]*a[1][0] + a[1][2]*a[2][1] + a[2][0]*a[0][2]) / 3.0;
  /* solve x^3 - 3 p x  - 2 q = 0 */
  pr = sqrt(p);
  pr3 = p * pr;
  if (pr3 <= fabs(q)) {
    if (q < 0.) { /* choose phi = pi/3 */
      v[1] = v[0] = (double) (m + pr);
      v[2] = (double) (m - 2.0 * pr);
    } else { /* phi = 0 */
      v[0] = (double) (m + 2.0 * pr);
      v[2] = v[1] = (double) (m - pr);
    }
  } else {
    double phi = acos(q/pr3)/3.0; /* 0 < phi < pi/3 */

    v[0] = (double) (m + 2.0 * pr * cos(phi));  /* largest */
    v[1] = (double) (m + 2.0 * pr * cos(phi - 2*M_PI/3)); /* second largest */
    v[2] = (double) (m + 2.0 * pr * cos(phi + 2*M_PI/3)); /* smallest */
  }
  return v;
}

/* compute eigenvalues of a 3x3 matrix
 * solving a cubic equation
 * use double for internal calculation */
INLINE real *rm3_eigval(real v[3], real a[3][3])
{
  double m, p, q, pr, pr3, a00, a11, a22;

  m = (a[0][0] + a[1][1] + a[2][2])/3;
  a00 = a[0][0] - m;
  a11 = a[1][1] - m;
  a22 = a[2][2] - m;
  q = ( a00 * (a11*a22 - a[1][2]*a[2][1])
      + a[0][1] * (a[1][2]*a[2][0] - a[1][0]*a22)
      + a[0][2] * (a[1][0]*a[2][1] - a11*a[2][0]) ) / 2.0;
  p = (a00*a00 + a11*a11 + a22*a22) / 6.0
    + (a[0][1]*a[1][0] + a[1][2]*a[2][1] + a[2][0]*a[0][2]) / 3.0;
  /* solve x^3 - 3 p x  - 2 q = 0 */
  pr = sqrt(p);
  pr3 = p * pr;
  if (pr3 <= fabs(q)) {
    if (q < 0.) { /* choose phi = pi/3 */
      v[1] = v[0] = (real) (m + pr);
      v[2] = (real) (m - 2.0 * pr);
    } else { /* phi = 0 */
      v[0] = (real) (m + 2.0 * pr);
      v[2] = v[1] = (real) (m - pr);
    }
  } else {
    double phi = acos(q/pr3)/3.0; /* 0 < phi < pi/3 */

    v[0] = (real) (m + 2.0 * pr * cos(phi));  /* largest */
    v[1] = (real) (m + 2.0 * pr * cos(phi - 2*M_PI/3)); /* second largest */
    v[2] = (real) (m + 2.0 * pr * cos(phi + 2*M_PI/3)); /* smallest */
  }
  return v;
}


/* sort `s' to descending order, order `u' and `v' correspondingly */
INLINE void fv3_sort3(float s[3], float (*u)[3], float (*v)[3])
{
  float tmp;

  if (s[2] > s[1]) {
    tmp = s[1]; s[1] = s[2]; s[2] = tmp;
    if (u) fv3_swap(u[1], u[2]);
    if (v) fv3_swap(v[1], v[2]);
  }
  if (s[1] > s[0]) {
    tmp = s[0]; s[0] = s[1]; s[1] = tmp;
    if (u) fv3_swap(u[0], u[1]);
    if (v) fv3_swap(v[0], v[1]);
  }
  if (s[2] > s[1]) {
    tmp = s[1]; s[1] = s[2]; s[2] = tmp;
    if (u) fv3_swap(u[1], u[2]);
    if (v) fv3_swap(v[1], v[2]);
  }
}

/* sort `s' to descending order, order `u' and `v' correspondingly */
INLINE void dv3_sort3(double s[3], double (*u)[3], double (*v)[3])
{
  double tmp;

  if (s[2] > s[1]) {
    tmp = s[1]; s[1] = s[2]; s[2] = tmp;
    if (u) dv3_swap(u[1], u[2]);
    if (v) dv3_swap(v[1], v[2]);
  }
  if (s[1] > s[0]) {
    tmp = s[0]; s[0] = s[1]; s[1] = tmp;
    if (u) dv3_swap(u[0], u[1]);
    if (v) dv3_swap(v[0], v[1]);
  }
  if (s[2] > s[1]) {
    tmp = s[1]; s[1] = s[2]; s[2] = tmp;
    if (u) dv3_swap(u[1], u[2]);
    if (v) dv3_swap(v[1], v[2]);
  }
}

/* sort `s' to descending order, order `u' and `v' correspondingly */
INLINE void rv3_sort3(real s[3], real (*u)[3], real (*v)[3])
{
  real tmp;

  if (s[2] > s[1]) {
    tmp = s[1]; s[1] = s[2]; s[2] = tmp;
    if (u) rv3_swap(u[1], u[2]);
    if (v) rv3_swap(v[1], v[2]);
  }
  if (s[1] > s[0]) {
    tmp = s[0]; s[0] = s[1]; s[1] = tmp;
    if (u) rv3_swap(u[0], u[1]);
    if (v) rv3_swap(v[0], v[1]);
  }
  if (s[2] > s[1]) {
    tmp = s[1]; s[1] = s[2]; s[2] = tmp;
    if (u) rv3_swap(u[1], u[2]);
    if (v) rv3_swap(v[1], v[2]);
  }
}


/* return the pivot row r and column c, starting from (r0, c0)
 * cmap[r0] registers the actual column index */
INLINE double fm3_pivot_(float m[3][3], int r0, int cmap[])
{
  int i, j, r, c;
  double tmp, max;
  float t;

  /* 1. find the pivot row and column */
  max = -1;
  for (j = r0; j < 3; j++)
    for (i = r0; i < 3; i++)
      if ((tmp = fabs(m[i][j])) > max)
        r = i, c = j, max = tmp;

  /* 2. put the pivot to the left-top corner */
  /* swapping row r and r0, which doesn't affect the solution */
  if (r != r0) fv3_swap(m[r], m[r0]);

  if (c != r0) { /* swap columns c and r0 */
    for (i = 0; i < 3; i++) /* must be from row 0 */
      t = m[i][c], m[i][c] = m[i][r0], m[i][r0] = t;
    i = cmap[c], cmap[c] = cmap[r0], cmap[r0] = i;
  }
  return max;
}

/* return the pivot row r and column c, starting from (r0, c0)
 * cmap[r0] registers the actual column index */
INLINE double dm3_pivot_(double m[3][3], int r0, int cmap[])
{
  int i, j, r, c;
  double tmp, max;
  double t;

  /* 1. find the pivot row and column */
  max = -1;
  for (j = r0; j < 3; j++)
    for (i = r0; i < 3; i++)
      if ((tmp = fabs(m[i][j])) > max)
        r = i, c = j, max = tmp;

  /* 2. put the pivot to the left-top corner */
  /* swapping row r and r0, which doesn't affect the solution */
  if (r != r0) dv3_swap(m[r], m[r0]);

  if (c != r0) { /* swap columns c and r0 */
    for (i = 0; i < 3; i++) /* must be from row 0 */
      t = m[i][c], m[i][c] = m[i][r0], m[i][r0] = t;
    i = cmap[c], cmap[c] = cmap[r0], cmap[r0] = i;
  }
  return max;
}

/* return the pivot row r and column c, starting from (r0, c0)
 * cmap[r0] registers the actual column index */
INLINE double rm3_pivot_(real m[3][3], int r0, int cmap[])
{
  int i, j, r, c;
  double tmp, max;
  real t;

  /* 1. find the pivot row and column */
  max = -1;
  for (j = r0; j < 3; j++)
    for (i = r0; i < 3; i++)
      if ((tmp = fabs(m[i][j])) > max)
        r = i, c = j, max = tmp;

  /* 2. put the pivot to the left-top corner */
  /* swapping row r and r0, which doesn't affect the solution */
  if (r != r0) rv3_swap(m[r], m[r0]);

  if (c != r0) { /* swap columns c and r0 */
    for (i = 0; i < 3; i++) /* must be from row 0 */
      t = m[i][c], m[i][c] = m[i][r0], m[i][r0] = t;
    i = cmap[c], cmap[c] = cmap[r0], cmap[r0] = i;
  }
  return max;
}


/* Solve matrix equation a x = 0 by Gaussian elimination (full-pivot)
 * The matrix 'a' is destroyed, solutions are saved as *row* vectors in 'x'
 * return the number of solutions */
INLINE int fm3_solvezero(float a[3][3], float (*x)[3])
{
  double max, tol, tol1, tol2;
  int cmap[3] = {0, 1, 2};
  float a00;

  max = fm3_pivot_(a, 0, cmap); /* pivot for column 0 */

  if (max <= 0) { /* matrix is zero */
    fv3_make(x[0], 1, 0, 0);
    fv3_make(x[1], 0, 1, 0);
    fv3_make(x[2], 0, 0, 1);
    return 3;
  }

  /* normalize row 0 such that a[0][0] = 1 */
  a00 = a[0][0];
  fv3_smul(a[0], 1/a00);
  fv3_smul(a[1], 1/a00);
  fv3_smul(a[2], 1/a00);
  /* gaussian elimination */
  a[1][1] -= a[1][0] * a[0][1];
  a[1][2] -= a[1][0] * a[0][2];
  a[2][1] -= a[2][0] * a[0][1];
  a[2][2] -= a[2][0] * a[0][2];

  max = fm3_pivot_(a, 1, cmap); /* pivot for column 1 */
  tol1 = 10.*(sizeof(double) == sizeof(float) ? DBL_EPSILON : FLT_EPSILON);
  /* estimate intrinsic error: the determinant is supposed to
   * be zero now; if not, use it as the intrinsic error */
  tol2 = 3 * sqrt(fabs(a[1][1]*a[2][2] - a[1][2]*a[2][1]));
  tol = tol1 > tol2 ? tol1 : tol2;

  if (max <= tol) { /* zero */
    a[1][ cmap[0] ] = a[0][0];
    a[1][ cmap[1] ] = a[0][1];
    a[1][ cmap[2] ] = a[0][2];
    fv3_makenorm(x[0], a[1][1], -a[1][0], 0);
    fv3_normalize( fv3_cross(x[1], a[1], x[0]) );
    return 2;
  }

  x[0][ cmap[1] ] = a[1][2];
  x[0][ cmap[2] ] = -a[1][1];
  x[0][ cmap[0] ] = - a[0][1] * a[1][2] - a[0][2] * (-a[1][1]);
  fv3_normalize(x[0]);
  return 1;
}

/* Solve matrix equation a x = 0 by Gaussian elimination (full-pivot)
 * The matrix 'a' is destroyed, solutions are saved as *row* vectors in 'x'
 * return the number of solutions */
INLINE int dm3_solvezero(double a[3][3], double (*x)[3])
{
  double max, tol, tol1, tol2;
  int cmap[3] = {0, 1, 2};
  double a00;

  max = dm3_pivot_(a, 0, cmap); /* pivot for column 0 */

  if (max <= 0) { /* matrix is zero */
    dv3_make(x[0], 1, 0, 0);
    dv3_make(x[1], 0, 1, 0);
    dv3_make(x[2], 0, 0, 1);
    return 3;
  }

  /* normalize row 0 such that a[0][0] = 1 */
  a00 = a[0][0];
  dv3_smul(a[0], 1/a00);
  dv3_smul(a[1], 1/a00);
  dv3_smul(a[2], 1/a00);
  /* gaussian elimination */
  a[1][1] -= a[1][0] * a[0][1];
  a[1][2] -= a[1][0] * a[0][2];
  a[2][1] -= a[2][0] * a[0][1];
  a[2][2] -= a[2][0] * a[0][2];

  max = dm3_pivot_(a, 1, cmap); /* pivot for column 1 */
  tol1 = 10.*(sizeof(double) == sizeof(double) ? DBL_EPSILON : FLT_EPSILON);
  /* estimate intrinsic error: the determinant is supposed to
   * be zero now; if not, use it as the intrinsic error */
  tol2 = 3 * sqrt(fabs(a[1][1]*a[2][2] - a[1][2]*a[2][1]));
  tol = tol1 > tol2 ? tol1 : tol2;

  if (max <= tol) { /* zero */
    a[1][ cmap[0] ] = a[0][0];
    a[1][ cmap[1] ] = a[0][1];
    a[1][ cmap[2] ] = a[0][2];
    dv3_makenorm(x[0], a[1][1], -a[1][0], 0);
    dv3_normalize( dv3_cross(x[1], a[1], x[0]) );
    return 2;
  }

  x[0][ cmap[1] ] = a[1][2];
  x[0][ cmap[2] ] = -a[1][1];
  x[0][ cmap[0] ] = - a[0][1] * a[1][2] - a[0][2] * (-a[1][1]);
  dv3_normalize(x[0]);
  return 1;
}

/* Solve matrix equation a x = 0 by Gaussian elimination (full-pivot)
 * The matrix 'a' is destroyed, solutions are saved as *row* vectors in 'x'
 * return the number of solutions */
INLINE int rm3_solvezero(real a[3][3], real (*x)[3])
{
  double max, tol, tol1, tol2;
  int cmap[3] = {0, 1, 2};
  real a00;

  max = rm3_pivot_(a, 0, cmap); /* pivot for column 0 */

  if (max <= 0) { /* matrix is zero */
    rv3_make(x[0], 1, 0, 0);
    rv3_make(x[1], 0, 1, 0);
    rv3_make(x[2], 0, 0, 1);
    return 3;
  }

  /* normalize row 0 such that a[0][0] = 1 */
  a00 = a[0][0];
  rv3_smul(a[0], 1/a00);
  rv3_smul(a[1], 1/a00);
  rv3_smul(a[2], 1/a00);
  /* gaussian elimination */
  a[1][1] -= a[1][0] * a[0][1];
  a[1][2] -= a[1][0] * a[0][2];
  a[2][1] -= a[2][0] * a[0][1];
  a[2][2] -= a[2][0] * a[0][2];

  max = rm3_pivot_(a, 1, cmap); /* pivot for column 1 */
  tol1 = 10.*(sizeof(double) == sizeof(real) ? DBL_EPSILON : FLT_EPSILON);
  /* estimate intrinsic error: the determinant is supposed to
   * be zero now; if not, use it as the intrinsic error */
  tol2 = 3 * sqrt(fabs(a[1][1]*a[2][2] - a[1][2]*a[2][1]));
  tol = tol1 > tol2 ? tol1 : tol2;

  if (max <= tol) { /* zero */
    a[1][ cmap[0] ] = a[0][0];
    a[1][ cmap[1] ] = a[0][1];
    a[1][ cmap[2] ] = a[0][2];
    rv3_makenorm(x[0], a[1][1], -a[1][0], 0);
    rv3_normalize( rv3_cross(x[1], a[1], x[0]) );
    return 2;
  }

  x[0][ cmap[1] ] = a[1][2];
  x[0][ cmap[2] ] = -a[1][1];
  x[0][ cmap[0] ] = - a[0][1] * a[1][2] - a[0][2] * (-a[1][1]);
  rv3_normalize(x[0]);
  return 1;
}


/* given an eigenvalue, return the corresponding eigenvectors */
INLINE int fm3_eigvecs(float (*vecs)[3], float mat[3][3], float val)
{
  float m[3][3];

  fm3_copy(m, mat); /* make a matrix */
  m[0][0] -= val;
  m[1][1] -= val;
  m[2][2] -= val;
  return fm3_solvezero(m, vecs);
}

/* given an eigenvalue, return the corresponding eigenvectors */
INLINE int dm3_eigvecs(double (*vecs)[3], double mat[3][3], double val)
{
  double m[3][3];

  dm3_copy(m, mat); /* make a matrix */
  m[0][0] -= val;
  m[1][1] -= val;
  m[2][2] -= val;
  return dm3_solvezero(m, vecs);
}

/* given an eigenvalue, return the corresponding eigenvectors */
INLINE int rm3_eigvecs(real (*vecs)[3], real mat[3][3], real val)
{
  real m[3][3];

  rm3_copy(m, mat); /* make a matrix */
  m[0][0] -= val;
  m[1][1] -= val;
  m[2][2] -= val;
  return rm3_solvezero(m, vecs);
}


/* given the matrix 'mat' and its eigenvalues 'v' return eigenvalues 'vecs'
 * ideally, eigenvalues should be sorted in magnitude-descending order
 * by default, vecs are transposed as a set of column vectors
 * set 'nt' != 0 to disable it: so vecs[0] is the first eigenvector  */
INLINE fv3_t *fm3_eigsys(float v[3], float vecs[3][3], float mat[3][3], int nt)
{
  float vs[5][3] = {{0}}; /* for safety, vs needs 5 rows */
  int n = 0, nn, i = 0;

  fm3_eigval(v, mat);

  for (nn = i = 0; i < 3; i++) {
    n = fm3_eigvecs(vs + nn, mat, v[nn]);
    if (n == 0) return NULL;
    if ((nn += n) >= 3) break;
  }

  fm3_copy(vecs, vs);
  fv3_sort3(v, vecs, NULL);

  return nt ? vecs : fm3_trans(vecs);
}

/* given the matrix 'mat' and its eigenvalues 'v' return eigenvalues 'vecs'
 * ideally, eigenvalues should be sorted in magnitude-descending order
 * by default, vecs are transposed as a set of column vectors
 * set 'nt' != 0 to disable it: so vecs[0] is the first eigenvector  */
INLINE dv3_t *dm3_eigsys(double v[3], double vecs[3][3], double mat[3][3], int nt)
{
  double vs[5][3] = {{0}}; /* for safety, vs needs 5 rows */
  int n = 0, nn, i = 0;

  dm3_eigval(v, mat);

  for (nn = i = 0; i < 3; i++) {
    n = dm3_eigvecs(vs + nn, mat, v[nn]);
    if (n == 0) return NULL;
    if ((nn += n) >= 3) break;
  }

  dm3_copy(vecs, vs);
  dv3_sort3(v, vecs, NULL);

  return nt ? vecs : dm3_trans(vecs);
}

/* given the matrix 'mat' and its eigenvalues 'v' return eigenvalues 'vecs'
 * ideally, eigenvalues should be sorted in magnitude-descending order
 * by default, vecs are transposed as a set of column vectors
 * set 'nt' != 0 to disable it: so vecs[0] is the first eigenvector  */
INLINE rv3_t *rm3_eigsys(real v[3], real vecs[3][3], real mat[3][3], int nt)
{
  real vs[5][3] = {{0}}; /* for safety, vs needs 5 rows */
  int n = 0, nn, i = 0;

  rm3_eigval(v, mat);

  for (nn = i = 0; i < 3; i++) {
    n = rm3_eigvecs(vs + nn, mat, v[nn]);
    if (n == 0) return NULL;
    if ((nn += n) >= 3) break;
  }

  rm3_copy(vecs, vs);
  rv3_sort3(v, vecs, NULL);

  return nt ? vecs : rm3_trans(vecs);
}


/* SVD decomposition of a 3x3 matrix A = U S V^T */
INLINE void fm3_svd(float a[3][3], float u[3][3], float s[3], float v[3][3])
{
  int i, rank;
  float ata[3][3], us[3][3];

  /* A^T A = V S^2 V^T, so (A^T A) V = V S^2 */

  /* 1. compute A^T A and its eigenvectors, which is V */
  fm3_tmul(ata, a, a);
  fm3_eigsys(s, v, ata, 1);

  /* 2. U^T = S^{-1} V^T A^T, and each row of U^T is an eigenvector
   * since eigenvectors are to be normalized, S^{-1} is unnecessary */
  if (s[0] <= 0.0) {
    rank = 0;
    fm3_copy(u, v);
  } else {
    double tol = 10. * sqrt(DBL_EPSILON);
    /* the test i = 1 + (s[1] > s[0]*tol) + (s[2] > s[0]*tol); */
    fm3_mult(u, v, a);
    for (i = 0; i < 3; i++) {
      fv3_copy(us[i], u[i]); /* save a copy of V^T A^T before normalizing it */
      s[i] = fv3_norm(u[i]);
      if (s[i] > 0) fv3_smul(u[i], 1/s[i]);
    }
    rank = 1;
    rank += (fabs(fv3_dot(u[0], u[1])) < tol && s[1] > tol);
    rank += (fabs(fv3_dot(u[0], u[2])) < tol && fabs(fv3_dot(u[1], u[2])) < tol && s[2] > tol);
    if (rank <= 2) {
      if (rank == 1) {
        float z[3] = {0, 0, 0}, w, tmp;

        w = (float) fabs(u[0][i = 0]);
        if ((tmp = (float) fabs(u[0][1])) < w) w = tmp, i = 1;
        if ((tmp = (float) fabs(u[0][2])) < w) i = 2;
        z[i] = 1.0f; /* select the smallest element in u[0] as z */
        fv3_normalize( fv3_cross(u[1], z, u[0]) );
        s[1] = fv3_dot(u[1], us[1]); /* S = U^T (V^T A^T)^T is more accurate than sqrt(A^T A) */
        if (s[1] < 0) { s[1] = -s[1]; fv3_neg(u[1]); } /* make sure s[1] > 0 */
      }
      fv3_normalize( fv3_cross(u[2], u[0], u[1]) );
      s[2] = fv3_dot(u[2], us[2]);
      if (s[2] < 0) { s[2] = -s[2]; fv3_neg(u[2]); }
    }
    fv3_sort3(s, u, v);
  }
  fm3_trans(v);
  fm3_trans(u);
}

/* SVD decomposition of a 3x3 matrix A = U S V^T */
INLINE void dm3_svd(double a[3][3], double u[3][3], double s[3], double v[3][3])
{
  int i, rank;
  double ata[3][3], us[3][3];

  /* A^T A = V S^2 V^T, so (A^T A) V = V S^2 */

  /* 1. compute A^T A and its eigenvectors, which is V */
  dm3_tmul(ata, a, a);
  dm3_eigsys(s, v, ata, 1);

  /* 2. U^T = S^{-1} V^T A^T, and each row of U^T is an eigenvector
   * since eigenvectors are to be normalized, S^{-1} is unnecessary */
  if (s[0] <= 0.0) {
    rank = 0;
    dm3_copy(u, v);
  } else {
    double tol = 10. * sqrt(DBL_EPSILON);
    /* the test i = 1 + (s[1] > s[0]*tol) + (s[2] > s[0]*tol); */
    dm3_mult(u, v, a);
    for (i = 0; i < 3; i++) {
      dv3_copy(us[i], u[i]); /* save a copy of V^T A^T before normalizing it */
      s[i] = dv3_norm(u[i]);
      if (s[i] > 0) dv3_smul(u[i], 1/s[i]);
    }
    rank = 1;
    rank += (fabs(dv3_dot(u[0], u[1])) < tol && s[1] > tol);
    rank += (fabs(dv3_dot(u[0], u[2])) < tol && fabs(dv3_dot(u[1], u[2])) < tol && s[2] > tol);
    if (rank <= 2) {
      if (rank == 1) {
        double z[3] = {0, 0, 0}, w, tmp;

        w = (double) fabs(u[0][i = 0]);
        if ((tmp = (double) fabs(u[0][1])) < w) w = tmp, i = 1;
        if ((tmp = (double) fabs(u[0][2])) < w) i = 2;
        z[i] = 1.0f; /* select the smallest element in u[0] as z */
        dv3_normalize( dv3_cross(u[1], z, u[0]) );
        s[1] = dv3_dot(u[1], us[1]); /* S = U^T (V^T A^T)^T is more accurate than sqrt(A^T A) */
        if (s[1] < 0) { s[1] = -s[1]; dv3_neg(u[1]); } /* make sure s[1] > 0 */
      }
      dv3_normalize( dv3_cross(u[2], u[0], u[1]) );
      s[2] = dv3_dot(u[2], us[2]);
      if (s[2] < 0) { s[2] = -s[2]; dv3_neg(u[2]); }
    }
    dv3_sort3(s, u, v);
  }
  dm3_trans(v);
  dm3_trans(u);
}

/* SVD decomposition of a 3x3 matrix A = U S V^T */
INLINE void rm3_svd(real a[3][3], real u[3][3], real s[3], real v[3][3])
{
  int i, rank;
  real ata[3][3], us[3][3];

  /* A^T A = V S^2 V^T, so (A^T A) V = V S^2 */

  /* 1. compute A^T A and its eigenvectors, which is V */
  rm3_tmul(ata, a, a);
  rm3_eigsys(s, v, ata, 1);

  /* 2. U^T = S^{-1} V^T A^T, and each row of U^T is an eigenvector
   * since eigenvectors are to be normalized, S^{-1} is unnecessary */
  if (s[0] <= 0.0) {
    rank = 0;
    rm3_copy(u, v);
  } else {
    double tol = 10. * sqrt(DBL_EPSILON);
    /* the test i = 1 + (s[1] > s[0]*tol) + (s[2] > s[0]*tol); */
    rm3_mult(u, v, a);
    for (i = 0; i < 3; i++) {
      rv3_copy(us[i], u[i]); /* save a copy of V^T A^T before normalizing it */
      s[i] = rv3_norm(u[i]);
      if (s[i] > 0) rv3_smul(u[i], 1/s[i]);
    }
    rank = 1;
    rank += (fabs(rv3_dot(u[0], u[1])) < tol && s[1] > tol);
    rank += (fabs(rv3_dot(u[0], u[2])) < tol && fabs(rv3_dot(u[1], u[2])) < tol && s[2] > tol);
    if (rank <= 2) {
      if (rank == 1) {
        real z[3] = {0, 0, 0}, w, tmp;

        w = (real) fabs(u[0][i = 0]);
        if ((tmp = (real) fabs(u[0][1])) < w) w = tmp, i = 1;
        if ((tmp = (real) fabs(u[0][2])) < w) i = 2;
        z[i] = 1.0f; /* select the smallest element in u[0] as z */
        rv3_normalize( rv3_cross(u[1], z, u[0]) );
        s[1] = rv3_dot(u[1], us[1]); /* S = U^T (V^T A^T)^T is more accurate than sqrt(A^T A) */
        if (s[1] < 0) { s[1] = -s[1]; rv3_neg(u[1]); } /* make sure s[1] > 0 */
      }
      rv3_normalize( rv3_cross(u[2], u[0], u[1]) );
      s[2] = rv3_dot(u[2], us[2]);
      if (s[2] < 0) { s[2] = -s[2]; rv3_neg(u[2]); }
    }
    rv3_sort3(s, u, v);
  }
  rm3_trans(v);
  rm3_trans(u);
}


/* an old alias */
#define rotfit3 rv3_rmsd

/* Fit x to y by rotation and translation of the `x'
 * If `refl', reflection can also be used.
 * The best-fit structure is saved to `xf', if not NULL */
INLINE float fv3_rmsd(fv3_t * RESTRICT x, fv3_t * RESTRICT xf,
    fv3_t * RESTRICT y, const float *w, int n, int refl,
    float (* RESTRICT r)[3], float * RESTRICT t)
{
  int i;
  float wtot = 0, sq, dev = 0, dev0, detm;
  fv3_t xc, yc, xs, ys, sig, t_;
  float u[3][3], v[3][3], s[3][3] = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}, xy[3][3], r_[3][3];

  if (r == NULL) r = r_;
  if (t == NULL) t = t_;

  /* 1. compute the centers */
  fv3_zero(xc);
  fv3_zero(yc);
  if (w == NULL) {
    for (i = 0; i < n; i++) {
      fv3_inc(xc, x[i]);
      fv3_inc(yc, y[i]);
    }
    wtot = (float) n;
  } else {
    for (wtot = 0., i = 0; i < n; i++) {
      fv3_sinc(xc, x[i], w[i]);
      fv3_sinc(yc, y[i], w[i]);
      wtot += w[i];
    }
  }
  fv3_smul(xc, 1.f/wtot);
  fv3_smul(yc, 1.f/wtot);

  /* 2. compute 3x3 asymmetric covariance matrix S = (x-xc) (y-yc)^T */
  for (i = 0; i < n; i++) {
    fv3_diff(xs, x[i], xc); /* shift to the center avoid the translation */
    fv3_diff(ys, y[i], yc);
    fm3_vtv(xy, xs, ys);
    sq  = fv3_sqr(xs);
    sq += fv3_sqr(ys);
    if (w) {
      fm3_sinc(s, xy, w[i]);
      dev += w[i]*sq;
    } else {
      fm3_inc(s, xy);
      dev += sq; /* Tr(x^T x + y^T y) */
    }
  }
  dev0 = dev;

  /* 3. SVD decompose S = u sig v^T */
  fm3_svd(s, u, sig, v);

  /* 4. compute R = v u^T */
  fm3_mult(r, v, u);
  detm = fm3_det(r);

#define rmsd_dump_(title) { const char *rfmt = "%22.14f"; \
    printf("rmsd [" title "], fatal error: detm = %g, n = %d\n", detm, n); \
    fm3_print(s, "s = (x - xc) (y - yc)^T", rfmt, 1); \
    printf("det(s) = %g\n", fm3_det(s)); \
    fm3_print(u, "u", rfmt, 1); \
    printf("det(u) = %g\n", fm3_det(u)); \
    fm3_print(v, "v", rfmt, 1); \
    printf("det(v) = %g\n", fm3_det(v)); \
    fv3_print(sig, "sig", rfmt, 1); \
    fm3_print(r, "r = v.u (rotation matrix)", rfmt, 1); \
    printf("det(r) = %g\n", fm3_det(r)); \
    fm3_mult(r, u, v); fm3_print(r, "r' = u.v", rfmt, 1); \
    printf("det(r') = %g\n", fm3_det(r)); \
    exit(1); }
  if (fabs(fabs(detm) - 1) > 0.01) {
    fprintf(stderr, "detm: %g\n", detm);
    rmsd_dump_("bad svd");
  }
  if (detm < 0 && !refl) { /* to avoid a reflection */
    fm3_trans(u);
    fv3_neg(u[2]); /* flip the last eigenvector */
    fm3_mul(r, v, u);
    dev -= 2*(sig[0] + sig[1] - sig[2]);
    detm = fm3_det(r);
    if (fabs(fabs(detm) - 1) > 0.01) rmsd_dump_("bad inv.");
#undef rmsd_dump_
  } else {
    dev -= 2 * (sig[0] + sig[1] + sig[2]); /* -2 Tr(R x y^T) */
  }
  if (dev < 0) dev = 0;
  fv3_diff(t, yc, fm3_mulvec(xs, r, xc)); /* t = yc - R xc */

  /* 5. compute the rotated structure */
  if (xf || dev < dev0*0.01) { /* if there's a large cancellation recompute the deviation */
    float xfi[3];

    for (dev = 0, i = 0; i < n; i++) {
      fv3_add(xfi, fm3_mulvec(xs, r, x[i]), t); /* xfi = R x + t */
      sq = fv3_dist2(y[i], xfi);
      if (xf) fv3_copy(xf[i], xfi);
      dev +=  (w ? w[i]*sq : sq); /* recompute the deviation */
    }
  }
  return (float) sqrt(dev/wtot);
}

/* Fit x to y by rotation and translation of the `x'
 * If `refl', reflection can also be used.
 * The best-fit structure is saved to `xf', if not NULL */
INLINE double dv3_rmsd(dv3_t * RESTRICT x, dv3_t * RESTRICT xf,
    dv3_t * RESTRICT y, const double *w, int n, int refl,
    double (* RESTRICT r)[3], double * RESTRICT t)
{
  int i;
  double wtot = 0, sq, dev = 0, dev0, detm;
  dv3_t xc, yc, xs, ys, sig, t_;
  double u[3][3], v[3][3], s[3][3] = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}, xy[3][3], r_[3][3];

  if (r == NULL) r = r_;
  if (t == NULL) t = t_;

  /* 1. compute the centers */
  dv3_zero(xc);
  dv3_zero(yc);
  if (w == NULL) {
    for (i = 0; i < n; i++) {
      dv3_inc(xc, x[i]);
      dv3_inc(yc, y[i]);
    }
    wtot = (double) n;
  } else {
    for (wtot = 0., i = 0; i < n; i++) {
      dv3_sinc(xc, x[i], w[i]);
      dv3_sinc(yc, y[i], w[i]);
      wtot += w[i];
    }
  }
  dv3_smul(xc, 1.f/wtot);
  dv3_smul(yc, 1.f/wtot);

  /* 2. compute 3x3 asymmetric covariance matrix S = (x-xc) (y-yc)^T */
  for (i = 0; i < n; i++) {
    dv3_diff(xs, x[i], xc); /* shift to the center avoid the translation */
    dv3_diff(ys, y[i], yc);
    dm3_vtv(xy, xs, ys);
    sq  = dv3_sqr(xs);
    sq += dv3_sqr(ys);
    if (w) {
      dm3_sinc(s, xy, w[i]);
      dev += w[i]*sq;
    } else {
      dm3_inc(s, xy);
      dev += sq; /* Tr(x^T x + y^T y) */
    }
  }
  dev0 = dev;

  /* 3. SVD decompose S = u sig v^T */
  dm3_svd(s, u, sig, v);

  /* 4. compute R = v u^T */
  dm3_mult(r, v, u);
  detm = dm3_det(r);

#define rmsd_dump_(title) { const char *rfmt = "%22.14f"; \
    printf("rmsd [" title "], fatal error: detm = %g, n = %d\n", detm, n); \
    dm3_print(s, "s = (x - xc) (y - yc)^T", rfmt, 1); \
    printf("det(s) = %g\n", dm3_det(s)); \
    dm3_print(u, "u", rfmt, 1); \
    printf("det(u) = %g\n", dm3_det(u)); \
    dm3_print(v, "v", rfmt, 1); \
    printf("det(v) = %g\n", dm3_det(v)); \
    dv3_print(sig, "sig", rfmt, 1); \
    dm3_print(r, "r = v.u (rotation matrix)", rfmt, 1); \
    printf("det(r) = %g\n", dm3_det(r)); \
    dm3_mult(r, u, v); dm3_print(r, "r' = u.v", rfmt, 1); \
    printf("det(r') = %g\n", dm3_det(r)); \
    exit(1); }
  if (fabs(fabs(detm) - 1) > 0.01) {
    fprintf(stderr, "detm: %g\n", detm);
    rmsd_dump_("bad svd");
  }
  if (detm < 0 && !refl) { /* to avoid a reflection */
    dm3_trans(u);
    dv3_neg(u[2]); /* flip the last eigenvector */
    dm3_mul(r, v, u);
    dev -= 2*(sig[0] + sig[1] - sig[2]);
    detm = dm3_det(r);
    if (fabs(fabs(detm) - 1) > 0.01) rmsd_dump_("bad inv.");
#undef rmsd_dump_
  } else {
    dev -= 2 * (sig[0] + sig[1] + sig[2]); /* -2 Tr(R x y^T) */
  }
  if (dev < 0) dev = 0;
  dv3_diff(t, yc, dm3_mulvec(xs, r, xc)); /* t = yc - R xc */

  /* 5. compute the rotated structure */
  if (xf || dev < dev0*0.01) { /* if there's a large cancellation recompute the deviation */
    double xfi[3];

    for (dev = 0, i = 0; i < n; i++) {
      dv3_add(xfi, dm3_mulvec(xs, r, x[i]), t); /* xfi = R x + t */
      sq = dv3_dist2(y[i], xfi);
      if (xf) dv3_copy(xf[i], xfi);
      dev +=  (w ? w[i]*sq : sq); /* recompute the deviation */
    }
  }
  return (double) sqrt(dev/wtot);
}

/* Fit x to y by rotation and translation of the `x'
 * If `refl', reflection can also be used.
 * The best-fit structure is saved to `xf', if not NULL */
INLINE real rv3_rmsd(rv3_t * RESTRICT x, rv3_t * RESTRICT xf,
    rv3_t * RESTRICT y, const real *w, int n, int refl,
    real (* RESTRICT r)[3], real * RESTRICT t)
{
  int i;
  real wtot = 0, sq, dev = 0, dev0, detm;
  rv3_t xc, yc, xs, ys, sig, t_;
  real u[3][3], v[3][3], s[3][3] = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}, xy[3][3], r_[3][3];

  if (r == NULL) r = r_;
  if (t == NULL) t = t_;

  /* 1. compute the centers */
  rv3_zero(xc);
  rv3_zero(yc);
  if (w == NULL) {
    for (i = 0; i < n; i++) {
      rv3_inc(xc, x[i]);
      rv3_inc(yc, y[i]);
    }
    wtot = (real) n;
  } else {
    for (wtot = 0., i = 0; i < n; i++) {
      rv3_sinc(xc, x[i], w[i]);
      rv3_sinc(yc, y[i], w[i]);
      wtot += w[i];
    }
  }
  rv3_smul(xc, 1.f/wtot);
  rv3_smul(yc, 1.f/wtot);

  /* 2. compute 3x3 asymmetric covariance matrix S = (x-xc) (y-yc)^T */
  for (i = 0; i < n; i++) {
    rv3_diff(xs, x[i], xc); /* shift to the center avoid the translation */
    rv3_diff(ys, y[i], yc);
    rm3_vtv(xy, xs, ys);
    sq  = rv3_sqr(xs);
    sq += rv3_sqr(ys);
    if (w) {
      rm3_sinc(s, xy, w[i]);
      dev += w[i]*sq;
    } else {
      rm3_inc(s, xy);
      dev += sq; /* Tr(x^T x + y^T y) */
    }
  }
  dev0 = dev;

  /* 3. SVD decompose S = u sig v^T */
  rm3_svd(s, u, sig, v);

  /* 4. compute R = v u^T */
  rm3_mult(r, v, u);
  detm = rm3_det(r);

#define rmsd_dump_(title) { const char *rfmt = "%22.14f"; \
    printf("rmsd [" title "], fatal error: detm = %g, n = %d\n", detm, n); \
    rm3_print(s, "s = (x - xc) (y - yc)^T", rfmt, 1); \
    printf("det(s) = %g\n", rm3_det(s)); \
    rm3_print(u, "u", rfmt, 1); \
    printf("det(u) = %g\n", rm3_det(u)); \
    rm3_print(v, "v", rfmt, 1); \
    printf("det(v) = %g\n", rm3_det(v)); \
    rv3_print(sig, "sig", rfmt, 1); \
    rm3_print(r, "r = v.u (rotation matrix)", rfmt, 1); \
    printf("det(r) = %g\n", rm3_det(r)); \
    rm3_mult(r, u, v); rm3_print(r, "r' = u.v", rfmt, 1); \
    printf("det(r') = %g\n", rm3_det(r)); \
    exit(1); }
  if (fabs(fabs(detm) - 1) > 0.01) {
    fprintf(stderr, "detm: %g\n", detm);
    rmsd_dump_("bad svd");
  }
  if (detm < 0 && !refl) { /* to avoid a reflection */
    rm3_trans(u);
    rv3_neg(u[2]); /* flip the last eigenvector */
    rm3_mul(r, v, u);
    dev -= 2*(sig[0] + sig[1] - sig[2]);
    detm = rm3_det(r);
    if (fabs(fabs(detm) - 1) > 0.01) rmsd_dump_("bad inv.");
#undef rmsd_dump_
  } else {
    dev -= 2 * (sig[0] + sig[1] + sig[2]); /* -2 Tr(R x y^T) */
  }
  if (dev < 0) dev = 0;
  rv3_diff(t, yc, rm3_mulvec(xs, r, xc)); /* t = yc - R xc */

  /* 5. compute the rotated structure */
  if (xf || dev < dev0*0.01) { /* if there's a large cancellation recompute the deviation */
    real xfi[3];

    for (dev = 0, i = 0; i < n; i++) {
      rv3_add(xfi, rm3_mulvec(xs, r, x[i]), t); /* xfi = R x + t */
      sq = rv3_dist2(y[i], xfi);
      if (xf) rv3_copy(xf[i], xfi);
      dev +=  (w ? w[i]*sq : sq); /* recompute the deviation */
    }
  }
  return (real) sqrt(dev/wtot);
}


/* generate a random orthonormal (unitary) 3x3 matrix */
INLINE fv3_t *fm3_rnduni(float a[3][3])
{
  float dot;

  fv3_rnddir0(a[0]);

  fv3_rnd(a[1], -1, 1);
  /* component of a[1] normal to a[0] */
  dot = fv3_dot(a[0], a[1]);
  fv3_normalize( fv3_sinc(a[1], a[0], -dot) );

  fv3_cross(a[2], a[0], a[1]);
  return a;
}

/* generate a random orthonormal (unitary) 3x3 matrix */
INLINE dv3_t *dm3_rnduni(double a[3][3])
{
  double dot;

  dv3_rnddir0(a[0]);

  dv3_rnd(a[1], -1, 1);
  /* component of a[1] normal to a[0] */
  dot = dv3_dot(a[0], a[1]);
  dv3_normalize( dv3_sinc(a[1], a[0], -dot) );

  dv3_cross(a[2], a[0], a[1]);
  return a;
}

/* generate a random orthonormal (unitary) 3x3 matrix */
INLINE rv3_t *rm3_rnduni(real a[3][3])
{
  real dot;

  rv3_rnddir0(a[0]);

  rv3_rnd(a[1], -1, 1);
  /* component of a[1] normal to a[0] */
  dot = rv3_dot(a[0], a[1]);
  rv3_normalize( rv3_sinc(a[1], a[0], -dot) );

  rv3_cross(a[2], a[0], a[1]);
  return a;
}


#endif /* ZCOM_RV3__ */
#endif /* ZCOM_RV3 */


#ifdef  ZCOM_MD
#ifndef ZCOM_MD__
#define ZCOM_MD__

#define md_shiftcom(x, n, d) md_shiftcomw(x, NULL, n, d)
#define md_shiftcom3d(x, n) md_shiftcomw3d(x, NULL, n)
#define md_shiftcom2d(x, n) md_shiftcomw2d(x, NULL, n)

INLINE void md_shiftcomw(real * RESTRICT x, const real * RESTRICT w, int n, int d);
/* these two are inline instead macros because they offer type checks */
INLINE void md_shiftcomw2d(rv2_t * RESTRICT x, const real * RESTRICT w, int n)
  { md_shiftcomw((real *) x, w, n, 2); }
INLINE void md_shiftcomw3d(rv3_t * RESTRICT x, const real * RESTRICT w, int n)
  { md_shiftcomw((real *) x, w, n, 3); }

INLINE void md_shiftang2d(rv2_t * RESTRICT x, rv2_t * RESTRICT v, int n);
INLINE void md_shiftang3d(rv3_t * RESTRICT x, rv3_t * RESTRICT v, int n);
INLINE void md_shiftang(real * RESTRICT x, real * RESTRICT v, int n, int d)
{
  if (d == 2) md_shiftang2d((rv2_t *) x, (rv2_t *) v, n);
  else md_shiftang3d((rv3_t *) x, (rv3_t *) v, n);
}


INLINE real md_ekin(const real *v, int nd, int dof, real * RESTRICT tkin);
INLINE real md_ekin2d(rv2_t * RESTRICT v, int n, int dof, real * RESTRICT tkin)
  { return md_ekin((const real *) v, n*2, dof, tkin); }
INLINE real md_ekin3d(rv3_t * RESTRICT v, int n, int dof, real * RESTRICT tkin)
  { return md_ekin((const real *) v, n*3, dof, tkin); }


INLINE void md_vscale(real * RESTRICT v, int nd, int dof, real tp, real ekt, real * RESTRICT ekin, real * RESTRICT tkin);
INLINE void md_vscale2d(rv2_t * RESTRICT v, int n, int dof, real tp, real ekt, real * RESTRICT ekin, real * RESTRICT tkin)
  { md_vscale((real *) v, n*2, dof, tp, ekt, ekin, tkin); }
INLINE void md_vscale3d(rv3_t * RESTRICT v, int n, int dof, real tp, real ekt, real * RESTRICT ekin, real * RESTRICT tkin)
  { md_vscale((real *) v, n*3, dof, tp, ekt, ekin, tkin); }


INLINE void md_vrescale(real * RESTRICT v, int nd, int dof, real tp, real dt, real * RESTRICT ekin, real * RESTRICT tkin);
INLINE void md_vrescale2d(rv2_t * RESTRICT v, int n, int dof, real tp, real dt, real * RESTRICT ekin, real * RESTRICT tkin)
  { md_vrescale((real *) v, n*2, dof, tp, dt, ekin, tkin); }
INLINE void md_vrescale3d(rv3_t * RESTRICT v, int n, int dof, real tp, real dt, real * RESTRICT ekin, real * RESTRICT tkin)
  { md_vrescale((real *) v, n*3, dof, tp, dt, ekin, tkin); }


INLINE void md_vrescalex(real * RESTRICT v, int nd, int dof, real tp, real dt, real * RESTRICT ekin, real * RESTRICT tkin);
INLINE void md_vrescalex2d(rv2_t * RESTRICT v, int n, int dof, real tp, real dt, real * RESTRICT ekin, real * RESTRICT tkin)
  { md_vrescalex((real *) v, n*2, dof, tp, dt, ekin, tkin); }
INLINE void md_vrescalex3d(rv3_t * RESTRICT v, int n, int dof, real tp, real dt, real * RESTRICT ekin, real * RESTRICT tkin)
  { md_vrescalex((real *) v, n*3, dof, tp, dt, ekin, tkin); }


/* shift the center of mass to zero */
INLINE void md_shiftcomw(real * RESTRICT x, const real * RESTRICT w, int n, int d)
{
  int i, j;
  real rc, wtot = 0;

  if (w) for (i = 0; i < n; i++) wtot += w[i];
  else wtot = (real) n;
  for (j = 0; j < d; j++) {
    for (rc = 0, i = 0; i < n; i++)
      rc += x[i*d+j]*(w ? w[i] : 1.f);
    rc /= wtot;
    for (i = 0; i < n; i++)
      x[i*d+j] -= rc;
  }
}


/* annihilate angular momentum 2d */
INLINE void md_shiftang2d(rv2_t * RESTRICT x, rv2_t * RESTRICT v, int n)
{
  int i;
  real am, r2, xc[2] = {0, 0}, xi[2];

  for (i = 0; i < n; i++) rv2_inc(xc, x[i]);
  rv2_smul(xc, 1.f/n);
  for (am = r2 = 0.f, i = 0; i < n; i++) {
    rv2_diff(xi, x[i], xc);
    am += rv2_cross(xi, v[i]);
    r2 += rv2_sqr(x[i]);
  }
  am = -am/r2;
  for (i = 0; i < n; i++) {
    rv2_diff(xi, x[i], xc);
    v[i][0] += -am*xi[1];
    v[i][1] +=  am*xi[0];
  }
}


/* annihilate angular momentum 3d
 * solve
 *   /  y^2 + z^2    -x y      -x y      \
 *   |  -x y       X^2 + z^2   -y z      |  c  =  I
 *   \  -x z         -y z     x^2 + y^2  /
 * use a velocity field
 *    v = c X r
 *   */

INLINE void md_shiftang3d(rv3_t *x, rv3_t *v, int n)
{
  int i;
  real xc[3] = {0, 0, 0}, xi[3], ang[3], am[3] = {0, 0, 0}, dv[3], mat[3][3], inv[3][3];
  real xx = 0, yy = 0, zz = 0, xy = 0, zx = 0, yz = 0;

  for (i = 0; i < n; i++) rv3_inc(xc, x[i]);
  rv3_smul(xc, 1.f/n);
  for (i = 0; i < n; i++) {
    rv3_diff(xi, x[i], xc);
    rv3_cross(ang, xi, v[i]);
    rv3_inc(am, ang);
    xx += xi[0]*xi[0];
    yy += xi[1]*xi[1];
    zz += xi[2]*xi[2];
    xy += xi[0]*xi[1];
    yz += xi[1]*xi[2];
    zx += xi[2]*xi[0];
  }
  mat[0][0] = yy+zz;
  mat[1][1] = xx+zz;
  mat[2][2] = xx+yy;
  mat[0][1] = mat[1][0] = -xy;
  mat[1][2] = mat[2][1] = -yz;
  mat[0][2] = mat[2][0] = -zx;
  rm3_inv(inv, mat);
  ang[0] = -rv3_dot(inv[0], am);
  ang[1] = -rv3_dot(inv[1], am);
  ang[2] = -rv3_dot(inv[2], am);
  /* ang is the solution of M^(-1) * I */
  for (i = 0; i < n; i++) {
    rv3_diff(xi, x[i], xc);
    rv3_cross(dv, ang, xi);
    rv3_inc(v[i], dv);
  }
}


/* return kinetic energy */
INLINE real md_ekin(const real *v, int nd, int dof, real *tkin)
{
  int i;
  real ekin;

  for (ekin = 0, i = 0; i < nd; i++) ekin += v[i]*v[i];
  if (tkin) *tkin = ekin/dof;
  return ekin *= .5f;
}


/* compute the kinetic energy for the thermostats, if ekin != NULL */
INLINE real md_getekin(real *ekin, const real *v, int nd)
{
  int i;
  real ek;

  if (ekin) {
    ek = *ekin;
  } else {
    for (ek = 0, i = 0; i < nd; i++) ek += v[i] * v[i];
    ek *= 0.5f;
  }
  return ek;
}


/* velocity scaling: for regular (no thermostat) MD during equilibration
 * `tp' is the target temperature
 * `ekt' is the observed average kinetic energy, may not be the current *ekin  */
INLINE void md_vscale(real *v, int nd, int dof, real tp, real ekt, real *ekin, real *tkin)
{
  int i;
  real ekav = .5f*tp*dof, s;

  s = (real) sqrt( ekav / ekt );
  for (i = 0; i < nd; i++)
    v[i] *= s;
  if (ekin) *ekin *= s*s;
  if (tkin) *tkin *= s*s;
}


/* velocity rescaling thermostat */
INLINE void md_vrescale(real *v, int nd, int dof, real tp, real dt, real *ekin, real *tkin)
{
  int i;
  real ekav = .5f*tp*dof, ek1, ek2, s;
  double amp;

  ek1 = md_getekin(ekin, v, nd);
  amp = 2*sqrt(ek1*ekav*dt/dof);
  ek2 = ek1 + (ekav - ek1)*dt + (real)(amp*grand0());
  if (ek2 < 0) ek2 = 0;
  s = (real) sqrt(ek2/ek1);
  for (i = 0; i < nd; i++)
    v[i] *= s;
  if (ekin) *ekin = ek2;
  if (tkin) *tkin *= s*s;
}


/* Exact velocity rescaling thermostat */
INLINE void md_vrescalex(real *v, int nd, int dof, real tp, real dt, real *ekin, real *tkin)
{
  int i;
  real ekav = .5f*tp*dof, ek1, ek2, s, c = 0, r, r2;

  if (dt < 10) c = (real) exp(-dt);
  ek1 = md_getekin(ekin, v, nd);
  r = (real) grand0();
  r2 = (real) randgausssum(dof - 1);
  ek2 = (real)( ek1 + (1 - c) * (ekav*(r2 + r*r)/dof - ek1)
    + 2 * r * sqrt(c*(1 - c) * ekav/dof*ek1) );
  if (ek2 < 0) ek2 = 0;
  s = (real) sqrt(ek2/ek1);
  for (i = 0; i < nd; i++)
    v[i] *= s;
  if (ekin) *ekin = ek2;
  if (tkin) *tkin *= s*s;
}


/* backup thermostat: velocity-rescaling according to a Monte-Carlo move */
INLINE int md_mcvrescale(real *v, int nd, int dof, real tp, real dt, real *ekin, real *tkin)
{
  int i, acc;
  real ek1, ek2, s;
  double logek1, logek2, r, r0;

  ek1 = md_getekin(ekin, v, nd);
  logek1 = log(ek1);
  logek2 = logek1 + dt*(2.f*rnd0() - 1);
  ek2 = (real) exp(logek2);
  r = (ek2 - ek1)/tp - .5*dof*(logek2 - logek1);
  if (r <= 0) {
    acc = 1;
  } else {
    r0 = rnd0();
    acc = (r0 < exp(-r));
  }
  if ( acc ) {
    s = (real) sqrt(ek2/ek1);
    for (i = 0; i < nd; i++)
      v[i] *= s;
    if (ekin) *ekin = ek2;
    if (tkin) *tkin *= s*s;
  }
  return acc;
}


INLINE int md_mcvrescale2d(rv2_t * RESTRICT v, int n, int dof, real tp, real dt, real * RESTRICT ekin, real * RESTRICT tkin)
  { return md_mcvrescale((real *) v, n*2, dof, tp, dt, ekin, tkin); }
INLINE int md_mcvrescale3d(rv3_t * RESTRICT v, int n, int dof, real tp, real dt, real * RESTRICT ekin, real * RESTRICT tkin)
  { return md_mcvrescale((real *) v, n*3, dof, tp, dt, ekin, tkin); }


/* Nose-Hoover thermostat */
INLINE void md_hoover(real *v, int nd, int dof, real tp, real dt,
    real *zeta, real Q, real *ekin, real *tkin)
{
  int i;
  real ek1, ek2, s, dt2 = .5f*dt;

  ek1 = md_getekin(ekin, v, nd);
  *zeta += (2.f*ek1 - dof * tp)/Q*dt2;

  s = (real) exp(-(*zeta)*dt);
  for (i = 0; i < nd; i++) v[i] *= s;
  ek2 = ek1 * (s*s);
  if (ekin) *ekin = ek2;
  if (tkin) *tkin *= s*s;

  *zeta += (2.f*ek2 - dof * tp)/Q*dt2;
}


INLINE void md_hoover2d(rv2_t *v, int n, int dof, real tp, real dt,
    real *zeta, real Q, real *ekin, real *tkin)
  { md_hoover((real *) v, n*2, dof, tp, dt, zeta, Q, ekin, tkin); }

INLINE void md_hoover3d(rv3_t *v, int n, int dof, real tp, real dt,
    real *zeta, real Q, real *ekin, real *tkin)
  { md_hoover((real *) v, n*3, dof, tp, dt, zeta, Q, ekin, tkin); }


/* Nose-Hoover chain thermostat */
INLINE void md_nhchain(real *v, int nd, int dof, real tp, real scl, real dt,
    real *zeta, const real *Q, int M, real *ekin, real *tkin)
{
  int i, j;
  real ek1, ek2, s, dt2 = .5f*dt, dt4 = .25f*dt, G, xp = 1.f;

  ek1 = md_getekin(ekin, v, nd);

  /* propagate the chain */
  for (j = M-1; j > 0; j--) {
    if (j < M-1) {
      xp = (real) exp(-dt4 * zeta[j+1]);
      zeta[j] *= xp;
    }
    G = (Q[j-1]*zeta[j-1]*zeta[j-1] - tp)/Q[j];
    zeta[j] += G * dt2;
    if (j < M-1)
      zeta[j] *= xp;
  }

  /* the first thermostat variable */
  if (M >= 2) {
    xp = (real) exp(-dt4 * zeta[1]);
    zeta[0] *= xp;
  }
  G = (scl * 2.f * ek1 - dof * tp)/Q[0];
  zeta[0] += G * dt2;
  if (M >= 2)
    zeta[0] *= xp;

  /* scale the velocities */
  s = (real) exp(-(*zeta)*dt);
  for (i = 0; i < nd; i++) v[i] *= s;
  ek2 = ek1 * (s*s);
  if (ekin) *ekin = ek2;
  if (tkin) *tkin *= s*s;

  /* the first thermostat variable */
  if (M >= 2) {
    xp = (real) exp(-dt4*zeta[1]);
    zeta[0] *= xp;
  }
  G = (scl * 2.f * ek1 - dof * tp)/Q[0];
  zeta[0] += G * dt2;
  if (M >= 2)
    zeta[0] *= xp;

  /* propagate the chain */
  for (j = M-1; j > 0; j--) {
    if (j < M-1) {
      xp = (real) exp(-dt4*zeta[j+1]);
      zeta[j] *= xp;
    }
    G = (Q[j-1]*zeta[j-1]*zeta[j-1] - tp)/Q[j];
    zeta[j] += G * dt2;
    if (j < M-1)
      zeta[j] *= xp;
  }
}


INLINE void md_nhchain2d(rv3_t *v, int n, int dof, real tp, real scl, real dt,
    real *zeta, const real *Q, int M, real *ekin, real *tkin)
  { md_nhchain((real *) v, n*2, dof, tp, scl, dt, zeta, Q, M, ekin, tkin); }

INLINE void md_nhchain3d(rv3_t *v, int n, int dof, real tp, real scl, real dt,
    real *zeta, const real *Q, int M, real *ekin, real *tkin)
  { md_nhchain((real *) v, n*3, dof, tp, scl, dt, zeta, Q, M, ekin, tkin); }


/* velocity-scaling Langevin thermostat */
INLINE void md_vslang(real *v, int nd, int dof, real tp, real dt,
    real *zeta, real zeta2, real Q, real *ekin, real *tkin)
{
  int i;
  real ek1, ek2, s, dt2 = .5f*dt, xp, amp;

  ek1 = md_getekin(ekin, v, nd);
  xp = (real) exp(-zeta2*.25*dt);
  amp = (real) sqrt(2*zeta2/Q*dt2);
  *zeta *= xp;
  *zeta += (2.f*ek1 - dof * tp)/Q*dt2;
  *zeta += amp * (real) grand0(); /* white noise */
  *zeta *= xp;

  s = (real) exp(-(*zeta)*dt);
  for (i = 0; i < nd; i++) v[i] *= s;
  ek2 = ek1 * (s*s);
  if (ekin) *ekin = ek2;
  if (tkin) *tkin *= s*s;

  *zeta *= xp;
  *zeta += (2.f*ek2 - dof * tp)/Q*dt2;
  *zeta += amp * (real) grand0(); /* white noise */
  *zeta *= xp;
}


INLINE void md_vslang2d(rv2_t *v, int n, int dof, real tp, real dt,
    real *zeta, real zeta2, real Q, real *ekin, real *tkin)
  { md_vslang((real *) v, n*2, dof, tp, dt, zeta, zeta2, Q, ekin, tkin); }

INLINE void md_vslang3d(rv3_t *v, int n, int dof, real tp, real dt,
    real *zeta, real zeta2, real Q, real *ekin, real *tkin)
  { md_vslang((real *) v, n*3, dof, tp, dt, zeta, zeta2, Q, ekin, tkin); }


/* Anderson thermostat */
INLINE void md_andersen(real *v, int n, int d, real tp)
{
  int i, j;

  tp = (real) sqrt(tp);
  i = (int)(rnd0() * n);
  for (j = 0; j < d; j++)
    v[i*d + j] = tp * (real) grand0();
}


/* Langevin thermostat */
INLINE void md_langevin(real *v, int n, int d, real tp, real dt)
{
  int i;
  real c = (real) exp(-dt), amp;

  amp = (real) sqrt((1 - c*c) * tp);
  for (i = 0; i < n*d; i++)
    v[i] = c*v[i] + amp * (real) grand0();
}


/* Nose-Hoover thermostat/barostat
 * set cutoff to half of the box */
INLINE void md_hoovertp(real *v, int n, int d, int dof, real dt,
    real tp, real pext, real *zeta, real *eta, real Q, real W,
    real vol, real vir, real ptail, int ensx,
    real *ekin, real *tkin)
{
  int i;
  real xp, pint, s, dt2 = dt*.5f, dt4 = dt*.25f;

  /* thermostat */
  *zeta += (2.f * (*ekin) + W * (*eta) * (*eta) - (dof + 1) * tp) * dt2/Q;
  xp = (real) exp(-(*zeta)*dt4); /* zeta won't change until the end */

  /* barostat */
  *eta *= xp;
  pint = (vir + 2.f * (*ekin))/ (d * vol) + ptail;
  *eta += ((pint - pext)*vol + (1 - ensx) * tp)*d*dt2/W;
  *eta *= xp;

  /* scaling velocity */
  s = (real) exp( -dt * (*zeta + *eta) );
  for (i = 0; i < d * n; i++) v[i] *= s;
  *ekin *= s*s;
  *tkin *= s*s;

  /* barostat */
  *eta *= xp;
  pint = (vir + 2.f * (*ekin))/ (d * vol) + ptail;
  *eta += ((pint - pext)*vol + (1 - ensx) * tp)*d*dt2/W;
  *eta *= xp;

  /* thermostat */
  *zeta += (2.f * (*ekin) + W * (*eta) * (*eta) - (dof + 1) * tp) * dt2/Q;
}


/* Nose-Hoover chain thermostat/barostat
 * set cutoff to half of the box */
INLINE void md_nhchaintp(real *v, int n, int d, int dof, real dt,
    real tp, real pext, real *zeta, real *eta, const real *Q, int M, real W,
    real vol, real vir, real ptail, int ensx,
    real *ekin, real *tkin)
{
  int i, j;
  real xpz, pint, s, dt2 = dt*.5f, dt4 = dt*.25f, G, xp;

  /* 1. thermostat */
  /* 1.A propagate the chain */
  for (j = M-1; j > 0; j--) {
    if (j < M-1) {
      xp = (real) exp(-dt4*zeta[j+1]);
      zeta[j] *= xp;
    }
    G = (Q[j-1]*zeta[j-1]*zeta[j-1] - tp)/Q[j];
    zeta[j] += G * dt2;
    if (j < M-1)
      zeta[j] *= xp;
  }

  /* 1.B the first thermostat variable */
  if (M >= 2) {
    xp = (real) exp(-dt4*zeta[1]);
    zeta[0] *= xp;
  }
  G = (2.f * (*ekin) + W * (*eta) * (*eta) - (dof + 1) * tp) / Q[0];
  zeta[0] += G * dt2;
  if (M >= 2)
    zeta[0] *= xp;
  xpz = (real) exp(-zeta[0]*dt4); /* zeta won't change until the end */

  /* 2. barostat */
  *eta *= xpz;
  pint = (vir + 2.f * (*ekin))/ (d * vol) + ptail;
  *eta += ((pint - pext)*vol + (1 - ensx) * tp)*d*dt2/W;
  *eta *= xpz;

  /* 3. scaling velocity */
  s = (real) exp( -dt * (zeta[0] + *eta) );
  for (i = 0; i < d * n; i++) v[i] *= s;
  *ekin *= s*s;
  *tkin *= s*s;

  /* 4. barostat */
  *eta *= xpz;
  pint = (vir + 2.f * (*ekin))/ (d * vol) + ptail;
  *eta += ((pint - pext)*vol + (1 - ensx) * tp)*d*dt2/W;
  *eta *= xpz;

  /* 5. thermostat */
  /* 5.A the first thermostat variable */
  if (M >= 2) {
    xp = (real) exp(-dt4*zeta[1]);
    zeta[0] *= xp;
  }
  G = (2.f * (*ekin) + W * (*eta) * (*eta) - (dof + 1) * tp) / Q[0];
  zeta[0] += G * dt2;
  if (M >= 2)
    zeta[0] *= xp;

  /* 5.B propagate the chain */
  for (j = M-1; j > 0; j--) {
    if (j < M-1) {
      xp = (real) exp(-dt4*zeta[j+1]);
      zeta[j] *= xp;
    }
    G = (Q[j-1]*zeta[j-1]*zeta[j-1] - tp)/Q[j];
    zeta[j] += G * dt2;
    if (j < M-1)
      zeta[j] *= xp;
  }
}


/* Langevin barostat
 *   d eta / dt = -zeta * eta
 *      + [ (Pint - Pext) * V + (1 - ensx) T ] * d / W
 *      + sqrt( 2 * zeta * T / W ) xi
 * the ideal-gas part of the pressure, Pint, is computed as \sum p^2/m / V
 * the additional volume distribution weight is V^(-ensx)
 * the scaling is r = r*s, p = p/s;
 * set the cutoff rc to half of the box */
INLINE void md_langtp(real *v, int n, int d, real dt,
    real tp, real pext, real zeta, real *eta, real W,
    real vol, real vir, real ptail, int ensx,
    real *ekin, real *tkin)
{
  int i;
  real xp, pint, s, dt2 = dt*.5f, dt4 = dt*.25f, amp;

  xp = (real) exp(-zeta*dt4);
  amp = (real) sqrt(2.f*zeta*tp/W*dt2);

  /* barostat: first half to update *eta */
  *eta *= xp;
  pint = (vir + 2.f * (*ekin))/ (d * vol) + ptail;
  *eta += ((pint - pext)*vol + (1 - ensx) * tp)*d*dt2/W;
  *eta += amp * (real) grand0(); /* random noise */
  *eta *= xp;

  /* scaling velocity */
  s = (real) exp( -dt * (*eta) );
  for (i = 0; i < d * n; i++) v[i] *= s;
  *ekin *= s*s;
  *tkin *= s*s;

  /* barostat: second half to update *eta */
  *eta *= xp;
  pint = (vir + 2.f * (*ekin))/ (d * vol) + ptail;
  *eta += ((pint - pext)*vol + (1 - ensx) * tp)*d*dt2/W;
  *eta += amp * (real) grand0(); /* random noise */
  *eta *= xp;
}


/* position Langevin barostat
 * limiting case, zeta -> inf., of the full Langevin barostat
 * barodt = dt/(W*zeta), and d * d(eta) = d lnv
 * the ideal-gas part of the pressure is computed as \sum p^2/m / V
 * the scaling is r = r*s, p = p/s;
 * set cutoff to half of the box */
INLINE void md_langtp0(real *v, int n, int d, real barodt,
    real tp, real pext, real *vol, real vir, real ptail, int ensx,
    real *ekin, real *tkin)
{
  int i;
  real pint, amp, vn, s, dlnv;

  /* compute the internal pressure */
  /* note only with half-box cutoff, the formula is accurate */
  pint = (vir + 2.f * (*ekin))/ (d * (*vol)) + ptail;

  amp = (real) sqrt(2.f * barodt);
  dlnv = ((pint - pext) * (*vol)/tp + 1 - ensx)*barodt
       + amp * (real) grand0();
  vn = *vol * (real) exp( dlnv );

  s = (real) exp( dlnv/d );
  for (i = 0; i < d * n; i++) v[i] *= s;
  *ekin *= s*s;
  *tkin *= s*s;

  *vol = vn;
}


/* position Langevin barostat, with coordinates only
 * NOTE: the first parameter is the degree of freedom
 * the scaling is r = r*s
 * set cutoff to half of the box */
INLINE void md_langp0(int dof, int d, real barodt,
    real tp, real pext, real *vol, real vir, real ptail, int ensx)
{
  real pintv, amp, dlnv;

  pintv = (vir + dof * tp)/d + ptail * (*vol);
  amp = (real) sqrt(2 * barodt);
  dlnv = ((pintv - pext * (*vol))/tp + 1 - ensx)*barodt
       + amp * (real) grand0();
  *vol *= (real) exp( dlnv );
}


/* sinc(x) = (e^x - e^(-x))/(2 x) */
INLINE double md_mysinc(double x)
{
  double x2, y;

  if (fabs(x) < 1e-2) { /* series expansion */
    x2 = x * x;
    return 1 + (1 + (1 + x2/42.0)*x2/20.0)*x2/6.0;
  }
  y = exp(x);
  return .5 * (y - 1./y)/x;
}


/* Nose-Hoover position update */
INLINE void md_hoovertpdr(real *r, const real *v, int nd,
    real *xp, real l, real eta, real dt)
{
  int i;
  real dtxp, xph, etadt2;

  /* r' = r*exp(eta*dt) + v*(exp(eta*dt) - 1)/eta
   * now exp(eta*dt) is the volume scaling factor
   * so for the reduced coordinates R = r*exp(-eta*dt)
   * R' = R + v*(1 - exp(-eta*dt))/eta; */
  etadt2 = eta * dt * .5f;
  xph = (real) exp(etadt2);
  *xp = xph * xph;
  dtxp = 1.f/xph * dt * (real) md_mysinc(etadt2) / l;
/*
  dtxp = (1 - 1/(*xp))/eta/l;
*/
  for (i = 0; i < nd; i++)
    r[i] += v[i] * dtxp;
}


#define md_mutv2d(v, n, tp, r) md_mutv((real *)(rv2_t *)v, n * 2, tp, r)
#define md_mutv3d(v, n, tp, r) md_mutv((real *)(rv3_t *)v, n * 3, tp, r)

/* mutate velocities with the probability of r */
INLINE int md_mutv(real *v, int nd, real tp, double r)
{
  int i;
  real vamp = (real) sqrt(tp);

  for (i = 0; i < nd; i++)
    if (rnd0() < r)
      v[i] = vamp * (real) grand0();
  return 0;
}


/* multiply velocities by a random unitary matrix */
INLINE int md_unimatv3d(rv3_t *v, int n)
{
  int i;
  real mat[3][3], v1[3];

  rm3_rnduni(mat);
  for (i = 0; i < n; i++) {
    rm3_mulvec(v1, mat, v[i]);
    rv3_copy(v[i], v1);
  }
  return 0;
}


/* write a position file */
INLINE int md_writepos(FILE *fp, const real *x, const real *v, int n,
    int d, real scl)
{
  int i, j;

  for (i = 0; i < n; i++) {
    for (j = 0; j < d; j++) fprintf(fp, "%16.14f ", x[i*d + j] * scl);
    if (v)
      for (j = 0; j < d; j++) fprintf(fp, "%16.14f ", v[i*d + j]);
    fprintf(fp, "\n");
  }
  return 0;
}

INLINE int md_writepos2d(FILE *fp, rv2_t *x, rv2_t *v, int n, real scl)
{
  return md_writepos(fp, (const real *) x, (const real *) v, n, 2, scl);
}

INLINE int md_writepos3d(FILE *fp, rv3_t *x, rv3_t *v, int n, real scl)
{
  return md_writepos(fp, (const real *) x, (const real *) v, n, 3, scl);
}


/* read position */
INLINE int md_readpos(FILE *fp, real *x, real *v, int n, int d, real scl)
{
  const char *fmt;
  char s[256], *p;
  int i, j, next;
  real *x0, *v0, xtmp;

  fmt = (sizeof(double) == sizeof(real)) ? "%lf%n" : "%f%n";
  /* back up the current coordinates and velocities, in case of failure */
  xnew(x0, n * d);
  memcpy(x0, x, sizeof(real) * n * d);
  if (v) {
    xnew(v0, n * d);
    memcpy(v0, v, sizeof(real) * n * d);
  }
  for (i = 0; i < n; i++) {
    if (fgets(s, sizeof s, fp) == NULL) goto ERR;
    for (p = s, j = 0; j < d; j++, p += next) {
      if (1 != sscanf(p, fmt, &xtmp, &next)) {
        fprintf(stderr, "cannot read i = %d, j = %d\n", i, j);
        goto ERR;
      }
      x[i*d + j] = xtmp / scl;
    }
    if (v != NULL) {
      for (j = 0; j < d; j++, p += next)
        if (1 != sscanf(p, fmt, v + i*d + j, &next)) {
          fprintf(stderr, "cannot read i = %d, j = %d\n", i, j);
          goto ERR;
        }
    }
  }
  free(x0);
  if (v) free(v0);
  return 0;
ERR:
  fprintf(stderr, "pos file broken on line %d!\n%s\n", i, s);
  /* recover the original coordinates and velocities */
  memcpy(x, x0, n * d * sizeof(real));
  free(x0);
  if (v) {
    memcpy(v, v0, n * d * sizeof(real));
    free(v0);
  }
  return -1;
}

INLINE int md_readpos2d(FILE *fp, rv2_t *x, rv2_t *v, int n, real scl)
{
  return md_readpos(fp, (real *) x, (real *) v, n, 2, scl);
}

INLINE int md_readpos3d(FILE *fp, rv3_t *x, rv3_t *v, int n, real scl)
{
  return md_readpos(fp, (real *) x, (real *) v, n, 3, scl);
}


#endif /* ZCOM_MD__ */
#endif /* ZCOM_MD */


#ifdef  ZCOM_ISING2
#ifndef ZCOM_ISING2__
#define ZCOM_ISING2__
/* two-dimensional Ising model */


typedef struct {
  int d, l, n;
  int M, E;
  int *s; /* 0 or 1 */
  double *logdos; /* logarithmic density of states */
  /* helper vars */
  uint32_t *uproba; /* temporary probability for MC transitions */
} ising_t;


/* set transition probability */
#define IS2_SETPROBA(is, bet) { \
  double x_ = exp(-4. * bet); \
  is->uproba[2] = (uint32_t) ((double)(0xffffffff) * x_); \
  is->uproba[4] = (uint32_t) ((double)(0xffffffff) * x_*x_); }


/* compute total energy and magnetization */
INLINE int is2_em(ising_t *is)
{
  int l, i, j, s, u, e, m, *p, *pu;

  e = m = 0;
  p = is->s;
  l = is->l;
  for (i = 0; i < l; ) {
    pu = (++i == l) ? is->s : p+l;
    for (j = 0; j < l; ) {
      m += (s = *p++);
      u = *pu++;
      e += s*(u + ((++j == l) ? *(p-l) : *p));
    }
  }
  is->M = m;
  return is->E = -e;
}


INLINE int is2_check(ising_t *is)
{
  int i, e, m;

  for (i = 0; i < is->n; i++)
    if (is->s[i] != 1 && is->s[i] != -1) {
      fprintf(stderr, "error: s[%d] = %d\n", i, is->s[i]);
      return -1;
    }
  e = is->E;
  m = is->M;
  is2_em(is);
  if  (e != is->E || e < -2*is->n || e > 2*is->n
    || m != is->M || m < -is->n   || m > is->n) {
    fprintf(stderr, "error: E = %d, %d; M = %d, %d\n",
        e, is->E, m, is->M);
    return -1;
  }
  return 0;
}


/* pick a random site, count neighbors with different spins */
INLINE int is2_pick(const ising_t *is, int *h)
{
  int id, ix, iy, l, lm, n, nm, *p;

  lm = (l = is->l) - 1;
  nm = (n = is->n) - l;
  id = (int)(rnd0() * n);
  iy = id / l, ix = id % l;
  p = is->s + id;
  *h = *p * ( ((ix != 0 ) ? *(p-1) : *(p+lm))   /* left  */
            + ((ix != lm) ? *(p+1) : *(p-lm))   /* right */
            + ((iy != 0 ) ? *(p-l) : *(p+nm))   /* down  */
            + ((iy != lm) ? *(p+l) : *(p-nm))); /* up    */
  return id;
}


/* flip site id, with h different neighbors */
INLINE int is2_flip(ising_t *is, int id, int h)
{
  die_if(id >= is->n, "bad id %d/%d\n", id, is->n);
  is->M += (is->s[id] = -is->s[id])*2;
  return is->E += h*2;
}


/* faster macros for systems with fixed (upon compiling) size
 * to use them one must define IS2_LB before including
 * IS2_PICK()/IS2_PSEQ() and IS2_FLIP() */
#ifdef  IS2_LB  /* L = 2^LB, N = L*L */
#define IS2_L   (1 << IS2_LB)
#define IS2_N   (IS2_L * IS2_L)

#define IS2_GETH(is, id, h) { \
  unsigned ix, iy; \
  iy = id / IS2_L, ix = id % IS2_L; \
  h = is->s[id] * ( is->s[iy*IS2_L + (ix+1)%IS2_L] \
                  + is->s[iy*IS2_L + (ix+IS2_L-1)%IS2_L] \
                  + is->s[(iy+1)%IS2_L*IS2_L + ix] \
                  + is->s[(iy-1+IS2_L)%IS2_L*IS2_L + ix] ); }
#define IS2_IRND(is, id)  id = rand32() >> (32 - 2*IS2_LB);
/* random picking */
#define IS2_PICK(is, id, h) { IS2_IRND(is, id); IS2_GETH(is, id, h); }
#define IS2_ISEQ(is, id)  id = (id + 1) % IS2_N;
/* sequential picking */
#define IS2_PSEQ(is, id, h) { IS2_ISEQ(is, id); IS2_GETH(is, id, h); }

#define IS2_FLIP(is, id, h) { \
  is->M += (is->s[id] = -is->s[id]) * 2; \
  is->E += h * 2; }

#else

#define IS2_PICK(is, id, h)  id = is2_pick(is, &h)
#define IS2_FLIP(is, id, h)  is2_flip(is, id, h)

#endif


INLINE int is2_load(ising_t *is, const char *fname)
{
  FILE *fp;
  int i, lx, ly, n, c;
  char s[80];

  xfopen(fp, fname, "r", return -1);
  if (fgets(s, sizeof s, fp) == NULL) {
    fprintf(stderr, "missing first line %s\n", fname);
    return -1;
  }
  if (4 != sscanf(s, "%d%d%d%d", &i, &lx, &ly, &n)
      || i != 2 || lx != ly || lx != is->l || n != is->n) {
    fprintf(stderr, "bad setting: %dD, %dx%d = %d\n", i, lx, ly, n);
    return -1;
  }
  for (i = 0; i < n; i++) {
    while ((c = fgetc(fp)) != EOF && c == '\n') ;
    if (c == EOF) break;
    is->s[i] = (c == ' ') ? -1 : 1;
  }
  if (i < n)
    fprintf(stderr, "%s: data stopped at i = %d\n", fname, i);
  fclose(fp);
  is2_em(is);
  return 0;
}


INLINE int is2_save(const ising_t *is, const char *fname)
{
  FILE *fp;
  int i, j, l, *p;

  xfopen(fp, fname, "w", return -1);
  l = is->l;
  fprintf(fp, "%d %d %d %d\n", is->d, l, l, is->n);
  for (p = is->s, i = 0; i < l; i++) {
    for (j = 0; j < l; j++, p++)
      fprintf(fp, "%c", (*p > 0) ? '#' : ' ');
    fprintf(fp, "\n");
  }
  fclose(fp);
  return 0;
}


/* initialize an lxl Ising model */
INLINE ising_t *is2_open(int l)
{
  int i, n;
  ising_t *is;

  xnew(is, 1);
  is->d = 2;
  is->l = l;
  is->n = n = l*l;
  xnew(is->s, n);
  for (i = 0; i < n; i++) is->s[i] = -1;
  is->M = -n;
  is->E = -2*n;
  xnew(is->logdos, n + 1);
  xnew(is->uproba, 2*is->d+1);
  is->uproba[0] = 0xffffffff;
  return is;
}


INLINE void is2_close(ising_t *is)
{
  if (is != NULL) {
    free(is->s);
    free(is->logdos);
    free(is->uproba);
    free(is);
  }
}


/* exact solution of ising model */
INLINE double is2_exact(ising_t *is, double beta, double *eav, double *cv)
{
  double lxh, n, ex, f, th, sech, bet2, bsqr, log2, x;
  double lnz, lnz1, lnz2, lnz3, lnz4, dz, ddz;
  double z21, z31, z41, za1;
  double dr1, dr2, dr3, dr4, ddr1, ddr2, ddr3, ddr4;
  double g, g0, dg, ddg, dg0;
  double xn2b, sh2b, coth2b;
  double lnch2b, lncc2b, lncl, lnsl, cd, cdsqr, lnddcl;
  int r, sgn4 = 1, lx, ly;

  lx = is->l;
  ly = is->l;
  lxh = .5*lx;
  n = lx * ly;
  log2 = log(2.0);
  bet2 = 2.*beta;
  bsqr = beta*beta;
  xn2b = exp(-bet2);
  if (lx == 2 && ly == 2) { /* 2x2 system */
    double lnc, lnd;
    x = 8.*beta;
    lnc = lnadd(x, -x);
    lnd = lnaddn(lnc, 6.);
    lnz = lnd + log2;
    if (eav) *eav = -8.*exp(lndif(x, -x) - lnd); /* -8*sinh(8*b)/(3+cosh(8*h)) */
    if (cv) *cv = bsqr * 384. * exp(lnaddn(lnc, 2./3) - 2.0*lnd); /* 64*(1+3cosh(8*b))/(3+cosh(8*b))^2 */
    return lnz;
  } else if (fabs(beta) < 1e-6) { /* high T approx. normal branch unstable if beta < 1e-6 */
    lnz = n * (2.*lnadd(beta, -beta) - log2);
    x = 1. + xn2b;
    if (eav) *eav = -2. * n * (1. - xn2b)/x;
    if (cv) *cv = bsqr * 8.*n*xn2b/(x*x);
    return lnz; /* +n*tanh(beta)^4 */
  }

  lnz1 = lnz2 = lnz3 = lnz4 = 0;
  dr1 = dr2 = dr3 = dr4 = 0;
  ddr1 = ddr2 = ddr3 = ddr4 = 0;
  lnch2b = lnadd(bet2, -bet2) - log2;
  coth2b = 2./(1. - xn2b*xn2b) - 1.;
  lncc2b = lnch2b + log(coth2b); /* ln[ cosh(2b) * coth(2b) ] */
  g0 = bet2 + log(2./(1. + xn2b) - 1.);
  sgn4 = (g0 >= 0) ? 1 : -1;

  sh2b = 0.5*(1./xn2b - xn2b);
  dg0 = 2. + 2./sh2b;
  x = sh2b*sh2b;
  cd = 2. - 2./x; /* cl' = cd * cosh(2b) */
  cdsqr = cd*cd;
  lnddcl = lnaddn(lncc2b, 2.0/(x * sh2b)) + 2.*log2; /* log(cl'') */

  for (r = 0; r < ly; r++) { /* for odd number */
    lncl = lnaddn(lncc2b, -cos((2.*r + 1.)*M_PI/ly));
    lnsl = lncl + 0.5*log(1. - exp(-2.*lncl));
    g = lnadd(lncl, lnsl);
    f = lxh*g;
    lnz1 += lnadd(f, -f);
    lnz2 += lndif(f, -f);

    dg = exp(lnch2b - lnsl)*cd; /* g' = cl'/sl; */
    ex = exp(-f);
    th = 2./(1. + ex*ex) - 1.;
    x = lxh*dg;
    dr1 += x*th;
    dr2 += x/th;

    /* g''=cl''/sl - cl' ^2 *cl/sl^3; */
    ddg = exp(lnddcl - lnsl);
    ddg -= exp(lnch2b*2. + lncl - 3.*lnsl)*cdsqr;
    sech = 2.0*dg/(ex + 1.0/ex); /* g' * sech(0.5*lx*g) */
    ddr1 += lxh*(ddg*th + lxh*(sech*sech));
    sech = 2.0*dg/(ex - 1.0/ex); /* g' * sech(0.5*lx*g) */
    ddr2 += lxh*(ddg/th - lxh*(sech*sech));

    if (r == 0) {
      g = g0;
    } else {
      lncl = lnaddn(lncc2b, -cos(2.0*M_PI*r/ly));
      lnsl = lncl+0.5*log(1-exp(-2*lncl));
      g = lnadd(lncl, lnsl);
      die_if (g < 0.0, "g = %g < 0.\n", g);;
    }
    f = lxh*g;
    lnz3 += lnadd(f, -f); /* log [2 cosh(f)] */
    lnz4 += (f < 0) ? lndif(-f, f) : lndif(f, -f); /* avoid neg. g0 */

    ex = exp(-f);
    th = 2./(1. + ex*ex) - 1.;
    dg = (r == 0) ? dg0 : exp(lnch2b - lnsl)*cd;
    dr3 += lxh*dg*th;
    dr4 += lxh*dg/th;

    if (r == 0) {
      ddg = -4*coth2b*coth2b*exp(-lnch2b);
    } else {
      ddg = exp(lnddcl - lnsl);
      ddg -= exp(lnch2b*2. + lncl - 3.*lnsl)*cdsqr;
    }
    sech = 2.0*dg/(ex + 1.0/ex);
    ddr3 += lxh*(ddg*th + lxh*(sech*sech));
    sech = 2.0*dg/(ex - 1.0/ex);
    ddr4 += lxh*(ddg/th - lxh*(sech*sech));
  }

  z21 = exp(lnz2 - lnz1);
  z31 = exp(lnz3 - lnz1);
  z41 = sgn4*exp(lnz4 - lnz1);
  za1 = 1.0 + z21 + z31 + z41;
  lnz = lnz1 + log(za1);
  lnz += .5*n*log(2.*sh2b) - log2;
  dz = (dr1 + z21*dr2 + z31*dr3 + z41*dr4)/za1;
  if (eav) *eav = - n*coth2b - dz;
  ddr1 += dr1*dr1;
  ddr2 += dr2*dr2;
  ddr3 += dr3*dr3;
  ddr4 += dr4*dr4;
  ddz = (ddr1 + z21*ddr2 + z31*ddr3 + z41*ddr4)/za1;
  if (cv) *cv = bsqr * (-2.*n/(sh2b*sh2b) + ddz - dz*dz);
  return lnz;
}


#define is2_loadlogdos(is, fn) \
  is2loadlogdos(is->logdos, is->l, is->l, fn)

/* load the exact logarithmic density of states from file
 * both n and m should be is->l
 * Minimal Mathematica script to generate exact DOS files

NDOS[m_, n_] := Module[{x, xp, prec = Floor[1.5 n m Log[2]/Log[10]], b, a, c2, s2, c0, s0, cn, sn},
  b = 2 x (1 - x^2);
  a[k_] := (1 + x^2)^2 - b Cos[Pi k/n];
  c0 = (1 - x)^m + x^m (1 + x)^m;
  s0 = (1 - x)^m - x^m (1 + x)^m;
  cn = (1 + x)^m + x^m (1 - x)^m;
  sn = (1 + x)^m - x^m (1 - x)^m;
  c2[k_] := (Sum[ m!/(2 j)!/(m - 2 j)! (a[k]^2 - b^2)^j a[k]^(m - 2 j), {j, 0, IntegerPart[m/2]}] + b^m)/2^(m - 1);
  s2[k_] := (Sum[ m!/(2 j)!/(m - 2 j)! (a[k]^2 - b^2)^j a[k]^(m - 2 j), {j, 0, IntegerPart[m/2]}] - b^m)/2^(m - 1);
  xp = Expand[ N[ (1/2) If[Mod[n, 2] == 0,
    Product[c2[2 k + 1], {k, 0, n/2 - 1}]
  + Product[s2[2 k + 1], {k, 0, n/2 - 1}]
  + c0 cn Product[c2[2 k], {k, 1, n/2 - 1}]
  + s0 sn Product[s2[2 k], {k, 1, n/2 - 1}],
    cn Product[c2[2 k + 1], {k, 0, (n - 3)/2}]
  + sn Product[s2[2 k + 1], {k, 0, (n - 3)/2}]
  + c0 Product[c2[2 k], {k, 1, (n - 1)/2}]
  + s0 Product[s2[2 k], {k, 1, (n - 1)/2}]], prec]];
  Take[Round[Chop[CoefficientList[xp, x]]], {1, -1, 2}]];

savels[fn_, ls_] := Module[{fp = OpenWrite[fn], i},
  For[i = 1, i <= Length[ls], i++, Write[fp, ls[[i]]]]; Close[fp]];

easydos[n_, m_] := Module[{dos = NDOS[n, m], logdos = Table[0, {n m + 1}], i},
  savels["IsingDOS" <> ToString[n] <> "x" <> ToString[m] <> ".txt", dos];
  For[i = 1, i <= n m + 1, i++,
    logdos[[i]] = If[dos[[i]] == 0, -10000, N[Log[dos[[i]]], 17]]];
  savels["islogdos" <> ToString[n] <> "x" <> ToString[m] <> ".txt", logdos]];
  */
INLINE int is2loadlogdos(double *logdos, int n, int m, const char *fn)
{
  char s[1024];
  FILE *fp;
  int i, err = 0;

  if (fn == NULL) { /* use standard file name */
    sprintf(s, "islogdos%dx%d.txt", n, m);
    fn = s;
  }
  xfopen(fp, fn, "r", return -1);

  for (i = 0; i <= n*m;  i++) {
    if (fgets(s, sizeof s, fp) == NULL) {
      printf("file %s ended at line %d/%d\n", fn, i, n*m);
      err = 1;
      break;
    }
    if (1 != sscanf(s, "%lf", &logdos[i])) {
      printf("sscanf failed in reading %s, line %d/%d\n", fn, i, n*m);
      err = 2;
      break;
    }
  }
  fclose(fp);
  return err;
}

#endif /* ZCOM_ISING2__ */
#endif /* ZCOM_ISING2 */


#ifdef  ZCOM_LJ
#ifndef ZCOM_LJ__
#define ZCOM_LJ__

typedef struct {
  int i, j, in;
  real phi, psi, xi, dx[3], dr2;
} ljpair_t;


#define LJ_SWALLPAIRS 0x100 /* flag of usesw, save all (including out-of-range) pairs */

typedef struct {
  int d; /* dimension = 3 */
  int n; /* number of particles */
  int dof; /* degrees of freedom */

  real rho;
  real l, vol; /* side length and volume */
  real rc2, rc, rcdef; /* real / preferred rc */

  real * RESTRICT x; /* reduced unit (0, 1) */
  real * RESTRICT v, * RESTRICT f;
  real epot0, epot, epots; /* potential energy: pure, with tail correction and shifted potential energy */
  int iepot;  /* integer energy for square-well potential */
  real ekin, tkin, etot;
  real vir; /* virial */
  real epot_shift, epot_tail, p_tail;

  int usesw; /* switched potential */
  real rs, a4, a5, a6, a7; /* parameters */
  ljpair_t *pr;
  int npr;
  real lap, f2, *gdg, *xdg;

  int usesq; /* square well potential */
  int esqinf;
  real ra, ra2, rb, rb2; /* -1 for (ra, rb) */
  real rmin; /* minimal pair distance */

  unsigned isclone; /* is a clone copy, don't free pointers */
} lj_t;

INLINE real lj_energy(lj_t *lj);
INLINE real lj_force(lj_t *lj);

#define lj_shiftcom(lj, v)    md_shiftcom(v, lj->n, lj->d)
#define lj_shiftang(lj, x, v) md_shiftang(x, v, lj->n, lj->d)


/* initialize a fcc lattice */
INLINE void lj_initfcc2d(lj_t *lj)
{
  int i, j, id, n1, n = lj->n;
  real a;

  n1 = (int) (pow(2*n, 1.0/lj->d) + .999999); /* # of particles per side */
  a = 1.f/n1;
  for (id = 0, i = 0; i < n1 && id < n; i++)
    for (j = 0; j < n1 && id < n; j++) {
      if ((i+j) % 2 != 0) continue;
      lj->x[id*2 + 0] = (i + .5f) * a;
      lj->x[id*2 + 1] = (j + .5f) * a;
      id++;
    }
}


/* initialize a fcc lattice */
INLINE void lj_initfcc3d(lj_t *lj)
{
  int i, j, k, id, n1, n = lj->n;
  real a;

  n1 = (int) (pow(2*n, 1.0/lj->d) + .999999); /* # of particles per side */
  a = 1.f/n1;
  for (id = 0, i = 0; i < n1 && id < n; i++)
    for (j = 0; j < n1 && id < n; j++)
      for (k = 0; k < n1 && id < n; k++) {
        if ((i+j+k) % 2 != 0) continue;
        lj->x[id*3 + 0] = (i + .5f) * a;
        lj->x[id*3 + 1] = (j + .5f) * a;
        lj->x[id*3 + 2] = (k + .5f) * a;
        id++;
      }
}


/* set density and compute tail corrections */
INLINE void lj_setrho(lj_t *lj, real rho)
{
  double irc, irc3, irc6;
  int i;

  lj->rho = rho;
  lj->l = (real) pow(1.*lj->n/rho, 1./lj->d);
  for (lj->vol = 1.f, i = 0; i < lj->d; i++) lj->vol *= lj->l;
  if ((lj->rc = lj->rcdef) > lj->l * .5f) lj->rc = lj->l * .5f;
  lj->rc2 = lj->rc * lj->rc;
  irc = 1.0/lj->rc;
  irc3 = irc * irc * irc;
  irc6 = irc3 * irc3;
  if (lj->usesw) { /* assume u(L/2) = 0 */
    lj->epot_shift = 0.f;
    lj->epot_tail = 0.f;
    lj->p_tail = 0.f;
  } else {
    lj->epot_shift = (real)( 4*irc6*(irc6 - 1) );
    if (lj->d == 3) {
      lj->epot_tail = (real)( 8*M_PI*rho*lj->n/9*(irc6 - 3)*irc3 );
      lj->p_tail = (real)( 32*M_PI*rho*rho/9*(irc6 - 1.5)*irc3 );
    } else if (lj->d == 2) {
      lj->epot_tail = (real) (M_PI*rho*lj->n*(.4*irc6 - 1)*irc3*irc);
      lj->p_tail = (real) (M_PI*rho*rho*(1.6*irc6 - 2)*irc3*irc);
    }
  }
}


INLINE real lj_pbc(real x, real l)
  { return (real)((x - ((int)((x)+1000.5) - 1000))*l) ; }


INLINE real *lj_vpbc2d(real *v, real l)
  { v[0] = lj_pbc(v[0], l); v[1] = lj_pbc(v[1], l); return v; }


INLINE real *lj_vpbc3d(real *v, real l)
  { v[0] = lj_pbc(v[0], l); v[1] = lj_pbc(v[1], l); v[2] = lj_pbc(v[2], l); return v; }


INLINE real lj_pbcdist2_2d(real *dx, const real *a, const real *b, real l)
  { return rv2_sqr(lj_vpbc2d(rv2_diff(dx, a, b), l)); }

INLINE real lj_pbcdist2_3d(real *dx, const real *a, const real *b, real l)
  { return rv3_sqr(lj_vpbc3d(rv3_diff(dx, a, b), l)); }


/* create an open structure */
INLINE lj_t *lj_open(int n, int d, real rho, real rcdef)
{
  lj_t *lj;
  int i;

  xnew(lj, 1);
  lj->n = n;
  lj->d = d;
  lj->dof = n * d - d * (d+1)/2;
  xnew(lj->f, n * d);
  xnew(lj->v, n * d);
  xnew(lj->x, n * d);

  lj->rcdef = rcdef;
  lj_setrho(lj, rho);

  lj->esqinf = 1000000;

  if (lj->d == 3) lj_initfcc3d(lj); else lj_initfcc2d(lj);

  /* init. random velocities */
  for (i = 0; i < n * d; i++) lj->v[i] = (real) (rnd0() - .5);

  lj_shiftcom(lj, lj->v);
  lj_shiftang(lj, lj->x, lj->v);
  lj->ekin = md_ekin(lj->v, lj->n * lj->d, lj->dof, &lj->tkin);

  lj->isclone = 0;

  lj_force(lj);
  return lj;
}


/* copy flags */
#define LJ_CPX   0x0001
#define LJ_CPV   0x0002
#define LJ_CPF   0x0004
#define LJ_CPPR  0x0020
#define LJ_CPGDG 0x0040
#define LJ_CPXDG 0x0080
#define LJ_CPXVF (LJ_CPX|LJ_CPV|LJ_CPF)

#define lj_copyvec(lj, t, s) memcpy(t, s, lj->d * lj->n * sizeof(real))

/* copy from src to dest
 * cannot copy vectors other than xvf */
INLINE lj_t *lj_copy(lj_t *dest, const lj_t *src, unsigned flags)
{
  /* to preserve the pointers before the memcpy(dest, src) call */
  real *x = dest->x, *v = dest->v, *f = dest->f;

  memcpy(dest, src, sizeof(lj_t));

  if (flags & LJ_CPX) lj_copyvec(src, x, src->x);
  dest->x = x;
  if (flags & LJ_CPV) lj_copyvec(src, v, src->v);
  dest->v = v;
  if (flags & LJ_CPF) lj_copyvec(src, f, src->f);
  dest->f = f;
  return dest;
}


/* make new copy */
INLINE lj_t *lj_clone(const lj_t *src, unsigned flags)
{
  int nd = src->n * src->d;
  lj_t *dest;

  xnew(dest, 1);
  memcpy(dest, src, sizeof(lj_t));
  /* unless specified in flags,
   * arrays are copied literally as pointers */
  dest->isclone = LJ_CPPR | LJ_CPGDG | LJ_CPXDG;
  if (flags & LJ_CPX) {
    xnew(dest->x, nd);
    lj_copyvec(src, dest->x, src->x);
  } else {
    dest->isclone |= LJ_CPX;
  }
  if (flags & LJ_CPV) {
    xnew(dest->v, nd);
    lj_copyvec(src, dest->v, src->v);
  } else {
    dest->isclone |= LJ_CPV;
  }
  if (flags & LJ_CPF) {
    xnew(dest->f, nd);
    lj_copyvec(src, dest->f, src->v);
  } else {
    dest->isclone |= LJ_CPF;
  }
  return dest;
}


INLINE void lj_close(lj_t *lj)
{
  if ( !(lj->isclone & LJ_CPX) ) free(lj->x);
  if ( !(lj->isclone & LJ_CPV) ) free(lj->v);
  if ( !(lj->isclone & LJ_CPF) ) free(lj->f);
  if ( !(lj->isclone & LJ_CPPR)  && lj->pr)
    free(lj->pr);
  if ( !(lj->isclone & LJ_CPGDG) && lj->gdg)
    free(lj->gdg);
  if ( !(lj->isclone & LJ_CPXDG) && lj->xdg)
    free(lj->xdg);
  free(lj);
}


/* write position (and velocity)
 * Note 1: *actual* position, not unit position is used
 * Note 2: coordinates are *not* wrapped back into the box */
INLINE int lj_writepos(lj_t *lj, const real *x, const real *v, const char *fn)
{
  FILE *fp;

  if (fn == NULL) fn = "lj.pos";
  xfopen(fp, fn, "w", return -1);
  fprintf(fp, "# %d %d %d %.14e\n", lj->d, lj->n, (v != NULL), lj->l);
  md_writepos(fp, x, v, lj->n, lj->d, lj->l);
  fclose(fp);
  return 0;
}


#define LJ_LOADBOX 0x10

/* read position file (which may include velocity) */
INLINE int lj_readpos(lj_t *lj, real *x, real *v, const char *fn, unsigned flags)
{
  char s[1024];
  FILE *fp;
  int i, j, ret = -1, hasv = 0;
  double l0;

  if (fn == NULL) fn = "lj.pos";
  xfopen(fp, fn, "r", return -1);

  if (fgets(s, sizeof s, fp) == NULL || s[0] != '#') { /* simplified format, old version */
    fprintf(stderr, "Warning: %s has no information line\n", fn);
    rewind(fp);
  } else {
    if (4 != sscanf(s + 1, "%d%d%d%lf", &i, &j, &hasv, &l0)
        || i != lj->d || j != lj->n) {
      fprintf(stderr, "first line is corrupted:\n%s", s);
      goto ERR;
    }
    if (fabs(l0 - lj->l) > 1e-5*lj->l) { /* verify the box size */
      if (flags & LJ_LOADBOX) {
        lj->l = (real) l0;
        for (lj->vol = 1, j = 0; j < lj->d; j++) lj->vol *= lj->l;
        lj_setrho(lj, lj->n/lj->vol);
      } else {
        fprintf(stderr, "box mismatch l %g, should be %g\n", l0, lj->l);
        goto ERR;
      }
    }
  }

  ret = md_readpos(fp, x, hasv ? v : NULL, lj->n, lj->d, lj->l);
ERR:
  fclose(fp);
  return ret;
}


/* compute reference thermal dynamics variables using the equation of states
   return the average potential energy
   *P:  pressure
   *Ar: Helmholtz free energy (potential part)
   *Gr: Gibbs free energy (potential part)
   Reference:
   J. Karl Johnson et al. The Lennard-Jones equation of states revisited,
   Molecular Physics (1993) Vol. 78, No 3, 591-618 */
INLINE double lj_eos3d(double rho, double T, double *P, double *Ar, double *Gr)
{
  const double x1  =  0.8623085097507421;
  const double x2  =  2.976218765822098;
  const double x3  = -8.402230115796038;
  const double x4  =  0.1054136629203555;
  const double x5  = -0.8564583828174598;
  const double x6  =  1.582759470107601;
  const double x7  =  0.7639421948305453;
  const double x8  =  1.753173414312048;
  const double x9  =  2.798291772190376e+03;
  const double x10 = -4.8394220260857657e-02;
  const double x11 =  0.9963265197721935;
  const double x12 = -3.698000291272493e+01;
  const double x13 =  2.084012299434647e+01;
  const double x14 =  8.305402124717285e+01;
  const double x15 = -9.574799715203068e+02;
  const double x16 = -1.477746229234994e+02;
  const double x17 =  6.398607852471505e+01;
  const double x18 =  1.603993673294834e+01;
  const double x19 =  6.805916615864377e+01;
  const double x20 = -2.791293578795945e+03;
  const double x21 = -6.245128304568454;
  const double x22 = -8.116836104958410e+03;
  const double x23 =  1.488735559561229e+01;
  const double x24 = -1.059346754655084e+04;
  const double x25 = -1.131607632802822e+02;
  const double x26 = -8.867771540418822e+03;
  const double x27 = -3.986982844450543e+01;
  const double x28 = -4.689270299917261e+03;
  const double x29 =  2.593535277438717e+02;
  const double x30 = -2.694523589434903e+03;
  const double x31 = -7.218487631550215e+02;
  const double x32 =  1.721802063863269e+02;
  const double gamma = 3.0;
  double a[8], b[6], c[8], d[6], G[6], F, rhop, rho2 = rho*rho, Pa = 0., Pb = 0., U;
  int i;

  a[0] = x1*T + x2*sqrt(T) + x3 + x4/T + x5/(T*T);
  a[1] = x6*T + x7 + x8/T + x9/(T*T);
  a[2] = x10*T + x11 + x12/T;
  a[3] = x13;
  a[4] = x14/T + x15/(T*T);
  a[5] = x16/T;
  a[6] = x17/T + x18/(T*T);
  a[7] = x19/(T*T);
  b[0] = (x20 + x21/T)/(T*T);
  b[1] = (x22 + x23/(T*T))/(T*T);
  b[2] = (x24 + x25/T)/(T*T);
  b[3] = (x26 + x27/(T*T))/(T*T);
  b[4] = (x28 + x29/T)/(T*T);
  b[5] = (x30 + x31/T + x32/(T*T))/(T*T);
  c[0] = x2*sqrt(T)/2 + x3 + 2*x4/T + 3*x5/(T*T);
  c[1] = x7 + 2*x8/T + 3*x9/(T*T);
  c[2] = x11 + 2*x12/T;
  c[3] = x13;
  c[4] = 2*x14/T + 3*x15/(T*T);
  c[5] = 2*x16/T;
  c[6] = 2*x17/T + 3*x18/(T*T);
  c[7] = 3*x19/(T*T);
  d[0] = (3*x20 + 4*x21/T)/(T*T);
  d[1] = (3*x22 + 5*x23/(T*T))/(T*T);
  d[2] = (3*x24 + 4*x25/T)/(T*T);
  d[3] = (3*x26 + 5*x27/(T*T))/(T*T);
  d[4] = (3*x28 + 4*x29/T)/(T*T);
  d[5] = (3*x30 + 4*x31/T + 5*x32/(T*T))/(T*T);

  F = exp(-gamma*rho*rho);
  G[0] = (1 - F)/(2*gamma);
  for (rhop = 1, i = 1; i < 6; i++) {
    rhop *= rho*rho;
    G[i] = -(F*rhop - 2*i*G[i-1])/(2*gamma);
  }

  if (Ar) *Ar = 0.;
  if (P)  Pa = Pb = 0;
  for (U = 0, i = 7; i >= 0; i--) {
    U = rho * (c[i]/(i+1) + U);
    if (Ar) *Ar = rho * (a[i]/(i+1) + (*Ar));
    if (P)  Pa  = rho * (a[i] + Pa);
  }

  for (i = 5; i >= 0; i--) {
    U += d[i]*G[i];
    if (Ar) *Ar += b[i]*G[i];
    if (P) Pb = rho2*(b[i] + Pb);
  }
  if (P) *P = rho*(T + Pa + F*Pb);
  if (Gr) *Gr = *Ar + *P/rho - T;
  return U;
}


/* initialize square well potential */
INLINE void lj_initsq(lj_t *lj, real ra, real rb)
{
  lj->ra2 = (lj->ra = ra) * ra;
  lj->rb2 = (lj->rb = rb) * rb;
  lj->usesq = 1;
  lj_energy(lj);
}


/* initialize coefficients for the switched potential */
INLINE void lj_initsw(lj_t *lj, real rs)
{
  real rs2, rs3, rs6, rs15, dr, dr2, dr3, dr4, f1, f2, f26, f13;

  lj->rs = rs;
  dr = lj->rs - lj->rc;
  die_if (dr > 0, "rs %g, rc %g\n", lj->rs, lj->rc);

  rs2 = rs*rs;
  rs3 = rs2*rs;
  rs6 = rs3*rs3;
  rs15 = rs6*rs6*rs3;
  dr2 = dr*dr;
  dr3 = dr2*dr;
  dr4 = dr3*dr;
  f1 = rs6 - 1.f;
  f2 = rs6 - 2.f;
  f13 = 2.f*rs6 - 13.f;
  f26 = 7.f*rs6 - 26.f;

  f1 *= rs3;
  f2 *= dr*rs2;
  f13 *= dr3;
  f26 *= dr2*rs;
  lj->a4 = -4.f*(35.f*f1 + 90.f*f2 + 28.f*f13 + 15.f*f26)/(dr4*rs15);
  lj->a5 = 24.f*(14.f*f1 + 39.f*f2 + 14.f*f13 + 7.f*f26)/(dr2*dr3*rs15);
  lj->a6 = -4.f*(70.f*f1 + 204.f*f2 + 84.f*f13 + 39.f*f26)/(dr3*dr3*rs15);
  lj->a7 = 16.f*(5.f*f1 + 15.f*f2 + 7.f*f13 + 3.f*f26)/(dr3*dr4*rs15);

  xnew(lj->pr, lj->n * lj->n);
  xnew(lj->gdg, lj->n * lj->d);
  lj->npr = 0;
  lj->usesw = 1;
}


/* compute the switch potential phi(r) and its derivatives
 * fscal = -phi = uij'/rij
 * psi = phi'/rij
 * xi = psi'/rij
 * Laplacian = psi*rij^2 + 3*phi = psi*rij^2 - 3*fscal */
INLINE real lj_potsw(lj_t *lj, real r, real *fscal, real *psi, real *xi)
{
  if (r < lj->rs) { /* normal lj */
    real invr2, invr6, invr8;
    invr2 = 1 / (r*r);
    invr6 = invr2 * invr2 * invr2;
    invr8 = invr6 * invr2;
    *fscal = (48 * invr6 - 24) * invr8;
    *psi = (672 * invr6 - 192) * invr8 * invr2;
    *xi = -(10752 * invr6 - 1920) * invr6 * invr6;
    return 4 * invr6 * (invr6 - 1);
  } else if (r < lj->rc) { /* polynomial */
    real dr, dr2, dr3, fs, ps, xs, invr, invr2;
    real a4 = lj->a4, a5 = lj->a5, a6 = lj->a6, a7 = lj->a7;
    invr = 1/r;
    dr = r - lj->rc;
    invr2 = invr * invr;
    dr2 = dr * dr;
    dr3 = dr2 * dr;
    fs = dr3*(4*a4 + dr*(5*a5 + dr*(6*a6 + dr*7*a7)))*invr;
    *fscal = -fs;
    ps = dr2*(12*a4 + dr*(20*a5 + dr*(30*a6 + dr*42*a7)));
    *psi = (ps - fs)*invr2;
    xs = dr*(24*a4 + dr*(60*a5 + dr*(120*a6 + dr*210*a7)));
    *xi = (xs*invr - 3*(*psi))*invr2;
    return (dr2*dr2)*(a4 + dr*(a5 + dr*(a6 + dr*a7)));
  } else { /* out of range */
    *fscal = 0;
    *psi = 0;
    *xi = 0;
    return 0;
  }
}


/* 2D energy for square, lj members are not altered */
INLINE int lj_energysq2d(lj_t *lj, rv2_t *x, real *rmin)
{
  real dx[2], dr2, ra2 = lj->ra2, rb2 = lj->rb2, l = lj->l, rm2 = 1e30;
  int i, j, iu = 0, n = lj->n, col = 0;

  for (i = 0; i < n - 1; i++) {
    for (j = i + 1; j < n; j++) {
      dr2 = lj_pbcdist2_2d(dx, x[i], x[j], l);
      if (dr2 < ra2) {
        iu += lj->esqinf;
        col++;
      } else if (dr2 < rb2) {
        iu--;
      }
      if (dr2 < rm2) rm2 = dr2;
    }
  }
  if (rmin != NULL) *rmin = (real) sqrt(rm2);
  /* force the energy to zero in the hard sphere case */
  if (fabs(ra2 - rb2) < 1e-6 && col == 0) {
    iu = 0;
  }
  return iu;
}

/* 3D energy for square, lj members are not altered */
INLINE int lj_energysq3d(lj_t *lj, rv3_t *x, real *rmin)
{
  real dx[3], dr2, ra2 = lj->ra2, rb2 = lj->rb2, l = lj->l, rm2 = 1e30;
  int i, j, iu = 0, n = lj->n, col = 0;

  for (i = 0; i < n - 1; i++) {
    for (j = i + 1; j < n; j++) {
      dr2 = lj_pbcdist2_3d(dx, x[i], x[j], l);
      if (dr2 < ra2) {
        iu += lj->esqinf;
        col++;
      } else if (dr2 < rb2) {
        iu--;
      }
      if (dr2 < rm2) rm2 = dr2;
    }
  }
  if (rmin != NULL) *rmin = (real) sqrt(rm2);
  /* force the energy to zero in the hard sphere case */
  if (fabs(ra2 - rb2) < 1e-6 && col == 0) {
    iu = 0;
  }
  return iu;
}


/* compute 2D energy for switched potential */
INLINE real lj_energysw2d(lj_t *lj, rv2_t *x, real *virial, real *laplace)
{
  int i, j, n = lj->n;
  real dx[2], dr2, dr, l = lj->l, d = (real) lj->d;
  real fscal, psi, xi, ep, vir, lap;

  ep = lap = vir = 0.f;
  for (i = 0; i < n; i++) {
    for (j = i + 1; j < n; j++) {
      dr2 = lj_pbcdist2_2d(dx, x[i], x[j], l);
      if (dr2 > lj->rc2) continue;

      dr = (real) sqrt(dr2);
      ep += lj_potsw(lj, dr, &fscal, &psi, &xi);
      lap += 2.f*(psi*dr2 - d*fscal);
      vir += fscal*dr2;
      rv2_smul(dx, fscal);
    }
  }
  if (virial) *virial = vir;
  if (laplace) *laplace = lap;
  return ep;
}

/* compute 3D energy for switched potential */
INLINE real lj_energysw3d(lj_t *lj, rv3_t *x, real *virial, real *laplace)
{
  int i, j, n = lj->n;
  real dx[3], dr2, dr, l = lj->l, d = (real) lj->d;
  real fscal, psi, xi, ep, vir, lap;

  ep = lap = vir = 0.f;
  for (i = 0; i < n; i++) {
    for (j = i + 1; j < n; j++) {
      dr2 = lj_pbcdist2_3d(dx, x[i], x[j], l);
      if (dr2 > lj->rc2) continue;

      dr = (real) sqrt(dr2);
      ep += lj_potsw(lj, dr, &fscal, &psi, &xi);
      lap += 2.f*(psi*dr2 - d*fscal);
      vir += fscal*dr2;
      rv3_smul(dx, fscal);
    }
  }
  if (virial) *virial = vir;
  if (laplace) *laplace = lap;
  return ep;
}


/* 2D compute force and virial, return energy */
INLINE real lj_energylj2d(lj_t *lj, rv2_t *x, real *virial, real *ep0, real *eps)
{
  real dx[2], dr2, dr6, ep, vir, l = lj->l, rc2 = lj->rc2;
  int i, j, prcnt = 0, n = lj->n;

  if (virial) *virial = 0.f;
  for (ep = vir = 0, i = 0; i < n - 1; i++) {
    for (j = i + 1; j < n; j++) {
      dr2 = lj_pbcdist2_2d(dx, x[i], x[j], l);
      if (dr2 > rc2) continue;
      dr2 = 1.f / dr2;
      dr6 = dr2 * dr2 * dr2;
      vir += dr6 * (48.f * dr6 - 24.f); /* f.r */
      ep += 4 * dr6 * (dr6 - 1);
      prcnt++;
    }
  }
  if (ep0) *ep0 = ep;
  if (eps) *eps = ep - prcnt * lj->epot_shift; /* shifted energy */
  if (virial) *virial = vir;
  return ep + lj->epot_tail; /* unshifted energy */
}

/* 3D compute force and virial, return energy */
INLINE real lj_energylj3d(lj_t *lj, rv3_t *x, real *virial, real *ep0, real *eps)
{
  real dx[3], dr2, dr6, ep, vir, l = lj->l, rc2 = lj->rc2;
  int i, j, prcnt = 0, n = lj->n;

  if (virial) *virial = 0.f;
  for (ep = vir = 0, i = 0; i < n - 1; i++) {
    for (j = i + 1; j < n; j++) {
      dr2 = lj_pbcdist2_3d(dx, x[i], x[j], l);
      if (dr2 > rc2) continue;
      dr2 = 1.f / dr2;
      dr6 = dr2 * dr2 * dr2;
      vir += dr6 * (48.f * dr6 - 24.f); /* f.r */
      ep += 4 * dr6 * (dr6 - 1);
      prcnt++;
    }
  }
  if (ep0) *ep0 = ep;
  if (eps) *eps = ep - prcnt * lj->epot_shift; /* shifted energy */
  if (virial) *virial = vir;
  return ep + lj->epot_tail; /* unshifted energy */
}


/* energy evaluation, do not change members of `lj' */
INLINE real lj_energyx2d(lj_t *lj, rv2_t *x, real *vir, int *iep, real *rmin,
    real *ep0, real *eps, real *lap)
{
  real u;
  if (lj->usesq) {
    *iep = lj_energysq2d(lj, x, rmin);
    u = (real) (*iep);
  } else if (lj->usesw) {
    u = lj_energysw2d(lj, x, vir, lap);
  } else {
    u = lj_energylj2d(lj, x, vir, ep0, eps);
  }
  if (lj->usesq || lj->usesw) {
    if (ep0) *ep0 = u;
    if (eps) *eps = u;
  }
  return u;
}

/* energy evaluation, do not change members of `lj' */
INLINE real lj_energyx3d(lj_t *lj, rv3_t *x, real *vir, int *iep, real *rmin,
    real *ep0, real *eps, real *lap)
{
  real u;
  if (lj->usesq) {
    *iep = lj_energysq3d(lj, x, rmin);
    u = (real) (*iep);
  } else if (lj->usesw) {
    u = lj_energysw3d(lj, x, vir, lap);
  } else {
    u = lj_energylj3d(lj, x, vir, ep0, eps);
  }
  if (lj->usesq || lj->usesw) {
    if (ep0) *ep0 = u;
    if (eps) *eps = u;
  }
  return u;
}


/* energy evaluation, do not change members of `lj' */
INLINE real lj_energyx(lj_t *lj, real *x, real *vir, int *iep, real *rmin,
    real *ep0, real *eps, real *lap)
{
  return lj->d == 2 ?
      lj_energyx2d(lj, (rv2_t *) x, vir, iep, rmin, ep0, eps, lap) :
      lj_energyx3d(lj, (rv3_t *) x, vir, iep, rmin, ep0, eps, lap);
}


/* compute the energy of the current configuration and set lj->epot */
INLINE real lj_energy2d(lj_t *lj)
{
  return lj->epot = lj_energyx2d(lj, (rv2_t *) lj->x, &lj->vir, &lj->iepot,
      &lj->rmin, &lj->epot0, &lj->epots, &lj->lap);
}

/* compute the energy of the current configuration and set lj->epot */
INLINE real lj_energy3d(lj_t *lj)
{
  return lj->epot = lj_energyx3d(lj, (rv3_t *) lj->x, &lj->vir, &lj->iepot,
      &lj->rmin, &lj->epot0, &lj->epots, &lj->lap);
}


INLINE real lj_energy(lj_t *lj)
{
  return (lj->d == 2) ? lj_energy2d(lj) : lj_energy3d(lj);
}


/* compute 2D switched force, save derivative information to lj->pr */
INLINE real lj_forcesw2d(lj_t *lj, rv2_t *x, rv2_t *f, ljpair_t *pr,
    int *ljnpr, real *virial, real *f2, real *laplace)
{
  int i, j, n = lj->n, npr;
  real dx[2], dr2, dr, l = lj->l, d = (real) lj->d;
  real fscal, psi, xi, ep, vir, lap;

  npr = 0;
  ep = lap = vir = 0.f;
  for (i = 0; i < n; i++) rv2_zero(f[i]);
  for (i = 0; i < n; i++) {
    for (j = i + 1; j < n; j++) {
      dr2 = lj_pbcdist2_2d(dx, x[i], x[j], l);
      if (dr2 > lj->rc2) {
        /* save out-of-range pairs, so we can later
         * locate the pair from indices i and j using getpairindex() */
        if (lj->usesw & LJ_SWALLPAIRS) {
          rv2_copy(pr->dx, dx);
          pr->i = i;
          pr->j = j;
          pr->phi = pr->psi = pr->xi = 0.f;
          pr->dr2 = dr2;
          pr->in = 0;
          pr++; npr++;
        }
        continue;
      }

      rv2_copy(pr->dx, dx);
      pr->dr2 = dr2;
      dr = (real) sqrt(dr2);
      ep += lj_potsw(lj, dr, &fscal, &psi, &xi);
      pr->phi = -fscal; /* phi = u'/r */
      pr->psi = psi;  /* psi = phi'/r */
      pr->xi = xi;  /* xi = psi'/r */
      lap += 2.f*(psi*dr2 - d*fscal);
      vir += fscal*dr2; /* f.r */
      rv2_smul(dx, fscal);
      pr->i = i;
      pr->j = j;
      rv2_inc(f[i], dx);
      rv2_dec(f[j], dx);
      pr->in = 1;
      pr++; npr++;
    }
  }
  if (ljnpr) *ljnpr = npr;
  if (virial) *virial = vir;
  if (laplace) *laplace = lap;
  if (f2) for (*f2 = 0.0, i = 0; i < n; i++) *f2 += rv2_sqr(f[i]);
  return ep;
}

/* compute 3D switched force, save derivative information to lj->pr */
INLINE real lj_forcesw3d(lj_t *lj, rv3_t *x, rv3_t *f, ljpair_t *pr,
    int *ljnpr, real *virial, real *f2, real *laplace)
{
  int i, j, n = lj->n, npr;
  real dx[3], dr2, dr, l = lj->l, d = (real) lj->d;
  real fscal, psi, xi, ep, vir, lap;

  npr = 0;
  ep = lap = vir = 0.f;
  for (i = 0; i < n; i++) rv3_zero(f[i]);
  for (i = 0; i < n; i++) {
    for (j = i + 1; j < n; j++) {
      dr2 = lj_pbcdist2_3d(dx, x[i], x[j], l);
      if (dr2 > lj->rc2) {
        /* save out-of-range pairs, so we can later
         * locate the pair from indices i and j using getpairindex() */
        if (lj->usesw & LJ_SWALLPAIRS) {
          rv3_copy(pr->dx, dx);
          pr->i = i;
          pr->j = j;
          pr->phi = pr->psi = pr->xi = 0.f;
          pr->dr2 = dr2;
          pr->in = 0;
          pr++; npr++;
        }
        continue;
      }

      rv3_copy(pr->dx, dx);
      pr->dr2 = dr2;
      dr = (real) sqrt(dr2);
      ep += lj_potsw(lj, dr, &fscal, &psi, &xi);
      pr->phi = -fscal; /* phi = u'/r */
      pr->psi = psi;  /* psi = phi'/r */
      pr->xi = xi;  /* xi = psi'/r */
      lap += 2.f*(psi*dr2 - d*fscal);
      vir += fscal*dr2; /* f.r */
      rv3_smul(dx, fscal);
      pr->i = i;
      pr->j = j;
      rv3_inc(f[i], dx);
      rv3_dec(f[j], dx);
      pr->in = 1;
      pr++; npr++;
    }
  }
  if (ljnpr) *ljnpr = npr;
  if (virial) *virial = vir;
  if (laplace) *laplace = lap;
  if (f2) for (*f2 = 0.0, i = 0; i < n; i++) *f2 += rv3_sqr(f[i]);
  return ep;
}


/* 2D compute force and virial, return energy */
INLINE real lj_forcelj2d(lj_t *lj, rv2_t *x, rv2_t *f, real *virial,
    real *ep0, real *eps, real *f2, real *laplace)
{
  real dx[2], fi[2], dr2, dr6, fs, tmp, ep, vir, lap, l = lj->l, rc2 = lj->rc2;
  int i, j, d, prcnt = 0, n = lj->n;

  for (i = 0; i < n; i++) rv2_zero(f[i]);
  for (ep = vir = lap = 0, i = 0; i < n - 1; i++) {
    rv2_zero(fi);
    for (j = i + 1; j < n; j++) {
      dr2 = lj_pbcdist2_2d(dx, x[i], x[j], l);
      if (dr2 > rc2) continue;
      dr2 = 1.f / dr2;
      dr6 = dr2 * dr2 * dr2;
      fs = dr6 * (48.f * dr6 - 24.f); /* f.r */
      vir += fs; /* f.r */
      if (laplace) /* 2.f for it applies to both particles */
        lap += 2.f * ((168 - 12*2) * dr6 - (48 - 6*2)) * dr6 * dr2;

      fs *= dr2; /* f.r / r^2 */
      for (d = 0; d < 2; d++) {
        tmp = dx[d] * fs;
        fi[d] += tmp;
        f[j][d] -= tmp;
      }

      ep += 4 * dr6 * (dr6 - 1);
      prcnt++;
    }
    rv2_inc(f[i], fi);
  }
  if (ep0) *ep0 = ep;
  if (eps) *eps = ep - prcnt * lj->epot_shift; /* shifted energy */
  if (virial) *virial = vir;
  if (laplace) *laplace = 4*lap;
  if (f2) for (*f2 = 0.f, i = 0; i < n; i++) *f2 += rv2_sqr(f[i]);
  return ep + lj->epot_tail; /* unshifted energy */
}

/* 3D compute force and virial, return energy */
INLINE real lj_forcelj3d(lj_t *lj, rv3_t *x, rv3_t *f, real *virial,
    real *ep0, real *eps, real *f2, real *laplace)
{
  real dx[3], fi[3], dr2, dr6, fs, tmp, ep, vir, lap, l = lj->l, rc2 = lj->rc2;
  int i, j, d, prcnt = 0, n = lj->n;

  for (i = 0; i < n; i++) rv3_zero(f[i]);
  for (ep = vir = lap = 0, i = 0; i < n - 1; i++) {
    rv3_zero(fi);
    for (j = i + 1; j < n; j++) {
      dr2 = lj_pbcdist2_3d(dx, x[i], x[j], l);
      if (dr2 > rc2) continue;
      dr2 = 1.f / dr2;
      dr6 = dr2 * dr2 * dr2;
      fs = dr6 * (48.f * dr6 - 24.f); /* f.r */
      vir += fs; /* f.r */
      if (laplace) /* 2.f for it applies to both particles */
        lap += 2.f * ((168 - 12*3) * dr6 - (48 - 6*3)) * dr6 * dr2;

      fs *= dr2; /* f.r / r^2 */
      for (d = 0; d < 3; d++) {
        tmp = dx[d] * fs;
        fi[d] += tmp;
        f[j][d] -= tmp;
      }

      ep += 4 * dr6 * (dr6 - 1);
      prcnt++;
    }
    rv3_inc(f[i], fi);
  }
  if (ep0) *ep0 = ep;
  if (eps) *eps = ep - prcnt * lj->epot_shift; /* shifted energy */
  if (virial) *virial = vir;
  if (laplace) *laplace = 4*lap;
  if (f2) for (*f2 = 0.f, i = 0; i < n; i++) *f2 += rv3_sqr(f[i]);
  return ep + lj->epot_tail; /* unshifted energy */
}


INLINE real lj_force2d(lj_t *lj)
{
  if (lj->usesq) return lj->epot = lj->epot0 = lj->epots = (real) lj_energysq2d(lj,
    (rv2_t *) lj->x, &lj->rmin); /* no force for square well */
  if (lj->usesw) return lj->epot = lj->epot0 = lj->epots = lj_forcesw2d(lj,
    (rv2_t *) lj->x, (rv2_t *) lj->f,
    lj->pr, &lj->npr, &lj->vir, &lj->f2, &lj->lap);
  return lj->epot = lj_forcelj2d(lj, (rv2_t *) lj->x, (rv2_t *) lj->f,
    &lj->vir, &lj->epot0, &lj->epots, &lj->f2, &lj->lap);
}

INLINE real lj_force3d(lj_t *lj)
{
  if (lj->usesq) return lj->epot = lj->epot0 = lj->epots = (real) lj_energysq3d(lj,
    (rv3_t *) lj->x, &lj->rmin); /* no force for square well */
  if (lj->usesw) return lj->epot = lj->epot0 = lj->epots = lj_forcesw3d(lj,
    (rv3_t *) lj->x, (rv3_t *) lj->f,
    lj->pr, &lj->npr, &lj->vir, &lj->f2, &lj->lap);
  return lj->epot = lj_forcelj3d(lj, (rv3_t *) lj->x, (rv3_t *) lj->f,
    &lj->vir, &lj->epot0, &lj->epots, &lj->f2, &lj->lap);
}


INLINE real lj_force(lj_t *lj)
{
  return (lj->d == 2) ? lj_force2d(lj) : lj_force3d(lj);
}


/* calculate the configurational temperature (bc) for switched potential
 * bc = div(v), where v = g/(g.g), g = grad U,
 * udb = v . grad bc
 * bvir = x . grad bc */
INLINE real lj_bconfsw3d(lj_t *lj, real *udb)
{
  int i, j, ipr, npr = lj->npr, n = lj->n;
  ljpair_t *pr;
  real dg[3], dh[3];
  real phi, psi, xi, d = (real) lj->d;
  real dgdx, dg2, dr2, m = 0.f, h2;
  real gdlap = 0.f, gdm = 0.f, bc, invg2, invg4;
  real dlap, dgdx2;
  rv3_t *h = (rv3_t *) lj->gdg, *f = (rv3_t *) lj->f;

  if (udb) for (i = 0; i < n; i++) rv3_zero(h[i]);

  for (ipr = 0; ipr < npr; ipr++) {
    pr = lj->pr + ipr;
    i = pr->i;
    j = pr->j;
    phi = pr->phi;
    psi = pr->psi;

    dg2 = rv3_sqr( rv3_diff(dg, f[j], f[i]) );
    dgdx = rv3_dot(dg, pr->dx);
    m += psi*(dgdx2 = dgdx*dgdx) + phi*dg2; /* M = g g : grad grad U */
    if (udb) {
      dr2 = pr->dr2;
      xi = pr->xi;
      dlap = xi*dr2 + (2.f + d)*psi;
      gdlap += dlap * dgdx; /* 0.5 g . grad laplace U */
      gdm += (xi*dgdx2 + 3.f*psi*dg2)*dgdx; /* g g g : grad grad grad U, first larger */
      rv3_lincomb2(dh, pr->dx, dg, psi * dgdx, phi);
      rv3_sinc(h[i], dh,  2.f);
      rv3_sinc(h[j], dh, -2.f);
    }
  }
  m *= 2.f;
  gdlap *= 2.f;
  gdm *= 2.f;
  invg2 = 1.f/lj->f2;
  invg4 = invg2 * invg2;
  bc = (lj->lap - m*invg2)*invg2; /* configuration temperature */

  if (udb) {
    for (h2 = 0.f, i = 0; i < n; i++) h2 += rv3_sqr(h[i]);
    /* (1/g) \partial^2 g/ \partial E^2 = <bc*bc + udb>
       \partial bc/ \partial E = <d(bc)^2 + udb> */
    *udb = invg4*(gdlap - (lj->lap*m + h2 + gdm)*invg2 + 2.f*m*m*invg4);
  }

  return bc;
}


/* return r r : grad grad U, must be called after force */
INLINE real lj_vir2sw3d(lj_t *lj)
{
  int ipr, npr = lj->npr;
  real vir2 = 0.f;

  for (ipr = 0; ipr < npr; ipr++) {
    ljpair_t *pr = lj->pr + ipr;
    vir2 += (pr->psi * pr->dr2 + pr->phi) * pr->dr2;
  }
  return vir2;
}


/* compute pressure */
INLINE real lj_calcp(lj_t *lj, real tp)
{ return (lj->dof * tp + lj->vir) / (lj->d * lj->vol) + lj->p_tail; }


/* compute pressure, ideal gas part from the kinetic energy  */
INLINE real lj_calcpk(lj_t *lj)
{ return (2.f * lj->ekin + lj->vir) / (lj->d * lj->vol) + lj->p_tail; }


/* Lennard-Jones system: Monte Carlo routines */


/* randomly displace particle i with random amplitude */
INLINE int lj_randmv2d(lj_t *lj, real *xi, real amp)
{
  int i, d;

  i = (int)(rnd0() * lj->n);
  amp /= lj->l;
  rv2_copy(xi, lj->x + i*2);
  for (d = 0; d < 2; d++) /* displacement */
    xi[d] += (real)(amp * (2.*rnd0() - 1.));
  return i;
}

/* randomly displace particle i with random amplitude */
INLINE int lj_randmv3d(lj_t *lj, real *xi, real amp)
{
  int i, d;

  i = (int)(rnd0() * lj->n);
  amp /= lj->l;
  rv3_copy(xi, lj->x + i*3);
  for (d = 0; d < 3; d++) /* displacement */
    xi[d] += (real)(amp * (2.*rnd0() - 1.));
  return i;
}


/* compute energy data for a 2D pair with a square well potential */
INLINE int lj_pairsq2d(const real *xi, const real *xj, real l,
    real ra2, real rb2, real *pdr2, int inf)
{
  real dx[2], dr2;
  dr2 = lj_pbcdist2_2d(dx, xi, xj, l);
  if (pdr2) *pdr2 = dr2;
  if (dr2 < ra2) return -inf;
  else if (dr2 < rb2) return 1;
  else return 0;
}

/* compute energy data for a 3D pair with a square well potential */
INLINE int lj_pairsq3d(const real *xi, const real *xj, real l,
    real ra2, real rb2, real *pdr2, int inf)
{
  real dx[3], dr2;
  dr2 = lj_pbcdist2_3d(dx, xi, xj, l);
  if (pdr2) *pdr2 = dr2;
  if (dr2 < ra2) return -inf;
  else if (dr2 < rb2) return 1;
  else return 0;
}


/* return the energy change (square well) from displacing x[i] to xi */
INLINE int lj_depotsq2d(lj_t *lj, int i, const real *xi, real *rm)
{
  int j, n = lj->n, npr = 0, inf = lj->esqinf, recalc = 0;
  real l = lj->l, ra2 = lj->ra2, rb2 = lj->rb2;
  real r2o, r2n, rm2o = 0, rm2 = 0;
  rv2_t *x = (rv2_t *) lj->x;
  const real tol = 1e-5;

  if (rm) rm2o = rm2 = (*rm) * (*rm);
  for (j = 0; j < n; j++) { /* pair */
    if (j == i) continue;
    npr -= lj_pairsq2d(x[i], x[j], l, ra2, rb2, &r2o, inf);
    npr += lj_pairsq2d(xi,   x[j], l, ra2, rb2, &r2n, inf);
    if (fabs(r2o - rm2o) < tol) { /* need to re-compute rmin */
      recalc |= 1;
    }
    if (r2n < rm2) { /* new rmin is found */
      recalc |= 2; /* no need to recalc */
      rm2 = r2n;
    }
  }

  /* in order to compute the minimal distance,
   * we need to occasionally recompute the entire system */
  if (recalc == 1) { /* 0, 2, 3 are safe */
    rv2_t xio;
    rv2_copy(xio, x[i]);
    rv2_copy(x[i], xi); /* apply xi */
    lj_energysq2d(lj, x, rm);
    rv2_copy(x[i], xio); /* recover */
  } else {
    if (rm) *rm = (real) sqrt(rm2);
  }

  /* hard sphere, no collision */
  if (fabs(ra2 - rb2) < 2e-6 && npr > -inf/10 && npr < inf/10) {
    npr = 0; /* number of neighbors */
  }
  return -npr; /* increased number of pairs == decreased energy */
}

/* return the energy change (square well) from displacing x[i] to xi */
INLINE int lj_depotsq3d(lj_t *lj, int i, const real *xi, real *rm)
{
  int j, n = lj->n, npr = 0, inf = lj->esqinf, recalc = 0;
  real l = lj->l, ra2 = lj->ra2, rb2 = lj->rb2;
  real r2o, r2n, rm2o = 0, rm2 = 0;
  rv3_t *x = (rv3_t *) lj->x;
  const real tol = 1e-5;

  if (rm) rm2o = rm2 = (*rm) * (*rm);
  for (j = 0; j < n; j++) { /* pair */
    if (j == i) continue;
    npr -= lj_pairsq3d(x[i], x[j], l, ra2, rb2, &r2o, inf);
    npr += lj_pairsq3d(xi,   x[j], l, ra2, rb2, &r2n, inf);
    if (fabs(r2o - rm2o) < tol) { /* need to re-compute rmin */
      recalc |= 1;
    }
    if (r2n < rm2) { /* new rmin is found */
      recalc |= 2; /* no need to recalc */
      rm2 = r2n;
    }
  }

  /* in order to compute the minimal distance,
   * we need to occasionally recompute the entire system */
  if (recalc == 1) { /* 0, 2, 3 are safe */
    rv3_t xio;
    rv3_copy(xio, x[i]);
    rv3_copy(x[i], xi); /* apply xi */
    lj_energysq3d(lj, x, rm);
    rv3_copy(x[i], xio); /* recover */
  } else {
    if (rm) *rm = (real) sqrt(rm2);
  }

  /* hard sphere, no collision */
  if (fabs(ra2 - rb2) < 2e-6 && npr > -inf/10 && npr < inf/10) {
    npr = 0; /* number of neighbors */
  }
  return -npr; /* increased number of pairs == decreased energy */
}


/* commit a particle displacement for a square well potential */
INLINE void lj_commitsq2d(lj_t *lj, int i, const real *xi, int du)
{
  rv2_copy(lj->x + i*2, xi);
  lj->iepot += du;
  lj->epot += du;
}

/* commit a particle displacement for a square well potential */
INLINE void lj_commitsq3d(lj_t *lj, int i, const real *xi, int du)
{
  rv3_copy(lj->x + i*3, xi);
  lj->iepot += du;
  lj->epot += du;
}


/* Metropolis for a square well */
INLINE int lj_metrosq2d(lj_t *lj, real amp, real bet)
{
  int i, du;
  real xi[2], rm;

  i = lj_randmv2d(lj, xi, amp);
  rm = lj->rmin;
  du = lj_depotsq2d(lj, i, xi, &rm);
  /* patch for bet = 0 */
  if (bet >= 0 && du > lj->esqinf/2) return 0;
  if (metroacc1(du, bet)) {
    lj_commitsq2d(lj, i, xi, du);
    lj->rmin = rm;
    return 1;
  }
  return 0;
}

/* Metropolis for a square well */
INLINE int lj_metrosq3d(lj_t *lj, real amp, real bet)
{
  int i, du;
  real xi[3], rm;

  i = lj_randmv3d(lj, xi, amp);
  rm = lj->rmin;
  du = lj_depotsq3d(lj, i, xi, &rm);
  /* patch for bet = 0 */
  if (bet >= 0 && du > lj->esqinf/2) return 0;
  if (metroacc1(du, bet)) {
    lj_commitsq3d(lj, i, xi, du);
    lj->rmin = rm;
    return 1;
  }
  return 0;
}


/* compute energy data for a particle pair, with switched potential  */
INLINE int lj_pairsw2d(lj_t *lj, real *xi, real *xj, real *u, real *vir)
{
  real dx[2], dr2, dr, fscal, psi, ksi;
  dr2 = lj_pbcdist2_2d(dx, xi, xj, lj->l);
  if (dr2 < lj->rc2) {
    dr = (real) sqrt(dr2);
    *u = lj_potsw(lj, dr, &fscal, &psi, &ksi);
    *vir = fscal * dr2; /* f.r */
    return 1;
  } else return 0;
}

/* compute energy data for a particle pair, with switched potential  */
INLINE int lj_pairsw3d(lj_t *lj, real *xi, real *xj, real *u, real *vir)
{
  real dx[3], dr2, dr, fscal, psi, ksi;
  dr2 = lj_pbcdist2_3d(dx, xi, xj, lj->l);
  if (dr2 < lj->rc2) {
    dr = (real) sqrt(dr2);
    *u = lj_potsw(lj, dr, &fscal, &psi, &ksi);
    *vir = fscal * dr2; /* f.r */
    return 1;
  } else return 0;
}


/* return the energy change from displacing x[i] to xi */
INLINE real lj_depotsw2d(lj_t *lj, int i, real *xi, real *vir)
{
  int j, n = lj->n;
  real u = 0.f, du = 0.f, dvir = 0.f;
  rv2_t *x = (rv2_t *) lj->x;

  *vir = 0.0f;
  for (j = 0; j < n; j++) { /* pair */
    if (j == i) continue;
    if (lj_pairsw2d(lj, x[i], x[j], &du, &dvir)) {
      u -= du;
      *vir -= dvir;
    }
    if (lj_pairsw2d(lj, xi, x[j], &du, &dvir)) {
      u += du;
      *vir += dvir;
    }
  }
  return u;
}

/* return the energy change from displacing x[i] to xi */
INLINE real lj_depotsw3d(lj_t *lj, int i, real *xi, real *vir)
{
  int j, n = lj->n;
  real u = 0.f, du = 0.f, dvir = 0.f;
  rv3_t *x = (rv3_t *) lj->x;

  *vir = 0.0f;
  for (j = 0; j < n; j++) { /* pair */
    if (j == i) continue;
    if (lj_pairsw3d(lj, x[i], x[j], &du, &dvir)) {
      u -= du;
      *vir -= dvir;
    }
    if (lj_pairsw3d(lj, xi, x[j], &du, &dvir)) {
      u += du;
      *vir += dvir;
    }
  }
  return u;
}


/* commit a particle displacement
 * like energysw2d, it does not set pair data, lj->pr
 * call lj_forcesw2d() if it is needed */
INLINE void lj_commitsw2d(lj_t *lj, int i, const real *xi, real du, real dvir)
{
  rv2_copy(lj->x + i*2, xi);
  lj->epot += du;
  lj->vir += dvir;
}

/* commit a particle displacement
 * like energysw3d, it does not set pair data, lj->pr
 * call lj_forcesw3d() if it is needed */
INLINE void lj_commitsw3d(lj_t *lj, int i, const real *xi, real du, real dvir)
{
  rv3_copy(lj->x + i*3, xi);
  lj->epot += du;
  lj->vir += dvir;
}


/* Metropolis algorithm */
INLINE int lj_metrosw2d(lj_t *lj, real amp, real bet)
{
  int i;
  real xi[2], du, dvir;

  i = lj_randmv2d(lj, xi, amp);
  du = lj_depotsw2d(lj, i, xi, &dvir);
  if (metroacc1(du, bet)) {
    lj_commitsw2d(lj, i, xi, du, dvir);
    return 1;
  }
  return 0;
}

/* Metropolis algorithm */
INLINE int lj_metrosw3d(lj_t *lj, real amp, real bet)
{
  int i;
  real xi[3], du, dvir;

  i = lj_randmv3d(lj, xi, amp);
  du = lj_depotsw3d(lj, i, xi, &dvir);
  if (metroacc1(du, bet)) {
    lj_commitsw3d(lj, i, xi, du, dvir);
    return 1;
  }
  return 0;
}


/* compute energy data for a 2D Lennard-Jones pair */
INLINE int lj_pairlj2d(real *xi, real *xj, real l, real rc2,
    real *u, real *vir)
{
  real dx[2], dr2, invr2, invr6;
  dr2 = lj_pbcdist2_2d(dx, xi, xj, l);
  if (dr2 < rc2) {
    invr2 = 1.0f / dr2;
    invr6 = invr2 * invr2 * invr2;
    *vir = invr6 * (48.f * invr6 - 24.f); /* f.r */
    *u  = 4.f * invr6 * (invr6 - 1.f);
    return 1;
  } else return 0;
}

/* compute energy data for a 3D Lennard-Jones pair */
INLINE int lj_pairlj3d(real *xi, real *xj, real l, real rc2,
    real *u, real *vir)
{
  real dx[3], dr2, invr2, invr6;
  dr2 = lj_pbcdist2_3d(dx, xi, xj, l);
  if (dr2 < rc2) {
    invr2 = 1.0f / dr2;
    invr6 = invr2 * invr2 * invr2;
    *vir = invr6 * (48.f * invr6 - 24.f); /* f.r */
    *u  = 4.f * invr6 * (invr6 - 1.f);
    return 1;
  } else return 0;
}


/* return the energy change from displacing x[i] to xi */
INLINE real lj_depotlj2d(lj_t *lj, int i, real *xi, real *vir)
{
  int j, n = lj->n;
  real l = lj->l, rc2 = lj->rc2, u = 0.f, du = 0.f, dvir = 0.f;
  rv2_t *x = (rv2_t *) lj->x;

  *vir = 0.0f;
  for (j = 0; j < n; j++) { /* pair */
    if (j == i) continue;
    if (lj_pairlj2d(x[i], x[j], l, rc2, &du, &dvir)) {
      u -= du;
      *vir -= dvir;
    }
    if (lj_pairlj2d(xi, x[j], l, rc2, &du, &dvir)) {
      u += du;
      *vir += dvir;
    }
  }
  return u;
}

/* return the energy change from displacing x[i] to xi */
INLINE real lj_depotlj3d(lj_t *lj, int i, real *xi, real *vir)
{
  int j, n = lj->n;
  real l = lj->l, rc2 = lj->rc2, u = 0.f, du = 0.f, dvir = 0.f;
  rv3_t *x = (rv3_t *) lj->x;

  *vir = 0.0f;
  for (j = 0; j < n; j++) { /* pair */
    if (j == i) continue;
    if (lj_pairlj3d(x[i], x[j], l, rc2, &du, &dvir)) {
      u -= du;
      *vir -= dvir;
    }
    if (lj_pairlj3d(xi, x[j], l, rc2, &du, &dvir)) {
      u += du;
      *vir += dvir;
    }
  }
  return u;
}


/* commit a particle displacement */
INLINE void lj_commitlj2d(lj_t *lj, int i, const real *xi, real du, real dvir)
{
  rv2_copy(lj->x + i*2, xi);
  lj->epot += du;
  lj->vir += dvir;
}

/* commit a particle displacement */
INLINE void lj_commitlj3d(lj_t *lj, int i, const real *xi, real du, real dvir)
{
  rv3_copy(lj->x + i*3, xi);
  lj->epot += du;
  lj->vir += dvir;
}


INLINE int lj_metrolj2d(lj_t *lj, real amp, real bet)
{
  int i;
  real xi[2], du = 0.f, dvir = 0.f;

  i = lj_randmv2d(lj, xi, amp);
  du = lj_depotlj2d(lj, i, xi, &dvir);
  if (metroacc1(du, bet)) {
    lj_commitlj2d(lj, i, xi, du, dvir);
    return 1;
  }
  return 0;
}

INLINE int lj_metrolj3d(lj_t *lj, real amp, real bet)
{
  int i;
  real xi[3], du = 0.f, dvir = 0.f;

  i = lj_randmv3d(lj, xi, amp);
  du = lj_depotlj3d(lj, i, xi, &dvir);
  if (metroacc1(du, bet)) {
    lj_commitlj3d(lj, i, xi, du, dvir);
    return 1;
  }
  return 0;
}


/* return the pair energy between two particles at xi and xj */
INLINE int lj_pair2d(lj_t *lj, real *xi, real *xj, real *u, real *vir)
{
  if (lj->usesq) return lj_pairsq2d(xi, xj, lj->l, lj->ra2, lj->rb2, NULL, lj->esqinf);
  if (lj->usesw) return lj_pairsw2d(lj, xi, xj, u, vir);
  return lj_pairlj2d(xi, xj, lj->l, lj->rc2, u, vir);
}

/* return the pair energy between two particles at xi and xj */
INLINE int lj_pair3d(lj_t *lj, real *xi, real *xj, real *u, real *vir)
{
  if (lj->usesq) return lj_pairsq3d(xi, xj, lj->l, lj->ra2, lj->rb2, NULL, lj->esqinf);
  if (lj->usesw) return lj_pairsw3d(lj, xi, xj, u, vir);
  return lj_pairlj3d(xi, xj, lj->l, lj->rc2, u, vir);
}


/* return the pair energy between two particles at xi and xj */
INLINE int lj_pair(lj_t *lj, real *xi, real *xj, real *u, real *vir)
{
  return lj->d == 2 ?  lj_pair2d(lj, xi, xj, u, vir) : lj_pair3d(lj, xi, xj, u, vir);
}


/* return the energy change from displacing x[i] to xi */
INLINE real lj_depot2d(lj_t *lj, int i, real *xi, real *vir, real *rmin)
{
  if (lj->usesq) return (real) lj_depotsq2d(lj, i, xi, rmin);
  if (lj->usesw) return lj_depotsw2d(lj, i, xi, vir);
  return lj_depotlj2d(lj, i, xi, vir);
}

/* return the energy change from displacing x[i] to xi */
INLINE real lj_depot3d(lj_t *lj, int i, real *xi, real *vir, real *rmin)
{
  if (lj->usesq) return (real) lj_depotsq3d(lj, i, xi, rmin);
  if (lj->usesw) return lj_depotsw3d(lj, i, xi, vir);
  return lj_depotlj3d(lj, i, xi, vir);
}


/* return the energy change from displacing x[i] to xi */
INLINE real lj_depot(lj_t *lj, int i, real *xi, real *vir, real *rmin)
{
  return lj->d == 2 ?  lj_depot2d(lj, i, xi, vir, rmin)
    : lj_depot3d(lj, i, xi, vir, rmin);
}


/* this is defined as a macro for du is `int' in sq case, but real in other cases */
#define lj_commit2d(lj, i, xi, du, dvir) { \
  (lj->usesq) ? lj_commitsq2d(lj, i, xi, du) : \
  (lj->usesw) ? lj_commitsw2d(lj, i, xi, du, dvir) : \
                lj_commitlj2d(lj, i, xi, du, dvir); }

/* this is defined as a macro for du is `int' in sq case, but real in other cases */
#define lj_commit3d(lj, i, xi, du, dvir) { \
  (lj->usesq) ? lj_commitsq3d(lj, i, xi, du) : \
  (lj->usesw) ? lj_commitsw3d(lj, i, xi, du, dvir) : \
                lj_commitlj3d(lj, i, xi, du, dvir); }

/* commit a move */
#define  lj_commit(lj, i, xi, du, dvir) \
  (lj->d == 2 ? lj_commit2d(lj, i, xi, du, dvir) \
              : lj_commit3d(lj, i, xi, du, dvir); )

INLINE int lj_metro2d(lj_t *lj, real amp, real bet)
{
  if (lj->usesq) return lj_metrosq2d(lj, amp, bet);
  if (lj->usesw) return lj_metrosw2d(lj, amp, bet);
  return lj_metrolj2d(lj, amp, bet);
}

INLINE int lj_metro3d(lj_t *lj, real amp, real bet)
{
  if (lj->usesq) return lj_metrosq3d(lj, amp, bet);
  if (lj->usesw) return lj_metrosw3d(lj, amp, bet);
  return lj_metrolj3d(lj, amp, bet);
}


/* Metropolis algorithm */
INLINE int lj_metro(lj_t *lj, real amp, real bet)
{ return lj->d == 2 ? lj_metro2d(lj, amp, bet) : lj_metro3d(lj, amp, bet); }


/* return the energy change of locally displacing a single atom */
INLINE real lj_dupertl2d(lj_t *lj, real amp)
{
  real dvir, xi[2], rmin;
  int i;

  i = lj_randmv2d(lj, xi, amp);
  return lj_depot2d(lj, i, xi, &dvir, &rmin);
}

/* return the energy change of locally displacing a single atom */
INLINE real lj_dupertl3d(lj_t *lj, real amp)
{
  real dvir, xi[3], rmin;
  int i;

  i = lj_randmv3d(lj, xi, amp);
  return lj_depot3d(lj, i, xi, &dvir, &rmin);
}


INLINE real lj_dupertl(lj_t *lj, real amp)
{ return lj->d == 2 ? lj_dupertl2d(lj, amp) : lj_dupertl3d(lj, amp); }


/* return the energy change by random displacements of all atoms */
INLINE real lj_dupertg2d(lj_t *lj, real amp)
{
  int i, d, iep;
  rv2_t *nx;
  real du, vir, rmin, ep0, eps, lap;

  xnew(nx, lj->n);
  amp /= lj->l; /* convert to the reduced unit */
  for (i = 0; i < lj->n; i++)
    for (d = 0; d < 2; d++)
      nx[i][d] = (real) (lj->x[i*2 + d] + amp * (2*rnd0() - 1));
  du = lj_energyx2d(lj, nx, &vir, &iep, &rmin, &ep0, &eps, &lap) - lj->epot;
  free(nx);
  return du;
}

/* return the energy change by random displacements of all atoms */
INLINE real lj_dupertg3d(lj_t *lj, real amp)
{
  int i, d, iep;
  rv3_t *nx;
  real du, vir, rmin, ep0, eps, lap;

  xnew(nx, lj->n);
  amp /= lj->l; /* convert to the reduced unit */
  for (i = 0; i < lj->n; i++)
    for (d = 0; d < 3; d++)
      nx[i][d] = (real) (lj->x[i*3 + d] + amp * (2*rnd0() - 1));
  du = lj_energyx3d(lj, nx, &vir, &iep, &rmin, &ep0, &eps, &lap) - lj->epot;
  free(nx);
  return du;
}


INLINE real lj_dupertg(lj_t *lj, real amp)
{ return lj->d == 2 ? lj_dupertg2d(lj, amp) : lj_dupertg3d(lj, amp); }


/* return the energy caused by inserting a random atom
   the tail correction is not applied */
INLINE real lj_duinsert2d(lj_t *lj, real *xt)
{
  int j, n = lj->n;
  real xt0[2], u, du, dvir;

  if (xt == NULL) for (xt = xt0, j = 0; j < 3; j++) xt[j] = (real) rnd0();
  for (u = 0.f, j = 0; j < n; j++) /* pair energy */
    if (lj_pair(lj, xt, lj->x + 2*j, &du, &dvir))
      u += du;
  return u;
}

/* return the energy caused by inserting a random atom
   the tail correction is not applied */
INLINE real lj_duinsert3d(lj_t *lj, real *xt)
{
  int j, n = lj->n;
  real xt0[3], u, du, dvir;

  if (xt == NULL) for (xt = xt0, j = 0; j < 3; j++) xt[j] = (real) rnd0();
  for (u = 0.f, j = 0; j < n; j++) /* pair energy */
    if (lj_pair(lj, xt, lj->x + 3*j, &du, &dvir))
      u += du;
  return u;
}


INLINE real lj_duinsert(lj_t *lj, real *xt)
{ return lj->d == 2 ? lj_duinsert2d(lj, xt) : lj_duinsert3d(lj, xt); }

/* Lennard-Jones system: molecular dynamics routines */


/* velocity scaling for regular (no thermostat) MD during equilibration
 * `tp' is the target temperature
 * `ekt' is the observed average kinetic energy over several steps */
#define lj_vscale(lj, tp, ekt) \
  md_vscale(lj->v, lj->n * lj->d, lj->dof, tp, ekt, &lj->ekin, &lj->tkin)

#define lj_vrescale(lj, tp, thermdt) \
  md_vrescale(lj->v, lj->n * lj->d, lj->dof, tp, thermdt, &lj->ekin, &lj->tkin)

#define lj_vrescalex(lj, tp, thermdt) \
  md_vrescalex(lj->v, lj->n * lj->d, lj->dof, tp, thermdt, &lj->ekin, &lj->tkin);

#define lj_mcvrescale(lj, tp, thermdt) \
  md_mcvrescale(lj->v, lj->n * lj->d, lj->dof, tp, thermdt, &lj->ekin, &lj->tkin);


#define lj_vv(lj, dt) lj_vvx(lj, 1.f, dt)

/* velocity Verlet */
INLINE void lj_vvx(lj_t *lj, real fscal, real dt)
{
  int i, nd = lj->n * lj->d;
  real dtl = dt / lj->l, dthf = dt * .5f * fscal;

  for (i = 0; i < nd; i++) { /* VV part 1 */
    lj->v[i] += lj->f[i] * dthf;
    lj->x[i] += lj->v[i] * dtl;
  }
  lj_force(lj); /* calculate the new force */
  for (i = 0; i < nd; i++) /* VV part 2 */
    lj->v[i] += lj->f[i] * dthf;

  lj->ekin = md_ekin(lj->v, nd, lj->dof, &lj->tkin);
}


/* Nose-Hoover thermostat/barostat
 * set cutoff to half of the box */
#define lj_hoovertp(lj, dt, tp, pext, zeta, eta, Q, W, ensx) \
  md_hoovertp(lj->v, lj->n, lj->d, lj->dof, dt, tp, pext, zeta, eta, \
      Q, W, lj->vol, lj->vir, lj->p_tail, ensx, &lj->ekin, &lj->tkin)


/* Nose-Hoover chain thermostat/barostat
 * set cutoff to half of the box */
#define lj_nhchaintp(lj, dt, tp, pext, zeta, eta, Q, M, W, ensx) \
  md_nhchaintp(lj->v, lj->n, lj->d, lj->dof, dt, tp, pext, zeta, eta, \
      Q, M, W, lj->vol, lj->vir, lj->p_tail, ensx, &lj->ekin, &lj->tkin)


/* Langevin barostat, with kinetic-energy scaling */
#define lj_langtp(lj, dt, tp, pext, zeta, eta, W, ensx) \
  md_langtp(lj->v, lj->n, lj->d, dt, tp, pext, zeta, eta, \
      W, lj->vol, lj->vir, lj->p_tail, ensx, &lj->ekin, &lj->tkin)


/* position Langevin barostat, with kinetic-energy scaling */
INLINE void lj_langtp0(lj_t *lj, real barodt, real tp, real pext, int ensx)
{
  md_langtp0(lj->v, lj->n, lj->d, barodt, tp, pext, &lj->vol,
      lj->vir, lj->p_tail, ensx, &lj->ekin, &lj->tkin);
  lj_setrho(lj, lj->n/lj->vol);
  lj_force(lj);
}


/* old interface */
#define lj_lgvvolmove(lj, barodt, tp, p) \
  lj_langp0(lj, barodt, tp, p, 0)

/* Langevin barostat, with coordinates only, barodt ~ 1e-5 for n = 108 */
INLINE void lj_langp0(lj_t *lj, real barodt, real tp, real pext, int ensx)
{
  md_langp0(lj->dof, lj->d, barodt, tp, pext, &lj->vol, lj->vir, lj->p_tail, ensx);
  lj_setrho(lj, lj->n/lj->vol);
  lj_force(lj);
}

/* velocity Verlet with the scaling step in the Nose-Hoover barostat */
INLINE void lj_vv_hoovertp(lj_t *lj, real dt, real eta)
{
  int i, nd = lj->n*lj->d;
  real dt2 = dt * .5f, xp;

  for (i = 0; i < nd; i++) /* VV part 1 */
    lj->v[i] += lj->f[i] * dt2;

  /* position update with scaling */
  md_hoovertpdr(lj->x, lj->v, nd, &xp, lj->l, eta, dt);
  lj->l *= xp;
  lj_setrho(lj, lj->rho/(xp*xp*xp));
  lj_force(lj); /* calculate the new force */

  for (i = 0; i < nd; i++) /* VV part 2 */
    lj->v[i] += lj->f[i] * dt2;

  lj->ekin = md_ekin(lj->v, nd, lj->dof, &lj->tkin);
}


/* Berendsen barostat: as a backup for constant pressure simulation */
INLINE void lj_pberendsen(lj_t *lj, real barodt, real tp, real pext)
{
  int i;
  real pint, vn, lo = lj->l, s, dlnv;

  pint = (lj->vir + 2.f * lj->ekin)/ (lj->d * lj->vol) + lj->p_tail;

  /* proposed change of log V */
  dlnv = (pint - pext)*lj->vol/tp*barodt;
  if (dlnv < -0.1) dlnv = -0.1; else if (dlnv > 0.1) dlnv = 0.1;
  vn = log(lj->vol) + dlnv;
  vn = exp(vn);
  lj_setrho(lj, lj->n/vn);
  s = lo/lj->l;
  for (i = 0; i < lj->d * lj->n; i++) lj->v[i] *= s;
  lj->ekin *= s*s;
  lj->tkin *= s*s;
}


/* In Monte Carlo barostats, we compute the energy directly */
#define LJ_FIXEDRC 0x4000

#define lj_mcprescale(lj, lnvamp, tp, pext, vmin, vmax, ensx) \
  lj_mctp(lj, lnvamp, tp, pext, vmin, vmax, ensx, 0)

/* Monte Carlo barostat, with kinetic-energy scaling
 * the ideal gas part computed as \sum p^2/m / V
 * the scaling is r = r*s, p = p/s;
 * set cutoff to half of the box */
INLINE int lj_mctp(lj_t *lj, real lnvamp, real tp, real pext,
    real vmin, real vmax, int ensx, unsigned flags)
{
  int acc = 0, i, d = lj->d;
  real lnlo, lnln, lo, ln, vo, vn, s, epo, bet = 1.f/tp;
  double dex;
  lj_t *lj1;

  vo = lj->vol;
  lo = lj->l;
  lnlo = (real) log(lo);
  lnln = (real) (lnlo + lnvamp/d * (2.f * rnd0() - 1.f));
  ln = (real) exp(lnln);
  for (vn = 1, i = 0; i < d; i++) vn *= ln;
  if (vn < vmin || vn >= vmax)
    return 0;
  if ((flags & LJ_FIXEDRC) && ln < lj->rc * 2)
    return 0; /* box too small */

  epo = lj->epot;
  lj1 = lj_clone(lj, LJ_CPF); /* make a copy */
  lj_setrho(lj, lj->n/vn);
  lj_force(lj); /* we change force here */
  dex = bet * (lj->epot - epo + pext * (vn - vo))
      + bet * (pow(vo/vn, 2.0/d) - 1)*lj->ekin
      + d * (lnlo - lnln) * (1 - ensx);
  if (metroacc1(dex, 1.f)) { /* scale the velocities */
    s = lo/lj->l;
    for (i = 0; i < d * lj->n; i++) lj->v[i] *= s;
    lj->ekin *= s*s;
    lj->tkin *= s*s;
    acc = 1;
  } else {
    lj_copy(lj, lj1, LJ_CPF); /* restore force etc. */
  }
  lj_close(lj1);
  return acc;
}


/* Monte Carlo barostat for the square-well potential, for coordinates only
 * suppose lj->rmin has been correctly set
 * use lnvamp 0.03 ~ 0.06 for 256 system */
INLINE int lj_mcpsq(lj_t *lj, real lnvamp, real tp, real pext,
    real vmin, real vmax, int ensx, unsigned flags)
{
  int acc = 0, i, d = lj->d, iep;
  real lnlo, lnln, vo, vn, lo, ln, rmn = 0, epo, bet = 1.f/tp;
  double dex;

  (void) flags;
  vo = lj->vol;
  lo = lj->l;
  lnlo = (real) log(lo);
  lnln = (real) (lnlo + lnvamp/d * (2.f * rnd0() - 1.f));
  ln = (real) exp(lnln);
  for (vn = 1, i = 0; i < d; i++) vn *= ln;
  if (vn < vmin || vn >= vmax) return 0;

  /* check if there is a clash */
  rmn = lj->rmin * ln / lo;
  if (ln < lo) {
    if (ln < lj->rb * 2) return 0; /* box too small */
    if (rmn < lj->ra) return 0;
  }

  /* compute the change of the square-well energy */
  epo = lj->epot;
  lj_setrho(lj, lj->n/vn); /* commit to the new box */
  if (fabs(lj->ra - lj->rb) < 1e-6) { /* skip the energy calculation */
    iep = 0;
  } else {
    if (d == 3) {
      iep = lj_energysq3d(lj, (rv3_t *) lj->x, &rmn);
    } else {
      iep = lj_energysq2d(lj, (rv2_t *) lj->x, &rmn);
    }
  }
  dex = bet * ((real) iep - epo + pext * (vn - vo))
      - (lj->dof + (1 - ensx) * d) * (lnln - lnlo);
  if (rmn > lj->ra && metroacc1(dex, 1.0)) {
    lj->iepot = iep;
    lj->epot = iep;
    lj->rmin = rmn;
    acc = 1;
  } else {
    lj_setrho(lj, lj->n/vo);
  }
  return acc;
}


/* Monte Carlo barostat, coordinates only
 * use lnvamp 0.03 ~ 0.06 for 256 system */
INLINE int lj_mcplj(lj_t *lj, real lnvamp, real tp, real pext,
    real vmin, real vmax, int ensx, unsigned flags)
{
  int acc = 0, i, d = lj->d;
  real lnlo, lnln, lo, ln, vo, vn, epo, bet = 1.f/tp;
  double dex;
  lj_t *lj1;

  vo = lj->vol;
  lo = lj->l;
  lnlo = (real) log(lo);
  lnln = (real) (lnlo + lnvamp/d * (2.f * rnd0() - 1.f));
  ln = (real) exp(lnln);
  for (vn = 1, i = 0; i < d; i++) vn *= ln;
  if (vn < vmin || vn >= vmax)
    return 0;
  if ((flags & LJ_FIXEDRC) && ln < lj->rc * 2)
    return 0; /* box too small */

  epo = lj->epot;
  lj1 = lj_clone(lj, LJ_CPF); /* save a copy */
  lj_setrho(lj, lj->n/vn); /* commit to the new box */
  lj_force(lj);
  dex = bet * (lj->epot - epo + pext * (vn - vo))
      - (lj->dof + (1 - ensx) * d) * (lnln - lnlo);
  if (metroacc1(dex, 1.0)) {
    acc = 1;
  } else {
    lj_copy(lj, lj1, LJ_CPF);
  }
  lj_close(lj1);
  return acc;
}


/* old interface */
#define lj_volmove(lj, lnlamp, tp, p) \
  lj_mcp(lj, lnlamp*lj->d, tp, p, 0, 1e300, 0, LJ_FIXEDRC)

/* Monte Carlo barostat, coordinates only */
INLINE int lj_mcp(lj_t *lj, real lnvamp, real tp, real pext,
    real vmin, real vmax, int ensx, unsigned flags)
{
  if (lj->usesq) { /* use the specialized square-well version */
    return lj_mcpsq(lj, lnvamp, tp, pext, vmin, vmax, ensx, flags);
  } else { /* use the generic version */
    return lj_mcplj(lj, lnvamp, tp, pext, vmin, vmax, ensx, flags);
  }
}

/* Lennard-Jones system: compute the radial distribution function (RDF)
 * using a specially normalized histogram, cf. hist/testrdf.c */


typedef struct {
  hist_t *rdf; /* histogram for radial distribution function */
  int nfr; /* number of frames in rdf */
  lj_t *lj;
} ljrdf_t;


/* open an ljrdf structure, `rmax' can be 0 */
INLINE ljrdf_t *ljrdf_open(lj_t *lj, double dr, double rmax)
{
  ljrdf_t *ljr;

  xnew(ljr, 1);
  ljr->nfr = 0;
  ljr->lj = lj;
  if (rmax <= 0) rmax = lj->l * .5;
  ljr->rdf = hs_open(1, 0, rmax, dr);
  return ljr;
}


INLINE void ljrdf_close(ljrdf_t *ljr)
{
  hs_close(ljr->rdf);
  free(ljr);
}


/* add pairs to the RDF data */
INLINE int ljrdf_add(ljrdf_t *ljr, unsigned flags)
{
  lj_t *lj = ljr->lj;
  int i, j;
  real rc2, dr2, dx[3];
  double dr;

  rc2 = ljr->lj->l/2;
  rc2 = rc2 * rc2;
  for (i = 0; i < lj->n; i++) {
    for (j = i + 1; j < lj->n; j++) {
      if (lj->d == 2)
        dr2 = lj_pbcdist2_2d(dx, lj->x + 2*i, lj->x + 2*j, lj->l);
      else
        dr2 = lj_pbcdist2_3d(dx, lj->x + 3*i, lj->x + 3*j, lj->l);
      if (dr2 >= rc2) continue;
      dr = sqrt(dr2);
      hs_add(ljr->rdf, &dr, 1.0, flags);
    }
  }
  return ++ljr->nfr; /* number of frames */
}


/* header information in writing rdf */
INLINE int ljrdf_fwheader(FILE *fp, void *pdata)
{
  ljrdf_t *ljr = (ljrdf_t *) pdata;
  fprintf(fp, "RDF %d %d %d %.10e | ",
      ljr->nfr, ljr->lj->d, ljr->lj->n, ljr->lj->l);
  return 0;
}


/* header information in reading rdf */
INLINE int ljrdf_frheader(const char *s, void *pdata)
{
  ljrdf_t *ljr = (ljrdf_t *) pdata;
  lj_t *lj = ljr->lj;
  int ret, d, n;
  double l;

  ret = sscanf(s, " RDF %d%d%d%lf | ", &(ljr->nfr), &d, &n, &l);
  die_if (d != lj->d, "dimension mismatch %d vs. %d (file)\n", lj->d, d);
  die_if (n != lj->n, "# of particle mismatch %d vs. %d (file)\n", lj->n, n);
  die_if (fabs(l - lj->l) > 1e-3, "box size mismatch %d vs. %d (file)\n", lj->l, l);
  return (ret == 4) ? 0 : 1;
}


/* normalization */
INLINE double ljrdf_norm(int row, int i, double xmin, double dx, void *pdata)
{
  int npr;
  double x, vsph;
  ljrdf_t *ljr = (ljrdf_t *) pdata;
  lj_t *lj = ljr->lj;

  (void) row;
  x = xmin + i * dx;
  if (lj->d == 2) vsph = 2. * M_PI * dx * (2*x + dx);
  else vsph = 4. * M_PI * dx * (x*(x + dx) + dx*dx/3.);
  npr = lj->n * (lj->n - 1)/2;
  return lj->vol / (vsph * npr * ljr->nfr);
}


/* save rdf, flags can have HIST_NOZEROES */
INLINE int ljrdf_save(ljrdf_t *ljr, const char *fn, unsigned flags)
{
  return hs_savex(ljr->rdf, fn, ljrdf_fwheader, ljrdf_norm, ljr, flags);
}


/* load rdf, flags can have HIST_ADDITION and/or HIST_VERBOSE */
INLINE int ljrdf_load(ljrdf_t *ljr, const char *fn, unsigned flags)
{
  return hs_loadx(ljr->rdf, fn, ljrdf_frheader, ljrdf_norm, ljr, flags);
}


#endif /* ZCOM_LJ__ */
#endif /* ZCOM_LJ */


/* finishing up */
#ifdef __INTEL_COMPILER
  #pragma warning pop
#elif defined(__GNUC__) && ( __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
  /* diagonistic push and pop are added in GCC 4.6 */
  #pragma GCC diagnostic pop
#endif

#ifdef _MSC_VER
  #pragma warning(pop)
#endif
