/*
  common routines

  Copyright (c) 2006-2011 Cheng Zhang

  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License
  as published by the Free Software Foundation; either version 2
  of the License, or (at your option) any later version.

  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <http://www.gnu.org/licenses/>.

  Usage:

  1.  It is designed quick programming.
      For simple use, just include this file and all functions will be available.
      But there might be compiler warnings for unused functions.

  2.  You should be able to include this file multiple times in a single file
      without a problem (otherwise a bug!).

  3.  Function are static by default. To export functions,
      e.g., to make it easier to debug, or to avoid warning of unused functions,
      define ZCOM_XFUNCS before the first inclusion.

  4.  To hand-pick specific set of modules,
        #define ZCOM_PICK
        #define ZCOM_RNG
        #define ZCOM_ARGOPT
      before including this file. Other modules will not be compiled.

  5.  If the compiler supports keywords inline and restrict,
        #define INLINE inline
        #define RESRICT restrict
      before including this file.

  6.  Define HAVEVAM if the compiler supports variable-argument macros.

  7.  The def module defines `real' as a double, to override
        typedef float real;
        #define HAVEREAL 1
      before including this file (or equivalently define HAVE_REAL)
*/

/* ZCOM_PICK or ZCOM_NONE is used include only subset of modules to
 * 1. reduce the # of warnings for unused functions
 * 2. accelerate the compiling
 * 3. avoid multiple inclusions
 * By default, ZCOM_PICK is undefined, so everything is used. */
#ifdef ZCOM_NONE  /* equivalent to ZCOM_PICK */
#define ZCOM_PICK
#endif

#ifndef ZCOM_PICK
  #ifndef ZCOM_DEF
  #define ZCOM_DEF
  #endif
  #ifndef ZCOM_UTIL
  #define ZCOM_UTIL
  #endif
  #ifndef ZCOM_SS
  #define ZCOM_SS
  #endif
  #ifndef ZCOM_ENDN
  #define ZCOM_ENDN
  #endif
  #ifndef ZCOM_BIO
  #define ZCOM_BIO
  #endif
  #ifndef ZCOM_RNG
  #define ZCOM_RNG
  #endif
  #ifndef ZCOM_RV2
  #define ZCOM_RV2
  #endif
  #ifndef ZCOM_RV3
  #define ZCOM_RV3
  #endif
  #ifndef ZCOM_EIG
  #define ZCOM_EIG
  #endif
  #ifndef ZCOM_LU
  #define ZCOM_LU
  #endif
  #ifndef ZCOM_SVD
  #define ZCOM_SVD
  #endif
  #ifndef ZCOM_ROTFIT
  #define ZCOM_ROTFIT
  #endif
  #ifndef ZCOM_SAVGOL
  #define ZCOM_SAVGOL
  #endif
  #ifndef ZCOM_SPECFUNC
  #define ZCOM_SPECFUNC
  #endif
  #ifndef ZCOM_ARGOPT
  #define ZCOM_ARGOPT
  #endif
  #ifndef ZCOM_CFG
  #define ZCOM_CFG
  #endif
  #ifndef ZCOM_TRACE
  #define ZCOM_TRACE
  #endif
  #ifndef ZCOM_LOG
  #define ZCOM_LOG
  #endif
  #ifndef ZCOM_AV
  #define ZCOM_AV
  #endif
  #ifndef ZCOM_HIST
  #define ZCOM_HIST
  #endif
  #ifndef ZCOM_MDS
  #define ZCOM_MDS
  #endif
  #ifndef ZCOM_PDB
  #define ZCOM_PDB
  #endif
  #ifndef ZCOM_CLUS
  #define ZCOM_CLUS
  #endif
  #ifndef ZCOM_ISING2
  #define ZCOM_ISING2
  #endif
  #ifndef ZCOM_POTTS2
  #define ZCOM_POTTS2
  #endif
  #ifndef ZCOM_MD
  #define ZCOM_MD
  #endif
  #ifndef ZCOM_LJ
  #define ZCOM_LJ
  #endif
  #ifndef ZCOM_ABPRO
  #define ZCOM_ABPRO
  #endif
  #ifndef ZCOM_CAGO
  #define ZCOM_CAGO
  #endif
  #ifndef ZCOM_TMH
  #define ZCOM_TMH
  #endif
#endif

/* build dependencies */
#ifdef ZCOM_TMH
  #define ZCOM_HIST
  #define ZCOM_RNG
  #define ZCOM_UTIL
#endif

#ifdef ZCOM_CAGO
  #define ZCOM_MD
  #define ZCOM_PDB
  #define ZCOM_AV
  #define ZCOM_ROTFIT
#endif

#ifdef ZCOM_ABPRO
  #define ZCOM_MD
  #define ZCOM_UTIL
#endif

#ifdef ZCOM_LJ
  #define ZCOM_MD
  #define ZCOM_UTIL
#endif

#ifdef ZCOM_MD
  #define ZCOM_RV3
  #define ZCOM_RV2
  #define ZCOM_RNG
#endif

#ifdef ZCOM_POTTS2
  #define ZCOM_RNG
  #define ZCOM_UTIL
#endif

#ifdef ZCOM_ISING2
  #define ZCOM_RNG
  #define ZCOM_UTIL
#endif

#ifdef ZCOM_CLUS
  #define ZCOM_MDS
  #define ZCOM_RNG
#endif

#ifdef ZCOM_PDB
  #define ZCOM_RV3
  #define ZCOM_UTIL
#endif

#ifdef ZCOM_MDS
  #define ZCOM_EIG
#endif

#ifdef ZCOM_LOG
  #define ZCOM_SS
#endif

#ifdef ZCOM_TRACE
  #define ZCOM_SS
#endif

#ifdef ZCOM_CFG
  #define ZCOM_SS
#endif

#ifdef ZCOM_ARGOPT
  #define ZCOM_UTIL
  #define ZCOM_DEF
#endif

#ifdef ZCOM_SAVGOL
  #define ZCOM_LU
#endif

#ifdef ZCOM_ROTFIT
  #define ZCOM_RV3
#endif

#ifdef ZCOM_SVD
  #define ZCOM_UTIL
  #define ZCOM_DEF
#endif

#ifdef ZCOM_LU
  #define ZCOM_DEF
#endif

#ifdef ZCOM_EIG
  #define ZCOM_UTIL
  #define ZCOM_DEF
#endif

#ifdef ZCOM_RV3
  #define ZCOM_DEF
#endif

#ifdef ZCOM_RV2
  #define ZCOM_DEF
#endif

#ifdef ZCOM_BIO
  #define ZCOM_ENDN
#endif


/* manage storage class: static is the safer choice
   to avoid naming conclict.  Example:
   both m.c and n.c include this file,
   m.c --> m.o, n.c --> n.o, m.o+n.o --> a.out
   static is the only way to avoid naming conflict in this case.

   In case that this file is included multiple times,
   ZCOM_XFUNCS should be defined before the first inclusion,
   otherwise it won't be effective in deciding storage class. */
#ifndef STRCLS
  #ifndef ZCOM_XFUNCS
    #define STRCLS static
  #else
    #define STRCLS
  #endif
#endif

/* inline keyword */
#ifndef INLINE
  #if defined(__GNUC__) || defined(__xlC__)
    #define INLINE STRCLS __inline__
  #elif defined(_MSC_VER) || defined(__BORLANDC__)
    #define INLINE __inline STRCLS
  #elif defined(__STDC_VERSION__) && (STDC_VERSION__ >= 199901L)
    #define INLINE STRCLS inline
  #else
    #define INLINE STRCLS
  #endif
#endif

/* restrict keyword */
#ifndef RESTRICT
  #if (defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__xlC__))
    #define RESTRICT __restrict
  #elif defined(__STDC_VERSION__) && (STDC_VERSION__ >= 199901L)
    #define RESTRICT restrict
  #else
    #define RESTRICT
  #endif
#endif

/* macros with variable-length arguments */
#ifndef HAVEVAM
  #if (  (defined(__GNUC__) && (__GNUC__ >= 3))   \
      || (defined(__xlC__)  && (__xlC__ >= 0x0700)) \
      || (defined(_MSC_VER) && (_MSC_VER >= 1400)) )
    #define HAVEVAM 1
  #endif
#endif

#ifdef __INTEL_COMPILER
  #pragma warning(disable:981) /* unspecified order warning */
  #pragma warning(disable:177) /* unreferenced function */
#endif

#ifdef _MSC_VER
  #pragma warning(disable:4127) /* conditional expression constant */
  #pragma warning(disable:4505) /* unreferenced function */
  #pragma warning(disable:4514) /* unreferenced inline */
  #pragma warning(disable:4710) /* not inlined */
#endif

/* In addition to ZCOM_ABC, we have to define another macro ZCOM_ABC__
 * in order to avoid multiple inclusions.
 * A single ZCOM_ABC__ won't do because different module-set may be selected */

#ifdef  ZCOM_DEF
#ifndef ZCOM_DEF__
#define ZCOM_DEF__

/* define a real type */
#ifdef HAVE_REAL
  #ifndef HAVEREAL
  #define HAVEREAL HAVE_REAL
  #endif
#endif

#ifndef HAVEREAL
  #define HAVEREAL 1
  typedef double real;
#endif

#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif

#endif /* ZCOM_DEF__ */
#endif /* ZCOM_DEF */

#ifdef  ZCOM_UTIL
#ifndef ZCOM_UTIL__
#define ZCOM_UTIL__

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <stdarg.h>
#include <math.h>

#ifndef xnew
#define xnew(x, n) \
  if (#n[0] != '1' && (n) <= 0) { \
    fprintf(stderr, "cannot allocate %d objects for %s\n", (int) (n), #x); \
    exit(1); \
  } else if ((x = calloc(n, sizeof(*(x)))) == NULL) { \
    fprintf(stderr, "no memory for %s x %u\n", #x, (unsigned) (n)); \
    exit(1); }
#endif

#ifndef xrenew
#define xrenew(x, n) \
  if ((n) <= 0) { \
    fprintf(stderr, "cannot allocate %d objects for %s\n", (int) (n), #x); \
    exit(1); \
  } else if ((x = realloc(x, (n)*sizeof(*(x)))) == NULL) { \
    fprintf(stderr, "no memory for %s x %u\n", #x, (unsigned) (n)); \
    exit(1); }
#endif

/* print an error message */
INLINE void perrmsg__(const char *file, int line, const char *why,
    int err, const char *fmt, va_list args)
{
  if (err) fprintf(stderr, "error: ");
  vfprintf(stderr, fmt, args);
  if (fmt[strlen(fmt) - 1] != '\n')
    fprintf(stderr, "\n"); /* add a new line if needed */
  if (err) {
    if (file != NULL) fprintf(stderr, "file: %s\n", file);
    if (line > 0) fprintf(stderr, "line: %d\n", line);
    if (why != NULL && strcmp(why, "1") != 0)
      fprintf(stderr, "cond: %s\n", why);
  }
}

#ifdef HAVEVAM

INLINE void perrmsg_(const char *file, int line, const char *why,
    int cond, int err, const char *fmt, ...)
{
  va_list args;

  if (cond) {
    va_start(args, fmt);
    perrmsg__(file, line, why, err, fmt, args);
    va_end(args);
    if (err) exit(1);
  }
}

#define die_if(cond, fmt, ...) \
  perrmsg_(__FILE__, __LINE__, #cond, cond, 1, fmt, ## __VA_ARGS__)
#define msg_if(cond, fmt, ...) \
  perrmsg_(__FILE__, __LINE__, #cond, cond, 0, fmt, ## __VA_ARGS__)
#define fatal(fmt, ...)  die_if(1, fmt, ## __VA_ARGS__)

#else /* !HAVEVAM */

#define PERRMSG__(c, x) {                     \
  va_list args;                               \
  if ((#c[0] == '1' && #c[1] == '\0') || c) { \
    va_start(args, fmt);                      \
    perrmsg__(NULL, -1, NULL, x, fmt, args);  \
    va_end(args);                             \
    if (#x[0] == '1') exit(1);                \
  } }
INLINE void die_if(int cond, const char *fmt, ...) PERRMSG__(cond, 1)
#ifdef USE_MSG_IF
INLINE void msg_if(int cond, const char *fmt, ...) PERRMSG__(cond, 0)
#endif
#ifdef USE_FATAL
STRCLS void fatal(const char *fmt, ...) PERRMSG__(1, 1)
#endif
#undef PERRMSG__

#endif /* HAVEVAM */

#define xfopen(fp, fn, fmt, err) \
  if ((fp = fopen(fn, fmt)) == NULL) { \
    fprintf(stderr, "cannot open file %s\n", fn); err; }

INLINE int fexists(const char *fn)
{
  FILE *fp;
  if ((fp = fopen(fn, "r")) == NULL) return 0;
  else { fclose(fp); return 1; }
}

/* sqrt(x*x + y*y) */
INLINE double dblhypot(double x, double y)
{
  double t;
  x = fabs(x);
  y = fabs(y);
  if (x <= 0.) return y;
  else if (y <= 0.) return x;
  if (x < y) t = x, x = y, y = t;
  t = y/x;
  return x*sqrt(1+t*t);
}

/* round x to a multiple dx  */
INLINE double dblround(double x, double dx)
{
  if (x > 0) return dx * (int)(x/dx+.5-1e-14);
  else return -dx * (int)(-x/dx+.5-1e-14);
}

INLINE double dblsqr(double x) { return x*x; }

INLINE double dblmax(double x, double y) { return x > y ? x : y; }
INLINE double dblmin(double x, double y) { return x < y ? x : y; }
/* confine x within [xmin, xmax] */
INLINE double dblconfine(double x, double xmin, double xmax)
  { return x < xmin ? xmin : x > xmax ? xmax : x; }

#ifndef LNADD_DEFINED
#define LNADD_DEFINED
#define LN_BIG 50.0

/* log(exp(a) + exp(b)) */
INLINE double lnadd(double a, double b)
{
  double c;
  if (a < b) { c = a; a = b; b = c; } /* ensure a >= b */
  return ((c = a-b) > LN_BIG) ? a : a + log(1 + exp(-c));
}

/* log(exp(a)-exp(b)), only works for a>b */
INLINE double lndif(double a, double b)
{
  double c;
  die_if (a < b, "lndif: %g < %g\n", a, b);
  return ((c = a-b) > LN_BIG) ? a : a + log(1 - exp(-c));
}

/* log(exp(a)+b) */
INLINE double lnaddn(double a, double b)
{
  return (a > LN_BIG) ? a : a + log(1 + b*exp(-a));
}

#undef LN_BIG
#endif  /* LNADD_DEFINED */

/* string manipulation */
#define ZSTR_XSPACEL  0x0001
#define ZSTR_XSPACER  0x0002
#define ZSTR_XSPACE   (ZSTR_XSPACEL|ZSTR_XSPACER)
#define ZSTR_COPY     0x0004
#define ZSTR_CAT      0x0008
#define ZSTR_CASE     0x0100
#define ZSTR_UPPER_   0x0200
#define ZSTR_UPPER    (ZSTR_CASE|ZSTR_UPPER_)
#define ZSTR_LOWER    ZSTR_CASE

/* remove leading and trailing spaces */
#define strip(s)  stripx(s, ZSTR_XSPACE)
#define lstrip(s) stripx(s, ZSTR_XSPACEL)
#define rstrip(s) stripx(s, ZSTR_XSPACER)
INLINE char *stripx(char *s, unsigned flags)
{
  char *p;

  if (flags & ZSTR_XSPACEL) { /* remove leading spaces */
    for (p = s; isspace(*p); p++) ;
    if (*p == '\0') *s = '\0';
    else memmove(s, p, strlen(p)+1);
  }
  if (flags & ZSTR_XSPACER) /* remove trailing spaces */
    for (p = s + strlen(s) - 1; p >= s && isspace(*p); p--)
      *p = '\0';
  return s;
}

/* in the follows, size_s means the buffer size of s, i.e., sizeof(s) for static strings */
/* copy the string and convert it to upper/lower case */
#define strcpy2u(s, t, size_s) strcnv(s, t, size_s - 1, ZSTR_COPY|ZSTR_UPPER)
#define strcpy2l(s, t, size_s) strcnv(s, t, size_s - 1, ZSTR_COPY|ZSTR_LOWER)
#define strcpy_sf(s, t, size_s) strcnv(s, t, size_s - 1, ZSTR_COPY)
/* concatenate strings, the last parameter is the buffer size of s,
 * unlike strncat(), in which it's the number of characters from *t* to be copied.  */
#define strcat_sf(s, t, size_s) strcnv(s, t, size_s - 1, ZSTR_CAT)
/* safely copy/cat strings with case conversion
 * unlike strncpy(), s is always null-terminated on return: it copies at most
 * len nonblank characters, i.e., s[len] = '\0' for the longest output */
INLINE char *strcnv(char *s, const char *t, size_t len, unsigned flags)
{
  size_t i = 0, j;
  unsigned docase = flags & ZSTR_CASE, up = flags & ZSTR_UPPER_;

  if (len == 0 || s == NULL || t == NULL) return s;
  if (flags & ZSTR_CAT) while(s[i]) i++;
  for (j = 0; i < len; i++, j++) {
    if (docase && t[j]) {
      if (up) s[i] = (char) (unsigned char) toupper((unsigned char) t[j]);
      else    s[i] = (char) (unsigned char) tolower((unsigned char) t[j]);
    } else s[i] = t[j];
    if (t[j] == 0) break;
  }
  if (i == len) s[i] = '\0';
  if (flags & ZSTR_XSPACE) stripx(s, flags); /* call strip */
  return s;
}

/* compare strings without case */
#define strcmpnc(s, t) strncmpnc(s, t, -1)
INLINE int strncmpnc(const char *s, const char *t, int n)
{
  int i, cs, ct;

  if (s == NULL || t == NULL) return 0;
  for (i = 0; ; i++) {
    if (i >= n) return 0;
    cs = s[i];
    ct = t[i];
    if (cs == 0 || ct == 0) break;
    cs = toupper( (unsigned char) cs );
    ct = toupper( (unsigned char) ct );
    if (cs != ct) break;
  }
  return cs-ct;
}

#endif /* ZCOM_UTIL__ */
#endif /* ZCOM_UTIL */

#ifdef  ZCOM_SS
#ifndef ZCOM_SS__
#define ZCOM_SS__

#include <stdio.h>
#include <string.h>
#include <stdarg.h>
#include <stdlib.h>

enum { SSCAT = 1, SSDELETE = 2, SSSHRINK = 3, SSSINGLE = 0x1000 };

#define ssnew(n)       sscpycatx(NULL, NULL, (n),    0)
#define ssdup(t)       sscpycatx(NULL, (t),   0,     0)
#define sscpy(s, t)    sscpycatx(&(s), (t),   0,     0)
#define sscat(s, t)    sscpycatx(&(s), (t),   0, SSCAT)
#define ssdel(s)       ssmanage((s), SSDELETE|SSSINGLE)
#define ssdelete(s)    { ssdel(s); (s)=NULL; }
#define ssshrink(s)    ssmanage((s), SSSHRINK|SSSINGLE)
#define ssdelall()     ssmanage(NULL, SSDELETE)
#define ssshrinkall()  ssmanage(NULL, SSHRINK)
#define ssfgets(s, pn, fp)    ssfgetx(&(s), (pn), '\n', (fp))
#define ssfgetall(s, pn, fp)  ssfgetx(&(s), (pn), EOF, (fp))

STRCLS int   ssmanage(char *, unsigned);
STRCLS char *sscpycatx(char **, const char *, size_t, unsigned);
STRCLS char *ssfgetx(char **, size_t *, int, FILE *fp);


#ifndef SSMINSIZ /* to override the block size, define it before inclusion */
#define SSMINSIZ 256 /* change this value to 1 for debugging */
#endif
#ifndef SSHASHBITS
#define SSHASHBITS 8
#endif
#define SSHASHSIZ   (1 << SSHASHBITS)
#define SSOVERALLOC 1
#define sscalcsize_(n) (((n)/SSMINSIZ + 1) * SSMINSIZ) /* size for n nonblank characters */

struct ssheader {
  size_t size;
  size_t hashval;
  struct ssheader *next;
} ssbase_[SSHASHSIZ] = {{ 0u, 0u, NULL }};

/* we use the string address instead of that of the pointer
 * to struct ssheader to compute the Hash value,
 * because the former is more frequently used in e.g. looking-up
 * */
static size_t sshashval_(const char *p)
{
  size_t val = (size_t) p * 1664525u + 1013904223u;
  return (val >> (sizeof(size_t)*8-SSHASHBITS)) & ((1<<SSHASHBITS)-1);
}

/*
 * return the *previous* header to the one that associates with s
 * first locate the list from the Hash value, then enumerate the linked list.
 * */
static struct ssheader *sslistfind_(const char *s)
{
  struct ssheader *hp, *head;

  if (s == NULL) return NULL;
  head = ssbase_ + sshashval_(s);
  if (head->next == NULL) return NULL; /* uninitialized head node */
  for (hp = head; hp->next != head; hp = hp->next)
    if ((char *)(hp->next + 1) == s)
      return hp;
  return NULL;
}

/*
 * simply add the entry h at the begining of the list
 * we do not accept a precalculated hash value,
 * since realloc might have changed it
 * */
static struct ssheader *sslistadd_(struct ssheader *h)
{
  struct ssheader *head;

  head = ssbase_ + sshashval_( (char *)(h+1) );
  if (head->next == NULL) /* initialize the base */
    head->next = head;
  h->next = head->next;
  head->next = h;
  return head;
}

/* remove hp->next */
static void sslistremove_(struct ssheader *hp, int f)
{
  struct ssheader *h = hp->next;

  hp->next = h->next;
  if (f) free(h);
}

/* (re)allocate memory for (*php)->next, update list, return the new string
 * n is the number of nonempty characters, obtained e.g. from strlen().
 * create a new header if *php is NULL, in this case, the first character
 * of the string is '\0'
 * */
static char *ssresize_(struct ssheader **php, size_t n, unsigned flags)
{
  struct ssheader *h = NULL, *hp;
  size_t size;

  if (php == NULL) {
    fprintf(stderr, "ssresize_: php is NULL, n = %u", (unsigned) n);
    return NULL;
  }

  /* we use the following if to assign hp and h, so the order is crucial */
  if ((hp = *php) == NULL || (h = hp->next)->size < n + 1 || !(flags & SSOVERALLOC)) {
    size = sscalcsize_(n);
    if (h == NULL || size != h->size) {
      /* since realloc will change the hash value of h
       * we have to remove the old entry first without free()
       * hp->next will be freed by realloc */
      if (hp != NULL)
        sslistremove_(hp, 0);
      if ((h = realloc(h, sizeof(*h)+size)) == NULL) {
        fprintf(stderr, "ssresize_: no memory for %u\n", (unsigned) size);
        return NULL;
      }
      if (hp == NULL) /* clear the first byte if we start from nothing */
        *(char *)(h + 1) = '\0';  /* h + 1 is the beginning of the string */
      *php = hp = sslistadd_(h);
      hp->next->size = size;
    }
  }
  return (char *)(hp->next + 1);
}

static void ssmanage_low_(struct ssheader *hp, unsigned opt)
{
  if (opt == SSDELETE)
    sslistremove_(hp, 1);
  else if (opt == SSSHRINK)
    ssresize_(&hp, strlen((char *)(hp->next+1)), 0);
}

/* delete a string, shrink memory, etc ... */
int ssmanage(char *s, unsigned flags)
{
  struct ssheader *hp, *head;
  unsigned opt = flags & 0xFF;
  size_t i;

  if (flags & SSSINGLE) {
    if (s == NULL || (hp = sslistfind_(s)) == NULL) {
      if (s) fprintf(stderr, "ssmanage: unknown address %p (%s)\n",  s, s);
      return -1;
    }
    ssmanage_low_(hp, opt);
  } else {
    for (i = 0; i < SSHASHSIZ; i++)
      for (hp = head = ssbase_+i; hp->next && hp->next != head; hp = hp->next)
        /* we must not operate on h itself, which renders the iterator h invalid */
        ssmanage_low_(hp, opt);
  }
  return 0;
}

/*
 * copy/cat t to *ps
 *
 * If (flags & SSCAT) == 0:
 * copy t to *ps, if ps is not NULL, and return the result
 * if ps or *ps is NULL, we return a string created from t
 *   *ps is set to the same value if ps is not NULL
 * otherwise, we update the record that corresponds to *ps
 *
 * minsize: to request a minimal size for the resulting buffer
 *
 * If flags & SSCAT:
 * append t after *ps. Equivalent to cpy if ps or *ps is NULL.
 * */
char *sscpycatx(char **ps, const char *t, size_t minsize, unsigned flags)
{
  struct ssheader *hp = NULL;
  size_t size = 0u, sizes = 0u;
  char *s = NULL, *p;

  /* both ps and *ps can be NULL, in which cases we leave hp as NULL */
  if (ps != NULL && (s = *ps) != NULL && (hp = sslistfind_(s)) == NULL) {
    fprintf(stderr, "sscpycatx: unknown address %p (%s)\n", s, s);
    return NULL;
  }
  if (t != NULL)
    while (t[size]) /* compute the length of t */
      size++;
  if (flags & SSCAT) {
    if (s != NULL)  /* s is also NULL, if ps itself is NULL */
      while (s[sizes]) /* compute the length of s */
        sizes++;
    size += sizes;
  }  /* sizes is always 0 in case of copying */
  if (size < minsize)
    size = minsize;
  if ((s = ssresize_(&hp, size, SSOVERALLOC)) == NULL) { /* change size */
    return NULL;
  }
  if (t != NULL)
    for (p = s + sizes; (*p++ = *t++); ) /* copy/cat the string */
      ;
  if (ps != NULL)
    *ps = s;
  return s;
}

/* get a string *ps from file fp
 * *ps can be NULL, in which case memory is allocated
 * *pn is number of characters read (including '\n', but not the terminal null)
 * delim is the '\n' for reading a singe line
 * */
char *ssfgetx(char **ps, size_t *pn, int delim, FILE *fp)
{
  size_t n, max;
  int c;
  char *s;
  struct ssheader *hp;

  if (ps == NULL || fp == NULL)
    return NULL;
  if ((s = *ps) == NULL) /* allocate an initial buffer if *ps is NULL */
    if ((s = sscpycatx(ps, NULL, 0, 0u)) == NULL)
      return NULL;
  if ((hp = sslistfind_(s)) == NULL) {
    fprintf(stderr, "ssfgetx: unknown address %p (%s)\n", s, s);
    return NULL;
  }
  max = hp->next->size-1;
  for (n = 0; (c = fgetc(fp)) != EOF; ) {
    if (n+1 > max) { /* request space for n+1 nonblank characters */
      if ((*ps = s = ssresize_(&hp, n+1, SSOVERALLOC)) == NULL)
        return NULL;
      max = hp->next->size - 1;
    }
    s[n++] = (char)(unsigned char) c;
    if (c == delim)
      break;
  }
  s[n] = '\0';
  if (pn != NULL)
    *pn = n;
  return (n > 0) ? s : NULL;
}
#endif /* ZCOM_SS__ */
#endif /* ZCOM_SS */

#ifdef  ZCOM_ENDN
#ifndef ZCOM_ENDN__
#define ZCOM_ENDN__


#include <stdio.h>
#include <string.h>

STRCLS int endn_system(void);
STRCLS size_t endn_fwrite(void *ptr, size_t size, size_t n, FILE *fp, int endn);
STRCLS size_t endn_fread(void *ptr, size_t size, size_t n, FILE *fp, int flip);
STRCLS int endn_rmatch(void *src, const void *ref, size_t size, FILE *fp);
STRCLS int endn_rmatchi(int *src, int iref, FILE *fp);


/* return the system endian, 1: big endian, 0: little endian */
int endn_system(void)
{
  unsigned feff = 0xFEFF; /* assume unsigned is at least 16-bit */
  unsigned char *p;

  p  = (unsigned char *) &feff;
  return (*p == 0xFF) ? 0 : 1;
}

/* change endianness in-place for n items of size in ptr */
__inline void endn_flip(void *ptr, size_t size, size_t n)
{
  unsigned char *p = (unsigned char *) ptr, ch;
  size_t i, r, half = size/2;

  for (; n > 0; n--, p += size) {
    /* reverse bytes for each object */
    for (i = 0; i < half; i++) {
      r = size - i - 1;
      ch   = p[i];
      p[i] = p[r];
      p[r] = ch;
    }
  }
}

/* write data in ptr to file with a specific endian 'endn'
 * `ptr' is not const, because it needs to change its endian */
size_t endn_fwrite(void *ptr, size_t size, size_t n, FILE *fp, int endn)
{
  static int endsys = -1;

  /* initial determine the machine's endianess */
  if (endsys < 0) endsys = endn_system();
  if (endn == endsys) return fwrite(ptr, size, n, fp);

  endn_flip(ptr, size, n);
  n = fwrite(ptr, size, n, fp);
  endn_flip(ptr, size, n);
  return n;
}

/* read an object test object *src, compared with *ref
 * return 0 if they are identical without endian change
 * return 1 if changing the endianness of *src matches *ref
 * otherwise return -1 */
int endn_rmatch(void *src, const void *ref, size_t size, FILE *fp)
{
  if (1 != fread(src, size, 1, fp))
    return -1;
#ifdef ENDN_DBG
  if (size == sizeof(int))
    printf("A: 0x%X vs. 0x%X size = %u, cmp = %d\n",
      *(int *)src, *(int *)ref, (unsigned)size,
      memcmp(src, ref, size));
#endif
  if (memcmp(src, ref,  size) == 0)
    return 0;
  /* alter the endianness, and test again */
  endn_flip(src, size, 1);
#ifdef ENDN_DBG
  if (size == sizeof(int))
    printf("B: 0x%X vs. 0x%X size = %u, cmp = %d\n",
      *(int *)src, *(int *)ref, (unsigned)size,
      memcmp(src, ref, size));
#endif
  return (memcmp(src, ref, size) == 0) ? 1 : -1;
}

/* special case of endn_rmatchi for integer, convenient because
 * iref could be e.g. sizeof(int), which has no address */
int endn_rmatchi(int *src, int iref, FILE *fp)
{
  return endn_rmatch(src, &iref, sizeof(int), fp);
}

/* read data from file to ptr with endianness changed if 'flip' is 1
 * flip can be initialized by calling endn_rmatch() for a test object */
size_t endn_fread(void *ptr, size_t size, size_t n, FILE *fp, int flip)
{
  n = fread(ptr, size, n, fp);
  if (flip) endn_flip(ptr, size, n);
  return n;
}

#endif /* ZCOM_ENDN__ */
#endif /* ZCOM_ENDN */

#ifdef  ZCOM_BIO
#ifndef ZCOM_BIO__
#define ZCOM_BIO__


#include <stdio.h>

/*
 * Helper macros for reading binary files with endianness
 * support.  However, sizeof(int) must remain the same
 * between system and file.
 *
 * To use these macros in a function:
 * 1. define the following variables in your function
 *   FILE *fp;
 *   int endn, err;
 *   (no need for to define `endn' or `err' in writing a file)
 *
 * 2. define a label ERR for error exit
 *
 * 3. in reading a file, use BIO_INITENDIAN to determine
 *    the correct endianness
 * */

#ifndef BIO_ENDNDEF
#define BIO_ENDNDEF 1  /* big endian */
#endif

/* string for printing file name and line number */
#define BIO_FLFMT_ "file: %s, line: %d"

/* check type */
#define BIO_CHECKTP_(x, tp)                                           \
  if (sizeof(x) != sizeof(tp)) {                                      \
    fprintf(stderr, "%s is not %s\n", #x, #tp);                       \
    goto ERR;                                                         \
  }

/* initialize file endian state to variable 'endn'
 * endn = 1: a conversion is needed from file's endianess to system's
 * endn = 0: otherwise
 * read an int variable x,
 * determine endian by comparing the value of x with ref
 * quit if neither endians makes x == ref */
#define BIO_INIT_ENDIAN(x, ref) {                                     \
  BIO_CHECKTP_(x, int)                                                \
  if ((endn = endn_rmatchi(&(x), ref, fp)) < 0) {                     \
    fprintf(stderr, "%s 0x%X cannot match %s 0x%X\n",                 \
      #x, (unsigned) x, #ref, (unsigned) ref);                        \
    goto ERR;                                                         \
  } }

/* read an array of size n, set err, fix endian */
#define BIO_RATOM_(arr, n)                                                      \
  if ((n) > 0 && endn_fread(arr, sizeof(*(arr)), n, fp, endn) != (size_t) n) {  \
    fprintf(stderr, "error while reading %s, size %u, "                         \
        BIO_FLFMT_ "\n", #arr, (unsigned) n, __FILE__, __LINE__);               \
    err = 1;                                                                    \
  } else { err = 0; }

/* read an array, set error */
#define BIO_RNA_(arr, n, tp) BIO_CHECKTP_(*(arr), tp) BIO_RATOM_(arr, n)
/* read a single variable x of type tp, set err if error occurs */
#define BIO_R1A_(x, tp) BIO_RNA_(&(x), 1, tp)

/* goto ERR if error occurs during reading */
#define BIO_RNB_(arr, n, tp) { BIO_RNA_(arr, n, tp); if (err) goto ERR; }
#define BIO_R1B_(x, tp) { BIO_R1A_(x, tp); if (err) goto ERR; }

/* most common: int and double cases */
#define BIO_RI_ERR(x)     BIO_R1A_(x, int)
#define BIO_RI(x)         BIO_R1B_(x, int)
#define BIO_RIARR(x, n)   BIO_RNB_(x, n, int)

#define BIO_RD_ERR(x)     BIO_R1A_(x, double)
#define BIO_RD(x)         BIO_R1B_(x, double)
#define BIO_RDARR(x, n)   BIO_RNB_(x, n, double)

/* match a temperory int x with the reference var */
#define BIO_MI(x, var)                                                \
  if ((x) != (var)) {                                                 \
    fprintf(stderr, "%s mismatch, expect: %d, read: %d "              \
        BIO_FLFMT_ "\n", #var, (int) var, x, __FILE__, __LINE__);     \
    goto ERR; }

/* match a temperory double x with the reference var */
#define BIO_MD(x, var, eps)                                           \
  if (fabs((x) - (var)) > eps) {                                      \
    fprintf(stderr, "%s mismatch, expect: %g, read: %g "              \
        BIO_FLFMT_ "\n", #var, var, x, __FILE__, __LINE__);           \
    goto ERR; }

/* read an int to x, match it with xref */
#define BIO_RMI(x, xref)       BIO_RI(x); BIO_MI(x, xref)
/* read a double to x, match it with xref */
#define BIO_RMD(x, xref, eps)  BIO_RD(x); BIO_MD(x, xref, eps)

/* write an array of size n with endian being BIO_ENDNDEF
 * we do not set err, directly goto ERR */
#define BIO_WATOM_(arr, n)                                            \
  if ((n) > 0 &&  (size_t) (n) !=                                     \
      endn_fwrite(arr, sizeof(*(arr)), n, fp, BIO_ENDNDEF) ) {        \
    fprintf(stderr, "error while reading %s, size %u, "               \
        BIO_FLFMT_ "\n", #arr, (unsigned) n, __FILE__, __LINE__);     \
    goto ERR;                                                         \
  }

/* write an array, go to ERR if error occurs */
#define BIO_WNB_(arr, n, tp) BIO_CHECKTP_(*(arr), tp) BIO_WATOM_(arr, n)
/* write a single variable, go to ERR if error occurs */
#define BIO_W1B_(x, tp) BIO_WNB_(&(x), 1, tp)

#define BIO_WI(x)           BIO_W1B_(x, int)
#define BIO_WIARR(x, n)     BIO_WNB_(x, n, int)
#define BIO_WD(x)           BIO_W1B_(x, double)
#define BIO_WDARR(x, n)     BIO_WNB_(x, n, double)


#endif /* ZCOM_BIO__ */
#endif /* ZCOM_BIO */

#ifdef  ZCOM_RNG
#ifndef ZCOM_RNG__
#define ZCOM_RNG__

#include <stdio.h>
#include <string.h>
#include <math.h>

#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
  #include <inttypes.h>
#elif defined(_MSC_VER) || defined(__BORLANDC__)
  typedef unsigned uint32_t;
  typedef unsigned __int64 uint64_t;
#else
  #include <inttypes.h>
#endif

#ifndef PRIu32
  #if (defined(_MSC_VER) && (_MSC_VER >= 1300)) || defined(__BORLANDC__)
    #define PRIu32 "I32u"
  #else
    #define PRIu32 "u"
  #endif
#endif

#ifndef PRIu64
  #if defined(_MSC_VER) || defined(__BORLANDC__)
    #define PRIu64 "I64u"
  #else
    #define PRIu64 "llu"
  #endif
#endif

#define rand32()  mtrand()
#define rnd0()    ((1.0/4294967296.0) * rand32()) /* double, [0, 1) */

#define MTFILE    "MTSEED"  /* default file */
#define MTSEED    5489UL    /* default seed */
STRCLS int mtsave(const char *fname);
STRCLS int mtload(const char *fname, uint32_t seed);
STRCLS uint32_t mtrand(void);
STRCLS double grand0(void);


/* Mersenne Twister was developped by Makoto Matsumoto and Takuji Nishimura */
#define MT_N 624
#define MT_M 397
#define MT_UMASK 0x80000000UL /* most significant w-r bits */
#define MT_LMASK 0x7fffffffUL /* least significant r bits */

int mtidx_ = -1; /* index in mt_, -1: uninitialized */
uint32_t mt_[MT_N]; /* array for the mt state vector */

/* save the current mt state to file */
int mtsave(const char *fname)
{
  FILE *fp;
  int k;

  if (mtidx_ < 0) return 1; /* RNG was never used, so it cannot be saved */
  if (fname == NULL) fname = MTFILE;
  if ((fp = fopen(fname, "w")) == NULL) {
    fprintf(stderr, "cannot save to %s.\n", fname);
    return 1;
  }
  fprintf(fp, "MTSEED\n%d\n", mtidx_);
  for (k = 0; k < MT_N; k++) fprintf(fp, "%"PRIu32"\n", mt_[k]);
  fclose(fp);
  return 0;
}

/* load mt state from `fname', or if it fails, use `seed' to initialize mt  */
int mtload(const char *fname, uint32_t seed)
{
  static char s[64];
  int k, z, err = 1;
  FILE *fp;

  if (fname == NULL) fname = MTFILE;
  if ((fp = fopen(fname, "r")) != NULL) { /* try to load from file */
    if (fgets(s, sizeof s, fp) == NULL) {
      fprintf(stderr, "%s is empty\n", fname);
    } else if (strncmp(s, "MTSEED", 6) != 0) { /* to check the first line */
      fprintf(stderr, "mtrand: corrupted file.\n");
    } else if (fscanf(fp, "%d", &mtidx_) != 1) {
      fprintf(stderr, "no index in %s\n", fname);
    } else {
      if (mtidx_ < 0) mtidx_ = MT_N; /* request updating */
      for (z = 1, k = 0; k < MT_N; k++) {
        if (fscanf(fp, "%"PRIu32, &mt_[k]) != 1) break;
        if (mt_[k] != 0) z = 0; /* a non-zero number */
      }
      if (k != MT_N) fprintf(stderr, "%s incomplete %d/%d\n", fname, k, MT_N);
      else err = z; /* clear error, if array is nonzero */
    }
    fclose(fp);
  }

  if (err) { /* initialize from seed */
    if (seed == 0) seed = MTSEED;
    mt_[0] = seed & 0xffffffffUL;
    for (k = 1; k < MT_N; k++) /* the final mask is for 64-bit machines */
      mt_[k] = (1812433253UL * (mt_[k-1] ^ (mt_[k-1]>>30)) + k) & 0xffffffffUL;
    mtidx_ = MT_N; /* request updating */
  }
  return (mtidx_ < 0);
}

/* return an unsigned random number */
uint32_t mtrand(void)
{
  uint32_t x;
  static const uint32_t mag01[2] = {0, 0x9908b0dfUL}; /* MATRIX_A */
  int k;

  if (mtidx_ < 0) mtload(NULL, 0);
  if (mtidx_ >= MT_N) { /* generate MT_N words at one time */
    for (k = 0; k < MT_N - MT_M; k++) {
      x = (mt_[k] & MT_UMASK) | (mt_[k+1] & MT_LMASK);
      mt_[k] = mt_[k+MT_M] ^ (x>>1) ^ mag01[x&1UL];
    }
    for (; k < MT_N-1; k++) {
      x = (mt_[k] & MT_UMASK) | (mt_[k+1] & MT_LMASK);
      mt_[k] = mt_[k+(MT_M-MT_N)] ^ (x>>1) ^ mag01[x&1UL];
    }
    x = (mt_[MT_N-1] & MT_UMASK) | (mt_[0] & MT_LMASK);
    mt_[MT_N-1] = mt_[MT_M-1] ^ (x>>1) ^ mag01[x&1UL];
    mtidx_ = 0;
  }
  x = mt_[ mtidx_++ ];
  /* tempering */
  x ^= (x >> 11);
  x ^= (x <<  7) & 0x9d2c5680UL;
  x ^= (x << 15) & 0xefc60000UL;
  x ^= (x >> 18);
  return x;
}

#undef MT_N
#undef MT_M
#undef MT_UMASK
#undef MT_LMASK

/* Gaussian distribution with zero mean and unit variance */
double grand0(void)
{
  double x, y, u, v, q;
  do {
    u = 1 - rnd0();
    v = 1.7156*(rnd0() - .5);  /* >= 2*sqrt(2/e) */
    x = u - 0.449871;
    y = fabs(v) + 0.386595;
    q = x*x  + y*(0.196*y - 0.25472*x);
    if (q < 0.27597) break;
  } while (q > 0.27846 || v*v > -4*u*u*log(u));
  return v/u;
}

#endif /* ZCOM_RNG__ */
#endif /* ZCOM_RNG */

#ifdef  ZCOM_RV2
#ifndef ZCOM_RV2__
#define ZCOM_RV2__

#ifndef RV2_T
#define RV2_T rv2_t
  typedef real rv2_t[2];
  typedef const real crv2_t[2];
  typedef real rm2_t[2][2];
#endif

#include <stdio.h>
#include <string.h>
#include <math.h>

/* due to that pointer may overlap with each other,
 * be careful when using the const modifier */
INLINE real *rv2_make(real *x, real a, real b) { x[0] = a; x[1] = b; return x; }
INLINE real *rv2_zero(real *x) { return rv2_make(x, 0, 0); }
INLINE real *rv2_copy(real *x, const real *src) { x[0] = src[0]; x[1] = src[1]; return x; }
/* use macro to avoid const qualifier of src */
#define rv2_ncopy(x, src, n) memcpy(x, src, n*sizeof(x[0]))

INLINE real rv2_sqr(const real *x) { return x[0]*x[0]+x[1]*x[1]; }
INLINE real rv2_norm(const real *x) { return (real)sqrt(x[0]*x[0]+x[1]*x[1]); }

INLINE real rv2_dot(const real *x, const real *y) { return x[0]*y[0]+x[1]*y[1]; }

INLINE real rv2_cross(const real *x, const real *y)
{
  return x[0]*y[1]-x[1]*y[0];
}

INLINE real *rv2_neg(real *x)
{
  x[0] = -x[0];
  x[1] = -x[1];
  return x;
}

INLINE real *rv2_neg2(real *nx, const real *x)
{
  nx[0] = -x[0];
  nx[1] = -x[1];
  return nx;
}

INLINE real *rv2_inc(real *x, const real *dx)
{
  x[0] += dx[0];
  x[1] += dx[1];
  return x;
}

INLINE real *rv2_dec(real *x, const real *dx)
{
  x[0] -= dx[0];
  x[1] -= dx[1];
  return x;
}

INLINE real *rv2_sinc(real *x, const real *dx, real s)
{
  x[0] += s*dx[0];
  x[1] += s*dx[1];
  return x;
}

INLINE real *rv2_smul(real *x, real s)
{
  x[0] *= s;
  x[1] *= s;
  return x;
}

INLINE real *rv2_smul2(real *y, const real *x, real s)
{
  y[0] = x[0]*s;
  y[1] = x[1]*s;
  return y;
}

INLINE real *rv2_normalize(real *x)
{
  real r = rv2_norm(x);
  if (r > 0.f) rv2_smul(x, 1.f/r);
  return x;
}

/* for in-place difference use rv2_dec */
INLINE real *rv2_diff(real *diff, const real *a, const real *b)
{
  diff[0] = a[0]-b[0];
  diff[1] = a[1]-b[1];
  return diff;
}

/* distance^2 between a and b */
INLINE real rv2_dist2(const real *a, const real *b)
{
  real d[2];
  return rv2_sqr(rv2_diff(d, a, b));
}

/* distance between a and b */
INLINE real rv2_dist(const real *a, const real *b)
{
  return (real) sqrt(rv2_dist2(a, b));
}

/* sum = a+b, for in-place addition use rv2_inc */
INLINE real *rv2_add(real *sum, const real *a, const real *b)
{
  sum[0] = a[0]+b[0];
  sum[1] = a[1]+b[1];
  return sum;
}

/* sum = -a-b */
INLINE real *rv2_nadd(real *sum, const real *a, const real *b)
{
  sum[0] = -a[0]-b[0];
  sum[1] = -a[1]-b[1];
  return sum;
}

INLINE real *rv2_lincomb2(real *sum, const real *a, const real *b, real s1, real s2)
{
  sum[0] = a[0]*s1+b[0]*s2;
  sum[1] = a[1]*s1+b[1]*s2;
  return sum;
}

/* consine of the angle of x1-x2-x3 */
INLINE real rv2_cosang(const real *x1, const real *x2, const real *x3,
    real *g1, real *g2, real *g3)
{
  real a[2], b[2], ra, rb, dot;

  ra = rv2_norm(rv2_diff(a, x1, x2));
  rv2_smul(a, 1.f/ra);
  rb = rv2_norm(rv2_diff(b, x3, x2));
  rv2_smul(b, 1.f/rb);
  dot = rv2_dot(a, b);
  if (dot > 1) dot = 1; else if (dot < -1) dot = -1;
  if (g1) {
    rv2_lincomb2(g1, b, a, 1.f/ra, -dot/ra);
    rv2_lincomb2(g3, a, b, 1.f/rb, -dot/rb);
    rv2_nadd(g2, g1, g3);
  }
  return dot;
}

/* angle and gradients of x1-x2-x3 */
INLINE real rv2_ang(const real *x1, const real *x2, const real *x3,
    real *g1, real *g2, real *g3)
{
  real dot, sn;
  dot = rv2_cosang(x1, x2, x3, g1, g2, g3);
  sn = (real) sqrt(1 - dot*dot);
  if (sn > 1e-7) sn = -1/sn; else sn = 0.;
  if (g1) {
    rv2_smul(g1, sn);
    rv2_smul(g2, sn);
    rv2_smul(g3, sn);
  }
  return (real) acos(dot);
}

/* vertical distance from x to line a-b */
INLINE real rv2_vdist(const real *x, const real *a, const real *b)
{
  real nm[2], d[2], dot;

  rv2_diff(d, x, a);
  rv2_normalize(rv2_diff(nm, a, b));
  dot = rv2_dot(d, nm);
  return rv2_norm(rv2_sinc(d, nm, -dot));
}

/* determinant of a 2x2 matrix */
INLINE real rm2_det(real a[2][2])
{
  return a[0][0]*a[1][1] - a[0][1]*a[1][0];
}

/* inverse matrix b = a^(-1) */
INLINE void rm2_inv(real b[2][2], real a[2][2])
{
  real det = rm2_det(a);
  if (fabs(det) < 1e-30) det = (det < 0) ? -1e-30f: 1e-30f;
  b[0][0] =  a[1][1]/det;
  b[0][1] = -a[0][1]/det;
  b[1][0] = -a[1][0]/det;
  b[1][1] =  a[0][0]/det;
}

#define rv2_print(r, nm, fmt, nl) rv2_fprint(stdout, r, nm, fmt, nl)
INLINE void rv2_fprint(FILE *fp, const real *r, const char *nm,
    const char *fmt, int nl)
{
  int i;
  if (nm) fprintf(fp, "%s: ", nm);
  for (i = 0; i < 2; i++)
    fprintf(fp, fmt, r[i], nl);
  fprintf(fp, "%c", (nl ? '\n' : ';'));
}

#define rm2_print(r, nm, fmt, nl) rm2_fprint(stdout, r, nm, fmt, nl)
INLINE void rm2_fprint(FILE *fp, real r[2][2], const char *nm,
    const char *fmt, int nl)
{
  int i, j;
  if (nm) fprintf(fp, "%s:%c", nm, (nl ? '\n' : ' '));
  for (i = 0; i < 2; i++) {
    for (j = 0; j < 2; j++) {
      fprintf(fp, fmt, r[i][j], nl);
    }
    fprintf(fp, "%s", (nl ? "\n" : "; "));
  }
}

#endif /* ZCOM_RV2__ */
#endif /* ZCOM_RV2 */

#ifdef  ZCOM_RV3
#ifndef ZCOM_RV3__
#define ZCOM_RV3__

#ifndef RV3_T
#define RV3_T rv3_t
  typedef real rv3_t[3];
  typedef const real crv3_t[3];
  typedef real rm3_t[3][3];
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

/* due to that pointer may overlap with each other,
 * be careful when using the const modifier */

INLINE real *rv3_make(real *x, real a, real b, real c)
  { x[0] = a; x[1] = b; x[2] = c; return x; }
INLINE real *rv3_zero(real *x) { return rv3_make(x, 0, 0, 0); }
INLINE real *rv3_copy(real *x, const real *src)
  { x[0] = src[0]; x[1] = src[1]; x[2] = src[2]; return x; }
/* use macro to avoid const qualifier of src */
#define rv3_ncopy(x, src, n) memcpy(x, src, n*sizeof(x[0]))

INLINE real rv3_sqr (const real *x) { return x[0]*x[0]+x[1]*x[1]+x[2]*x[2]; }
INLINE real rv3_norm(const real *x) { return (real)sqrt(x[0]*x[0]+x[1]*x[1]+x[2]*x[2]); }

/* if x == y, try to use sqr */
INLINE real rv3_dot(const real *x, const real *y)
{
  return x[0]*y[0]+x[1]*y[1]+x[2]*y[2];
}

INLINE real *rv3_cross(real *RESTRICT z, const real *x, const real *y)
{
  z[0] = x[1]*y[2]-x[2]*y[1];
  z[1] = x[2]*y[0]-x[0]*y[2];
  z[2] = x[0]*y[1]-x[1]*y[0];
  return z;
}

INLINE real *rv3_neg(real *x)
{
  x[0] = -x[0];
  x[1] = -x[1];
  x[2] = -x[2];
  return x;
}

INLINE real *rv3_neg2(real *nx, const real *x)
{
  nx[0] = -x[0];
  nx[1] = -x[1];
  nx[2] = -x[2];
  return nx;
}

INLINE real *rv3_inc(real * RESTRICT x, const real *dx)
{
  x[0] += dx[0];
  x[1] += dx[1];
  x[2] += dx[2];
  return x;
}

INLINE real *rv3_dec(real *x, const real *dx)
{
  x[0] -= dx[0];
  x[1] -= dx[1];
  x[2] -= dx[2];
  return x;
}

INLINE real *rv3_sinc(real * RESTRICT x, const real *dx, real s)
{
  x[0] += s*dx[0];
  x[1] += s*dx[1];
  x[2] += s*dx[2];
  return x;
}

INLINE real *rv3_smul(real *x, real s)
{
  x[0] *= s;
  x[1] *= s;
  x[2] *= s;
  return x;
}

/* if y == x, just use smul */
INLINE real *rv3_smul2(real * RESTRICT y, const real *x, real s)
{
  y[0] = x[0]*s;
  y[1] = x[1]*s;
  y[2] = x[2]*s;
  return y;
}

INLINE real *rv3_normalize(real *x)
{
  real r = rv3_norm(x);
  if (r > 0.0) rv3_smul(x, 1.f/r);
  return x;
}

/* for in-place difference use rv3_dec */
INLINE real *rv3_diff(real * RESTRICT diff, const real *a, const real *b)
{
  diff[0] = a[0]-b[0];
  diff[1] = a[1]-b[1];
  diff[2] = a[2]-b[2];
  return diff;
}

/* distance^2 between a and b */
INLINE real rv3_dist2(const real *a, const real *b)
{
  real d[3];
  return rv3_sqr(rv3_diff(d, a, b));
}

/* distance between a and b */
INLINE real rv3_dist(const real *a, const real *b)
{
  return (real) sqrt(rv3_dist2(a, b));
}

/* sum = a+b, for in-place addition use rv3_inc */
INLINE real *rv3_add(real * RESTRICT sum, const real *a, const real *b)
{
  sum[0] = a[0]+b[0];
  sum[1] = a[1]+b[1];
  sum[2] = a[2]+b[2];
  return sum;
}

/* sum = -a-b */
INLINE real *rv3_nadd(real *sum, const real *a, const real *b)
{
  sum[0] = -a[0]-b[0];
  sum[1] = -a[1]-b[1];
  sum[2] = -a[2]-b[2];
  return sum;
}

INLINE real *rv3_lincomb2(real * RESTRICT sum, const real *a, const real *b, real s1, real s2)
{
  sum[0] = a[0]*s1+b[0]*s2;
  sum[1] = a[1]*s1+b[1]*s2;
  sum[2] = a[2]*s1+b[2]*s2;
  return sum;
}

/* angle and gradients of cos(x1-x2-x3) */
INLINE real rv3_cosang(const real *x1, const real *x2, const real *x3,
    real *g1, real *g2, real *g3)
{
  real a[3], b[3], ra, rb, dot;

  ra = rv3_norm(rv3_diff(a, x1, x2));
  rv3_smul(a, 1.f/ra);
  rb = rv3_norm(rv3_diff(b, x3, x2));
  rv3_smul(b, 1.f/rb);
  dot = rv3_dot(a, b);
  if (dot > 1) dot = 1; else if (dot < -1) dot = -1;
  if (g1) {
    rv3_lincomb2(g1, b, a, 1.f/ra, -dot/ra);
    rv3_lincomb2(g3, a, b, 1.f/rb, -dot/rb);
    rv3_nadd(g2, g1, g3);
  }
  return dot;
}

/* angle and gradients of x1-x2-x3 */
INLINE real rv3_ang(const real *x1, const real *x2, const real *x3,
    real *g1, real *g2, real *g3)
{
  real dot, sn;

  dot = rv3_cosang(x1, x2, x3, g1, g2, g3);
  sn = (real) sqrt(1 - dot*dot);
  if (sn < 1e-7) sn = 1; else sn = -1.f/sn;
  if (g1) {
    rv3_smul(g1, sn);
    rv3_smul(g2, sn);
    rv3_smul(g3, sn);
  }
  return (real) acos(dot);
}

/* vertical distance from x to line a-b */
INLINE real rv3_vdist(const real *x, const real *a, const real *b)
{
  real nm[3], d[3], dot;

  rv3_diff(d, x, a);
  rv3_normalize(rv3_diff(nm, a, b));
  dot = rv3_dot(d, nm);
  return rv3_norm(rv3_sinc(d, nm, -dot));
}

/* signed distance from x to the plane extended by a, b, c */
INLINE real rv3_vpdist(const real *x, const real *a, const real *b, const real *c)
{
  real u[3], v[3], m[3];

  rv3_diff(u, b, a);
  rv3_diff(v, c, b);
  rv3_normalize(rv3_cross(m, u, v));
  rv3_diff(u, x, a);
  return rv3_dot(u, m);
}

/* structure for dihedral calculation */
typedef struct {
  int  szreal; /* sizeof real */
  int  pad0;   /* padding */
  real phi; /* cis is zero, clockwise positive */
  real cos; /* cos(m, n) */
  real sgn; /* (0, pi) is 1.0, otherwise -1.0 */

  real g2;
  real g[4][3]; /* gradient for each particle */

  real div; /* the divengence */
  real d4ij, d4ik, d4jj, d4jk, d4jl, d4kk, d4kl;

  unsigned int flags; /* a copy of flags used */
  int t1, t2, t3; /* gromacs shift indices */
  const void *pbcdata; /* periodic boundary condition descriptor */
  int (*pbcdiff)(real *xij, const real *xi, const real *xj, const void *);
    /* function to handle pbc, use GROMACS convention: the last is the difference */
} dihcalc_t;

#define DIH_GRAD  0x0001
#define DIH_DIV   0x0002
/*#define DIH_CONJ  0x0004 */
/*#define DIH_PROJ  0x0008 */
#define DIH_I     0x0010
#define DIH_J     0x0020
#define DIH_K     0x0040
#define DIH_L     0x0080
#define DIH_FOUR  (DIH_I|DIH_J|DIH_K|DIH_L)
/* the four atoms involved */
#define DIH_ALL   (DIH_FOUR|DIH_GRAD|DIH_DIV)
/* only I and L, so no divergence */
#define DIH_ENDS  (DIH_GRAD|DIH_I|DIH_L)

/* compute the dihedral angle, gradient g and divegence
 * of the field v conjugate to gradient (v.g = 1)
 *
 * if dih is NULL and flags is 0, only the dihedral angle is computed
 * optionally, the gradient and divergent are computed with flags
 * DIH_GRAD and DIH_DIV respectively (the latter requires the former)
 * routine for treating periodic boundary condition can specified by
 * assigning a function pointer to dih->pbcdiff with additional information
 * to dih->pbcdata entry, otherwise, dih->pbcdiff *must* be set to NULL
 * the procedure of computing similar to that in GROMACS
 *
 * the conjugate field v = g / [g.g], such that v.g = 1, where g = grad(phi)
 * it is not explicitly computed (since it lies along the gradient)
 * however, the denominator is saved to dih->g2
 * the calculation of the divergence of v is simplified by the fact that
 * the divergence of the gradient is always zero, i.e., div.g = 0, thus
 *     div.v = -2 [ g.(gg).g ] /[g.g]^2,
 * where gg = grad grad(phi) involves: d4ij, d4ik, ..., d4kl.
 *
 * both g and div can be computed for a subset of the four involving atoms
 * by passing `flags' a combination of DIH_I, DIH_J, DIH_K and DIH_L
 * however, *all* moments d4ij, d4ik, ... , d4kl are always calculated
 * though only the involved ones are used to produce the divergence. */
#define rv3_calcdihv(dih, x, idx, flags) \
  rv3_calcdih(dih, x[*(idx)], x[*(idx+1)], x[*(idx+2)], x[*(idx+3)], flags)

INLINE real rv3_calcdih(dihcalc_t *dih,
    const real *xi, const real *xj, const real *xk, const real *xl,
    unsigned int flags)
{
  real dot, scl, tol, vol, phi, sgn, cosphi;
  real nxkj, nxkj2, m2, n2;
  real xij[3], xkj[3], xkl[3];
  real m[3], n[3]; /* the planar vector of xij x xkj,  and xkj x xkj */

  if (dih != NULL && sizeof(real) != dih->szreal) {
    fprintf(stderr, "size real should be %d instead of %d\n",
        (int) sizeof(real), (int) dih->szreal);
    exit(1);
  }
  if (dih != NULL && dih->pbcdiff != NULL) { /* handle pbc */
    dih->t1 = (*dih->pbcdiff)(xij, xi, xj, dih->pbcdata);
    dih->t2 = (*dih->pbcdiff)(xkj, xk, xj, dih->pbcdata);
    dih->t3 = (*dih->pbcdiff)(xkl, xk, xl, dih->pbcdata);
  } else {
    rv3_diff(xij, xi, xj);
    rv3_diff(xkj, xk, xj);
    rv3_diff(xkl, xk, xl);
  }
  nxkj2 = rv3_sqr(xkj);
  nxkj = (real) sqrt(nxkj2);
  if (sizeof(real) <= 4)
    tol = nxkj2 * 6e-8f;
  else
    tol = nxkj2 * 1e-16f;

  rv3_cross(m, xij, xkj);
  m2 = rv3_sqr(m);
  rv3_cross(n, xkj, xkl);
  n2 = rv3_sqr(n);
  if (m2 > tol && n2 > tol) {
    scl = (real) sqrt(m2*n2);
    dot = rv3_dot(m, n);
    cosphi = dot/scl;
    if (cosphi >= (real) 1.) cosphi = 1.f;
    else if (cosphi < (real)(-1.)) cosphi = -1.f;
  } else {
    cosphi = 1.f;
  }
  phi = (real) acos(cosphi);
  vol = rv3_dot(n, xij);
  sgn = ((vol > 0.0f) ? 1.0f : (-1.0f));
  phi *= sgn;
  if (dih != NULL) {
    dih->phi = phi;
    dih->sgn = sgn;
    dih->cos = cosphi;
    dih->flags = flags;
  }

  /* optionally calculate the gradient */
  if (dih != NULL && (flags & (DIH_GRAD|DIH_DIV))) { /* divergence implies gradient */
    /* clear divergence */
    dih->div = dih->d4ij = dih->d4ik = dih->d4jj = dih->d4jk = dih->d4jl = dih->d4kk = dih->d4kl = 0.0f;

    /* calculate the gradient of the dihedral */
    if (m2 > tol && n2 > tol) {
      real vgi[3], vgj[3], vgk[3], vgl[3];
      real uvec[3], vvec[3], svec[3], p, q;
      real gi2, gj2, gk2, gl2, g2all, invg2;
      unsigned doi, doj, dok, dol;

      doi = (flags & DIH_I);
      doj = (flags & DIH_J);
      dok = (flags & DIH_K);
      dol = (flags & DIH_L);

      scl = nxkj/m2;
      rv3_smul2(vgi, m, scl);
      scl = -nxkj/n2;
      rv3_smul2(vgl, n, scl);

      p = rv3_dot(xij, xkj);
      p /= nxkj2;
      rv3_smul2(uvec, vgi, p);
      q = rv3_dot(xkl, xkj);
      q /= nxkj2;
      rv3_smul2(vvec, vgl, q);
      rv3_diff(svec, uvec, vvec);

      rv3_diff(vgj, svec, vgi);
      rv3_nadd(vgk, vgl, svec);

      rv3_copy(dih->g[0], vgi);
      rv3_copy(dih->g[1], vgj);
      rv3_copy(dih->g[2], vgk);
      rv3_copy(dih->g[3], vgl);

      gi2 = rv3_sqr(vgi);
      gj2 = rv3_sqr(vgj);
      gk2 = rv3_sqr(vgk);
      gl2 = rv3_sqr(vgl);
      g2all = 0.0f;
      if (doi) g2all += gi2;
      if (doj) g2all += gj2;
      if (dok) g2all += gk2;
      if (dol) g2all += gl2;
      dih->g2 = g2all;
      invg2 = 1.0f/g2all;

      if (flags & DIH_DIV) {
        real xkjv[3], nvv[3], mvv[3];
        real gjxij, gjmvv, gjxkl, gjnvv;
        real gkmvv, gknvv, gkxkl, gkxij;
        real kivkj, klvkj, ljvkj, ijvkj;
        real kikl, ijlj;
        real tmp1, tmp2;
        real sinmn;

        rv3_smul2(mvv, m, 1.0f/m2);
        rv3_smul2(nvv, n, 1.0f/n2);
        rv3_smul2(xkjv, xkj, 1.0f/nxkj);

        sinmn = vol*nxkj/(m2*n2);

        ijvkj = rv3_dot(xij, xkjv);
        kivkj = nxkj-ijvkj;
        klvkj = rv3_dot(xkl, xkjv);
        ljvkj = nxkj-klvkj;

        ijlj = ijvkj*ljvkj;
        kikl = kivkj*klvkj;

        gjxij = rv3_dot(vgj, xij);
        gjxkl = rv3_dot(vgj, xkl);
        gjmvv = rv3_dot(vgj, mvv);
        gjnvv = rv3_dot(vgj, nvv);
        gkxij = rv3_dot(vgk, xij);
        gkxkl = rv3_dot(vgk, xkl);
        gkmvv = rv3_dot(vgk, mvv);
        gknvv = rv3_dot(vgk, nvv);

        tmp1 = nxkj2*sinmn;
        tmp2 = tmp1/m2;
        dih->d4ij = kikl*tmp2;
        dih->d4ik = ijlj*tmp2;
        tmp2 = tmp1/n2;
        dih->d4jl = kikl*tmp2;
        dih->d4kl = ijlj*tmp2;

        dih->d4jj = -(gjxij*gjmvv+gjxkl*gjnvv)/nxkj
                +2.0f*(kivkj*gjmvv-klvkj*gjnvv)*(-kikl*sinmn);

        dih->d4jk = (gjxij*gkmvv+gjxkl*gknvv)/nxkj
              +(-(gjmvv*ljvkj+gkmvv*klvkj)*(ijvkj*kivkj)
                +(gjnvv*ijvkj+gknvv*kivkj)*(ljvkj*klvkj) )*sinmn;

        dih->d4kk = -(gkxkl*gknvv+gkxij*gkmvv)/nxkj
                +2.0f*(ljvkj*gknvv-ijvkj*gkmvv)*(ijlj*sinmn);

        /* summarize */
        if ((flags & DIH_FOUR) == DIH_FOUR) {
          tmp1 = dih->d4jj + dih->d4kk;
          tmp2 = dih->d4ij + dih->d4ik+dih->d4jk+dih->d4jl+dih->d4kl;
        } else {
          tmp1 = tmp2 = 0.0f;
          if (doj) { tmp1 += dih->d4jj; }
          if (dok) { tmp1 += dih->d4kk; }
          if (doi && doj) tmp2 += dih->d4ij;
          if (doi && dok) tmp2 += dih->d4ik;
          if (doj && dok) tmp2 += dih->d4jk;
          if (doj && dol) tmp2 += dih->d4jl;
          if (dok && dol) tmp2 += dih->d4kl;
        }
        dih->div = -2.0f*(tmp1+2.0f*tmp2)*(invg2*invg2);
      } /* do divengence */

    } else { /* clear the gradients */
      int j;
      for (j = 0; j < 4; j++)
        rv3_zero(dih->g[j]);
    }
  }
  return phi;
}


/* transpose */
INLINE rv3_t *rm3_trans(real a[3][3])
{
  real x;
  x = a[0][1], a[0][1] = a[1][0], a[1][0] = x;
  x = a[0][2], a[0][2] = a[2][0], a[2][0] = x;
  x = a[2][1], a[2][1] = a[1][2], a[1][2] = x;
  return a;
}

/* a = u^T v */
INLINE rv3_t *rm3_vtv(real a[3][3], const real *u, const real *v)
{
  a[0][0] = u[0]*v[0];
  a[0][1] = u[0]*v[1];
  a[0][2] = u[0]*v[2];
  a[1][0] = u[1]*v[0];
  a[1][1] = u[1]*v[1];
  a[1][2] = u[1]*v[2];
  a[2][0] = u[2]*v[0];
  a[2][1] = u[2]*v[1];
  a[2][2] = u[2]*v[2];
  return a;
}

/* a += b */
INLINE rv3_t *rm3_inc(real a[3][3], real b[3][3])
{
  a[0][0] += b[0][0];
  a[0][1] += b[0][1];
  a[0][2] += b[0][2];
  a[1][0] += b[1][0];
  a[1][1] += b[1][1];
  a[1][2] += b[1][2];
  a[2][0] += b[2][0];
  a[2][1] += b[2][1];
  a[2][2] += b[2][2];
  return a;
}

/* a += b*s */
INLINE rv3_t *rm3_sinc(real a[3][3], real b[3][3], real s)
{
  a[0][0] += b[0][0]*s;
  a[0][1] += b[0][1]*s;
  a[0][2] += b[0][2]*s;
  a[1][0] += b[1][0]*s;
  a[1][1] += b[1][1]*s;
  a[1][2] += b[1][2]*s;
  a[2][0] += b[2][0]*s;
  a[2][1] += b[2][1]*s;
  a[2][2] += b[2][2]*s;
  return a;
}

/* c = a b */
INLINE rv3_t *rm3_mul(real c[3][3], real a[3][3], real b[3][3])
{
  int i, j;
  for (i = 0; i < 3; i++)
    for (j = 0; j < 3; j++)
      c[i][j] = a[i][0]*b[0][j]+a[i][1]*b[1][j]+a[i][2]*b[2][j];
  return c;
}

/* c = a b^T */
INLINE rv3_t *rm3_mult(real c[3][3], real a[3][3], real b[3][3])
{
  int i, j;
  for (i = 0; i < 3; i++)
    for (j = 0; j < 3; j++)
      c[i][j] = rv3_dot(a[i], b[j]);
  return c;
}

/* c = a v */
INLINE real *rm3_mulvec(real *c, real a[3][3], const real *v)
{
  c[0] = a[0][0]*v[0] + a[0][1]*v[1] + a[0][2]*v[2];
  c[1] = a[1][0]*v[0] + a[1][1]*v[1] + a[1][2]*v[2];
  c[2] = a[2][0]*v[0] + a[2][1]*v[1] + a[2][2]*v[2];
  return c;
}

/* c = a^T v */
INLINE real *rm3_multvec(real *c, real a[3][3], const real *v)
{
  c[0] = a[0][0]*v[0] + a[1][0]*v[1] + a[2][0]*v[2];
  c[1] = a[0][1]*v[0] + a[1][1]*v[1] + a[2][1]*v[2];
  c[2] = a[0][2]*v[0] + a[1][2]*v[1] + a[2][2]*v[2];
  return c;
}

/* determinant of a 3x3 matrix */
INLINE real rm3_det(real a[3][3])
{
  return a[0][0] * (a[1][1]*a[2][2] - a[1][2]*a[2][1])
      +  a[0][1] * (a[1][2]*a[2][0] - a[1][0]*a[2][2])
      +  a[0][2] * (a[1][0]*a[2][1] - a[1][1]*a[2][0]);
}

/* inverse matrix b = a^(-1) */
INLINE rv3_t *rm3_inv(real b[3][3], real a[3][3])
{
  real d00, d01, d02, detm;
  d00 = a[1][1]*a[2][2] - a[1][2]*a[2][1];
  d01 = a[1][2]*a[2][0] - a[1][0]*a[2][2];
  d02 = a[1][0]*a[2][1] - a[1][1]*a[2][0];
  detm = a[0][0]*d00 + a[0][1]*d01 + a[0][2]*d02;
  if (fabs(detm) < 1e-30) detm = (detm < 0) ? -1e-30f: 1e-30f;
  b[0][0] = d00/detm;
  b[0][1] = (a[2][1]*a[0][2] - a[0][1]*a[2][2])/detm;
  b[0][2] = (a[0][1]*a[1][2] - a[0][2]*a[1][1])/detm;
  b[1][0] = d01/detm;
  b[1][1] = (a[2][2]*a[0][0] - a[2][0]*a[0][2])/detm;
  b[1][2] = (a[0][2]*a[1][0] - a[1][2]*a[0][0])/detm;
  b[2][0] = d02/detm;
  b[2][1] = (a[2][0]*a[0][1] - a[2][1]*a[0][0])/detm;
  b[2][2] = (a[0][0]*a[1][1] - a[0][1]*a[1][0])/detm;
  return b;
}

/* eigenvalues of a 3x3 matrix */
INLINE real *rm3_eigval(real v[3], real a[3][3])
{
  real m, p, q, cphi, sphi, pr, pr3;

  m = (a[0][0]+a[1][1]+a[2][2])/3.f;
  a[0][0] -= m;
  a[1][1] -= m;
  a[2][2] -= m;
  q = .5f * rm3_det(a);
  p = ((a[0][0]*a[0][0] + a[1][1]*a[1][1] + a[2][2]*a[2][2]) +
   2.f*(a[0][1]*a[1][0] + a[1][2]*a[2][1] + a[2][0]*a[0][2]))/6.f;
  pr = (real) sqrt(p);
  pr3 = p*pr;
  if (pr3 <= fabs(q)) {
    if (q < 0.) { /* choose phi = pi/3 */
      v[1] = v[0] = m + pr;
      v[2] = m - 2.f*pr;
    } else { /* phi = 0 */
      v[0] = m + 2.f*pr;
      v[2] = v[1] = m - pr;
    }
  } else {
    double phi = acos(q/pr3)/3.f; /* 0 < phi < pi/3 */
    cphi = (real)cos(phi);
    sphi = (real)(sin(phi)*1.7320508075688772);
    v[0] = m + 2.f*pr*cphi;  /* cos(phi), largest */
    v[1] = m - pr*(cphi-sphi); /* cos(phi-2*pi/3), second largest */
    v[2] = m - pr*(cphi+sphi); /* cos(phi+2*pi/3), smallest */
  }
  a[0][0] += m;
  a[1][1] += m;
  a[2][2] += m;
  return v;
}

/* given matrix a and eigenvalue lm, return eigenvector */
INLINE real *rm3_eigvec(real vec[3], real m[3][3], real val)
{
  double a = m[0][0]-val, b = m[1][1]-val, c, d = m[0][1], e = m[0][2], f = m[1][2];
  double detm, tol = 1e-12;

  vec[2] = 1.f;
  if (fabs(detm = a*b - d*d) > tol) { /* use row 0 and 1 */
    rv3_make(vec, (real)((d*f-b*e)/detm), (real)((e*d-a*f)/detm), 1);
    return rv3_normalize(vec);
  }
  c = m[2][2] - val;
  if (fabs(detm = a*f - e*d) > tol) { /* row 1 and 2 */
    rv3_make(vec, (real)((d*c-e*f)/detm), (real)((e*e-a*c)/detm), 1);
    return rv3_normalize(vec);
  }
  if ((detm = sqrt(a*a+d*d)) > tol) { /* three-row-degenerate */
    rv3_make(vec, (real)(d/detm), (real)(-a/detm), 0);
  } else {
    rv3_make(vec, 1, 0, 0);
  }
  return vec;
}

/* compute eigenvectors for the eigenvalues */
INLINE rv3_t *rm3_eigvecs(real vecs[3][3], real mat[3][3], real v[3], int t)
{
  const double tol = 1e-12;
  double v0 = fabs(v[0]), v1 = fabs(v[1]), v2 = fabs(v[2]);

  rm3_eigvec(vecs[0], mat, v[0]);
  if ( fabs(v[0] - v[1]) > tol*(v0 + v1) ) {
    rm3_eigvec(vecs[1], mat, v[1]);
    rv3_cross(vecs[2], vecs[0], vecs[1]);
  } else if ( fabs(v[2] - v[1]) > tol*(v1 + v2) ) {
    rm3_eigvec(vecs[2], mat, v[2]);
    rv3_cross(vecs[1], vecs[2], vecs[0]);
  } else {
    rv3_make(vecs[1], 0, 1, 0);
    rv3_make(vecs[2], 0, 0, 1);
  }
  /* transpose the matrix */
  if (t) return vecs;
  else return rm3_trans(vecs);
}

/* SVD decomposition of a 3x3 matrix a = u s v */
INLINE int rm3_svd(real a[3][3], real u[3][3], real s[3], real v[3][3])
{
  int i, j;
  real ata[3][3], z[3];
  const double tol = 1e-12;

  /* 1. compute A^T A and its eigenvectors */
  for (i = 0; i < 3; i++)
    for (j = i; j < 3; j++) {
      ata[i][j] = a[0][i]*a[0][j] + a[1][i]*a[1][j] + a[2][i]*a[2][j];
      if (i != j) ata[j][i] = ata[i][j];
    }
  rm3_eigval(s, ata);
  for (i = 0; i < 3; i++) if (s[i] < 0) s[i] = 0;
  rm3_eigvecs(v, ata, s, 1); /* get V^T */

  /* 2. U = A V S^-1, or U^T = S^{-1}T V^T A^T */
  j = (s[0] > tol) + (s[1] > tol) + (s[2] > tol);
  if (j >= 2) {
    rm3_mult(u, v, a);
    if (j == 2) rv3_cross(u[2], u[0], u[1]); /* fix the last */
  } else if (j == 1) {
    rm3_multvec(u[0], a, v[0]);
    rv3_zero(z);
    /* choose z[i] such that z X u[0] != 0 */
    i = (u[0][0]*u[0][0] < u[0][1]*u[0][1]) ? 0 : 1;
    rv3_cross(u[1], z, u[0]);
    rv3_cross(u[2], u[0], u[1]);
  } else { /* use u */
    for (i = 0; i < 3; i++) rv3_copy(u[i], v[i]);
  }
  for (i = 0; i < 3; i++) rv3_normalize(u[i]);
  for (i = 0; i < 3; i++) s[i] = (real) sqrt(s[i]);
  rm3_trans(u);
  rm3_trans(v);
  return 0;
}

/* return 0 rotation matrix around v for ang */
INLINE rv3_t *rm3_mkrot(real m[3][3], const real *v, real ang)
{
  real c = (real) cos(ang), s = (real) sin(ang), nc, n[3];

  rv3_copy(n, v);
  rv3_normalize(n);
  nc = 1 - c;
  m[0][0] = n[0]*n[0]*nc + c;
  m[0][1] = n[0]*n[1]*nc - n[2]*s;
  m[0][2] = n[0]*n[2]*nc + n[1]*s;
  m[1][0] = n[1]*n[0]*nc + n[2]*s;
  m[1][1] = n[1]*n[1]*nc + c;
  m[1][2] = n[1]*n[2]*nc - n[0]*s;
  m[2][0] = n[2]*n[0]*nc - n[1]*s;
  m[2][1] = n[2]*n[1]*nc + n[0]*s;
  m[2][2] = n[2]*n[2]*nc + c;
  return m;
}

/* rotate v0 around u by ang, save result to v1 */
INLINE real *rv3_rot(real *v1, const real *v0, const real *u, real ang)
{
  real m[3][3];

  rm3_mkrot(m, u, ang);
  rm3_mulvec(v1, m, v0);
  return v1;
}

#define rv3_print(r, nm, fmt, nl) rv3_fprint(stdout, r, nm, fmt, nl)
INLINE void rv3_fprint(FILE *fp, const real *r, const char *nm,
    const char *fmt, int nl)
{
  int i;
  if (nm) fprintf(fp, "%s: ", nm);
  for (i = 0; i < 3; i++)
    fprintf(fp, fmt, r[i], nl);
  fprintf(fp, "%c", (nl ? '\n' : ';'));
}

#define rm3_print(r, nm, fmt, nl) rm3_fprint(stdout, r, nm, fmt, nl)
INLINE void rm3_fprint(FILE *fp, real r[3][3], const char *nm,
    const char *fmt, int nl)
{
  int i, j;
  if (nm) fprintf(fp, "%s:%c", nm, (nl ? '\n' : ' '));
  for (i = 0; i < 3; i++) {
    for (j = 0; j < 3; j++) {
      fprintf(fp, fmt, r[i][j], nl);
    }
    fprintf(fp, "%s", (nl ? "\n" : "; "));
  }
}

#endif /* ZCOM_RV3__ */
#endif /* ZCOM_RV3 */

#ifdef  ZCOM_EIG
#ifndef ZCOM_EIG__
#define ZCOM_EIG__
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

STRCLS int eigsym(real *mat, real *eigval, real *eigvec, int n);

/* To reduce a real symmetric matrix 'm' to tridiagonal by Householder transformations.
 * The diagonal elements are saved in vector 'd' and off-diagonal elements 'e'.  */
static void tridiag(real *m, real d[], real e[], int n)
{
  int i, j, k;
  real H, sigma, p, K, *x;

  /* use d[i] to indicate if the i'th Householder transformation is performed */
  for (i = 0; i < n; i++) d[i] = 0;

  /* n-2 Householder transformations */
  for (i = 0; i < n-2; i++) {
    x = m+i*n; /* alias x[k] == m[i*n+k] */

    for (H = 0, k = i+1; k < n; k++) H += x[k]*x[k];
    sigma = (real)(x[i+1] > 0 ? sqrt(H) : -sqrt(H)); /* sigma = sgn(x1) |x| */
    e[i] = -sigma; /* P x = - sigma e1 */
    H += sigma*x[i+1]; /* H= (1/2) |u|^2 = |x|^2 + sigma x1 */

    /* To avoid singularity due to (partially) diagonal matrix as input */
    if (sigma + m[i*n+i] == m[i*n+i]) {
      e[i] = m[i*n+i+1];
      continue;
    }

    x[i+1] += sigma;  /* u = x + sigma e1, we now switch to 'u' */
    for (j = i+1; j < n; j++) m[j*n+i] = x[j]/H; /* save u/H in column i */

    /*  CALCULATE P A P */
    K = 0;
    for (j = i+1; j < n; j++) {
      /* calculate p=A u /H, we only use the up triangle */
      for (p = 0, k = i+1; k <= j; k++) p += m[k*n+j]*x[k];
      for (k = j+1; k < n; k++) p += m[j*n+k]*x[k];
      e[j] = (p /= H); /* save p temporarily to e[j], notice e[i+1..n-1] are not used yet.*/
      K += x[j]*p; /* K = u' p / (2H) */
    }
    K /= (2*H); /* K = u' p / (2H) */
    for (j = i+1; j < n; j++) e[j] -= K*x[j];  /* form  q = p - K u */
    for (j = i+1; j < n; j++) /* calculate A' = A - q u' - u q' (only right-top triangle) */
      for (k = j; k < n; k++)
        m[j*n+k] -= e[j]*x[k]+x[j]*e[k];

    d[i] = 1; /* indicate that the transformation is performed */
  }
  e[n-2] = m[(n-2)*n + n-1]; /* for i == n-2 */
  e[n-1] = 0;

  /* if only eigenvalues are required, enable the above line and ignore the rest */

  /* To form Q = P1 ... Pn-2 */
  d[n-2] = m[(n-2)*n + n-2]; d[n-1] = m[(n-1)*n + n-1]; /* copy last two eigenvalues */
  m[(n-2)*n + n-2] = 1; m[(n-2)*n + n-1] = 0; /* initialize the right-bottom corner */
  m[(n-1)*n + n-2] = 0; m[(n-1)*n + n-1] = 1;

  /* P Q = (1 - u u'/H) Q = Q - (u/H) (u' Q) */
  for (i = n-3; i >= 0; i--) { /* for each P */
    x = m + i*n; /* alias x[k] == m[i*n+k] */

    /* Form eigenvector, ONLY if i'th transformation is performed */
    if (d[i] != 0) {
      for (j = i+1; j < n; j++) {
        /* form K = u'Q */
        for (K = 0, k = i+1; k < n; k++) K += x[k]*m[k*n + j];
        /* Q = Q - K (u/H)  */
        for (k = i+1; k < n; k++) m[k*n + j] -= K*m[k*n + i];
      }
    }
    /* copy the diagonal element and proceed */
    d[i] = m[i*n + i];
    m[i*n + i] = 1;
    for (j = i+1; j < n; j++) m[i*n + j] = m[j*n + i] = 0.;
  }
}


/* diagonize the tridiagonal matrix by QR algorithm,
   whose diagonal is d[0..n-1], off-diagonal is e[0..n-2];
 * reduce from the left-top to right-left */
static void eigtriqr(real d[], real e[], int n, real *mat)
{
  const int itermax = 1000;
  int i, j, k, m, iter, sgn;
  real ks = 0, r, c, s, delta, f, t1, t2;

  e[n-1] = 0;

  for (i = 0; i < n; i++) {
    /* for each eigenvalue */
    for (iter = 0; iter < itermax; iter++) {
      /* Look for a single small subdiagonal element to split the matrix */
      for (m = i; m < n-1; m++) {
        if (fabs(e[m]) < (fabs(d[m+1])+fabs(d[m]))*1e-12)
          break;
      }

      /* I have isolated d[i] from other matrix elements
       * so that d[i] is the eigenvalue.
       * stop iteration and look for next(i+1) eigenvalue  */
      if (m == i) break;

      /* form shift ks */
      delta = d[m]-d[m-1];
      sgn = ((delta > 0) ? 1: -1);
      delta /= e[m-1];
      r = (real) dblhypot(delta, 1);
      ks = d[m] + sgn*e[m-1]/(r + (real) fabs(delta));

      /* Rotations */
      for (j = i; j <= m-1; j++) {
       /* calculate c and s */
       if (j == i) {
         /* First rotation */
         r = (real) dblhypot(d[i]-ks, e[i]);
         c = (d[i]-ks)/r;
         s = e[i]/r;
       } else {
         /* Givens rotations */
         r = (real) dblhypot(e[j-1], f);
         c = e[j-1]/r;
         s = f/r;
         e[j-1] = r;
       }

       /* update the diagonal and off-diagonal elements */
       r = s*(d[j+1]-d[j]) + 2*c*e[j];
       d[j]   += s*r;
       d[j+1] -= s*r;
       e[j]    = c*r - e[j];
       f       = s*e[j+1];
       e[j+1] *= c;

       /* update eigenvectors */
       for (k = 0; k < n; k++) {
         t1 = mat[k*n + j];
         t2 = mat[k*n + j+1];
         mat[k*n + j]   = c*t1+s*t2;
         mat[k*n + j+1] = -s*t1+c*t2;
       }
      } /* end of rotations */
    } /* end for iteration */
    /*printf("Iterate %d times for %d'th eigenvalue.\n", iter, i);*/
  }/* end for each eigenvalue */
}

/* sort eigen values and vectors into ascending order */
static void eigsort(real *d, real *v, int n)
{
  int i, j, im;
  real max, tmp;

  for (i = 0; i < n - 1; i++) {
    /* search the maximal eigenvalue */
    for (max = d[i], im = i, j = i+1; j < n; j++) {
      if (d[j] > max) max = d[im = j];
    }
    if (im != i) { /* change column im and i */
      tmp = d[i], d[i] = d[im], d[im] = tmp;
      for (j = 0; j < n; j++)
        tmp = v[j*n+i], v[j*n+i] = v[j*n+im], v[j*n+im] = tmp;
    }
  }
}

/* solve eigensystem of a real symmetric matrix `mat',
 * eigenvalues saved to `d', eigenvectors to v */
int eigsym(real *mat, real *d, real *v, int n)
{
  real *e;
  int i;

  xnew(e, n);
  for (i = 0; i < n*n; i++) v[i] = mat[i];
  tridiag(v, d, e, n);
  eigtriqr(d, e, n, v);
  eigsort(d, v, n);
  free(e);
  return 0;
}

#endif /* ZCOM_EIG__ */
#endif /* ZCOM_EIG */

#ifdef  ZCOM_LU
#ifndef ZCOM_LU__
#define ZCOM_LU__

/* LU decomposition part  */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

STRCLS int lusolve(real *a, real *b, int n);


/* solve A x = b by L U decomposition */
int lusolve(real *a, real *b, int n)
{
  int i, j, k, imax = 0;
  real x, max;
  const real mintol = 1e-16; /* absolute minimal value for a pivot */

  for (i = 0; i < n; i++) {  /* normalize each equation */
    for (max = 0.0, j = 0; j < n; j++)
      if ((x = fabs(a[i*n+j])) > max)
        max = x;
    if (max < mintol) {
      return 1;
    }
    for (x = 1.0/max, j = 0; j < n; j++)
      a[i*n+j] *= x;
    b[i] *= x;
  }

  /* step 1: A = L U, column by column */
  for (j = 0; j < n; j++) {
    /* matrix U */
    for (i = 0; i < j; i++) {
      for (x = a[i*n+j], k = 0; k < i; k++)
        x -= a[i*n+k]*a[k*n+j];
      a[i*n+j] = x;
    }

    /* matrix L, diagonal of L are 1 */
    max = 0.0;
    for (i = j; i < n; i++) {
      for (x = a[i*n+j], k = 0; k < j; k++)
        x -= a[i*n+k]*a[k*n+j];
      a[i*n+j] = x;
      if (fabs(x) >= max) {
        max = fabs(x);
        imax = i;
      }
    }

    if (j != imax) { /* swap the pivot row with the jth row */
      for (k = 0; k < n; k++)
        x = a[imax*n+k], a[imax*n+k] = a[j*n+k], a[j*n+k] = x;
      x = b[imax]; b[imax] = b[j]; b[j] = x;
    }
    if (fabs(a[j*n+j]) < mintol)
      return 2;
    /* divide by the pivot element, for the L matrix */
    if (j != n-1)
      for (x = 1.0/a[j*n+j], i = j+1; i < n; i++)
        a[i*n+j] *= x;
  }

  /* step2: solve the equation L U x = b */
  for (i = 0; i < n; i++) { /* L y = b */
    x = b[i];
    for (j = 0; j < i; j++) x -= a[i*n+j]*b[j];
    b[i] = x;
  }
  for (i = n-1; i >= 0; i--) { /* U x = y. */
    x = b[i];
    for (j = i+1; j < n; j++) x -= a[i*n+j]*b[j];
    b[i] = x/a[i*n+i];
  }
  return 0;
}

#endif /* ZCOM_LU__ */
#endif /* ZCOM_LU */

#ifdef  ZCOM_SVD
#ifndef ZCOM_SVD__
#define ZCOM_SVD__


#include <stdio.h>
#include <stdlib.h>
#include <math.h>

STRCLS int svd(real *a, real *s, real *v, int m, int n);
STRCLS int svdback(real *u, real *w, real *v, int m, int n, real *x, real *b);
STRCLS int svdsolve(real *a, real *x, real *b, int n, real tol);


/* singular value decomposition of mxn matrix `a'
 * a[m*n] (or u[m*n] on return), w[n], v[n*n] */
int svd(real *a, real *w, real *v, int m, int n)
{
  int flag, i, it, j, jj, k, l, nm;
  real c, f, h, s, x, y, z;
  real anorm = 0.0, g, scl;
  real *rv1;

  die_if (m < n, "ERROR: m %d < n %d\n", m, n);
  xnew(rv1, n);

  /* Householder reduction to bidiagonal form */
  for (g = s = scl = 0., i = 0; i < n; i++) {
    /* left-hand reduction */
    l = i + 1;
    rv1[i] = scl * g;
    g = s = scl = 0.0;
    if (i < m) {
      for (k = i; k < m; k++)
        scl += fabs(a[k*n+i]);
      if (scl > 0.) {
        for (k = i; k < m; k++) {
          a[k*n+i] = x = a[k*n+i]/scl;
          s += x*x;
        }
        f = a[i*n+i];
        g = (f > 0.) ? -sqrt(s) : sqrt(s);
        h = f * g - s;
        a[i*n+i] = f - g;
        if (i != n - 1) {
          for (j = l; j < n; j++) {
            for (s = 0.0, k = i; k < m; k++)
              s += a[k*n+i] * a[k*n+j];
            f = s / h;
            for (k = i; k < m; k++)
              a[k*n+j] += f * a[k*n+i];
          }
        }
        for (k = i; k < m; k++)
          a[k*n+i] = a[k*n+i]*scl;
      }
    }
    w[i] = scl*g;

    /* right-hand reduction */
    g = s = scl = 0.0;
    if (i < m && i != n - 1) {
      for (k = l; k < n; k++)
        scl += fabs(a[i*n+k]);
      if (scl > 0.) {
        for (k = l; k < n; k++) {
          a[i*n+k] = x = a[i*n+k]/scl;
          s += x*x;
        }
        f = a[i*n+l];
        g = (f > 0.) ? -sqrt(s) : sqrt(s);
        h = f * g - s;
        a[i*n+l] = f - g;
        for (k = l; k < n; k++)
          rv1[k] = a[i*n+k] / h;
        if (i != m - 1) {
          for (j = l; j < m; j++) {
            for (s = 0.0, k = l; k < n; k++)
              s += a[j*n+k] * a[i*n+k];
            for (k = l; k < n; k++)
              a[j*n+k] += s * rv1[k];
          }
        }
        for (k = l; k < n; k++)
          a[i*n+k] *= scl;
      }
    }
    x = fabs(w[i]) + fabs(rv1[i]);
    if (x > anorm) anorm = x;
  }

  /* accumulate the right-hand transformation */
  for (i = n - 1; i >= 0; i--) {
    if (i < n - 1) {
        if (g != 0.) {
            for (j = l; j < n; j++)
                v[j*n+i] = ((a[i*n+j] / a[i*n+l]) / g);
                /* real division to avoid underflow */
            for (j = l; j < n; j++) {
                for (s = 0.0, k = l; k < n; k++)
                    s += (a[i*n+k] * v[k*n+j]);
                for (k = l; k < n; k++)
                    v[k*n+j] += (s * v[k*n+i]);
            }
        }
        for (j = l; j < n; j++)
            v[i*n+j] = v[j*n+i] = 0.0;
    }
    v[i*n+i] = 1.0;
    g = rv1[i];
    l = i;
  }

  /* accumulate the left-hand transformation */
  for (i = n - 1; i >= 0; i--) {
    l = i + 1;
    g = w[i];
    if (i < n - 1)
      for (j = l; j < n; j++) a[i*n+j] = 0.0;
    if (g != 0.) {
      g = 1.0 / g;
      if (i != n - 1) {
        for (j = l; j < n; j++) {
          for (s = 0.0, k = l; k < m; k++)
            s += (a[k*n+i] * a[k*n+j]);
          f = s/a[i*n+i]*g;
          for (k = i; k < m; k++)
            a[k*n+j] += f*a[k*n+i];
        }
      }
      for (j = i; j < m; j++)
        a[j*n+i] = a[j*n+i]*g;
    } else {
      for (j = i; j < m; j++) a[j*n+i] = 0.0;
    }
    a[i*n+i] += 1.;
  }

  /* diagonalize the bidiagonal form */
  for (k = n - 1; k >= 0; k--) { /* loop over singular values */
    for (it = 0; it < 200; it++) { /* loop over allowed iterations */
      flag = 1;
      for (l = k; l >= 0; l--) { /* test for splitting */
        nm = l - 1;
        if (fabs(rv1[l]) + anorm == anorm) {
          flag = 0;
          break;
        }
        if (fabs(w[nm]) + anorm == anorm)
          break;
      }
      if (flag) {
        c = 0.0;
        s = 1.0;
        for (i = l; i <= k; i++) {
          f = s * rv1[i];
          if (fabs(f) + anorm == anorm) continue;
          g = w[i];
          h = dblhypot(f, g);
          w[i] = h;
          h = 1.0 / h;
          c = g * h;
          s = (- f * h);
          for (j = 0; j < m; j++) {
            y = a[j*n+nm];
            z = a[j*n+i];
            a[j*n+nm] = y * c + z * s;
            a[j*n+i] = z * c - y * s;
          }
        }
      }
      z = w[k];
      if (l == k) { /* convergence */
        if (z < 0.0) { /* flip sign of w */
          w[k] = -z;
          for (j = 0; j < n; j++)
            v[j*n+k] = -v[j*n+k];
        }
        break;
      }
      if (it >= 200) {
        free(rv1);
        fprintf(stderr, "svd: failed to converge\n");
        return -1;
      }

      /* shift from bottom 2 x 2 minor */
      x = w[l];
      nm = k - 1;
      y = w[nm];
      g = rv1[nm];
      h = rv1[k];
      f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
      g = dblhypot(f, 1.0);
      if (f < 0.) g = -g;
      f = ((x - z) * (x + z) + h * (y/(f + g) - h)) / x;

      /* next QR transformation */
      c = s = 1.0;
      for (j = l; j <= nm; j++) {
        i = j + 1;
        g = rv1[i];
        y = w[i];
        h = s * g;
        g = c * g;
        z = dblhypot(f, h);
        rv1[j] = z;
        c = f / z;
        s = h / z;
        f = x * c + g * s;
        g = g * c - x * s;
        h = y * s;
        y = y * c;
        for (jj = 0; jj < n; jj++) {
          x = v[jj*n+j];
          z = v[jj*n+i];
          v[jj*n+j] = x * c + z * s;
          v[jj*n+i] = z * c - x * s;
        }
        w[j] = z = dblhypot(f, h);
        if (z > 0.) { c = f/z; s = h/z; }
        f = c * g + s * y;
        x = c * y - s * g;
        for (jj = 0; jj < m; jj++) {
          y = a[jj*n+j];
          z = a[jj*n+i];
          a[jj*n+j] = y * c + z * s;
          a[jj*n+i] = z * c - y * s;
        }
      }
      rv1[l] = 0.0;
      rv1[k] = f;
      w[k] = x;
    }
  }
  free(rv1);
  return 0;
}

int svdback(real *u, real *w, real *v, int m, int n, real *x, real *b)
{
  int i, j;
  real *b1, y;

  xnew(b1, n);
  for (i = 0; i < n; i++) {
    if (w[i] <= 0.) { b1[i] = 0.; continue; }
    for (y = 0, j = 0; j < m; j++)
      y += u[j*n+i]*b[j];
    b1[i] = y/w[i];
  }
  for (i = 0; i < n; i++) {
    for (y = 0., j = 0; j < n; j++)
      y += v[i*n+j]*b1[j];
    x[i] = y;
  }
  free(b1);
  return 0;
}

int svdsolve(real *a, real *x, real *b, int n, real rerr)
{
  int i;
  real *u, *v, *w, wmax, wmin;

  xnew(w, n); xnew(u, n*n); xnew(v, n*n);
  for (i = 0; i < n*n; i++) u[i] = a[i];
  svd(u, w, v, n, n);
  for (wmax = 0., i = 0; i < n; i++)
    if (w[i] > wmax) wmax = w[i];
  for (wmin = wmax*rerr, i = 0; i < n; i++)
    if (w[i] < wmin) w[i] = wmin;
  for (i = 0; i < n; i++) printf("%g  ", w[i]);
  printf("\n");
  svdback(u, w, v, n, n, x, b);
  free(u); free(v); free(w);
  return 0;
}

#endif /* ZCOM_SVD__ */
#endif /* ZCOM_SVD */

#ifdef  ZCOM_ROTFIT
#ifndef ZCOM_ROTFIT__
#define ZCOM_ROTFIT__

STRCLS real rotfit3(rv3_t *x, rv3_t *xf, rv3_t *y, const real *w, int n,
    real (*r)[3], real *t);


/* least square fit x to y after rotation/translation */
real rotfit3(rv3_t *x, rv3_t *xf, rv3_t *y, const real *w, int n,
    real (*r)[3], real *t)
{
  int i;
  real wtot = 0, sq, dev = 0, detm;
  rv3_t xc, yc, xs, ys, sig, t_;
  real u[3][3], v[3][3], s[3][3] = {{0,0,0},{0,0,0},{0,0,0}}, xy[3][3], r_[3][3];

  if (r == NULL) r = r_;
  if (t == NULL) t = t_;

  /* 1. compute the centers */
  rv3_zero(xc);
  rv3_zero(yc);
  if (w == NULL) {
    for (i = 0; i < n; i++) {
      rv3_inc(xc, x[i]);
      rv3_inc(yc, y[i]);
    }
    wtot = (real) n;
  } else {
    for (wtot = 0., i = 0; i < n; i++) {
      rv3_sinc(xc, x[i], w[i]);
      rv3_sinc(yc, y[i], w[i]);
      wtot += w[i];
    }
  }
  rv3_smul(xc, 1.f/wtot);
  rv3_smul(yc, 1.f/wtot);

  /* 2. compute 3x3 asymmetric covarience matrix S = (x-xc) (y-yc)^T */
  for (i = 0; i < n; i++) {
    rv3_diff(xs, x[i], xc); /* shift to the center avoid the translation */
    rv3_diff(ys, y[i], yc);
    rm3_vtv(xy, xs, ys);
    sq  = rv3_sqr(xs);
    sq += rv3_sqr(ys);
    if (w) {
      rm3_sinc(s, xy, w[i]);
      dev += w[i]*sq;
    } else {
      rm3_inc(s, xy);
      dev += sq; /* Tr(x^T x + y^T y) */
    }
  }

  /* 3. SVD decompose S = u sig v^T */
  rm3_svd(s, u, sig, v);

  /* 4. compute R = v u^T */
  rm3_mult(r, v, u);
  detm = rm3_det(r);

#define rotfit3_dump_() { \
    printf("fatal error!, detm = %g\n", detm); \
    rv3_print(sig, "sig", "%8.3f", 1); \
    rm3_print(r, "r", "%8.3f", 1); \
    printf("det(r) = %g\n", rm3_det(r)); \
    rm3_mult(r, u, v); rm3_print(r, "rx", "%8.3f", 1); \
    printf("det(rx) = %g\n", rm3_det(r)); \
    rm3_print(u, "u", "%8.3f", 1); \
    printf("det(u) = %g\n", rm3_det(u)); \
    rm3_print(v, "v", "%8.3f", 1); \
    printf("det(v) = %g\n", rm3_det(v)); \
    rm3_print(s, "s", "%12.3f", 1); \
    printf("det(s) = %g\n", rm3_det(s)); \
    exit(1); }
  if (fabs(fabs(detm) - 1) > 0.1) rotfit3_dump_();
  if (detm < 0) { /* to avoid a reflection */
    rm3_trans(u);
    rv3_neg(u[2]); /* flip the last eigenvector */
    rm3_mul(r, v, u);
    dev -= 2*(sig[0]+sig[1]-sig[2]);
    detm = rm3_det(r);
    if (fabs(fabs(detm) - 1) > 0.1) rotfit3_dump_();
#undef rotfit3_dump_
  } else {
    dev -= 2*(sig[0]+sig[1]+sig[2]); /* -2 Tr(R x y^T) */
  }
  if (dev < 0) dev = 0;
  rv3_diff(t, yc, rm3_mulvec(xs, r, xc)); /* t = yc - R xc */

  /* 5. compute the rotated structure */
  if (xf) {
    for (dev = 0, i = 0; i < n; i++) {
      rv3_add(xf[i], rm3_mulvec(xs, r, x[i]), t); /* xf = R x + t */
      sq = rv3_dist2(y[i], xf[i]);
      dev +=  (w ? w[i]*sq : sq); /* recompute the deviation */
    }
  }
  return (real) sqrt(dev/wtot);
}

#endif /* ZCOM_ROTFIT__ */
#endif /* ZCOM_ROTFIT */

#ifdef  ZCOM_SAVGOL
#ifndef ZCOM_SAVGOL__
#define ZCOM_SAVGOL__


#include <stdio.h>
#include <stdlib.h>
#include <math.h>

STRCLS double *savgol(int w, int ord, int der, int h, int verbose);
STRCLS double *savgol2d(int iw, int jw, int ord, int h, int verbose);


/* compute 1d Savitzky-Golay coefficients
 * der == 0 for function itself, 1 for first-order derivative */
double *savgol(int w, int ord, int der, int h, int verbose)
{
  int i, i0, i1, ox, oy, nop, orm, npt;
  double x, xk, y;
  double *mmt, *b, *mat, *x2m, *c;

  nop = ord+1;
  orm = 2*ord+1;
  npt = h ? (2*w) : (2*w+1);
  i0 = -w;
  i1 = h ? w : (w+1);
  if ((c = calloc(npt, sizeof(double))) == NULL)
    return NULL;
  if ((b = calloc(nop, sizeof(double))) == NULL) {
    free(c);
    return NULL;
  }
  if ((mat = calloc(nop*nop, sizeof(double))) == NULL) {
    free(b); free(c);
    return NULL;
  }
  if ((mmt = calloc(orm, sizeof(double))) == NULL) {
    free(mat); free(b); free(c);
    return NULL;
  }
  if ((x2m = calloc(nop*npt, sizeof(double))) == NULL) {
    free(mmt); free(mat); free(b); free(c);
    return NULL;
  }
  if (der > ord) {
    fprintf(stderr, "no %dth order of derivative, order = %d\n", der, ord);
    return NULL;
  }

  for (i = 0; i < orm; i++) mmt[i] = 0.;
  for (i = i0; i < i1; i++) {
    x = h ? (i + .5) : i;
    /* mmt[k] = < x^k > */
    for (xk = 1., ox = 0; ox < orm; ox++, xk *= x)
      mmt[ox] += xk;
    /* x2m[k*npt + x] = x^k */
    for (xk = 1., ox = 0; ox <= ord; ox++, xk *= x)
      x2m[ox*npt + (i - i0)] = xk;
  }

  /* install matrix from moments */
  for (ox = 0; ox < nop; ox++)
    for (oy = 0; oy < nop; oy++)
      mat[ox*nop + oy] = mmt[ox+oy];

  /* we approximate y(x) = a0 + a1 x + a2 x^2 + ...
   * mat.a = b = x2m.y, or a = mat^(-1).b
   * since mat is symmetrical, rows == columns,
   * we thus extract the first row by solving b = {1, 0, 0, ...} */
  for (i = 0; i < nop; i++) b[i] = 0;
  b[der] = 1.;
  i = lusolve(mat, b, nop);
  if (i != 0) {
    fprintf(stderr, "unable to inverse matrix!\n");
    return NULL;
  }
  /* c = mat^(-1).x2m */
  for (i = 0; i < npt; i++) {
    for (y = 0, ox = 0; ox < nop; ox++)
      y += b[ox]*x2m[ox*npt + i];
    c[i] = y;
  }
  free(x2m);
  free(mmt);
  free(mat);
  free(b);
  if (verbose) {
    for (i = 0; i < npt; i++)
      printf("%g\t", c[i]);
    printf("\n");
  }
  return c;
}

/* compute 2d Savitzky-Golay coefficients
 * h means if it is a histogram */
double *savgol2d(int iw, int jw, int ord, int h, int verbose)
{
  int i, j, i0, i1, j0, j1, id, nop, orm, npt;
  int io, iq, ox, oy, o1, o2, o3, o4;
  double w, x, y, xk, xyk;
  double *mmt, *b, *mat, *x2m, *c;

  nop = (ord+1)*(ord+2)/2;
  orm = 2*ord+1;
  i0 = -iw;
  j0 = -jw;
  if (h) { /* for histogram */
    npt = (2*iw)*(2*jw);
    i1 = iw;
    j1 = jw;
  } else {
    npt = (2*iw+1)*(2*jw+1);
    i1 = iw + 1;
    j1 = jw + 1;
  }
  if ((c = calloc(npt, sizeof(double))) == NULL)
    return NULL;
  if ((b = calloc(nop, sizeof(double))) == NULL) {
    free(c);
    return NULL;
  }
  if ((mat = calloc(nop*nop, sizeof(double))) == NULL) {
    free(b); free(c);
    return NULL;
  }
  if ((mmt = calloc(orm*orm, sizeof(double))) == NULL) {
    free(mat); free(b); free(c);
    return NULL;
  }
  if ((x2m = calloc(nop*npt, sizeof(double))) == NULL) {
    free(mmt); free(mat); free(b); free(c);
    return NULL;
  }

  for (i = 0; i < orm*orm; i++) mmt[i] = 0.;
  for (i = i0; i < i1; i++) {
    x = h ? (i + .5) : i;
    for (j = j0; j < j1; j++) {
      y = h ? (j + .5) : j;
      w = 1.;
      /* moment matrix */
      xk = w;
      for (ox = 0; ox < orm; ox++) {
        xyk = xk;
        for (oy = 0; oy < orm-ox; oy++) {
          mmt[ox*orm + oy] += xyk;
          xyk *= y;
        }
        xk *= x;
      }
      /* position to z-moment matrix */
      id = (i - i0)*(j1 - j0) + (j - j0);
      for (io = 0, o1 = 0; o1 <= ord; o1++) {
        for (o2 = 0; o2 <= o1; o2++, io++) {
          xyk = w;
          for (ox = 0; ox < o1 - o2; ox++) xyk *= x;
          for (oy = 0; oy < o2; oy++) xyk *= y;
          x2m[io*npt + id] = xyk;
        }
      }
    }
  }

  /* install matrix from moments */
  for (io = 0, o1 = 0; o1 <= ord; o1++)
  for (o2 = 0; o2 <= o1; o2++, io++) {
    /* x^(o1-o2) y^o2 */
    for (iq = 0, o3 = 0; o3 <= ord; o3++)
    for (o4 = 0; o4 <= o3; o4++, iq++) {
      /* x^(o3-o4) y^o4 */
      ox = o3 - o4 + o1 - o2;
      oy = o4 + o2;
      mat[io*nop + iq] = mmt[ox*orm + oy];
    }
  }

  for (i = 0; i < nop; i++) b[i] = 0.;
  b[0] = 1.;
  i = lusolve(mat, b, nop);
  if (i != 0) {
    fprintf(stderr, "unable to inverse matrix!\n");
    return NULL;
  }
  for (i = 0; i < npt; i++) {
    for (y = 0, io = 0; io < nop; io++)
      y += b[io]*x2m[io*npt + i];
    c[i] = y;
  }
  free(x2m);
  free(mmt);
  free(mat);
  free(b);
  if (verbose) {
    for (i = i0; i < i1; i++) {
      for (j = j0; j < j1; j++) {
        printf("%7.4f ", c[(i-i0)*(j1 - j0)+(j-j0)]);
      }
      printf("\n");
    }
  }
  return c;
}

#endif /* ZCOM_SAVGOL__ */
#endif /* ZCOM_SAVGOL */

#ifdef  ZCOM_SPECFUNC
#ifndef ZCOM_SPECFUNC__
#define ZCOM_SPECFUNC__

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

STRCLS double lngam(double z);
STRCLS double ksq(double x);
STRCLS double plegendre(double x, int l, int m);


/* returns log(Gamma(z)) */
double lngam(double z)
{
  int i;
  double xp, zhg;
  static const double gh = 671./128, sqrt2pi = 2.506628274631000242,
    c[15] = {0.999999999999997092, 57.1562356658629235,-59.5979603554754912,
    14.1360979747417471,-0.491913816097620199,.339946499848118887e-4,
    .465236289270485756e-4,-.983744753048795646e-4,.158088703224912494e-3,
    -.210264441724104883e-3,.217439618115212643e-3,-.164318106536763890e-3,
    .844182239838527433e-4,-.261908384015814087e-4,.368991826595316234e-5};

  if (z <= 0.) {
    fprintf(stderr, "neg. arg. for gamma(x)\n");
    exit(1);
  }
  for (xp = c[0], i = 1; i < 15; i++)
    xp += c[i]/(z + i);
  zhg = z + gh;
  return (z+.5)*log(zhg) - zhg + log(sqrt2pi*xp/z); /* gamma(z) = gamma(z+1)/z */
}

/* return the p-value, or 1 - cdf(x), for KS distribution */
double ksq(double x)
{
  double y;
  if (x < 0) {
    fprintf(stderr, "neg. arg. for ksq(x)\n");
    exit(1);
  }
  if (x < 1e-15) {
    return 1.;
  } else if (x < 1.18) {
    x = 1.110720734539591525/x;
    y = exp(-x*x);
    return 1. - 2.25675833419102515*x*(y+pow(y,9)+pow(y,25)+pow(y,49));
  } else {
    y = exp(-x*x*2.);
    return 2.*(y - pow(y, 4) + pow(y, 9));
  }
}

/* normalized associated legendre polynomial
 * nplm(x) = sqrt( (2l+1)/(4 pi) (l-m)!/(l+m)!) P_l^m(x)
 * real solution of m <= l, l, m >= 0
 * (1 - x^2) y'' - 2 x y' + [l(l+1) - m^2/(1-x^2)] y = 0 */
double plegendre(double x, int l, int m)
{
  int i;
  double y, yp, ypp, f, fp, s = 1 - x*x;

  if (m < 0 || m > l || s < 0) return 0;
  for (yp = 1, i = 1; i <= m; i++) yp *= (1 + .5/i)*s;
  yp = sqrt(yp/(4*M_PI)) * (m % 2 ? -1: 1); /* P(m, m) */
  /* (l-m) P_l^m = x (2l-1) P_{l-1}^m - (l+m-1)*P_{l-2}^m */
  for (fp = 1, ypp = 0, i = m + 1; i <= l; i++, fp = f, ypp = yp, yp = y) {
    f = sqrt((4.*i*i-1)/((i-m)*(i+m)));
    y = f*(x*yp - ypp/fp);
  }
  return yp;
}

#endif /* ZCOM_SPECFUNC__ */
#endif /* ZCOM_SPECFUNC */

#ifdef  ZCOM_ARGOPT
#ifndef ZCOM_ARGOPT__
#define ZCOM_ARGOPT__
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

typedef struct {
  int isopt; /* is option or argument */
  char ch; /* single letter option flag */
  const char *sflag; /* long string flag */

  const char *val; /* raw string from command line */
  const char *desc; /* description */
  const char *fmt; /* sscanf format */
  const char *pfmt; /* printf format, NULL: to guess */
  void *ptr; /* address to the target variable */
  unsigned flags;
} opt_t;

typedef struct {
  int narg, nopt;
  opt_t *args;
  opt_t *opts;
  const char *prog;
  const char *desc;
  const char *author;
  const struct tm *tm; /* compilation time */
  int version;
  unsigned flags;
  int dum_[4]; /* space holder */
} argopt_t;

#define ARGOPT_MUST     0x0001  /* a mandatory argument or option */
#define ARGOPT_SWITCH   0x0002  /* an option is a switch, fmt "%b" */
#define ARGOPT_SET      0x0004  /* an argument/option is set, fmt starts with "!" */
#define ARGOPT_LONGOPT  0x0010  /* always assume long format, e.g., -maxh */

STRCLS argopt_t *argopt_open(unsigned flags);
STRCLS void argopt_close(argopt_t *ao);
#define argopt_regarg(ao, fmt, ptr, desc) argopt_regopt(ao, NULL, fmt, ptr, desc)
STRCLS int argopt_regopt(argopt_t *ao, const char *sflag,
    const char *fmt, void *ptr, const char *desc);
STRCLS void argopt_parse(argopt_t *ao, int argc, char **argv);

#define argopt_reghelp(ao, sflag) argopt_regopt(ao, sflag, "%b", ao->dum_, "$HELP")
#define argopt_regversion(ao, sflag) argopt_regopt(ao, sflag, "%b", ao->dum_, "$VERSION")

#define argopt_getopt(ao, p) argopt_searchls_(ao->opts, ao->nopt, p)
#define argopt_getarg(ao, p) argopt_searchls_(ao->args, ao->narg, p)
INLINE opt_t *argopt_searchls_(opt_t *ls, int n, const void *p)
 { int i; for (i = 0; i < n; i++) if (ls[i].ptr == p) return ls+i; return NULL; }

#define argopt_set(ao, var) argopt_set_(ao, &var, #var)
INLINE int argopt_set_(argopt_t *ao, const void *p, const char *var)
{
   opt_t *a = argopt_getarg(ao, p), *o = argopt_getopt(ao, p);
   die_if(!a && !o, "cannot locate var %s, %p\n", var, p);
   return a ? (a->flags & ARGOPT_SET ? 1 : 0) : (o->flags & ARGOPT_SET ? 1 : 0);
}


/* initialize the argument structure */
argopt_t *argopt_open(unsigned flags)
{
  argopt_t *ao;
  time_t tmcmpl;

  xnew(ao, 1);
  ao->flags = flags;
  ao->narg = ao->nopt = 0;
  ao->args = ao->opts = NULL;
  tmcmpl = time(NULL);
  ao->tm = localtime( &tmcmpl );
  memset(ao->dum_, '\0', sizeof(ao->dum_));
  return ao;
}

void argopt_close(argopt_t *ao)
{
  if (ao->args) { free(ao->args); ao->args = NULL; }
  if (ao->opts) { free(ao->opts); ao->opts = NULL; }
  free(ao);
}

/* print version and die */
static void argopt_version(argopt_t *ao)
{
  printf("%s: %s, version %d\n",
      ao->prog, ao->desc ? ao->desc : "", ao->version);
  if (ao->author && ao->tm)
    printf("Copyright (c) %s %d\n", ao->author, ao->tm->tm_year + 1900);
  argopt_close(ao);
  exit(1);
}

/* print help message and die */
static void argopt_help(argopt_t *ao)
{
  int i, len, maxlen;
  opt_t *lo;
  const char *sysopt[2] = {"print help message", "print version"}, *desc, *fmt;

  printf("%s, version %d",
      ao->desc ? ao->desc : ao->prog, ao->version);
  if (ao->author && ao->tm)
    printf(", Copyright (c) %s %d", ao->author, ao->tm->tm_year + 1900);
  printf("\nUSAGE\n  %s [OPTIONS]", ao->prog);
  for (i = 0; i < ao->narg; i++) {
    const char *bra = "", *ket = "";
    lo = ao->args + i;
    if (lo->flags & ARGOPT_MUST) {
      if (strchr(lo->desc, ' '))
        bra = "{", ket = "}";
    } else
      bra = "[", ket = "]";
    printf(" %s%s%s", bra, lo->desc, ket);
  }
  printf("\n");

  printf("OPTIONS:\n") ;
  for (maxlen = 0, i = 0; i < ao->nopt; i++) {
    len =  strlen(ao->opts[i].sflag);
    if (len > maxlen) maxlen = len;
  }
  for (i = 0; i < ao->nopt; i++) {
    lo = ao->opts + i;
    desc = lo->desc;
    if (strcmp(desc, "$HELP") == 0)
      desc = sysopt[0];
    else if (strcmp(desc, "$VERSION") == 0)
      desc = sysopt[1];
    printf("  %-*s : %s%s", maxlen, lo->sflag,
        (!(lo->flags & ARGOPT_SWITCH) ? "followed by " : ""), desc);
    if (lo->ptr && lo->ptr != &ao->dum_) { /* print default values */
      printf(", default: ");
      for (fmt = lo->fmt; *fmt && *fmt != '%'; fmt++) ;
#define ELIF_PF_(fm, fmp, type) else if (strcmp(fmt, fm) == 0) printf((lo->pfmt ? lo->pfmt : fmp), *(type *)lo->ptr)
      if (fmt == NULL || *fmt == '\0') printf("%s", (*(char **)lo->ptr) ? (*(char **)lo->ptr) : "NULL");
      ELIF_PF_("%b", "%d", int); /* switch */
      ELIF_PF_("%d", "%d", int);
      ELIF_PF_("%u", "%u", unsigned);
      ELIF_PF_("%x", "0x%x", unsigned);
      ELIF_PF_("%ld", "%ld", long);
      ELIF_PF_("%lu", "%lu", unsigned long);
      ELIF_PF_("%lx", "0x%lx", unsigned long);
#if 0  /* C99 only */
      ELIF_PF_("%lld", "%lld", long long);
      ELIF_PF_("%llu", "%llu", unsigned long long);
      ELIF_PF_("%llx", "0x%llx", unsigned long long);
#endif
      ELIF_PF_("%f", "%g", float);
      ELIF_PF_("%lf", "%g", double);
      ELIF_PF_("%r", "%g", real);
      else printf("unknown %s-->%%d: %d\n", fmt, *(int *)lo->ptr);
#undef ELIF_PF_
    }
    printf("\n");
  }
  argopt_close(ao);
  exit(1);
}

/* register option: fmt = "%b" for a switch */
int argopt_regopt(argopt_t *ao, const char *sflag,
    const char *fmt, void *ptr, const char *desc)
{
  opt_t *ol;
  int n;

  if (sflag) { /* option */
    n = ao->nopt++;
    xrenew(ao->opts, ao->nopt);
    ol = ao->opts + n;
    ol->isopt = 1;
    ol->ch = (char) ( sflag[2] ? '\0' : sflag[1] ); /* no ch for a long flag */
  } else { /* argument */
    n = ao->narg++;
    xrenew(ao->args, ao->narg);
    ol = ao->args + n;
    ol->isopt = 0;
    ol->ch = '\0';
  }
  ol->sflag = sflag;
  ol->flags = 0;
  die_if (ptr == NULL, "null pass to argopt with %s: %s\n", sflag, desc);
  ol->ptr = ptr;
  if (fmt == NULL) fmt = "";
  if (fmt[0] == '!') {
    fmt++;
    ol->flags |= ARGOPT_MUST;
  }
  if (strcmp(fmt, "%b") == 0) {
    fmt = "%d";
    ol->flags |= ARGOPT_SWITCH;
  }
  ol->fmt = fmt;
  ol->pfmt = NULL;
  ol->desc = desc;
  return n;
}

/* translate string values to actual ones through sscanf() */
static int opt_getval_(opt_t *o)
{
  const char *fmt = o->fmt;

  if (fmt == NULL || fmt[0] == '\0') { /* raw string assignment */
    *(const char **)o->ptr = o->val;
  } else { /* call sscanf */
    int ret, ch;
    ch = fmt[strlen(fmt)-1];
    if (ch == 'r') /* real */
      fmt = (sizeof(real) == 4) ? "%f" : "%lf";
    ret = sscanf(o->val, fmt, o->ptr);
    if (ret != 1) {
      fprintf(stderr, "Error: unable to convert a value for [%s] as fmt [%s], raw string: [%s]\n",
          o->desc, fmt, o->val);
      return 1;
    }
  }
  return 0;
}

/* main parser of arguments */
void argopt_parse(argopt_t *ao, int argc, char **argv)
{
  int i, j, k, ch, acnt = 0;
  opt_t *al = ao->args;
  opt_t *ol = ao->opts;

  ao->prog = argv[0];
  for (i = 1; i < argc; i++) {
    if (argv[i][0] != '-') { /* it's an argument */
      if (acnt >= ao->narg) argopt_help(ao);
      al[acnt].val = argv[i];
      al[acnt].flags |= ARGOPT_SET;
      if (0 != opt_getval_(al+acnt))
        argopt_help(ao);
      ++acnt;
      continue;
    }

    /* it's an option, loop for abbreviated form "-abc" == "-a -b -c" */
    for (j = 1; (ch = argv[i][j]) != '\0'; j++) {
      int islong = (j == 1 && argv[i][1] == '-') | (ao->flags & ARGOPT_LONGOPT);

      if (islong) { /* compare against long options */
        for (k = 0; k < ao->nopt; k++)
          if (strncmp(argv[i], ol[k].sflag, strlen(ol[k].sflag)) == 0)
            break;
      } else { /* compare against short options */
        for (k = 0; k < ao->nopt; k++)
          if (ch == ol[k].ch)
            break;
      }
      if (k >= ao->nopt) {
        fprintf(stderr, "cannot handle option [%s]\n", argv[i]);
        argopt_help(ao);
      }

      if (ol[k].desc[0] == '$') { /* system commands */
        if (strcmp(ol[k].desc, "$HELP") == 0)
          argopt_help(ao);
        else if (strcmp(ol[k].desc, "$VERSION") == 0)
          argopt_version(ao);
      }

      if (ol[k].flags & ARGOPT_SWITCH) {
        ol[k].flags |= ARGOPT_SET;
        *(int *)ol[k].ptr = 1;
        if (islong) break; /* go to the next argument argv[i+1] */
      } else { /* look for the additional argument for this */
        int hasv = 0;
        if (islong) { /* e.g., --version=11 */
          j = strlen(ol[k].sflag);
          if (argv[i][ j ] == '=') {
            ol[k].val = argv[i] + j + 1;
            hasv = 1;
          }
        } else { /* e.g., -n8 */
          if (argv[i][++j]) {
            ol[k].val = argv[i] + j;
            hasv = 1;
          }
        }

        if (!hasv) { /* --version 11 or -n 8 */
          if (++i >= argc) {
            printf("%s(%s) requires an argument!\n", ol[k].sflag, argv[i-1]);
            argopt_help(ao);
          }
          ol[k].val = argv[i];
        }
        ol[k].flags |= ARGOPT_SET;
        if (0 != opt_getval_(ol+k)) argopt_help(ao);
        break; /* go to the next argument argv[i+1] */
      }
    } /* end of short option loop */
  }
  /* check if we have all mandatory arguments and options */
  for (i = 0; i < ao->narg; i++) {
    if ((al[i].flags & ARGOPT_MUST) && !(al[i].flags & ARGOPT_SET)) {
      printf("Error: missing argument %d: %s\n\n", i, al[i].desc);
      argopt_help(ao);
    }
  }
  for (i = 0; i < ao->nopt; i++) {
    if ((ol[i].flags & ARGOPT_MUST) && !(ol[i].flags & ARGOPT_SET)) {
      printf("Error: missing option %s: %s\n\n", ol[i].sflag, ol[i].desc);
      argopt_help(ao);
    }
  }
}

#endif /* ZCOM_ARGOPT__ */
#endif /* ZCOM_ARGOPT */

#ifdef  ZCOM_CFG
#ifndef ZCOM_CFG__
#define ZCOM_CFG__

/*
 * =====================================================================
 *
 * Configuration file
 *
 * =====================================================================
 *
 * to open a configuration file:
 *    cfg = cfgopen("your.cfg");
 * to load a parameter,
 *    cfgget(cfg, &var, "var_name", scanf_fmt);
 * to finish up:
 *    cfgclose(cfg);
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

typedef struct {
  size_t n;           /* number of lines */
  int    canfree;     /* whether the struct is dynamically allocated */
  char **key,**value; /* key[i] = value[i] */
  char  *buf;         /* the whole configuration file */
} cfgdata_t;

STRCLS cfgdata_t *cfgopen(const char *filenm);
STRCLS void cfgclose(cfgdata_t *cfg);
STRCLS int cfgget(cfgdata_t *cfg, void *var, const char *key, const char* fmt);


#define isspace_(c) isspace((unsigned char)(c))

/* load the whole configuration file into memory (return value);
 * parse it to entries, return 0 if successful */
static int cfgload_(cfgdata_t *cfg, const char *filenm)
{
  FILE *fp;
  size_t i, j, size = 0;
  char *p, *q, *lin;

  if ((fp = fopen(filenm, "r")) == NULL) {
    fprintf(stderr,"cannot open the configuration file [%s]\n", filenm);
    return 1;
  }

  if (ssfgetall(cfg->buf, &size, fp) == NULL) {
    fprintf(stderr, "error reading file %s\n", filenm);
    return 4;
  }
  sscat(cfg->buf, "\n"); /* in case the file is not ended by a new line, we add one */
  fclose(fp);


  /* count the number of lines for allocating the key-table */
  for (i = 0, cfg->n = 0; i < size; i++) {
    if (cfg->buf[i] == '\n' || cfg->buf[i] == '\r') {
      if (i > 0 && cfg->buf[i-1] == '\\') {
        /* allow multiple-line splicing by replacing cr, lf with spaces
           parse should be aware of these additional spaces */
        cfg->buf[i-1] = ' ';
        cfg->buf[i] = ' ';
      } else {
        cfg->buf[i] = '\0';
        (cfg->n)++;
      }

      for (j = i+1; j < size; j++) {
        /* we replace immediately followed cr & lf by spaces for
           efficiency (to avoid a large key table for blank lines) */
        if ( isspace_(cfg->buf[j]) ) {
          cfg->buf[j] = ' ';
        } else {
          break;
        }
        /* note: parser should be insensitive to leading spaces */
      }
    }
  }

  cfg->key   = calloc(cfg->n, sizeof(char*));
  cfg->value = calloc(cfg->n, sizeof(char*));

  /* load lines into the keytable, not parsed yet */
  for (p = q = cfg->buf, j = 0, i = 0; i < size; i++) {
    if (cfg->buf[i] == '\0') {
      cfg->key[j] = p;
      j++;
      p = cfg->buf+i+1;
      /* we may still have spaces left over, but no need to continue */
      if (j > cfg->n) break;
    }
  }
  cfg->n = j;

  /* now parse lines: separate values from keys */
  for (j = 0; j < cfg->n; j++) {
    lin = cfg->key[j];

    /* remove the the leading spaces */
    for (; *lin && isspace_(*lin); lin++) ;
    cfg->key[j] = lin;
    /* skip a blank or comment line */
    if (lin[0] == '\0' || strchr("#%!;", lin[0]) != NULL) {
      cfg->key[j] = NULL;
      continue;
    }

    /* remove trailing space and ';' */
    for (q = lin+strlen(lin)-1;
         q >= lin && (isspace_(*q)||*q == ';'); q--) *q = '\0';

    if ((q = strchr(lin, '=')) == NULL) { /* skip a line without '=' */
      cfg->key[j] = NULL;
      continue;
    }

    /* find the end of key --> 'q' */
    *q = '\0';
    p  = q + 1;
    for (--q; isspace_(*q); q--) *q = '\0';
    for (; (*p) && isspace_(*p); p++) ; /* skip leading space, 'p' -> value */
    cfg->value[j] = p;
  }

  return 0;
}

#undef isspace_

/* a wrapper of cfgload_ to make it more like fopen */
cfgdata_t *cfgopen(const char *filenm)
{
  cfgdata_t *cfg;

  if ((cfg = calloc(1, sizeof(*cfg))) == NULL) {
    fprintf(stderr, "cannot allocate space for cfgdata_t.\n");
    return NULL;
  }
  if (cfgload_(cfg, filenm) != 0) {
    free(cfg);
    return NULL;
  }
  cfg->canfree = 1; /* so it can be safely freed */
  return cfg;
}


void cfgclose(cfgdata_t *cfg)
{
  int canfree = cfg->canfree; /* save the value before memset */

  free(cfg->value);
  free(cfg->key);
  ssdelete(cfg->buf);
  memset(cfg, 0, sizeof(*cfg));
  if (canfree) /* free cfg if it is created by cfgopen */
    free(cfg);
}


/* Read the value of a given variable from the current configuration file,
 * the name of variable is given by `key',
 * If the key is matched, its value is saved to `*var' through sscanf,
 *   otherwise, the content in *var is not modified.
 * If the function succeeds, it returns 0.
 * A comment line in the configuration file starts with '#', '%' or '!'.
 * In case fmt is "%s", (*var) is a string, or a pointer to char.
 *   The space for (*var) will be managed through sscpy, which should *not*
 *   to be free'd.  Instead, ssdel should be called if really necessary.
 * */
int cfgget(cfgdata_t *cfg, void *var, const char *key, const char* fmt)
{
  size_t j;

  if (cfg == NULL) return 1;

  if (cfg->key == NULL || var == NULL || key == NULL || fmt == NULL) {
    fprintf(stderr, "cfgget: NULL pointer.\n");
    return 1;
  }

  for (j = 0; j < cfg->n; j++)
    if (cfg->key[j] != NULL && strcmp(cfg->key[j], key) == 0) {
      if (strcmp(fmt, "%s") == 0) { /* string case */
        sscpy( *(char **)var, cfg->value[j]); /* make a copy and return */
        return 0;
      } else { /* use sscanf for other cases, like int, float,... */
        if (EOF == sscanf(cfg->value[j], fmt, var))
          return 2; /* input error */
        else
          return 0;
      }
    }
  return 1; /* no match */
}


#endif /* ZCOM_CFG__ */
#endif /* ZCOM_CFG */

#ifdef  ZCOM_TRACE
#ifndef ZCOM_TRACE__
#define ZCOM_TRACE__


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>

#ifdef HAVEVAM  /* we use variadic macros if possible */

#define wtrace(fmt, ...)     wtrace_x(0, fmt, ## __VA_ARGS__)
#define wtrace_buf(fmt, ...) wtrace_x(1, fmt, ## __VA_ARGS__)
STRCLS int wtrace_x(int, const char*, ...);

#else /* otherwise default to the buffered version */

#define wtrace wtrace_buf
STRCLS int wtrace_buf(const char *, ...);

#endif /* HAVEVAM */


/* determine if vsnprintf is available */
#ifndef HAVE_VSNPRINTF
#if ( defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__xlC__) || defined(_MSC_VER) )
#define HAVE_VSNPRINTF 1
#ifdef _MSC_VER
#ifndef vsnprintf
#define vsnprintf _vsnprintf
#endif /* VC define vsnprintf */
#endif /* _MSC_VER */
#endif /* good compilers */
#endif /* HAS_VSNPRINTF */

/* handle a trace command inside a format string
 * return nonzero to quit and enter the normal mode */
static int tracecmd_(const char *cmd, va_list args,
    char **pfname, int *freq, int *verbose)
{
  const char *p;

  if ((p = strchr(cmd, '=')) == NULL)
    return 1;
  if (strncmp(cmd, "filename", 8) == 0) {
    p = va_arg(args, const char *);
    if (p != NULL) {
      sscpy(*pfname, p);
      if (*verbose)
        fprintf(stderr, "The trace file is now %s\n", *pfname);
    }
  } else if (strncmp(cmd, "freq", 4) == 0) {
    *freq = va_arg(args, int);
    if (*verbose)
      fprintf(stderr, "change frequency of writing trace to %d\n", *freq);
  } else if (strncmp(cmd, "verbose", 7) == 0) {
    *verbose = va_arg(args, int);
  } else {
    fprintf(stderr, "unknown command: %s\n", cmd);
    return 1;
  }
  return 0;
}

/* write trace in unbuffered mode
 * note: system may have provided certain buffer */
static int wtrace_unbuf_low_(int cnt, int freq,
    const char *fname, const char *fmt, va_list args)
{
  static FILE *fp = NULL;
  static const char *mode = NULL;

  if (cnt == 0) {
    if ((fp = fopen(fname, mode = (mode ? "a" : "w"))) == NULL) {
      fprintf(stderr, "cannot write file %s, mode %s\n", fname, mode);
      return 1;
    }
  }
  if (fmt != NULL)
    vfprintf(fp, fmt, args);
  if ((cnt + 1) == freq || fmt == NULL)
    fclose(fp);
  return 0;
}

/* write trace in memory-buffered mode */
static int wtrace_buf_low_(int cnt, int freq,
    const char *fname, const char *fmt, va_list args)
{
  const  int   maxmsg = 1024;
  static char *msg = NULL, *buf = NULL;
  int i;

  /* buffered mode */
  if (cnt == 0) { /* allocate msg and buf first */
    msg = ssnew(maxmsg * 2); /* leave some margin */
    buf = ssnew(maxmsg * freq);
  }
  if (msg == NULL || buf == NULL) {
    fprintf(stderr, "no buffer found.\n");
    return 1;
  }
  if (fmt != NULL) {
    if (strlen(fmt) >= (size_t) maxmsg) {
      fprintf(stderr, "the format string is too long.\n");
      return 1;
    }
#ifdef HAVE_VSNPRINTF
    i = vsnprintf(msg, maxmsg, fmt, args);
#else /* we use vsprintf if vsnprintf is unavailable */
    i = vsprintf(msg, fmt, args);
#endif
    if (i >= maxmsg) {
      fprintf(stderr, "the message is too long.\n");
      return 1;
    }
    sscat(buf, msg);
  }

  /* flush buffered content to file, and possibly finish up */
  if ((cnt + 1) % freq == 0 || fmt == NULL) {
    FILE *fp;
    static const char *mode = NULL;

    if (buf && buf[0] != '\0') { /* in case nothing was written */
      if ((fp = fopen(fname, mode = (mode ? "a" : "w"))) == NULL) {
        fprintf(stderr, "cannot write file %s, mode %s\n", fname, mode);
        return 1;
      }
      fputs(buf, fp);
      buf[0] = '\0';
      fclose(fp);
    }
    if (fmt == NULL) { /* finishing up */
      if (msg != NULL) ssdelete(msg);
      if (buf != NULL) ssdelete(buf);
    }
  }
  return 0;
}

/*
 * write trace using
 * unbuffered (flags = 0) or buffered (flags = 1) output
 * but mixing the two modes is a major mistake!
 *
 * pass NULL to `fmt' to finish up
 * In addition, we support simple commands in `fmt'
 * If `fmt' starts with %@, we start the command mode
 * if fmt="%@filename=%s", the next argument is "tr.dat",
 *    then the trace file becomes tr.dat
 * if fmt="%@freq=%d", the next argument is 100,
 *    then we refresh every 100 calls.
 * */
#ifdef HAVEVAM
int wtrace_x(int wtrace_flags_, const char *fmt, ...)
#else
static int wtrace_flags_ = 1;
int wtrace_buf(const char *fmt, ...)
#endif
{
  static int   verbose = 1;
  static int   freq = 1000;
  static int   cnt = 0;
  static char *fname = NULL;
  va_list args;
  int i;

  if (fname == NULL) /* set the default file name */
    fname = ssdup("TRACE");

  /* start the command mode if the format string start with "%@"
   * the command mode allows setting parameters */
  if (fmt != NULL && fmt[0] == '%' && fmt[1] == '@') {
    if (cnt > 0) {
      fprintf(stderr, "trace: changing setting after something\n");
      return -1;
    }
    va_start(args, fmt);
    i = tracecmd_(fmt + 2, args, &fname, &freq, &verbose);
    va_end(args);
    if (i == 0) return 0;
  }

  va_start(args, fmt);
  if (wtrace_flags_ == 0) { /* unbuffered version */
    i = wtrace_unbuf_low_(cnt, freq, fname, fmt, args);
  } else { /* buffered version */
    i = wtrace_buf_low_(cnt, freq, fname, fmt, args);
  }
  va_end(args);

  /* fmt == NULL means finishing up, once = 0 for a fresh start */
  if (fmt == NULL) { /* finishing up */
    cnt  = 0;
    if (fname) ssdelete(fname);
  } else {
    cnt++;
  }
  return i;
}

#endif /* ZCOM_TRACE__ */
#endif /* ZCOM_TRACE */

#ifdef  ZCOM_LOG
#ifndef ZCOM_LOG__
#define ZCOM_LOG__

/*
 * =======================================================================
 *
 * LOG file routines
 *
 * ========================================================================
 */

#include <stdio.h>
#include <stdarg.h>

typedef struct {
  FILE *fp;
  char *fname;
  int flag;
} logfile_t;

#define LOG_WRITESCREEN  0x01
#define LOG_FLUSHAFTER   0x02
#define LOG_NOWRITEFILE  0x10

STRCLS logfile_t *logopen(char *filenm);
STRCLS int logprintf(logfile_t *log, char *fmt, ...);
STRCLS int loghardflush(logfile_t *log);
STRCLS void logclose(logfile_t *log);


logfile_t *logopen(char *filenm)
{
  logfile_t *log;

  if (filenm == NULL) /* assign a default name */
    filenm = "LOG";
  if ((log = calloc(1, sizeof(*log))) == NULL) {
    fprintf(stderr, "cannot allocate memory for log file %s\n", filenm);
    return NULL;
  }
  /* We merely copy the name of the file,
   * the file is not opened until the first logprintf call */
  log->fname = ssdup(filenm);
  log->flag = 0;
  return log;
}

int logprintf(logfile_t *log, char *fmt, ...)
{
  va_list args;

  if (log == NULL) return 1;

  if (log->fp == NULL)
    log->fp = fopen(log->fname, "w");
  if (log->fp == NULL) {
    fprintf(stderr, "log [%s] cannot be opened.\n", log->fname);
    return 1;
  }
  if ((log->flag & LOG_NOWRITEFILE) == 0) {
    va_start(args, fmt);
    vfprintf(log->fp, fmt, args);
    va_end(args);
  }
  if (log->flag & LOG_WRITESCREEN) {
    va_start(args, fmt);
    vprintf(fmt, args);
    va_end(args);
  }
  if (log->flag & LOG_FLUSHAFTER)
    fflush(log->fp);
  return 0;
}

/* close & reopen log file to make sure that stuff is written to disk */
int loghardflush(logfile_t *log)
{
  if (log->fp == NULL || log->fname == NULL)
    return 1;
  fclose(log->fp);
  if ((log->fp = fopen(log->fname, "a")) == NULL) {
    fprintf(stderr, "cannot reopen the log file [%s].\n",
        log->fname);
    return 1;
  }
  return 0;
}

void logclose(logfile_t *log)
{
  if (log == NULL)
    return;
  if (log->fp != NULL) {
    fclose(log->fp);
    log->fp = NULL;
  }
  if (log->fname != NULL) {
    ssdelete(log->fname);
  }
  free(log);
}

#endif /* ZCOM_LOG__ */
#endif /* ZCOM_LOG */

#ifdef  ZCOM_AV
#ifndef ZCOM_AV__
#define ZCOM_AV__
#include <stdio.h>
#include <math.h>

typedef struct {
  double s, sx, sx2; /* sum, sum x, variance */
} av_t;

STRCLS void av_add(av_t *av, double w, double x);
INLINE void av_clear(av_t *av) { av->s = av->sx = av->sx2 = 0; }
INLINE double av_getave(const av_t *av) { return (av && av->s > 0) ? av->sx/av->s : 0; }
INLINE double av_getvar(const av_t *av) { return (av && av->s > 0) ? av->sx2/av->s : 0; }
INLINE double av_getdev(const av_t *av) { return (av && av->s > 0) ? sqrt(av_getvar(av)) : 0; }

/* add a new value to av_t with a weight `w' */
INLINE void av_addw(av_t *av, double x, double w)
{
  double s, sx;

  av->s = (s = av->s) + w;
  av->sx = (sx = av->sx) + x*w;
  if (s <= 0.0) return;
  av->sx2 += (x - sx/s)*(x - av->sx/av->s)*w;
}
#define av_add(av, x) av_addw(av, x, 1)


#endif /* ZCOM_AV__ */
#endif /* ZCOM_AV */

#ifdef  ZCOM_HIST
#ifndef ZCOM_HIST__
#define ZCOM_HIST__


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <math.h>

#define HIST_VERBOSE    0x0001

#define HIST_ADDAHALF   0x0010
#define HIST_NOZEROES   0x0020
#define HIST_KEEPLEFT   0x0040
#define HIST_KEEPRIGHT  0x0080
#define HIST_KEEPLEFT2  0x0040
#define HIST_KEEPRIGHT2 0x0080
#define HIST_KEEPEDGE   (HIST_KEEPLEFT|HIST_KEEPRIGHT|HIST_KEEPLEFT2|HIST_KEEPRIGHT2)
#define HIST_KEEPHIST   0x0100
#define HIST_OVERALL    0x0200

#define HIST_ADDITION   0x1000

/* old names */
#define wdist(h,m,n,x0,dx,fn) wdistex(h,m,n,x0,dx,HIST_ADDAHALF|HIST_VERBOSE,fn)
#define wdistex histsave

#define histsave(h,rows,n,xmin,dx,flags,fname) \
  histsavex((const double *)h,rows,n,xmin,dx,flags,NULL,NULL,NULL,fname)

STRCLS int histsavex(const double *h, int rows, int n, double xmin, double dx,
    unsigned flags, int (*fwheader)(FILE *, void *),
    double (*fnorm)(int, int, double, double, void *),
    void *pdata, const char *fname);

#define histload(h,rows,n,xmin,dx,flags,fname) \
  histloadx((double *)h,rows,n,xmin,dx,flags,NULL,NULL,NULL,fname)

STRCLS int histloadx(double *hist, int rows, int n, double xmin, double dx,
    unsigned flags, int (*frheader)(const char *, void *),
    double (*fnorm)(int, int, double, double, void *),
    void *pdata, const char *fn);

STRCLS int histadd(const double *x, double w, double *h, int rows,
    int n, double xmin, double dx, unsigned flags);

/* object oriented wrapper functions */
typedef struct {
  double *arr;
  int rows;
  int n;
  double xmin;
  double dx;
  int (*fwheader)(FILE *, void *);
  int (*frheader)(const char *, void *);
  double (*fnorm)(int, int, double, double, void *);
} hist_t;

#define hs_init(m,x0,x1,dx) hs_initx(m,x0,x1,dx,NULL,NULL,NULL)
#define hs_save(hs,fn,flags) hs_savex(hs,fn,NULL,flags)
#define hs_load(hs,fn,flags) hs_loadx(hs,fn,NULL,flags)

STRCLS hist_t *hs_initx(int rows, double xmin, double xmax, double dx,
    int (*fwh)(FILE *, void *), int (*frh)(const char*, void *),
    double (*fnorm)(int, int, double, double, void *));
STRCLS void hs_free(hist_t *hs);
STRCLS int hs_savex(const hist_t *hs, const char *fname, void *pdata, unsigned flags);
STRCLS int hs_loadx(hist_t *hs, const char *fname, void *pdata, unsigned flags);
STRCLS int hs_add(hist_t *hs, const double *x, double w, unsigned flags);
STRCLS int hs_add1(hist_t *hs, int r, double x, double w, unsigned flags);


/* two dimensional version */
STRCLS int hist2save(const double *h, int rows, int n, double xmin, double dx,
    unsigned flags, const char *fname);
STRCLS int hist2load(double *hist, int rows, int n, double xmin, double dx,
    unsigned flags, const char *fname);
STRCLS int hist2add(const double *xarr, const double *yarr, int skip,
    double w, double *h, int rows,
    int n, double xmin, double dx, unsigned flags);

typedef struct {
  double *arr;
  int rows;
  int n;
  double xmin;
  double dx;
} hist2_t;

STRCLS hist2_t *hs2_init(int rows, double xmin, double xmax, double dx);
STRCLS void hs2_free(hist2_t *hs);
STRCLS int hs2_save(const hist2_t *hs, const char *fname, unsigned flags);
STRCLS int hs2_load(hist2_t *hs, const char *fname, unsigned flags);
STRCLS int hs2_add(hist2_t *hs, const double *x, const double *y, int skip, double w, unsigned flags);
STRCLS int hs2_add1(hist2_t *hs, int r, double x, double y, double w, unsigned flags);

#ifndef xnew
#define xnew(x, n) \
  if ((n) <= 0) { \
    fprintf(stderr, "cannot allocate %d objects for %s\n", (int) (n), #x); \
    exit(1); \
  } else if ((x = calloc(n, sizeof(*(x)))) == NULL) { \
    fprintf(stderr, "no memory for %s x %u\n", #x, (unsigned) (n)); \
    exit(1); }
#endif

/* compute sum, average and standard deviation*/
static double *gethistsums_(const double *h, int rows, int n,
    double xmin, double dx)
{
  double *sums, *xav, *xdv, x, w;
  int i, r;

  xnew(sums, 3*rows);
  xav = sums + rows;
  xdv = xav  + rows;
  for (r = 0; r < rows; r++) {
    sums[r] = xav[r] = xdv[r] = 0.;
    for (i = 0; i < n; i++) {
      x = xmin + (i+.5)*dx;
      w = h[r*n + i];
      sums[r] += w;
      xav[r]  += w*x;
      xdv[r]  += w*x*x;
    }
    if (sums[r] > 1e-5) {
      xav[r] /= sums[r];
      xdv[r] = sqrt(xdv[r]/sums[r] - xav[r]*xav[r]);
    }
  }
  return sums;
}

static double *gethist2sums_(const double *h, int rows, int n,
    double xmin, double dx)
{
  double *sums, *xav, *yav, *xdv, *ydv, x, y, w;
  int i, j, r;

  xnew(sums, 5*rows);
  xav = sums + rows;
  xdv = sums + rows*2;
  yav = sums + rows*3;
  ydv = sums + rows*4;
  for (r = 0; r < rows; r++) {
    sums[r] = xav[r] = xdv[r] = yav[r] = ydv[r] = 0.;
    for (i = 0; i < n; i++) {
      x = xmin + (i+.5)*dx;
      for (j = 0; j < n; j++) {
        y = xmin + (j+.5)*dx;
        w = h[r*n*n + i*n + j];
        sums[r] += w;
        xav[r]  += w*x;
        xdv[r]  += w*x*x;
        yav[r]  += w*y;
        ydv[r]  += w*y*y;
      }
    }
    if (sums[r] > 1e-5) {
      xav[r] /= sums[r];
      xdv[r] = sqrt(xdv[r]/sums[r] - xav[r]*xav[r]);
      yav[r] /= sums[r];
      ydv[r] = sqrt(ydv[r]/sums[r] - yav[r]*yav[r]);
    }
  }
  return sums;
}


/* write histograms to file
 * histogram 'h' contains 'rows' histograms,
 * each contains 'n' entries, from 'xmin' to 'xmin+dx*n'
 * (*fwheader) is function to print additional information
 * (*fnorm) is advanced normalization function */
int histsavex(const double *h, int rows, int n, double xmin, double dx,
    unsigned flags,
    int (*fwheader)(FILE *fp, void *data),
    double (*fnorm)(int r, int ix, double xmin, double dx, void *data),
    void *pdata,
    const char *fn)
{
  const int version = 0;
  const char *filename;
  FILE *fp;
  int i, r, rp, rowp, imax, imin;
  const double *p;
  double sm, *sums, fac, delta;
  double *smtot, *htot = NULL;

  filename = (fn != NULL) ? fn : "HIST";

  if ((fp = fopen(filename, "w")) == NULL) {
    printf("cannot write history file [%s].\n", filename);
    return 1;
  }

  sums = gethistsums_(h, rows, n, xmin, dx);

  /* compute the overall histogram */
  if (flags & HIST_OVERALL) {
    xnew(htot, n);
    for (i = 0; i < n; i++) htot[i] = 0.;

    for (r = 0; r < rows; r++)
      for (i = 0; i < n; i++)
        htot[i] += h[r*n + i];
    smtot = gethistsums_(htot, 1, n, xmin, dx);
    rowp = rows + 1;
  } else {
    rowp = rows;
  }

  /* print basic information */
  fprintf(fp, "# %d 0x%X | %d %d %g %g | ",
      version, flags, rows, n, xmin, dx);
  for (r = 0; r < rows; r++) /* number of visits */
    fprintf(fp, "%g ", sums[r]);
  fprintf(fp, "| ");
  for (r = 0; r < rows; r++) /* average, standard deviation */
    fprintf(fp, "%g %g ", sums[r+rows], sums[r+rows*2]);
  fprintf(fp, "| ");
  if (fwheader != NULL) (*fwheader)(fp, pdata);
  fprintf(fp, "\n");

  delta = (flags & HIST_ADDAHALF) ? 0.5 : 0;

  for (r = 0; r < rowp; r++) {
    p = (r == rows) ? htot : (h+r*n);

    if (flags & HIST_KEEPRIGHT) {
      imax = n;
    } else { /* trim the right edge */
      for (i = n-1; i >= 0; i--)
        if (p[i] > 0)
          break;
      imax = i+1;
      if (imax == 0)
        continue;
    }

    if (flags & HIST_KEEPLEFT) {
      imin = 0;
    } else { /* trim the left edge */
      for (i = 0; i < imax; i++)
        if (p[i] > 0)
          break;
      imin = i;
    }

    sm = (r == rows) ? smtot[0] : sums[r];
    if (fabs(sm) < 1e-6) fac = 1.;
    else fac = 1.0/(sm*dx);

    for (i = imin; i < imax; i++) {
      if ((flags & HIST_NOZEROES) && p[i] < 1e-6)
        continue;
      fprintf(fp,"%g ", xmin+(i+delta)*dx);
      if (flags & HIST_KEEPHIST)
        fprintf(fp, "%20.14E ", p[i]);
      rp = (r == rows) ? (-1) : r;
      if (fnorm != NULL) /* advanced normalization, note the r = -1 case */
        fac = (*fnorm)(rp, i, xmin, dx, pdata);
      fprintf(fp,"%20.14E %d\n", p[i]*fac, rp);
    }
    fprintf(fp,"\n");
  }
  fclose(fp);
  if (flags & HIST_VERBOSE) {
    fprintf(stderr, "successful wrote %s\n", fn);
    for (r = 0; r < rows; r++)
      fprintf(stderr, "%2d cnt: %20.4f av: %10.4f(%10.4f)\n",
          r, sums[r], sums[r+rows], sums[r+rows*2]);
  }
  free(sums);
  if (flags & HIST_OVERALL) {
    free(htot);
    free(smtot);
  }
  return 0;
}

/* skip a | */
static char *skipabar_(char *p)
{
  int next = -1;
  sscanf(p, " | %n", &next);
  return (next < 0) ? NULL : (p + next);
}

/* load a previous histogram
 * (*frheader) function to read additional header info.
 * (*fnorm) normalization factor */
int histloadx(double *hist, int rows, int n, double xmin, double dx,
    unsigned flags,
    int (*frheader)(const char *s, void *data),
    double (*fnorm)(int r, int ix, double xmin, double dx, void *data),
    void *pdata,
    const char *fn)
{
  FILE *fp;
  static char s[40960] = "", *p;
  int verbose = (flags & HIST_VERBOSE);
  int add = (flags & HIST_ADDITION);
  int ver, next, hashist;
  int i, i1, r, r1, nlin = 0;
  unsigned fflags;
  double x, y, y2, fac, delta, *arr, *sums = NULL;

  if ((fp = fopen(fn, "r")) == NULL) {
    fprintf(stderr, "cannot read %s\n", fn);
    return -1;
  }

  /* check the first line */
  if (fgets(s, sizeof s, fp) == NULL || s[0] != '#') {
    fprintf(stderr, "%s: missing the first line\n", fn);
    fclose(fp);
    return -1;
  }
  nlin++;
  if (6 != sscanf(s, " # %d 0x %X | %d%d%lf%lf | %n", &ver, &fflags, &r, &i, &y, &x, &next)
      || i < n || r != rows || fabs(x - dx) > 1e-5) {
    fprintf(stderr, "Error: bins = %d, %d, ng = %d, %d; dx = %g, %g\n",
        i, n, r, rows, x, dx);
    fclose(fp);
    return -1;
  }
  delta   = ((fflags & HIST_ADDAHALF) ? .5 : 0.);
  hashist =  (fflags & HIST_KEEPHIST);
  /* scan sums */
  xnew(sums, rows);
  for (p = s+next, r = 0; r < rows; r++) {
    if (1 != sscanf(p, "%lf%n", sums + r, &next)) {
      fprintf(stderr, "cannot read sums from at %d/%d, s:\n%s\np:\n%s\n", r, rows, s, p);
      goto EXIT;
    }
    p += next;
  }
  if ((p = skipabar_(p)) == NULL) goto EXIT;
  for (r = 0; r < rows; r++) {
    if (2 != sscanf(p, "%lf%lf%n", &y, &y2, &next)) {
      fprintf(stderr, "cannot read average/stddev from at %d/%d, s:\n%s\np:\n%s\n", r, rows, s, p);
      goto EXIT;
    }
    p += next;
  }
  if ((p = skipabar_(p)) == NULL) goto EXIT;
  if (frheader != NULL) {
    if (0 != frheader(p, pdata))
      goto EXIT;
  }


  if (!add) { /* clear histogram */
    for (i = 0; i < rows*n; i++) hist[i] = 0.;
  }

  /* loop over r = 0..rows-1 */
  for (r = 0; r < rows; r++) {
    arr = hist + r*n;
    fac = sums[r]*dx;
    while (fgets(s, sizeof s, fp)) {
      nlin++;
      for (p = s+strlen(s)-1; isspace((unsigned char)(*p)) && p >= s; p--)
        *p = '\0'; /* trim ending */
      if (s[0] == '#' || s[0] == '\0') break;
      if (hashist) {
        if (4 != sscanf(s, "%lf%lf%lf%d", &x, &y, &y2, &r1)) {
          fprintf(stderr, "error on line %d\n", nlin);
          goto EXIT;
        }
      } else { /* simple */
        if (3 != sscanf(s, "%lf%lf%d", &x, &y2, &r1)) {
          fprintf(stderr, "error on line %d\n", nlin);
          goto EXIT;
        }
      }
      if (r1 < 0) break; /* overall histogram */

      if (r1 < r) {
        fprintf(stderr, "wrong column index %d vs. %d on line %d, s=[%s]\n",
            r1, r, nlin, s);
        goto EXIT;
      } else if (r1 > r) {
        r = r1;
        arr = hist + r*n;
        fac = sums[r]*dx;
      }
      i1 = (int)((x - xmin)/dx - delta + .5);
      if (i1 < 0 || i1 >= n) {
        fprintf(stderr, "cannot find index for x = %g, delta = %g, i = %d/%d, on line %d\n",
            x, delta, i1, n, nlin);
        goto EXIT;
      }
      if (!hashist) {
        if (fnorm != NULL) {
          fac = (*fnorm)(r, i1, xmin, dx, pdata);
          fac = ((fabs(fac) < 1e-8) ? 1. : 1./fac);
        }
        y = y2*fac;
      }
      if (add) arr[i1] += y;
      else arr[i1] = y;
    }
  }
  if (verbose)
    fprintf(stderr, "histogram loaded successfully from %s\n", fn);

  if (sums) free(sums);
  return 0;
EXIT:
  fprintf(stderr, "error occurs at file %s, line %d, s:%s\n", fn, nlin, s);
  if (sums) free(sums);
  /* we always clear histogram on error */
  for (i = 0; i < rows*n; i++) hist[i] = 0.;
  return -1;
}

/* add x of weight w, into histogram h
 * return number of success */
int histadd(const double *xarr, double w, double *h, int rows,
    int n, double xmin, double dx, unsigned flags)
{
  int r, ix, good = 0, verbose = flags & HIST_VERBOSE;
  double x;

  for (r = 0; r < rows; r++) {
    x = xarr[r];
    if (x < xmin) {
      if (verbose)
       fprintf(stderr, "histadd underflows %d: %g < %g\n", r, x, xmin);
      continue;
    }
    ix = (int)((x - xmin)/dx);
    if (ix >= n) {
      if (verbose)
        fprintf(stderr, "histadd overflows %d: %g > %g\n", r, x, xmin+dx*n);
      continue;
    }
    h[r*n + ix] += w;
    good++;
  }
  return good;
}

/* OO wrappers */
static void hs_check(const hist_t *hs)
{
  if (hs == NULL) {
    fprintf(stderr, "hist is NULL\n");
    exit(1);
  }
  if (hs->arr == NULL || hs->rows == 0 || hs->n == 0) {
    fprintf(stderr, "hist: arr %p rows %d n %d\n", (void *)(hs->arr), hs->rows, hs->n);
    exit(1);
  }
}

hist_t *hs_initx(int rows, double xmin, double xmax, double dx,
    int (*fwh)(FILE *, void *), int (*frh)(const char*, void *),
    double (*fnorm)(int, int, double, double, void *))
{
  hist_t *hs;

  xnew(hs, 1);
  hs->rows = rows;
  hs->xmin = xmin;
  hs->dx   = dx;
  hs->n = (int)((xmax - xmin)/dx + 0.99999999);
  xnew(hs->arr, hs->n*hs->rows);
  hs->fwheader = fwh;
  hs->frheader = frh;
  hs->fnorm = fnorm;
  return hs;
}

void hs_free(hist_t *hs)
{
  if (hs) {
    if (hs->arr) free(hs->arr);
    memset(hs, 0, sizeof(*hs));
    free(hs);
  }
}

int hs_savex(const hist_t *hs, const char *fn, void *pdata, unsigned flags)
{
  hs_check(hs);
  return histsavex(hs->arr, hs->rows, hs->n, hs->xmin, hs->dx, flags,
      hs->fwheader, hs->fnorm, pdata, fn);
}

int hs_loadx(hist_t *hs, const char *fn, void *pdata, unsigned flags)
{
  hs_check(hs);
  return histloadx(hs->arr, hs->rows, hs->n, hs->xmin, hs->dx, flags,
      hs->frheader, hs->fnorm, pdata, fn);
}

int hs_add(hist_t *hs, const double *x, double w, unsigned flags)
{
  hs_check(hs);
  return histadd(x, w, hs->arr, hs->rows, hs->n, hs->xmin, hs->dx, flags);
}

int hs_add1(hist_t *hs, int r, double x, double w, unsigned flags)
{
  hs_check(hs);
  if (r >= hs->rows || r < 0) {
    fprintf(stderr, "bad row index %d\n", r);
    exit(1);
  }
  return histadd(&x, w, hs->arr + r*hs->n, 1, hs->n, hs->xmin, hs->dx, flags);
}


/* write 'rows' 2d n^2 histograms to file */
int hist2save(const double *h, int rows, int n, double xmin, double dx,
    unsigned flags, const char *fn)
{
  const int version = 0;
  const char *filename;
  FILE *fp;
  int i, j, r, imax, imin, jmax, jmin, n2;
  const double *p;
  double *sums, fac, delta;

  filename = (fn != NULL) ? fn : "HIST2";

  if ((fp = fopen(filename, "w")) == NULL) {
    printf("cannot write history file [%s].\n", filename);
    return 1;
  }

  n2 = n*n;
  sums = gethist2sums_(h, rows, n, xmin, dx);
  /* print basic information */
  fprintf(fp, "# %d 0x%X | %d %d %g %g | ",
      version, flags, rows, n, xmin, dx);
  for (r = 0; r < rows; r++) /* number of visits */
    fprintf(fp, "%g ", sums[r]);
  fprintf(fp, " | ");
  for (r = 0; r < rows; r++) /* averages and standard deviations */
    fprintf(fp, "%g %g %g %g ", sums[r+rows], sums[r+rows*2],
        sums[r+rows*3], sums[r+rows*4]);
  fprintf(fp, "| \n");

  delta = (flags & HIST_ADDAHALF) ? 0.5 : 0;

  for (r = 0; r < rows; r++) {
    p = h+r*n2;

    if (flags & HIST_KEEPRIGHT) {
      imax = n;
    } else { /* trim the right edge of i */
      for (i = n-1; i >= 0; i--) {
        for (j = 0; j < n; j++)
          if (p[i*n + j] > 0) break;
        if (j < n) break;
      }
      imax = i+1;
      if (imax == 0)
        continue;
    }

    if (flags & HIST_KEEPLEFT) {
      imin = 0;
    } else { /* trim the left edge of i */
      for (i = 0; i < imax; i++) {
        for (j = 0; j < n; j++)
          if (p[i*n + j] > 0) break;
        if (j < n) break;
      }
      imin = i;
    }

    if (flags & HIST_KEEPRIGHT2) {
      jmax = n;
    } else { /* trim the right edge of j */
      for (j = n-1; j >= 0; j--) {
        for (i = imin; i < imax; i++)
          if (p[i*n + j] > 0) break;
        if (i < imax) break;
      }
      jmax = j+1;
    }

    if (flags & HIST_KEEPLEFT2) {
      jmin = 0;
    } else { /* trim the left edge of j */
      for (j = 0; j < jmax; j++) {
        for (i = imin; i < imax; i++)
          if (p[i*n + j] > 0) break;
        if (i < imax) break;
      }
      jmin = j;
    }

    if (fabs(sums[r]) < 1e-6) fac = 1.;
    else fac = 1.0/(sums[r]*dx*dx);

    for (i = imin; i < imax; i++) {
      for (j = jmin; j < jmax; j++) {
        double x, y;
        if ((flags & HIST_NOZEROES) && p[i] < 1e-6)
          continue;
        x = xmin + (i+delta)*dx;
        y = xmin + (j+delta)*dx;
        fprintf(fp,"%g %g ", x, y);
        if (flags & HIST_KEEPHIST)
          fprintf(fp, "%20.14E ", p[i*n+j]);
        fprintf(fp,"%20.14E %d\n", p[i*n+j]*fac, r);
      }
      fprintf(fp,"\n");
    }
    fprintf(fp, "\n#\n");
  }
  fclose(fp);
  if (flags & HIST_VERBOSE) {
    fprintf(stderr, "successful wrote %s\n", fn);
    for (r = 0; r < rows; r++)
      fprintf(stderr, "%2d cnt: %20.4f xav: %10.4f(%10.4f) yav: %10.4f(%10.4f)\n",
          r, sums[r], sums[r+rows], sums[r+rows*2], sums[r+rows*3], sums[r+rows*4]);
  }
  free(sums);
  return 0;
}

int hist2load(double *hist, int rows, int n, double xmin, double dx,
    unsigned flags, const char *fn)
{
  FILE *fp;
  static char s[40960] = "", *p;
  int verbose = (flags & HIST_VERBOSE);
  int add = (flags & HIST_ADDITION);
  int ver, next, hashist;
  int i, j, r, r1, n2, nlin = 0;
  unsigned fflags;
  double x, y, g, g2, fac, delta, *arr, *sums = NULL;

  if ((fp = fopen(fn, "r")) == NULL) {
    fprintf(stderr, "cannot read %s\n", fn);
    return -1;
  }

  n2 = n*n;
  /* check the first line */
  if (fgets(s, sizeof s, fp) == NULL || s[0] != '#') {
    fprintf(stderr, "%s: missing the first line\n", fn);
    fclose(fp);
    return -1;
  }
  nlin++;
  if (6 != sscanf(s, " # %d 0x %X | %d%d%lf%lf | %n", &ver, &fflags, &r, &i, &y, &x, &next)
      || i < n || r != rows || fabs(x - dx) > 1e-5) {
    fprintf(stderr, "Error: bins = %d, %d, ng = %d, %d; dx = %g, %g\n",
        i, n, r, rows, x, dx);
    fclose(fp);
    return -1;
  }
  delta   = ((fflags & HIST_ADDAHALF) ? .5 : 0.);
  hashist =  (fflags & HIST_KEEPHIST);
  /* scan sums */
  xnew(sums, rows);
  for (p = s+next, r = 0; r < rows; r++) {
    if (1 != sscanf(p, "%lf%n", sums + r, &next)) {
      fprintf(stderr, "cannot read sums from at %d/%d, s:\n%s\np:\n%s\n", r, rows, s, p);
      goto EXIT;
    }
    p += next;
  }
  if ((p = skipabar_(p)) == NULL) goto EXIT;
  for (r = 0; r < rows; r++) {
    if (4 != sscanf(p, "%lf%lf%lf%lf%n", &x, &y, &g, &g2, &next)) {
      fprintf(stderr, "cannot read average/stddev from at %d/%d, s:\n%s\np:\n%s\n", r, rows, s, p);
      goto EXIT;
    }
    p += next;
  }
  if ((p = skipabar_(p)) == NULL) goto EXIT;

  if (!add) { /* clear histogram */
    for (i = 0; i < rows*n2; i++) hist[i] = 0.;
  }

  /* loop over r = 0..rows-1 */
  for (r = 0; r < rows; r++) {
    arr = hist + r*n2;
    fac = sums[r]*(dx*dx);
    while (fgets(s, sizeof s, fp)) {
      nlin++;
      for (p = s+strlen(s)-1; isspace((unsigned char)(*p)) && p >= s; p--)
        *p = '\0'; /* trim ending */
      if (s[0] == '#') break;
      if (s[0] == '\0') continue;

      if (hashist) {
        if (5 != sscanf(s, "%lf%lf%lf%lf%d", &x, &y, &g, &g2, &r1)) {
          fprintf(stderr, "error on line %d\n", nlin);
          goto EXIT;
        }
      } else {
        if (4 != sscanf(s, "%lf%lf%lf%d", &x, &y, &g2, &r1)) {
          fprintf(stderr, "error on line %d\n", nlin);
          goto EXIT;
        }
      }
      if (r1 != r) {
        fprintf(stderr, "wrong column index %d vs. %d on line %d\n",
          r1, r, nlin);
        goto EXIT;
      }
      i = (int)((x - xmin)/dx - delta + .5);
      if (i < 0 || i >= n) {
        fprintf(stderr, "cannot find index for x = %g\n", x);
        goto EXIT;
      }
      j = (int)((y - xmin)/dx - delta + .5);
      if (j < 0 || j >= n) {
        fprintf(stderr, "cannot find index for y = %g\n", y);
        return -1;
      }
      if (!hashist) {
        g = g2*fac;
      }
      if (add) arr[i*n+j] += g;
      else arr[i*n+j] = g;
    }
  }
  if (verbose) fprintf(stderr, "%s loaded successfully\n", fn);
  return 0;
EXIT:
  fprintf(stderr, "error occurs at file %s, line %d, s:%s\n", fn, nlin, s);
  if (sums) free(sums);
  for (i = 0; i < rows*n2; i++) hist[i] = 0.;
  return -1;
}

/* add (xarr[skip*r], yarr[skip*r]) of weight w, into histogram h
 * return number of success */
int hist2add(const double *xarr, const double *yarr, int skip,
    double w, double *h, int rows,
    int n, double xmin, double dx, unsigned flags)
{
  int r, ix, iy, good = 0, verbose = flags & HIST_VERBOSE;
  double x, y;

  for (r = 0; r < rows; r++) {
    x = xarr[skip*r];
    y = yarr[skip*r];
    if (x < xmin || y < xmin) {
      if (verbose)
       fprintf(stderr, "histadd underflows %d: %g or %g < %g\n", r, x, y, xmin);
      continue;
    }
    ix = (int)((x - xmin)/dx);
    iy = (int)((y - xmin)/dx);
    if (ix >= n || iy >= n) {
      if (verbose)
        fprintf(stderr, "histadd overflows %d: %g or %g > %g\n", r, x, y, xmin+dx*n);
      continue;
    }
    h[r*n*n + ix*n+iy] += w;
    good++;
  }
  return good;
}

static void hs2_check(const hist2_t *hs)
{
  if (hs == NULL) {
    fprintf(stderr, "hist2 is NULL\n");
    exit(1);
  }
  if (hs->arr == NULL || hs->rows == 0 || hs->n == 0) {
    fprintf(stderr, "hist2: arr %p rows %d n %d\n", (void *)(hs->arr), hs->rows, hs->n);
    exit(1);
  }
}

hist2_t *hs2_init(int rows, double xmin, double xmax, double dx)
{
  hist2_t *hs2;

  xnew(hs2, 1);
  hs2->rows = rows;
  hs2->xmin = xmin;
  hs2->dx   = dx;
  hs2->n = (int)((xmax - xmin)/dx + 0.99999999);
  xnew(hs2->arr, hs2->n*hs2->n*hs2->rows);
  return hs2;
}

void hs2_free(hist2_t *hs2)
{
  if (hs2) {
    if (hs2->arr) free(hs2->arr);
    memset(hs2, 0, sizeof(*hs2));
    free(hs2);
  }
}

int hs2_save(const hist2_t *hs, const char *fn, unsigned flags)
{
  hs2_check(hs);
  return hist2save(hs->arr, hs->rows, hs->n, hs->xmin, hs->dx,
      flags, fn);
}

int hs2_load(hist2_t *hs, const char *fn, unsigned flags)
{
  hs2_check(hs);
  return hist2load(hs->arr, hs->rows, hs->n, hs->xmin, hs->dx,
      flags, fn);
}

int hs2_add(hist2_t *hs, const double *x, const double *y, int skip, double w, unsigned flags)
{
  hs2_check(hs);
  return hist2add(x, y, skip, w, hs->arr, hs->rows, hs->n, hs->xmin, hs->dx, flags);
}

int hs2_add1(hist2_t *hs, int r, double x, double y, double w, unsigned flags)
{
  hs2_check(hs);
  return hist2add(&x, &y, 1, w, hs->arr+r*hs->n*hs->n, 1, hs->n, hs->xmin, hs->dx, flags);
}

#endif /* ZCOM_HIST__ */
#endif /* ZCOM_HIST */

#ifdef  ZCOM_MDS
#ifndef ZCOM_MDS__
#define ZCOM_MDS__

/* multidimensional scaling */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

STRCLS real mds_min0(real *x, real *dm, int n, int dim, double tol);


/* compute force and energy */
static real mds_force(real *f, real *x, real *dm, int n, int dim)
{
  const real dmin = 1e-6f;
  int i, j, k;
  real ene = 0., dij, dref, dsc;
  real *dx, *xi, *xj, *fi, *fj;

  xnew(dx, dim);
  for (i = 0; i < n*dim; i++) f[i] = 0.;
  for (i = 0; i < n; i++) {
    xi = x + i*dim;
    fi = f + i*dim;
    for (j = i+1; j < n; j++) {
      xj = x + j*dim;
      fj = f + j*dim;
      for (dij = 0, k = 0; k < dim; k++) {
        dx[k] = xi[k] - xj[k];
        dij += dx[k]*dx[k];
      }
      dij = (real) sqrt(dij);
      dref = dm[i*n+j];
      if (dref < dmin) dref = dmin;
      dsc = dij/dref - 1;
      ene += dsc*dsc;
      /* dij is to be used in denominator in the loop, ensure its > 0 */
      if (dij < dmin) dij = dmin;
      for (k = 0; k < dim; k++) {
        dx[k] *= -(dij - dref)/(dref*dref*dij);
        fi[k] += dx[k];
        fj[k] -= dx[k];
      }
    }
  }
  free(dx);
  return ene;
}

/* make coordinates neat
 * center coordinates
 * rotate to principle coordinates */
static void mds_trim(real *x, int n, int dim)
{
  real *av, *mmt, *eig, *vec, *xi, *b;
  int i, d, d2;

  /* center the graph */
  xnew(av, dim);
  for (i = 0; i < n; i++)
    for (d = 0; d < dim; d++)
      av[d] += x[i*dim + d];
  for (d = 0; d < dim; d++) av[d] /= n;
  for (i = 0; i < n; i++)
    for (d = 0; d < dim; d++)
      x[i*dim + d] -= av[d];
  free(av);

  /* compute moments */
  xnew(mmt, dim*dim);
  xnew(eig, dim);
  xnew(vec, dim*dim);
  xnew(b, dim);
  for (i = 0; i < n; i++) {
    for (d = 0; d < dim; d++)
      for (d2 = 0; d2 < dim; d2++)
        mmt[d*dim+d2] += x[i*dim+d]*x[i*dim+d2];
  }
  eigsym(mmt, eig, vec, dim);
  for (i = 0; i < n; i++) {
    /* rotate x[i*dim .. i*dim+d-1] --> b[0 .. d-1] */
    xi = x + i*dim;
    for (d = 0; d < dim; d++) /* b = xi.vec */
      for (b[d] = 0., d2 = 0; d2 < dim; d2++)
        b[d] += xi[d2]*vec[d2*dim+d];
    for (d = 0; d < dim; d++) xi[d] = b[d];
  }
  free(b);
  free(eig);
  free(vec);
  free(mmt);
}

/* multidimensional scaling - steepest descend
 * given a distance matrix dm[n x n],
 * return best mds position x[n x dim];
 * dim is the target dimensional, e.g. 2
 * return the total discrepancy */
real mds_min0(real *x, real *dm, int n, int dim, double tol)
{
  int i, j, it, itermax = 100000, npr;
  real *f, *fp, *xp, ene, enep;
  real dt = 1e-1f;

  if (n == 1) {
    for (j = 0; j < dim; j++) x[j] = 0.;
    return 0.0;
  }
  npr = n*(n-1)/2;
  xnew(f, n*dim);
  xnew(xp, n*dim);
  xnew(fp, n*dim);
  for (i = 0; i < n*dim; i++)
    x[i] = 1.f*rand()/RAND_MAX;
  ene = mds_force(f, x, dm, n, dim);
  for (it = 0; it < itermax; it++) {
    enep = ene;
    for (i = 0; i < n*dim; i++) { /* backup */
      xp[i] = x[i];
      fp[i] = f[i];
    }
    for (i = 0; i < n*dim; i++) x[i] += f[i]*dt;
    ene = mds_force(f, x, dm, n, dim);
    if (ene > enep) {
      dt *= 0.7f;
      for (i = 0; i < n*dim; i++) { /* recover */
        x[i] = xp[i];
        f[i] = fp[i];
      }
    } else {
      if (fabs(ene-enep) < tol*npr*dt) break;
      dt *= 1.03f; /* attempt to increase time step */
    }
  }
  if (it >= itermax) {
    fprintf(stderr, "mds: failed to converge after %d iterations, %g\n",
        it, fabs(ene-enep));
  }
  mds_trim(x, n, dim);
  free(xp);
  free(f);
  free(fp);
  return ene;
}

#endif /* ZCOM_MDS__ */
#endif /* ZCOM_MDS */

#ifdef  ZCOM_PDB
#ifndef ZCOM_PDB__
#define ZCOM_PDB__
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <math.h>

typedef struct {
  int aid; /* atom index */
  int rid; /* residue index */
  int insert;
  char atnm[8];
  char resnm[8];
  char elem[4];
  real *x; /* pointer to pdbmodel_t.x */
} pdbatom_t; /* a single atom entry */

typedef struct {
  int natm; /* # of lines == # of atoms */
  int nalloc;
  int nres;
  pdbatom_t *atm; /* array of n or nalloc */
  real (*x)[3];  /* coordinate array of n or nalloc */
  const char *file;
} pdbmodel_t; /* raw data in a pdb model */


typedef struct {
  int aa;  /* index of amino acid [0, 20) */
  int nat; /* number of atoms */
  int id[32]; /* indices to the coordinate array */
  unsigned long flags;
  real *xca, *xn, *xc;
} pdbaar_t; /* amino-acid residues */

typedef struct {
  int nres;
  int natm;
  pdbaar_t *res; /* array of nres */
  real (*x)[3]; /* array of natom */
  const char *file; /* input file */
} pdbaac_t; /* amino-acid chain */

/* generic pdb model */
STRCLS pdbmodel_t *pdbm_read(const char *fname, int verbose);
STRCLS int pdbm_write(pdbmodel_t *m, const char *fn);
#define pdbm_free(m) { free(m->atm); free(m->x); free(m); }

enum { PDB_CONTACT_CA, PDB_CONTACT_HEAVY, PDB_CONTACT_ALL }; /* ways of searching contacts */
STRCLS int *pdbm_contact(pdbmodel_t *pm, double rc, int level, int nearby, int dbg);

/* protein pdb */
STRCLS pdbaac_t *pdbaac_parse(pdbmodel_t *m, int verbose);
#define pdbaac_free(c) { free(c->res); free(c->x); free(c); }
#define pdbaac_x(c, i, nm) pdbaac_getx(c, i, #nm)

#define AA_BACKBONE 0x7
#define AA_H2   29
#define AA_H3   30
#define AA_OXT  31

/* don't edit data in the structure, written by mkdb.py */
struct tag_pdb_aadb {
  const char *resnm;      /* residue name */
  const char *atom[25];   /* atoms */
  const char *sub[11];    /* substitutions */
  unsigned long hvflags;  /* backbone and heavy atom flags */
} pdb_aadb[20] = {
{"GLY", {"CA", "N", "C", "O", "HA1", "HA2", "H", NULL}, {"HA3", "HA1", NULL}, 0xful},
{"ALA", {"CA", "N", "C", "O", "CB", "HB1", "HB2", "HB3", "HA", "H", NULL}, {NULL}, 0x1ful},
{"VAL", {"CA", "N", "C", "O", "CB", "HB", "CG1", "HG11", "HG12", "HG13", "CG2", "HG21", "HG22", "HG23", "HA", "H", NULL}, {NULL}, 0x45ful},
{"LEU", {"CA", "N", "C", "O", "CB", "HB1", "HB2", "CG", "HG", "CD1", "HD11", "HD12", "HD13", "CD2", "HD21", "HD22", "HD23", "HA", "H", NULL}, {"HB3", "HB1", NULL}, 0x229ful},
{"ILE", {"CA", "N", "C", "O", "CB", "HB", "CG2", "HG21", "HG22", "HG23", "CG1", "HG11", "HG12", "CD", "HD1", "HD2", "HD3", "HA", "H", NULL}, {"HG13", "HG11", "CD1", "CD", "HD11", "HD1", "HD12", "HD2", "HD13", "HD3", NULL}, 0x245ful},
{"PRO", {"CA", "N", "C", "O", "CD", "HD1", "HD2", "CG", "HG1", "HG2", "CB", "HB1", "HB2", "HA", NULL}, {"HB3", "HB1", "HG3", "HG1", "HD3", "HD1", NULL}, 0x49ful},
{"SER", {"CA", "N", "C", "O", "CB", "HB1", "HB2", "OG", "HG", "HA", "H", NULL}, {"HB3", "HB1", NULL}, 0x9ful},
{"THR", {"CA", "N", "C", "O", "CB", "HB", "CG2", "HG21", "HG22", "HG23", "OG1", "HG1", "HA", "H", NULL}, {NULL}, 0x45ful},
{"CYS", {"CA", "N", "C", "O", "CB", "HB1", "HB2", "SG", "HG", "HA", "H", NULL}, {"HB3", "HB1", NULL}, 0x9ful},
{"MET", {"CA", "N", "C", "O", "CB", "HB1", "HB2", "CG", "HG1", "HG2", "SD", "CE", "HE1", "HE2", "HE3", "HA", "H", NULL}, {"HB3", "HB1", "HG3", "HG1", NULL}, 0xc9ful},
{"ASN", {"CA", "N", "C", "O", "CB", "HB1", "HB2", "CG", "OD1", "ND2", "HD21", "HD22", "HA", "H", NULL}, {"HB3", "HB1", NULL}, 0x39ful},
{"GLN", {"CA", "N", "C", "O", "CB", "HB1", "HB2", "CG", "HG1", "HG2", "CD", "OE1", "NE2", "HE21", "HE22", "HA", "H", NULL}, {"HB3", "HB1", "HG3", "HG1", NULL}, 0x1c9ful},
{"ASP", {"CA", "N", "C", "O", "CB", "HB1", "HB2", "CG", "OD1", "OD2", "HA", "H", NULL}, {"HB3", "HB1", NULL}, 0x39ful},
{"GLU", {"CA", "N", "C", "O", "CB", "HB1", "HB2", "CG", "HG1", "HG2", "CD", "OE1", "OE2", "HA", "H", NULL}, {"HB3", "HB1", "HG3", "HG1", NULL}, 0x1c9ful},
{"LYS", {"CA", "N", "C", "O", "CB", "HB1", "HB2", "CG", "HG1", "HG2", "CD", "HD1", "HD2", "CE", "HE1", "HE2", "NZ", "HZ1", "HZ2", "HZ3", "HA", "H", NULL}, {"HB3", "HB1", "HG3", "HG1", "HD3", "HD1", "HE3", "HE1", NULL}, 0x1249ful},
{"ARG", {"CA", "N", "C", "O", "CB", "HB1", "HB2", "CG", "HG1", "HG2", "CD", "HD1", "HD2", "NE", "HE", "CZ", "NH1", "HH11", "HH12", "NH2", "HH21", "HH22", "HA", "H", NULL}, {"HB3", "HB1", "HG3", "HG1", "HD3", "HD1", NULL}, 0x9a49ful},
{"HIS", {"CA", "N", "C", "O", "CB", "HB1", "HB2", "CG", "ND1", "HD1", "CE1", "HE1", "NE2", "HE2", "CD2", "HD2", "HA", "H", NULL}, {"HB3", "HB1", NULL}, 0x559ful},
{"PHE", {"CA", "N", "C", "O", "CB", "HB1", "HB2", "CG", "CD1", "HD1", "CE1", "HE1", "CZ", "HZ", "CE2", "HE2", "CD2", "HD2", "HA", "H", NULL}, {"HB3", "HB1", NULL}, 0x1559ful},
{"TYR", {"CA", "N", "C", "O", "CB", "HB1", "HB2", "CG", "CD1", "HD1", "CE1", "HE1", "CZ", "OH", "HH", "CE2", "HE2", "CD2", "HD2", "HA", "H", NULL}, {"HB3", "HB1", NULL}, 0x2b59ful},
{"TRP", {"CA", "N", "C", "O", "CB", "HB1", "HB2", "CG", "CD1", "HD1", "NE1", "HE1", "CE2", "CZ2", "HZ2", "CH2", "HH2", "CZ3", "HZ3", "CE3", "HE3", "CD2", "HA", "H", NULL}, {"HB3", "HB1", NULL}, 0x2ab59ful}};


INLINE int pdbaaidx(const char *res)
{
  int i;
  for (i = 0; i < 20; i++)
    if (strcmp(res, pdb_aadb[i].resnm) == 0)
      return i;
  return -1;
}

INLINE const char *pdbaaname(int i)
{
  die_if (i < 0 || i >= 20, "invalid amino acid id %d\n", i);
  return pdb_aadb[i].resnm;
}

/* return the index of an atom from */
INLINE int pdbaagetaid(int aa, const char *atnm)
{
  int k;
  for (k = 0; pdb_aadb[aa].atom[k]; k++)
    if (strcmp(pdb_aadb[aa].atom[k], atnm) == 0) return k;
  return -1;
}

/* return the global atom index */
INLINE int pdbaar_getaid(pdbaar_t *r, const char *atnm)
  { int topid = pdbaagetaid(r->aa, atnm); return (topid < 0) ? -1 : r->id[topid]; }
INLINE int pdbaac_getaid(pdbaac_t *c, int i, const char *atnm)
  { return pdbaar_getaid(c->res + i, atnm); }
/* return coordinates */
INLINE real *pdbaac_getx(pdbaac_t *c, int i, const char *atnm)
  { int id = pdbaac_getaid(c, i, atnm); return (id < 0) ? NULL : c->x[id]; }

/* format atom name, out could be equal to atnm
 * style 0: atom name first, e.g., "HE21", "CB", or style 1: "1HE2" or " CB " */
INLINE char *pdbm_fmtatom(char *out, const char *inp, int style)
{
  size_t n, i;
  char c, cn, atnm[5];
  const char *p;

  die_if (style > 2 || style < 0, "bad format style %d\n", style);

  /* copy inp to a buffer without space */
  for (p = inp; *p == ' '; p++) ;
  for (n = 0; n < 4 && p[n] && p[n] != ' '; n++) atnm[n] = p[n];
  atnm[n] = '\0';
  die_if (n == 4 && p[n] && p[n] != ' ', "bad input atom name [%s]\n", atnm);

  if (style <= 1) { /* style 0: "H", "CA", "HE21", style 1: " H", " CA", "HE21" */
    if (n == 4) {
      c = atnm[0];
      if (isdigit(c)) { /* rotate the string, such that 1HE2 --> HE21; */
        for (i = 1; i < n; i++) out[i-1] = atnm[i];
        out[n-1] = c;
        out[n] = '\0';
      } else strcpy(out, atnm);
    } else { /* n <= 3 */
      if (style == 0) strcpy(out, atnm);
      else { out[0] = ' '; strcpy(out+1, atnm); }
    }
  } else if (style == 2) { /* style 2: " H", " CA", "1HE2" */
    if (n == 4) {
      c = atnm[0];
      cn = atnm[n - 1];
      if (isalpha(c) && isdigit(cn)) { /* HE21 --> 1HE2 */
        for (i = n-1; i > 0; i--) out[i] = atnm[i-1];
        out[0] = cn;
        out[n] = '\0';
      } else strcpy(out, atnm);
    } else { /* n <= 3 */
      out[0] = ' '; strcpy(out+1, atnm);
    }
  }
  return out;
}


/* read raw atom data from pdb */
pdbmodel_t *pdbm_read(const char *fname, int verbose)
{
  const int BSIZ = 256;
  FILE *fp;
  pdbmodel_t *m;
  pdbatom_t *atm;
  int i, j, ir, iro;
  char s[256], resnm[8] = "";
  float x[3];

  xfopen(fp, fname, "r", return NULL);
  xnew(m, 1);
  m->natm = 0;
  m->nalloc = BSIZ;
  m->file = fname;
  xnew(m->atm, m->nalloc);
  xnew(m->x,   m->nalloc);
  /* read through pdb */
  ir = -1;
  while (fgets(s, sizeof s, fp)) {
    if (strncmp(s, "TER", 3) == 0 ||
        strncmp(s, "ENDMDL", 6) == 0 ||
        strncmp(s, "END", 3) == 0)
      break;
    if (strncmp(s, "ATOM ", 5) != 0)
      continue;
    if (s[16] != ' ' && s[16] != 'A') /* discard alternative position */
      continue;
    i = m->natm;
    if (i >= m->nalloc) {
      m->nalloc += BSIZ;
      xrenew(m->atm, m->nalloc);
      xrenew(m->x,   m->nalloc);
    }
    atm = m->atm + i;
    atm->aid = i;
    atm->insert = s[26];

    /* atom name */
    strcnv(atm->atnm, s+12, 4, ZSTR_COPY|ZSTR_XSPACE);
    pdbm_fmtatom(atm->atnm, atm->atnm, 0);
    /* residue name */
    strcnv(atm->resnm, s+17, 3, ZSTR_COPY|ZSTR_XSPACE);
    /* residue number */
    sscanf(s+22, "%d", &(atm->rid));
    if (ir == atm->rid && resnm[0] && strcmp(atm->resnm, resnm) != 0) {
      fprintf(stderr, "atom %d, %s, residue %d conflicts %s --> %s, file %s\n",
          i, atm->atnm, ir, resnm, atm->resnm, fname);
    }
    strcpy(resnm, atm->resnm);
    ir = atm->rid;
    /* coordinates */
    sscanf(s+30, "%f%f%f", x, x+1, x+2);
    rv3_make(m->x[i], x[0], x[1], x[2]);
    /* element name */
    if (strlen(s) >= 78) {
      strcnv(atm->elem, s+76, 2, ZSTR_COPY|ZSTR_XSPACE);
      if (atm->elem[0] == '\0') { /* guess */
        atm->elem[0] = atm->atnm[0];
        atm->elem[1] = '\0';
      }
    }
    m->natm++;
  }
  for (i = 0; i < m->natm; i++) /* set atom x */
    m->atm[i].x = m->x[i];
  if (verbose)
    printf("%s has %d residues\n", fname, m->atm[m->natm-1].rid);
  /* offset the residue id */
  for (ir = 0, i = 0; i < m->natm; ir++) {
    iro = m->atm[i].rid;
    for (j = i; j < m->natm && m->atm[j].rid == iro &&
        m->atm[j].insert == m->atm[i].insert &&
        strcmp(m->atm[j].resnm, m->atm[i].resnm) == 0; j++) {
      m->atm[j].rid = ir;
    }
    if (verbose >= 2)
      printf("atoms %d to %d --> residue %s, %d (%d)\n",
        i+1, j, m->atm[i].resnm, iro, ir+1);
    i = j;
  }
  m->nres = ir;
  if (verbose >= 3) {
    for (i = 0; i < m->natm; i++) {
      atm = m->atm + i;
      printf("%4d %4s %4d %4s %8.3f %8.3f %8.3f\n",
          atm->aid+1, atm->atnm, atm->rid+1, atm->resnm,
          atm->x[0], atm->x[1], atm->x[2]);
    }
  }
  fclose(fp);
  return m;
}

/* write data to file fn */
int pdbm_write(pdbmodel_t *m, const char *fn)
{
  int i, aid, ATMFMT = 1;
  char atnm[8] = "";
  FILE *fp;
  pdbatom_t *atm;
  real *x;

  if (m->natm <= 0) return 1;
  xfopen(fp, fn, "w", return 1);
  for (aid = 1, i = 0; i < m->natm; i++) {
    atm = m->atm + i;
    pdbm_fmtatom(atnm, atm->atnm, ATMFMT);
    x = m->x[i];
    fprintf(fp, "ATOM  %5d %-4s %-4sA%4d    %8.3f%8.3f%8.3f  1.00  0.00          %2s  \n",
        aid++, atnm, atm->resnm, atm->rid+1, x[0], x[1], x[2], atm->elem);
  }
  fprintf(fp, "TER   %5d      %-4sA%4d%54s\n", m->natm+1, atm->resnm, atm->rid+1, "");
  fprintf(fp, "END%77s\n", " ");
  fclose(fp);
  return 0;
}

INLINE int iscontactatom(int level, const char *atnm)
{
  if (level == PDB_CONTACT_ALL) return 1;
  else if (level == PDB_CONTACT_HEAVY) return atnm[0] != 'H';
  else return strcmp(atnm, "CA") == 0; /* PDB_CONTACT_CA */
}

/* return a nres x nres array to indicate if two residues form a contact
 * it is a contact only if the distance of any two atoms from the two residues
 * is less than rc,
 * 'level' : one of the PDB_CONTACT_XX values above.
 * 'nearby': # of adjacent resdiues to be excluded from the list
 * */
int *pdbm_contact(pdbmodel_t *pm, double rc, int level, int nearby, int dbg)
{
  int ir, jr, i, j, im, jm, ica, jca, n = pm->natm, nres = pm->nres, ct;
  pdbatom_t *atm = pm->atm;
  real d, dmin, dca;
  int *ds;

  xnew(ds, nres*nres);
  for (ir = 0; ir < nres; ir++) {
    for (jr = ir+1; jr < nres; jr++) {
      /* compute the minimal distance between ir and jr */
      dmin = 1e9; dca = 0; im = jm = -1;
      for (i = 0; i < n; i++) {
        if (atm[i].rid != ir) continue;
        if (!iscontactatom(level, atm[i].atnm)) continue;
        ica = iscontactatom(PDB_CONTACT_CA, atm[i].atnm);
        for (j = 0; j < n; j++) {
          if (atm[j].rid != jr) continue;
          if (!iscontactatom(level, atm[j].atnm)) continue;
          jca = iscontactatom(PDB_CONTACT_CA, atm[j].atnm);
          d = rv3_dist(atm[i].x, atm[j].x);
          if (d < dmin) { dmin = d; im = i; jm = j; }
          if (ica && jca) dca = d; /* CA distance */
        }
      }
      ds[ir*nres+jr] = ds[jr*nres+ir] = ct = (dmin < rc) ? 1 : 0;
      if (dbg > 1 || (dbg && ct))/* print decision */
        printf("[%d] %s%-3d and %s%-3d: dca %6.3fA dmin %6.3fA (%s:%d, %s:%d)\n",
          ct, atm[im].resnm, ir+1, atm[jm].resnm, jr+1, dca, dmin,
          atm[im].atnm, im+1, atm[jm].atnm, jm+1);
    }
  }

  /* exclude nearby residues */
  for (ir = 0; ir < nres; ir++)
    for (jr = ir+1; jr <= ir+nearby && jr < nres; jr++)
      ds[ir*nres + jr] = ds[jr*nres + ir] = 0;
  return ds;
}

/* get amino acid chain by parsing pdbmodel_t m */
pdbaac_t *pdbaac_parse(pdbmodel_t *m, int verbose)
{
  pdbatom_t *atm;
  pdbaac_t *c;
  pdbaar_t *r;
  int i, k, match;
  unsigned long hvflags;

  xnew(c, 1);
  c->nres = m->nres;
  c->natm = m->natm;
  xnew(c->res, c->nres);
  xnew(c->x, m->natm);
  memcpy(c->x, m->x, sizeof(*(c->x))*m->natm); /* copy coordinates */
  c->file = m->file;

  for (i = 0; i < m->natm; i++) {
    atm = m->atm + i;
    r = c->res + atm->rid;
    r->aa = pdbaaidx(atm->resnm);
    if (r->aa < 0) {
      fprintf(stderr, "unknown amino acid residue %d/%d[%s]\n",
          atm->rid, m->nres, atm->resnm);
      goto ERR;
    }

    /* match index */
    match = 0;
    for (k = 0; pdb_aadb[r->aa].atom[k] != NULL; k++)
      if (strcmp(atm->atnm, pdb_aadb[r->aa].atom[k]) == 0) {
        r->id[k] = i;
        r->flags |= 1ul << k;
        match = 1;
        break;
      }
    if (!match) { /* check terminals */
#define AAMAPIT_(lead, str, nm) lead (strcmp(atm->atnm, str) == 0) \
    { int aid = pdbaagetaid(r->aa, #nm); r->id[aid] = i; r->flags |= 1ul << (unsigned long) aid; match = 1; }
#define AAMATCH_(lead, str, nm) lead (strcmp(atm->atnm, str) == 0) \
    { r->id[AA_ ## nm] = i; r->flags |= 1ul << AA_##nm; match = 1; }
      AAMAPIT_(if, "H1", H)
      AAMATCH_(else if, "H2",   H2)
      AAMATCH_(else if, "H3",   H3)
      AAMATCH_(else if, "OXT",  OXT)
      AAMAPIT_(else if, "OC1",  O)
      AAMATCH_(else if, "OC2",  OXT)
      AAMAPIT_(else if, "O1",   O)
      AAMATCH_(else if, "O2",   OXT)
      else { /* check substitutions */
        for (k = 0; pdb_aadb[r->aa].sub[k] != NULL; k += 2)
          if (strcmp(atm->atnm, pdb_aadb[r->aa].sub[k]) == 0) {
            int aid = pdbaagetaid(r->aa, pdb_aadb[r->aa].sub[k+1]);
            r->id[aid] = i;
            r->flags |= 1ul << (unsigned) aid;
            match = 1;
            break;
          }
      }
    }
    if (!match)
      printf("unknown atom %s:%d res %s%d\n", atm->atnm, i+1, atm->resnm, atm->rid+1);
  }

#define pdbaac_pmiss_(xflags) { \
  unsigned long miss = (r->flags ^ xflags) & xflags; \
  fprintf(stderr, "file %s, residue %s%d misses atom(s): ", c->file, pdbaaname(r->aa), i+1); \
  for (k = 0; pdb_aadb[r->aa].atom[k] != NULL; k++) { \
    if (miss & (1ul << k)) fprintf(stderr, "%s ", pdb_aadb[r->aa].atom[k]); } \
  fprintf(stderr, "\n"); }

  /* checking integrity */
  for (i = 0; i < c->nres; i++) {
    r = c->res + i;
    hvflags = pdb_aadb[r->aa].hvflags;
    if ((r->flags & AA_BACKBONE) != AA_BACKBONE) {
      pdbaac_pmiss_(AA_BACKBONE);
      goto ERR;
    } else if ((r->flags & hvflags) != hvflags) {
      pdbaac_pmiss_(hvflags);
    }
    r->xn = pdbaac_x(c, i, N);
    r->xca = pdbaac_x(c, i, CA);
    r->xc = pdbaac_x(c, i, C);
  }
  /* check bond-length */
  for (i = 0; i < c->nres; i++) {
    real x;
    r = c->res + i;
    if (i > 0) {
      x = rv3_dist(pdbaac_x(c, i-1, C), r->xn);
      if (x < .3 || x > 2.3) {
        if (verbose) {
          const char *aap = pdbaaname(c->res[i-1].aa), *aa = pdbaaname(r->aa);
          fprintf(stderr, "%s: C-N bond between %d (%s) and %d (%s) is broken %g, insert break\n",
            c->file, i, aap, i+1, aa, x);
          goto ERR;
        }
      }
    }
    x = rv3_dist(r->xn, r->xca);
    if (x < .4 || x > 3.0) {
      if (verbose) {
        fprintf(stderr, "%s: N-CA bond of residue %d (%s) is broken %g\n",
          c->file, i+1, pdbaaname(r->aa), x);
        fprintf(stderr, "N : %8.3f %8.3f %8.3f\n", r->xn[0], r->xn[1], r->xn[2]);
        fprintf(stderr, "CA: %8.3f %8.3f %8.3f\n", r->xca[0], r->xca[1], r->xca[2]);
      }
      goto ERR;
    }
    x = rv3_dist(r->xca, r->xc);
    if (x < .4 || x > 3.0) {
      if (verbose) {
        fprintf(stderr, "%s: CA-C bond of residue %d (%s) is broken %g\n",
          c->file, i+1, pdbaaname(r->aa), x);
        fprintf(stderr, "CA: %8.3f %8.3f %8.3f\n", r->xca[0], r->xca[1], r->xca[2]);
        fprintf(stderr, "C : %8.3f %8.3f %8.3f\n", r->xc[0], r->xc[1], r->xc[2]);
      }
      goto ERR;
    }
  }
  if (verbose >= 3) {
    for (i = 0; i < c->nres; i++)
      printf("%4d: %s\n", i+1, pdbaaname(c->res[i].aa));
  }
  return c;
ERR:
  if (c->res) free(c->res);
  if (c->x) free(c->x);
  free(c);
  return NULL;
}

#endif /* ZCOM_PDB__ */
#endif /* ZCOM_PDB */

#ifdef  ZCOM_CLUS
#ifndef ZCOM_CLUS__
#define ZCOM_CLUS__
/* abstract function for cluster analysis using Monte Carlo sampling
 * given the distance matrix, we give out a cluster*/
#include <stdio.h>
#include <stdlib.h>

typedef struct { /* structure of a single cluster */
  int *idx;
  int cnt; /* number of points in this cluster, size of idx */
  int cap; /* capacity for holding */
  int centroid; /* centroid */
  double smdis; /* sum of pair distances, wi*wj*dij */
  double smwt;  /* sum of weights */
  double x, y;  /* multidimensional scaling */
} clus_t;

typedef struct { /* clsys: array of all clusters */
  int np;       /* # of points */
  int nc;       /* # of clusters */
  float **mat;  /* distance matrix i < j */
  double *wt;   /* weight */
  double wtot;  /* total weight */
  clus_t *c;    /* cluster array */
  double ene;   /* energy of all clusters */
  /* auxiliary variables */
  double mu0;   /* input mu */
  double muw;   /* penalty of adding a cluster the actual one
                   the weighted version, 0.5 * mu0 * wtot  */
  double bet;   /* inverse temperature */
  int acc; /* accepted metropolis moves */
} clsys_t;

STRCLS clsys_t *cls_init(float **mat, double *wt, int n, double mu);
STRCLS void cls_free(clsys_t *cls, int matwt);
STRCLS void cls_changemu(clsys_t *cls, double mu);
STRCLS clsys_t *cls_read(const char *fn, void (*rhead)(FILE *, clsys_t *, void *data), void *data);
STRCLS int cls_write(clsys_t *cls, const char *fn,
    void (*whead)(FILE *, const clsys_t *cls, const void *data), const void *data, int ver);
STRCLS clsys_t *cls_anneal(clsys_t *cls, int itmax, int method, double bet0, double bet1);
STRCLS clsys_t *cls_zalgo(clsys_t *cls, int itmax, int method,
    double bet0, double bet1, int nbet, int verbose);


/* allocate space for the distance matrix */
static float **dismat_alloc(int n)
{
  float **mat, *larr;
  int npr, offset, i;

  xnew(mat, n);
  npr = (n+1)*n/2;
  xnew(larr, npr);
  for (offset = 0, i = 0; i < n; i++) {
    mat[i] = larr + offset - i;
    mat[i][i] = 0.;
    offset += n - i;
  }
  return mat;
}

/* free the distance matrix */
static void dismat_free(float **mat)
{
  free(mat[0]);
  free(mat);
}

int CLIDBLOCKSIZ = 16; /* block size for allocating clus.idx */

/* initialize a cluster of a single point i */
static void cl_init(clus_t *cl, int i, double wt)
{
  cl->cnt = 1;
  cl->cap = CLIDBLOCKSIZ;
  xnew(cl->idx, cl->cap);
  cl->idx[0] = i;
  cl->smdis = 0.;
  cl->smwt  = wt;
}

static void cl_free(clus_t *cl)
{
  if (cl->cnt > 0) {
    free(cl->idx);
    memset(cl, 0, sizeof(*cl));
  }
}

/* duplicate information in cluster b to cluster a */
static void cl_copy(clus_t *a, const clus_t *b)
{
  int k;

  a->cnt = b->cnt;
  a->cap = a->cnt;
  xrenew(a->idx, a->cap);
  for (k = 0; k < a->cnt; k++)
    a->idx[k] = b->idx[k];
  a->smdis = b->smdis;
  a->smwt  = b->smwt;
  a->centroid = b->centroid;
}

/* add a new point `i' into the cluster, update energy by `de' */
static void cl_padd(clus_t *cl, int i, double deldis, double delwt)
{
  cl->cnt++;
  if (cl->cnt > cl->cap) {
    cl->cap += CLIDBLOCKSIZ;
    xrenew(cl->idx, cl->cap);
  }
  cl->idx[cl->cnt - 1] = i;
  cl->smdis += deldis;
  cl->smwt  += delwt;
}

/* remove the kth item in cluster 'cl', the energy change de  */
static void cl_premove(clus_t *cl, int k, double deldis, double delwt)
{
  die_if (k >= cl->cnt, "index error %d >= %d\n", k, cl->cnt);
  cl->cnt--;
  if (k < cl->cnt)
    cl->idx[k] = cl->idx[cl->cnt];
  cl->smdis += deldis;
  cl->smwt  += delwt;
}

#define cls_getwt(cls, i)  (cls->wt ? cls->wt[i] : 1.)

/* init. an cluster configuration from an nxn distance matrix */
clsys_t *cls_init(float **mat, double *wt, int n, double mu)
{
  clsys_t *cls;
  int ic;
  double wi;

  xnew(cls, 1);
  cls->np = n;
  cls->nc = n;
  cls->mat = mat;
  cls->wt  = wt;
  xnew(cls->c, cls->nc);
  cls->bet = 0.0;
  cls->wtot = 0.;
  for (ic = 0; ic < cls->nc; ic++) {
    wi = cls_getwt(cls, ic);
    cl_init(cls->c + ic, ic, wi);
    cls->wtot += wi;
  }
  cls->mu0 = mu;
  cls->muw = .5*mu*cls->wtot;
  cls->ene = cls->nc*cls->muw;
  return cls;
}

/* copy b to a, assume a is made by cls_init */
static void cls_copy(clsys_t *a, const clsys_t *b)
{
  int ic;
  a->np = b->np;
  a->nc = b->nc;
  a->mat = b->mat;
  a->wt = b->wt;
  a->bet = b->bet;
  a->wtot = b->wtot;
  a->mu0 = b->mu0;
  a->muw = b->muw;
  a->ene = b->ene;
  for (ic = 0; ic < a->nc; ic++) {
    cl_copy(a->c + ic, b->c + ic);
  }
}

/* find point ip in clusters,
 * return cluster id, *pk is ip's index in the cluster */
static int cls_find(clsys_t *cls, int ip, int *k)
{
  clus_t *ci;
  int ic;

  for (ic = 0; ic < cls->nc; ic++) {
    ci = cls->c + ic;
    for (*k = 0; *k < ci->cnt; (*k)++) {
      if (ip == ci->idx[*k])
        return ic;
    }
  }
  die_if (1, "cannot find point ip %d, np %d, nc %d\n", ip, cls->np, cls->nc);
  return -1;
}

/* add a new cluster with a single point */
static void cls_cadd(clsys_t *cls, int i)
{
  double wt;
  ++cls->nc;
  die_if (cls->nc > cls->np, "too many clusters %d > %d\n", cls->nc, cls->np);
  wt = cls_getwt(cls, i);
  cl_init(cls->c + cls->nc - 1, i, wt);
}

/* remove cluster ic */
static void cls_cremove(clsys_t *cls, int ic)
{
  die_if (ic >= cls->nc, "invalid cluster %d >= %d\n", ic, cls->nc);
  cls->nc--;
  if (ic < cls->nc)
    cl_copy(cls->c + ic, cls->c + cls->nc);
  cl_free(cls->c + cls->nc);
}

/* abandom the cluster structure */
void cls_free(clsys_t *cls, int matwt)
{
  int ic;
  for (ic = 0; ic < cls->np; ic++)
    cl_free(cls->c + ic);
  free(cls->c);
  if (matwt) {
    if (cls->wt != NULL) free(cls->wt);
    dismat_free(cls->mat);
  }
  memset(cls, 0, sizeof(*cls));
  free(cls);
}

/* compute distance between two points i and j */
static __inline double cls_pdis(clsys_t *cls, int i, int j)
{
  die_if (i == j || i < 0 || i >= cls->np || j < 0 || j >= cls->np,
      "bad index  i: %d, j: %d, np: %g", i, j, cls->np);
  if (i < j) return cls->mat[i][j]; else return cls->mat[j][i];
}

/* check cluster configuration */
static void cls_check(clsys_t *cls, int it, int acctype)
{
  clus_t *cl;
  int *in, ic, k, ip;

  xnew(in, cls->np);
  for (ic = 0; ic < cls->nc; ic++) {
    cl = cls->c + ic;
    die_if (cl->cnt <= 0, "invalid size %d, %d\n", ic, cl->cnt);
    for (k = 0; k < cl->cnt; k++) {
      ip = cl->idx[k];
      die_if (ip >= cls->np || in[ip], "type %d/%d: invalid ip %d/%d", acctype, it, ip, cls->np);
      in[ip] = 1;
    }
  }
  free(in);
}

/* comparison of two integers */
static int cmpint(const void *a, const void *b)
{
  return (*(const int *)a) - (*(const int *)b);
}

/* sort 1) indices in a cluster, 2) clusters by size */
static void cls_sort(clsys_t *cls)
{
  int ic, jc, im;
  clus_t *cl, cbuf;
  double wm;

  /* sort indices */
  for (ic = 0; ic < cls->nc; ic++) {
    cl = cls->c + ic;
    qsort(cl->idx, cl->cnt, sizeof(int), &cmpint);
  }
  /* sort clusters by size */
  for (ic = 0; ic < cls->nc - 1; ic++) {
    wm = cls->c[ic].smwt;
    for (im = ic, jc = ic + 1; jc  < cls->nc; jc++) {
      if (cls->c[jc].smwt > wm) {
        im = jc;
        wm = cls->c[jc].smwt;
      }
    }
    if (im != ic) {
      memcpy(&cbuf, cls->c + ic, sizeof(clus_t));
      memcpy(cls->c + ic, cls->c + im, sizeof(clus_t));
      memcpy(cls->c + im, &cbuf, sizeof(clus_t));
    }
  }
}

/* compute centroid */
static void cls_centroid(clsys_t *cls)
{
  int ic, i, ip, j, jp, imin;
  double dsm, wj, dmin;
  clus_t *cl;

  for (ic = 0; ic < cls->nc; ic++) {
    /* compute the centroid of cluster cl
     * closest to all other points */
    cl = cls->c + ic;
    if (cl->cnt <= 1) {
      cl->centroid = 0;
      continue;
    }
    for (dmin = 1e9, imin = -1, i = 0; i < cl->cnt; i++) {
      ip = cl->idx[i];
      /* compute distance from others */
      dsm = 0.;
      for (j = 0; j < cl->cnt; j++) {
        if (i == j) continue;
        jp = cl->idx[j];
        wj = cls_getwt(cls, jp);
        dsm += wj*cls_pdis(cls, ip, jp);
      }
      if (dsm < dmin) {
        imin = i;
        dmin = dsm;
      }
    }
    cl->centroid = imin;
  }
}

#define CLUS_METROPOLIS 0x00
#define CLUS_HEATBATH   0x01
#define CLUS_VERBOSE    0x10
#define CLUS_CHECK      0x20
#define CLUS_NOGIANT    0x100 /* ignore a single-cluster configuration during sampling */

/* energy of a single cluster */
static double cls_ene1(clsys_t *cls, int ic, unsigned flags)
{
  clus_t *cl = cls->c + ic;
  int k1, k2, ip, jp;
  double dis, ene, wi, wj, wtot;

  die_if(cl->cnt <= 0, "invalid cluster, n(%d) = %d\n", ic, cl->cnt);
  if (cl->cnt <= 1) return 0.;
  for (dis = 0., wtot = 0., k1 = 0; k1 < cl->cnt; k1++) {
    ip = cl->idx[k1];
    wi = cls_getwt(cls, ip);
    wtot += wi;
    for (k2 = k1+1; k2 < cl->cnt; k2++) {
      jp = cl->idx[k2];
      wj = cls_getwt(cls, jp);
      die_if (ip == jp, "same ip jp %d\n", ip);
      dis += wi*wj*cls_pdis(cls, ip, jp);
    }
  }
  if (flags & CLUS_CHECK) {
    die_if (fabs(cl->smdis - dis) > 1e-2,
      "corruption distance sum %d, %g, vs. %g\n", ic, cl->smdis, dis);
  }
  cl->smdis = dis; /* assign the new energy, to avoid error propagation */
  cl->smwt  = wtot;
  ene = dis/wtot;
  if (flags & CLUS_VERBOSE)
    printf("ENE: clus %d: nc = %d, ene = %g = %g/%g\n", ic, cl->cnt, ene, dis, wtot);
  return ene;
}

/* energy of all clusters */
static double cls_ene(clsys_t *cls, unsigned flags)
{
  int ic;
  double ene;

  ene = cls->nc * cls->muw;
  for (ic = 0; ic < cls->nc; ic++) {
    ene += cls_ene1(cls, ic, flags);
  }
  if (flags & CLUS_VERBOSE)
    printf("ENE: %d clusters, ene = %g\n", cls->nc, ene);
  return ene;
}

/* change mu of the cluster  */
void cls_changemu(clsys_t *cls, double mu)
{
  cls->mu0 = mu;
  cls->muw = .5*mu*cls->wtot;
  cls->ene = cls_ene(cls, 0);
}

/* merge a single-point cluster i to cluster j */
static void cls_merge(clsys_t *cls, int i, int j, double disj, double dene)
{
  clus_t *ci = cls->c + i;
  int ip;
  double wi;

  die_if (ci->cnt != 1, "cluster %d is not alone %d\n", i, ci->cnt);
  ip = ci->idx[0];
  wi = cls_getwt(cls, ip);
  cl_padd(cls->c + j, ip, disj, wi); /* add ip to cluster j */
  cls_cremove(cls, i); /* remove cluster i, done after cl_padd() to avoid messing up with cluster index */
  cls->ene += dene;
}

/* remove the kth point in cluster i, and add it to cluster j */
static void cls_transfer(clsys_t *cls, int i, int k, int j,
    double disi, double disj, double dene)
{
  clus_t *ci = cls->c + i;
  int ip;
  double wi;

  die_if (ci->cnt <= 1, "cluster %d is alone %d\n", i, ci->cnt);
  die_if (k < 0 || k >= ci->cnt, "point %d out of range %d\n", k, ci->cnt);
  ip = ci->idx[k];
  wi = cls_getwt(cls, ip);
  cl_premove(ci, k, -disi, -wi);
  if (j == cls->nc) {
    cls_cadd(cls, ip);
  } else {
    cl_padd(cls->c + j, ip, disj, wi);
  }
  cls->ene += dene;
}

/* move the kth point in cluster i, and add it to cluster j
 * combines cls_merge() and cls_transfer() */
static int cls_pmove(clsys_t *cls, int i, int ik, int j,
    double disi, double disj, double dene, int iter)
{
  const int freq = 1000;
  clus_t *ci = cls->c + i;
  int mvtype = 0, ip = ci->idx[ik];

  if (i == j) {
    return 0;
  } else if (ci->cnt == 1) {
    cls_merge(cls, i, j, disj, dene);
    mvtype = 1;
  } else {
    cls_transfer(cls, i, ik, j, disi, disj, dene);
    mvtype = 2;
  }

  cls_check(cls, iter, mvtype);
  if (++cls->acc % freq == 0) {
    double ene1 = cls_ene(cls, CLUS_CHECK);
    die_if (fabs(ene1 - cls->ene) > 1e-1,
        "type %d: ene diff %g, %g; i: %d(%d:%d) j: %d\n",
        mvtype, ene1, cls->ene, i, ik, ip, j);
    cls->ene = ene1;
  }
  return mvtype;
}

/* compute the energy change of adding from point k of cluster i to cluster j */
static double cls_deadd(clsys_t *cls, int i, int k, int j, double *dis)
{
  clus_t *ci = cls->c + i, *cj = cls->c + j;
  double wi, wj;
  int ip, k1, jp;

  *dis = 0.;
  if (j == cls->nc) return cls->muw;
  ip = ci->idx[k];
  wi = cls_getwt(cls, ip);
  for (k1 = 0; k1 < cj->cnt; k1++) {
    jp = cj->idx[k1];
    wj = cls_getwt(cls, jp);
    *dis += wj*cls_pdis(cls, ip, jp);
  }
  *dis *= wi;
  return (cj->smdis + *dis)/(cj->smwt + wi) - cj->smdis/cj->smwt;
}

/* compute the energy change of removing kth point from cluster i */
static double cls_deremove(clsys_t *cls, int i, int k, double *dis)
{
  clus_t *ci = cls->c + i;
  double wi, wj;
  int ip, k1, jp;

  die_if (i >= cls->nc, "i: %d, nc %d\n", i, cls->nc);
  if (ci->cnt == 1) {
    *dis = 0;
    return -cls->muw;
  }
  die_if (k < 0 || k >= ci->cnt, "bad index k = %d, cnt = %d\n", k, ci->cnt);
  ip = ci->idx[k];
  wi = cls_getwt(cls, ip);
  for (*dis = 0., k1 = 0; k1 < ci->cnt; k1++) {
    if (k == k1) continue;
    jp = ci->idx[k1];
    wj = cls_getwt(cls, jp);
    *dis += wj*cls_pdis(cls, ip, jp);
  }
  *dis *= wi;
  return (ci->smdis - *dis)/(ci->smwt - wi) - ci->smdis/ci->smwt;
}

/* one step, metropolis move of a cluster */
static int cls_metropolis(clsys_t *cls, int iter, unsigned flags)
{
  clus_t *ci;
  int i, j, k, acc;
  double dene, disi, disj;

  if (cls->nc < 1) return 0;
  i = (int)(cls->nc * rnd0());
  ci = cls->c + i;
  k = (int)(ci->cnt * rnd0());
  die_if(ci->cnt <= 0, "empty cluster %d\n", i);
  if (ci->cnt == 1) { /* a cluster of a single point */
    /* try to merge to another cluster */
    if ((flags & CLUS_NOGIANT) && cls->nc == 2) return 0; /* forbid giant formation */
    j = (int)((cls->nc - 1)*rnd0());
    j = (i+1+j) % cls->nc; /* choose any other cluster */
  } else {
    j = (int)(cls->nc * rnd0());
    j = (i+1+j) % (cls->nc + 1); /* m for a new cluster */
  }
  dene = cls_deremove(cls, i, k, &disi)
       + cls_deadd(cls, i, k, j, &disj);
  acc = ((dene < 0.0) || rnd0() < exp(-cls->bet*dene));
  if ((flags & CLUS_NOGIANT) && cls->nc == 1) /* always encourage forming two clusters */
    acc = 1;
  if (acc) { /* accept the move */
    cls_pmove(cls, i, k, j, disi, disj, dene, iter);
  }
  return 0;
}

/* choose one from a heat bath choice */
static int heatbathchoose(double *dene, int n, double bet)
{
  double *prob, demin, deb, r;
  int j;

  /* select a cluster to join */
  xnew(prob, n);
  for (demin = 1e9, j = 0; j < n; j++) {
    if (dene[j] < demin) demin = dene[j];
  }
  for (j = 0; j < n; j++) { /* probability */
    deb = bet*(dene[j] - demin);
    if (deb > 100.0) prob[j] = 0.;
    else prob[j] = exp(-deb);
  }
  for (j = 1; j < n; j++) {
    prob[j] = prob[j-1] + prob[j];
  }
  r = prob[n - 1]*rnd0();
  for (j = 0; j < n; j++) {
    if (r < prob[j]) break;
  }
  die_if (j >= n, "bad index %d > %d\n", j, n);
  free(prob);
  return j;
}

/* heat bath, if ip == -1, randomly pick a particle */
static int cls_heatbath(clsys_t *cls, int ip, int iter, unsigned flags)
{
  clus_t *ci;
  int i, j, k = -1, mvtype = 0;
  double dei, disi;
  double *dene, *disj;

  if (ip >= 0 && ip < cls->np) {
    i = cls_find(cls, ip, &k);
    ci = cls->c + i;
  } else {
    i = (int)(cls->nc*rnd0());
    ci = cls->c + i;
    k = (int)(ci->cnt*rnd0());
  }
  die_if(ci->cnt <= 0 || k < 0, "empty cluster %d or bad k %d", ci->cnt, k);

  if (flags & CLUS_NOGIANT) {
    /* forbid the transition from cluster i to the other (giant) */
    if (cls->nc == 2 && ci->cnt == 1)
      return 0;
  }

  xnew(dene, cls->nc + 1); /* energy change vector */
  xnew(disj, cls->nc + 1);
  /* energy of removing k'th point from cluster i */
  dei = cls_deremove(cls, i, k, &disi);
  /* energy of adding that point to cluster j */
  for (j = 0; j <= cls->nc; j++) {
    if (j == i) continue;
    dene[j] = dei + cls_deadd(cls, i, k, j, &disj[j]);
  }
  if (ci->cnt == 1) dene[cls->nc] = 1e9; /* disable the last option for an alone cluster */
  dene[i] = 0.0;

  /* choose from the heat bath */
  if ((flags & CLUS_NOGIANT) && cls->nc == 1) {
    j = 1;
  } else {
    j = heatbathchoose(dene, cls->nc + 1, cls->bet);
  }
  if (j == i) goto EXIT; /* no move */

  /* accept the move */
  mvtype = cls_pmove(cls, i, k, j, disi, disj[j], dene[j], iter);
EXIT:
  free(dene);
  free(disj);
  return mvtype;
}

/* energy minimization */
static int cls_minimize(clsys_t *cls, int verbose)
{
  int iter, ip, changed = 1, nc0 = cls->nc, nc1;
  double bet, ene0 = cls->ene;

  bet = cls->bet;
  cls->bet = 1e9;
  for (iter = 0; ; iter++) {
    changed = 0;
    nc1 = cls->nc;
    for (ip = 0; ip < cls->np; ip++) {
      if (cls_heatbath(cls, ip, iter, 0))
        changed++;
    }
    if (verbose >= 2) printf("%d: nc: %d -> %d, changed = %d\n", iter, nc1, cls->nc, changed);
    if (!changed) break;
  }
  cls->bet = bet;
  if (verbose)
    printf("enemin %4d iter: (%3d %12.3f) --> (%3d %12.3f)\n",
      iter, nc0, ene0, cls->nc, cls->ene);
  return iter > 1;
}

/* do multidimensional scaling, cls_centroid should be called */
static int cls_mdscal(clsys_t *cls)
{
  int i, j, ip, jp, n;
  clus_t *ci, *cj;
  real *dm, *xy;

  /* construct cluster-cluster distance matrix */
  n = cls->nc;
  xnew(dm, n*n);
  for (i = 0; i < n; i++) {
    ci = cls->c + i;
    ip = ci->idx[ci->centroid];
    for (j = i+1; j < n; j++) {
      cj = cls->c + j;
      jp = cj->idx[cj->centroid];
      dm[i*n+j] = dm[j*n+i] = (real) cls_pdis(cls, ip, jp);
    }
    dm[i*n+i] = 0.;
  }

  /* initialize coordinates */
  xnew(xy, n*2);

  /* low level multidimensional scaling */
  mds_min0(xy, dm, n, 2, 1e-16);

  /* copy coordinates */
  for (i = 0; i < n; i++) {
    ci = cls->c + i;
    ci->x = xy[i*2];
    ci->y = xy[i*2+1];
  }

  free(xy);
  free(dm);
  return 0;
}

/* prepare cls for a better representation */
static void cls_trim(clsys_t *cls)
{
  cls_sort(cls); /* sort the cluster list */
  cls_centroid(cls);
  cls_mdscal(cls);
}

/* evenly spaced nc clusters */
static int cls_evensplit(clsys_t *cls, int nc)
{
  int i, ic, ppc;
  clus_t *ci;

  die_if (nc > cls->np, "too many clusters %d > %d\n", nc, cls->np);
  cls->nc = nc;

  /* compute number of points in each cluster */
  ppc = cls->np / nc;

  for (ic = 0; ic < nc; ic++) {
    ci = cls->c + ic;
    ci->cnt = (ic == nc - 1) ? (cls->np - ppc*(nc-1)) : ppc;
    ci->cap = ci->cnt;
    die_if(ci->cnt <= 0, "cnt %d, np %d, ic %d, nc %d, ppc %d\n",
        ci->cnt, cls->np, ic, nc, ppc);
    xrenew(ci->idx, ci->cap);
    for (i = 0; i < ci->cnt; i++) {
      ci->idx[i] = ic*ppc + i;
    }
  }
  for (ic = nc; ic < cls->np; ic++)
    cl_free(cls->c + ic);
  cls->ene = cls_ene(cls, 0);
  return 0;
}


/* main function for clustering: given a distance matrix return a clustering conf. */
clsys_t *cls_anneal(clsys_t *cls,
    int itmax, int method, double bet0, double bet1)
{
  int iter;

  for (iter = 0; iter < itmax; iter++) {
    cls->bet = bet0 + (bet1 - bet0)*iter/itmax;
    if (method == CLUS_HEATBATH) {
      cls_heatbath(cls, -1, iter, 0);
    } else {
      cls_metropolis(cls, iter, 0);
    }
  }
  cls_minimize(cls, 1);
  cls_trim(cls);
  return cls;
}

clsys_t *cls_zalgo(clsys_t *cls, int itermax, int method,
    double bet0, double bet1, int nbet, int verbose)
{
  int nstage = 30, ncm = 10;
  int isz, is, ib, jb, i, it, iter, freemode = 0;
  double lnf, lnfree, r, gemin;
  double *barr, *lnz, *hist, *esum;
  double *muhist;
  clsys_t **clsb;

  /* size dependent cluster configuration */
  if (ncm > cls->np-1) ncm = cls->np-1;
  gemin = 1e9;
  xnew(clsb, ncm+1);
  xnew(muhist, ncm+1);
  for (i = 0; i <= ncm; i++) {
    clsb[i] = cls_init(cls->mat, cls->wt, cls->np, cls->mu0);
    if (i > 0) {
      cls_evensplit(clsb[i], i);
    }
  } /* 0 is for all cluster config. w/ size > ncm */

  if (nbet < 2) nbet = 2;
  xnew(lnz, nbet);
  xnew(barr, nbet);
  xnew(hist, nbet);
  xnew(esum,  nbet);
  for (ib = 0; ib < nbet; ib++) {
    barr[ib] = exp(log(bet0) + ib*(log(bet1)  - log(bet0))/(nbet-1));
    lnz[ib] = 0.;
  }
  ib = 0;
  cls->bet = barr[0];
  iter = 1;
  if (itermax <= 0) { /* automatically determine */
    itermax = (cls->np < 10000) ? 10*cls->np*cls->np : 1000000000;
    if (verbose)
      printf("automatically determine itermax = %d for n = %d\n", itermax, cls->np);
  }
  lnfree = 0.;
  gemin = 1e9;
  for (lnf = 1., is = 0; is <= nstage; is++, lnf *= .316227766) {
    /* clear histogram */
    for (i = 0; i < nbet; i++) esum[i] = hist[i] = 0.;

    for (it = 1; ; it++) {
      int cl_flags = CLUS_NOGIANT; /* do not sample a single cluster */
      if (method == CLUS_HEATBATH) {
        cls_heatbath(cls, -1, iter, cl_flags);
      } else {
        cls_metropolis(cls, iter, cl_flags);
      }
      /* temperature transition */
      jb = (int)(ib + rnd0()*(nbet-1) + 1) % nbet;
      die_if (jb < 0 || jb > nbet, "bad jb = %d\n", jb);
      r = cls->ene*(barr[jb] - barr[ib]) + lnz[jb] - lnz[ib];
      if (r < 0. || rnd0() < exp(-r)) {
        ib = jb;
        cls->bet = barr[ib];
      }
      lnz[ib] += lnf;
      hist[ib] += 1.;
      esum[ib] += cls->ene;

      isz = (cls->nc > ncm) ? 0 : cls->nc;
      muhist[isz] += 1.;
      if (cls->ene < clsb[isz]->ene) {
        cls_copy(clsb[isz], cls);
        if (verbose >= 2)
          printf("found new emin[%d]  = %g     \r", isz, clsb[isz]->ene);
      }
      if (cls->ene < gemin) {
        gemin = cls->ene;
        if (verbose)
          printf("found new gemin = %g     \r", gemin);
      }
      if (++iter >= itermax) break;

      if (it % 100 == 0) {
        /* clean up lnz */
        for (i = 0; i < nbet; i++) lnz[i] -= lnz[nbet-1];
        lnfree = 1.*nbet/iter;
        if (lnf < lnfree && is > 3)
          freemode = 1;
        if (freemode) {
          /* once enter freemode never quit */
          lnf = lnfree;
        } else {  /* check histogram */
          double hmin = 1e9, hmax = 0.;
          for (i = 0; i < nbet; i++) {
            if (hist[i] < hmin) hmin = hist[i];
            if (hist[i] > hmax) hmax = hist[i];
          }
          if ((hmax-hmin)/(hmax+hmin) < 0.3) break; /* each bet met once */
        }
      }
    }
    if (verbose) {
      printf("stage %d is complete after %d/%d, emin = %g, lnf = %g, lnfree = %g\n",
          is, it, iter, gemin, lnf, lnfree);
    }
    if (verbose >= 3 || (verbose >=2 && iter >= itermax)) {
      for (i = 0; i < nbet; i++) {
        double eav;
        if (hist[i] > 0.) eav = esum[i]/hist[i]; else eav = 0.;
        printf("%8.4f %8.2f %g %g\n", barr[i], lnz[i], hist[i], eav);
      }
    }
    if (iter >= itermax) {
      printf("exceeds the maximal iterations %d in stage %d\n", iter, is);
      break;
    }
  }

  free(lnz);
  free(barr);
  free(hist);
  free(esum);

  /* search for the best */
  for (gemin = 1e9, i = -1, isz = 0; isz <= ncm; isz++) {
    printf("%4d, %10.0f ", isz, muhist[isz]);
    cls_minimize(clsb[isz], 1);
    if (clsb[isz]->ene < gemin) {
      gemin = clsb[isz]->ene;
      i = isz;
    }
  }
  cls_copy(cls, clsb[i]);
  for (isz = 0; isz <= ncm; isz++)
    cls_free(clsb[isz], 0);
  free(clsb);
  free(muhist);
  cls_trim(cls);
  return cls;
}

/* write cluster results
 * `whead' is a call-back function for writing extra information */
int cls_write(clsys_t *cls, const char *fn,
    void (*whead)(FILE *, const clsys_t *cls, const void *data), const void *data,
    int version)
{
  FILE *fp;
  int ic, ni, k, ip, k1;
  clus_t *ci;
  double wtot;

  if ((fp = fopen(fn, "w")) == NULL) {
    fprintf(stderr, "cannot open file %s\n", fn);
    return -1;
  }

  /* basic information */
  fprintf(fp, "# %d %d %g %d\n", cls->np, cls->nc, cls->mu0, version);

  /* call the callback function */
  if (whead != NULL) {
    (*whead)(fp, cls, data);
  } else {
    for (k = 0; k < cls->np; k++) {
      fprintf(fp, "# %d %g\n", k, cls_getwt(cls, k));
    }
  }

  /* write the rmsd matrix */
  for (k = 0; k < cls->np - 1; k++) {
    fprintf(fp, "# %d ", k);
    for (k1 = k+1; k1 < cls->np; k1++) {
      fprintf(fp, "%.3f ", cls->mat[k][k1]);
    }
    fprintf(fp, "\n");
  }

  for (ic = 0; ic < cls->nc; ic++) {
    ci = cls->c + ic;
    ni = ci->cnt;
    /* compute cluster weight */
    for (wtot = 0., k = 0; k < ni; k++) {
      ip = ci->idx[k];
      wtot += cls_getwt(cls, ip);
    }
    fprintf(fp, "%d %d %g %d %g %g: ",
        ic, ni, wtot, ci->idx[ci->centroid], ci->x, ci->y);
    for (k = 0; k < ni; ) {
      ip = ci->idx[k];
      for (k1 = k; k1 < ni-1; k1++) {
        if (ci->idx[k1]+1 != ci->idx[k1+1])
          break;
      }
      if (k1 == k || k == ni-1) {
        fprintf(fp, "%d ", ip);;
      } else {
        fprintf(fp, "%d-%d ", ip, ci->idx[k1]);
      }
      k = k1 + 1;
    }
    fprintf(fp, "\n");
  }
  fclose(fp);
  return 0;
}

/* read a cluster file and construct a rmsd matrix */
clsys_t *cls_read(const char *fn,
    void (*rhead)(FILE *, clsys_t *, void *data), void *data)
{
  FILE *fp;
  int ic, ni, k, ip, k1, np, nc, itmp, ret, ver;
  clsys_t *cls;
  clus_t *cl;
  char buf[1024] = "", word[128], *p;
  float **dismat;
  double *wt, wtot, mu, x, y;

  if ((fp = fopen(fn, "r")) == NULL) {
    fprintf(stderr, "cannot open file %s\n", fn);
    return NULL;
  }

  /* read in basic information */
  die_if (NULL == fgets(buf, sizeof buf, fp), "%s: no first line\n", fn);
  die_if (4 != sscanf(buf, "#%d%d%lf%d", &np, &nc, &mu, &ver),
      "%s: first line broken\n", fn);

  /* allocate space */
  xnew(wt, np);
  dismat = dismat_alloc(np);

  /* read wt */
  for (k = 0; k < np; k++) {
    die_if (NULL == fgets(buf, sizeof buf, fp), "no weight %d\n", k);
    ret = sscanf(buf, "#%d%lf", &itmp, &wt[k]);
    die_if (2 != ret, "no weight information at k = %d, ret = %d\n", k, ret);
    die_if (k != itmp, "index mismatch %d vs. %d", itmp, k);
  }

  /* read the rmsd matrix */
  for (k = 0; k < np - 1; k++) {
    ret = fscanf(fp, " #%d", &itmp);
    die_if (1 != ret, "cannot scan index k = %d, ret = %d\n", k, ret);
    die_if (k != itmp, "wrong id %d should be %d\n", itmp, k);
    for (k1 = k+1; k1 < np; k1++) {
      die_if (1 != fscanf(fp, "%f", &dismat[k][k1]),
          "cannot read matrix %d, %d\n", k, k1);
    }
  }

  /* initialize a configuration */
  cls = cls_init(dismat, wt, np, mu);

  /* read cluster configuration */
  cls->nc = nc;
  for (ic = 0; ic < nc; ic++) {
    die_if (6 != fscanf(fp, " %d%d%lf%d%lf%lf : ", &k1, &ni, &wtot, &itmp, &x, &y),
        "cannot read tag info for cluster %d", ic);
    cl = cls->c + ic;
    cl->cnt = ni;
    cl->cap = cl->cnt;
    cl->x = x;
    cl->y = y;
    xrenew(cl->idx, cl->cap);
    /* start reading indices */
    for (k = 0; k < ni; ) {
      word[0] = '\0';
      die_if (1 != fscanf(fp, "%s", word), "no atoms left, ic = %d, k = %d", ic, k);
      p = strchr(word, '-');
      if (p != NULL) {
        *p = '\0';
        ip = atoi(word);
        itmp = atoi(p+1);
        die_if (ip >= itmp, "%d >= %d\n", ip, itmp);
        die_if (k+itmp+1-ip > ni, "%d + (%d,%d) > %d", k, ip, itmp, ni);
        for (k1 = ip; k1 <= itmp; k1++)
          cl->idx[k++] = k1;
      } else {
        cl->idx[k++] = atoi(word);
      }
    }
  }
  for (ic = nc; ic < cls->np; ic++) /* free the rest */
    cl_free(cls->c + ic);
  cls->ene = cls_ene(cls, 0);
  cls_centroid(cls);

  /* start reading additional info. callback */
  rewind(fp);
  die_if(NULL == fgets(buf, sizeof buf, fp), "%s: no first line (rescan)\n", fn);
  if (rhead != NULL) (*rhead)(fp, cls, data);

  fclose(fp);
  return cls;
}

#endif /* ZCOM_CLUS__ */
#endif /* ZCOM_CLUS */

#ifdef  ZCOM_ISING2
#ifndef ZCOM_ISING2__
#define ZCOM_ISING2__


#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>

typedef struct {
  int d, l, n;
  int M, E;
  int *s; /* 0 or 1 */
  /* helper vars */
  uint32_t *uproba; /* temporary probability for MC transitions */
} ising_t;

STRCLS int     is2_em(ising_t *is);
STRCLS int     is2_check(ising_t *is);
STRCLS int     is2_load(ising_t *is, const char *fname);
STRCLS int     is2_save(const ising_t *is, const char *fname);
STRCLS double  is2_exact(ising_t *is, double beta, double *eav, double *cv);
STRCLS int     is2_pick(const ising_t *is, int *h);
STRCLS int     is2_flip(ising_t *is, int id, int h);
ising_t*is2_open(int l);
STRCLS void    is2_close(ising_t *is);

/* set transition probability */
#define IS2_SETPROBA(is, bet) { \
  double x_ = exp(-4. * bet); \
  is->uproba[2] = (uint32_t) ((double)(0xffffffff) * x_); \
  is->uproba[4] = (uint32_t) ((double)(0xffffffff) * x_*x_); }

/* faster macros for systems with fixed (upon compiling) size
 * to use them one must define IS2_LB before including
 * IS2_PICK()/IS2_PSEQ() and IS2_FLIP() */
#ifdef  IS2_LB  /* L = 2^LB, N = L*L */
#define IS2_L   (1 << IS2_LB)
#define IS2_N   (IS2_L * IS2_L)

#define IS2_GETH(is, id, h) { \
  unsigned ix, iy; \
  iy = id / IS2_L, ix = id % IS2_L; \
  h = is->s[id]*(is->s[iy*IS2_L + (ix+1)%IS2_L] \
               + is->s[iy*IS2_L + (ix+IS2_L-1)%IS2_L] \
               + is->s[(iy+1)%IS2_L*IS2_L + ix] \
               + is->s[(iy-1+IS2_L)%IS2_L*IS2_L + ix]); }
#define IS2_IRND(is, id)  id = rand32() >> (32 - 2*IS2_LB);
/* random picking */
#define IS2_PICK(is, id, h) { IS2_IRND(is, id); IS2_GETH(is, id, h); }
#define IS2_ISEQ(is, id)  id = (id + 1) % IS2_N;
/* sequential picking */
#define IS2_PSEQ(is, id, h) { IS2_ISEQ(is, id); IS2_GETH(is, id, h); }

#define IS2_FLIP(is, id, h) { \
  is->M += (is->s[id] = -is->s[id]) * 2; \
  is->E += h * 2; }

#else

#define IS2_PICK(is, id, h)  id = is2_pick(is, &h)
#define IS2_FLIP(is, id, h)  is2_flip(is, id, h)
#endif /* IS2_LB */


/* compute total energy and magnetization */
int is2_em(ising_t *is)
{
  int l, i, j, s, u, e, m, *p, *pu;

  e = m = 0;
  p = is->s;
  l = is->l;
  for (i = 0; i < l; ) {
    pu = (++i == l) ? is->s : p+l;
    for (j = 0; j < l; ) {
      m += (s = *p++);
      u = *pu++;
      e += s*(u + ((++j == l) ? *(p-l) : *p));
    }
  }
  is->M = m;
  return is->E = -e;
}

int is2_check(ising_t *is)
{
  int i, e, m;

  for (i = 0; i < is->n; i++)
    if (is->s[i] != 1 && is->s[i] != -1) {
      fprintf(stderr, "error: s[%d] = %d\n", i, is->s[i]);
      return -1;
    }
  e = is->E;
  m = is->M;
  is2_em(is);
  if  (e != is->E || e < -2*is->n || e > 2*is->n
    || m != is->M || m < -is->n   || m > is->n) {
    fprintf(stderr, "error: E = %d, %d; M = %d, %d\n",
        e, is->E, m, is->M);
    return -1;
  }
  return 0;
}

/* pick a random site, count neighbors with different spins */
int is2_pick(const ising_t *is, int *h)
{
  int id, ix, iy, l, lm, n, nm, *p;

  lm = (l = is->l) - 1;
  nm = (n = is->n) - l;
  id = (int)(rnd0() * n);
  iy = id / l, ix = id % l;
  p = is->s + id;
  *h = *p * ( ((ix != 0 ) ? *(p-1) : *(p+lm))   /* left  */
            + ((ix != lm) ? *(p+1) : *(p-lm))   /* right */
            + ((iy != 0 ) ? *(p-l) : *(p+nm))   /* down  */
            + ((iy != lm) ? *(p+l) : *(p-nm))); /* up    */
  return id;
}

/* flip site id, with h different neighbors */
int is2_flip(ising_t *is, int id, int h)
{
  assert(id < is->n);
  is->M += (is->s[id] = -is->s[id])*2;
  return is->E += h*2;
}

int is2_load(ising_t *is, const char *fname)
{
  FILE *fp;
  int i, lx, ly, n, c;
  char s[80];

  if ((fp = fopen(fname, "r")) == NULL) {
    return -1;
  }
  if (fgets(s, sizeof s, fp) == NULL) {
    fprintf(stderr, "missing first line %s\n", fname);
    return -1;
  }
  if (4 != sscanf(s, "%d%d%d%d", &i, &lx, &ly, &n)
      || i != 2 || lx != ly || lx != is->l || n != is->n) {
    fprintf(stderr, "bad setting: %dD, %dx%d = %d\n", i, lx, ly, n);
    return -1;
  }
  for (i = 0; i < n; i++) {
    while ((c=fgetc(fp)) != EOF && c == '\n') ;
    if (c == EOF) break;
    is->s[i] = (c == ' ') ? -1 : 1;
  }
  if (i < n)
    fprintf(stderr, "%s: data stopped at i = %d\n", fname, i);
  fclose(fp);
  is2_em(is);
  return 0;
}

int is2_save(const ising_t *is, const char *fname)
{
  FILE *fp;
  int i, j, l, *p;

  if ((fp = fopen(fname, "w")) == NULL) {
    fprintf(stderr, "cannot write %s\n", fname);
    return -1;
  }
  l = is->l;
  fprintf(fp, "%d %d %d %d\n", is->d, l, l, is->n);
  for (p = is->s, i = 0; i < l; i++) {
    for (j = 0; j < l; j++, p++)
      fprintf(fp, "%c", (*p > 0) ? '#' : ' ');
    fprintf(fp, "\n");
  }
  fclose(fp);
  return 0;
}

/* initialize an lxl Ising model */
ising_t *is2_open(int l)
{
  int i, n;
  ising_t *is;

  xnew(is, 1);
  is->d = 2;
  is->l = l;
  is->n = n = l*l;
  xnew(is->s, n);
  for (i = 0; i < n; i++) is->s[i] = -1;
  is->M = -n;
  is->E = -2*n;
  xnew(is->uproba, 2*is->d+1);
  is->uproba[0] = 0xffffffff;
  return is;
}

void is2_close(ising_t *is)
{
  if (is != NULL) {
    free(is->s);
    free(is->uproba);
    free(is);
  }
}

/* exact solution of ising model */
double is2_exact(ising_t *is, double beta, double *eav, double *cv)
{
  double lxh, n, ex, f, th, sech, bet2, bsqr, log2, x;
  double lnz, lnz1, lnz2, lnz3, lnz4, dz, ddz;
  double z21, z31, z41, za1;
  double dr1, dr2, dr3, dr4, ddr1, ddr2, ddr3, ddr4;
  double g, g0, dg, ddg, dg0;
  double xn2b, sh2b, coth2b;
  double lnch2b, lncc2b, lncl, lnsl, cd, cdsqr, lnddcl;
  int r, sgn4 = 1, lx, ly;

  lx = is->l;
  ly = is->l;
  lxh = .5*lx;
  n = lx * ly;
  log2 = log(2.0);
  bet2 = 2.*beta;
  bsqr = beta*beta;
  xn2b = exp(-bet2);
  if (lx == 2 && ly == 2) { /* 2x2 system */
    double lnc, lnd;
    x = 8.*beta;
    lnc = lnadd(x, -x);
    lnd = lnaddn(lnc, 6.);
    lnz = lnd + log2;
    *eav = -8.*exp(lndif(x, -x) - lnd); /* -8*sinh(8*b)/(3+cosh(8*h)) */
    *cv = bsqr * 384. * exp(lnaddn(lnc,2./3) - 2.0*lnd); /* 64*(1+3cosh(8*b))/(3+cosh(8*b))^2 */
    return lnz;
  } else if (fabs(beta) < 1e-6) { /* high T approx. normal branch unstable if beta < 1e-6 */
    lnz = n * (2.*lnadd(beta, -beta) - log2);
    x = 1. + xn2b;
    *eav = -2. * n * (1. - xn2b)/x;
    *cv = bsqr * 8.*n*xn2b/(x*x);
    return lnz; /* +n*tanh(beta)^4 */
  }

  lnz1 = lnz2 = lnz3 = lnz4 = 0;
  dr1 = dr2 = dr3 = dr4 = 0;
  ddr1 = ddr2 = ddr3 = ddr4 = 0;
  lnch2b = lnadd(bet2, -bet2) - log2;
  coth2b = 2./(1. - xn2b*xn2b) - 1.;
  lncc2b = lnch2b + log(coth2b); /* ln[ cosh(2b) * coth(2b) ] */
  g0 = bet2 + log(2./(1. + xn2b) - 1.);
  sgn4 = (g0 >= 0) ? 1 : -1;

  sh2b = 0.5*(1./xn2b - xn2b);
  dg0 = 2. + 2./sh2b;
  x = sh2b*sh2b;
  cd = 2. - 2./x; /* cl' = cd * cosh(2b) */
  cdsqr = cd*cd;
  lnddcl = lnaddn(lncc2b, 2.0/(x * sh2b)) + 2.*log2; /* log(cl'') */

  for (r = 0; r < ly; r++) { /* for odd number */
    lncl = lnaddn(lncc2b, -cos((2.*r + 1.)*M_PI/ly));
    lnsl = lncl + 0.5*log(1. - exp(-2.*lncl));
    g = lnadd(lncl, lnsl);
    f = lxh*g;
    lnz1 += lnadd(f, -f);
    lnz2 += lndif(f, -f);

    dg = exp(lnch2b - lnsl)*cd; /* g' = cl'/sl; */
    ex = exp(-f);
    th = 2./(1. + ex*ex) - 1.;
    x = lxh*dg;
    dr1 += x*th;
    dr2 += x/th;

    /* g''=cl''/sl - cl' ^2 *cl/sl^3; */
    ddg = exp(lnddcl - lnsl);
    ddg -= exp(lnch2b*2. + lncl - 3.*lnsl)*cdsqr;
    sech = 2.0*dg/(ex + 1.0/ex); /* g' * sech(0.5*lx*g) */
    ddr1 += lxh*(ddg*th + lxh*(sech*sech));
    sech = 2.0*dg/(ex - 1.0/ex); /* g' * sech(0.5*lx*g) */
    ddr2 += lxh*(ddg/th - lxh*(sech*sech));

    if (r == 0) {
      g = g0;
    } else {
      lncl = lnaddn(lncc2b, -cos(2.0*M_PI*r/ly));
      lnsl = lncl+0.5*log(1-exp(-2*lncl));
      g = lnadd(lncl, lnsl);
      die_if (g < 0.0, "g = %g < 0.\n", g);;
    }
    f = lxh*g;
    lnz3 += lnadd(f, -f); /* log [2 cosh(f)] */
    lnz4 += (f < 0) ? lndif(-f, f) : lndif(f, -f); /* avoid neg. g0 */

    ex = exp(-f);
    th = 2./(1. + ex*ex) - 1.;
    dg = (r == 0) ? dg0 : exp(lnch2b - lnsl)*cd;
    dr3 += lxh*dg*th;
    dr4 += lxh*dg/th;

    if (r == 0) {
      ddg = -4*coth2b*coth2b*exp(-lnch2b);
    } else {
      ddg = exp(lnddcl - lnsl);
      ddg -= exp(lnch2b*2. + lncl - 3.*lnsl)*cdsqr;
    }
    sech = 2.0*dg/(ex + 1.0/ex);
    ddr3 += lxh*(ddg*th + lxh*(sech*sech));
    sech = 2.0*dg/(ex - 1.0/ex);
    ddr4 += lxh*(ddg/th - lxh*(sech*sech));
  }

  z21 = exp(lnz2 - lnz1);
  z31 = exp(lnz3 - lnz1);
  z41 = sgn4*exp(lnz4 - lnz1);
  za1 = 1.0 + z21 + z31 + z41;
  lnz = lnz1 + log(za1);
  lnz += .5*n*log(2.*sh2b) - log2;
  dz = (dr1 + z21*dr2 + z31*dr3 + z41*dr4)/za1;
  *eav = - n*coth2b - dz;
  ddr1 += dr1*dr1;
  ddr2 += dr2*dr2;
  ddr3 += dr3*dr3;
  ddr4 += dr4*dr4;
  ddz = (ddr1 + z21*ddr2 + z31*ddr3 + z41*ddr4)/za1;
  *cv = bsqr * (-2.*n/(sh2b*sh2b) + ddz - dz*dz);
  return lnz;
}

#endif /* ZCOM_ISING2__ */
#endif /* ZCOM_ISING2 */

#ifdef  ZCOM_POTTS2
#ifndef ZCOM_POTTS2__
#define ZCOM_POTTS2__


#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>

typedef struct {
  int d; /* dimension */
  int q; /* number of states of each spin */
  int l, n;
  int E;  /* potential energy */
  int *M; /* M[0..q-1] number of spins in each state */
  int *s; /* s[0..n-1], each s[i] in 0..q-1 */
  /* helper vars */
  double *accprb; /* temporary accumulated probabilities, for heat bath */
  uint32_t *uproba; /* temporary probability for MC transitions */
  double *dproba;
} potts_t;

STRCLS int pt2_em(potts_t *pt);
STRCLS int pt2_check(potts_t *pt);
STRCLS int pt2_load(potts_t *pt, const char *fname);
STRCLS int pt2_save(const potts_t *pt, const char *fname);
STRCLS potts_t *pt2_open(int l, int q);
STRCLS void pt2_close(potts_t *pt);

#define PT2_SETPROBA(pt, bet) { \
  double x_ = exp(-bet), prd_; \
  prd_  = x_; pt->dproba[1] = prd_; pt->uproba[1] = (uint32_t) (4294967295. * prd_); \
  prd_ *= x_; pt->dproba[2] = prd_; pt->uproba[2] = (uint32_t) (4294967295. * prd_); \
  prd_ *= x_; pt->dproba[3] = prd_; pt->uproba[3] = (uint32_t) (4294967295. * prd_); \
  prd_ *= x_; pt->dproba[4] = prd_; pt->uproba[4] = (uint32_t) (4294967295. * prd_); \
}

/* faster macros for systems with fixed (upon compiling) size
 * to use them one must define PT2_LB and PT2_Q before including
 * PT2_PICK()/PT2_PSEQ() and PT2_FLIP() */
#ifdef  PT2_LB  /* L = 2^LB, N = L*L */
#define PT2_L   (1 << PT2_LB)
#define PT2_N   (PT2_L * PT2_L)

#define PT2_GETH(pt, id, h) { \
  unsigned ix, iy; \
  for (ix = 0; ix < PT2_Q; ix++) h[ix] = 0; \
  iy = id / PT2_L, ix = id % PT2_L; \
  h[ pt->s[iy*PT2_L + (ix+1)%PT2_L]       ]++; \
  h[ pt->s[iy*PT2_L + (ix+PT2_L-1)%PT2_L] ]++; \
  h[ pt->s[(iy+1)%PT2_L*PT2_L + ix]       ]++; \
  h[ pt->s[(iy-1+PT2_L)%PT2_L*PT2_L + ix] ]++; }
#define PT2_IRND(pt, id)  id = rand32() >> (32 - 2*PT2_LB);
/* random pick */
#define PT2_PICK(pt, id, h) { PT2_IRND(pt, id); PT2_GETH(pt, id, h); }
#define PT2_ISEQ(pt, id)  id = (id + 1) % PT2_N;
/* sequential pick */
#define PT2_PSEQ(pt, id, h) { PT2_ISEQ(pt, id); PT2_GETH(pt, id, h); }

/* change spin id from so to sn (use PT2_Q instead of pt->q) */
#define PT2_NEWFACE(pt, id, so, sn) { \
  so = pt->s[id]; sn = (so + 1 + (int)(rnd0()*(PT2_Q - 1))) % PT2_Q; }

/* change spin id from so to sn according to heat bath algorithm
 * local accprb is somehow faster */
#define PT2_HEATBATH(pt, id, so, sn, h) { \
  static double accprb[PT2_Q+1] = {0.,}; double rs_; \
  so = pt->s[id]; \
  for (sn = 0; sn < PT2_Q; sn++) accprb[sn+1] = accprb[sn] + pt->dproba[4-h[sn]]; \
  for (rs_ = accprb[PT2_Q]*rnd0(), sn = 0; sn < PT2_Q; sn++) if (accprb[sn+1] > rs_) break; \
}

#define PT2_FLIP(pt, id, so, sn, h) { \
  pt->s[id] = sn; \
  pt->M[so]--; \
  pt->M[sn]++; \
  pt->E += h[so] - h[sn];  }

#else  /* non-macro version */

#define PT2_PICK(pt, id, h)  id = pt2_pick(pt, h)
#define PT2_NEWFACE(pt, id, so, sn) { \
  so = pt->s[id]; sn = (so + 1 + (int)(rnd0()*(pt->q - 1))) % (pt->q); }
#define PT2_HEATBATH(pt, id, so, sn, h) \
  pt2_heatbath(pt, id, &so, &sn, h)
#define PT2_FLIP(pt, id, so, sn, h) {so = pt->s[id]; pt2_flip(pt, id, sn, h); }

INLINE int pt2_pick(const potts_t *pt, int h[])
{
  int i, id, ix, iy, l, lm, n, nm, *p;
  int sl, sr, sd, su;

  lm = (l = pt->l) - 1;
  nm = (n = pt->n) - l;
  id = (int)(rnd0() * n);
  iy = id / l, ix = id % l;
  p = pt->s + id;
  for (i = 0; i < pt->q; i++) h[i] = 0;
  sl = ((ix != 0 ) ? *(p-1) : *(p+lm)); h[sl]++;
  sr = ((ix != lm) ? *(p+1) : *(p-lm)); h[sr]++;
  sd = ((iy != 0 ) ? *(p-l) : *(p+nm)); h[sd]++;
  su = ((iy != lm) ? *(p+l) : *(p-nm)); h[su]++;
  return id;
}

INLINE int pt2_heatbath(potts_t *pt, int id, int *so, int *sn,
    const int h[])
{
  double rs_;
  int i, mx_ = 4;
  *so = pt->s[id];
  for (i = 0; i < pt->q; i++)
    pt->accprb[i+1] = pt->accprb[i] + pt->dproba[mx_-h[i]];
  for (rs_ = pt->accprb[pt->q]*rnd0(), i = 0; i < pt->q; i++)
    if (pt->accprb[i+1] > rs_) break;
  die_if (i >= pt->q, "no suitable selection, i = %d\n", i);
  *sn = i;
  return 0;
}

/* flip site `id' to `sn', with h different neighbors */
INLINE int pt2_flip(potts_t *pt, int id, int sn, const int h[])
{
  int so = pt->s[id];
  die_if(id >= pt->n, "id %d >= n %d\n", id, pt->n);
  die_if(sn >= pt->q || sn < 0, "invalid sn %d (q = %d)\n", sn, pt->q);
  pt->s[id] = sn;
  pt->M[so]--;
  pt->M[sn]++;
  return pt->E += h[so] - h[sn];
}

#endif /* PT2_LB */



/* compute the total energy and magnetization */
int pt2_em(potts_t *pt)
{
  int i, j, l, s, s1, s2, *p;

  pt->E = 0;
  p = pt->s;
  l = pt->l;
  for (i = 0; i < pt->q; i++) pt->M[i] = 0;
  for (i = 0; i < l; i++)
    for (j = 0; j < l; j++) {
      s = p[i*l + j];
      s1 = p[((i+1)%l)*l + j];
      if (s1 == s) pt->E--;
      s2 = p[i*l + (j+1)%l];
      if (s2 == s) pt->E--;
      pt->M[s]++;
    }
  return pt->E;
}

int pt2_check(potts_t *pt)
{
  int i, e, *mg, q;

  q = pt->q;
  for (i = 0; i < pt->n; i++) /* check spin value */
    if (pt->s[i] < 0 || pt->s[i] >= q) {
      fprintf(stderr, "error: s[%d] = %d\n", i, pt->s[i]);
      return -1;
    }
  e = pt->E;
  xnew(mg, pt->q);
  for (i = 0; i < pt->q; i++)
    mg[i] = pt->M[i];
  if (e != pt2_em(pt)) { /* check energy */
    fprintf(stderr, "error: E = %d, should be %d\n",
        e, pt->E);
    free(mg);
    return -1;
  }
  for (i = 0; i < pt->q; i++) {
    if (mg[i] != pt->M[i]) {
      fprintf(stderr, "error: M(%d) = %d, should be %d",
          i, mg[i], pt->M[i]);
      free(mg);
      return -1;
    }
  }
  free(mg);
  return 0;
}

/* pick a random site (return its id)
 * compute h[j], the numbers of neighboring spins with value j */
/* load spin configuration */
int pt2_load(potts_t *pt, const char *fname)
{
  FILE *fp;
  int i, lx, ly, n, c;
  char s[80];

  if ((fp = fopen(fname, "r")) == NULL) {
    return -1;
  }
  if (fgets(s, sizeof s, fp) == NULL) {
    fprintf(stderr, "missing first line %s\n", fname);
    return -1;
  }
  if (4 != sscanf(s, "%d%d%d%d", &i, &lx, &ly, &n)
      || i != 2 || lx != ly || lx != pt->l || n != pt->n) {
    fprintf(stderr, "bad setting: %dD, %dx%d = %d\n", i, lx, ly, n);
    return -1;
  }
  for (i = 0; i < n; i++) {
    while ((c=fgetc(fp)) != EOF && c == '\n') ;
    if (c == EOF) break;
    c -= '0';
    if (c < 0 || c >= pt->q) {
      fprintf(stderr, "BAD %s s[%d] = %d, q = %d\n", fname, i, c, pt->q);
      break;
    }
    pt->s[i] = c;
  }
  if (i < n) {
    fprintf(stderr, "%s: data stopped at i = %d, clear\n", fname, i);
    for (i = 0; i < n; i++) pt->s[i] = 0;
  }
  fclose(fp);
  pt2_em(pt); /* re-compute energy/magnetization */
  return 0;
}

/* save spin configuration */
int pt2_save(const potts_t *pt, const char *fname)
{
  FILE *fp;
  int i, j, l, *p;

  if ((fp = fopen(fname, "w")) == NULL) {
    fprintf(stderr, "cannot write %s\n", fname);
    return -1;
  }
  l = pt->l;
  fprintf(fp, "%d %d %d %d\n", pt->d, l, l, pt->n);
  for (p = pt->s, i = 0; i < l; i++) {
    for (j = 0; j < l; j++, p++)
      fprintf(fp, "%c", *p+'0');
    fprintf(fp, "\n");
  }
  fclose(fp);
  return 0;
}

/* initialize an lxl q-state Potts model */
potts_t *pt2_open(int l, int q)
{
  int i, n;
  potts_t *pt;

  xnew(pt, 1);
  pt->d = 2;
  pt->q = q;
  pt->l = l;
  pt->n = n = l*l;
  xnew(pt->s, n);
  xnew(pt->M, q);
  for (i = 0; i < n; i++)
    pt->s[i] = 0;
  for (pt->M[0] = n, i = 1; i < q; i++)
    pt->M[i] = 0;
  pt->E = -pt->d * n;
  xnew(pt->accprb, q+1);
  pt->accprb[0] = 0.;
  /* dynamic array of uproba/dproba seems to be faster */
  xnew(pt->uproba, 2*pt->d+1);
  pt->uproba[0] = 0xffffffffu;
  xnew(pt->dproba, 2*pt->d+1);
  pt->dproba[0] = 1.;
  return pt;
}

void pt2_close(potts_t *pt)
{
  if (pt != NULL) {
    free(pt->s);
    free(pt->M);
    free(pt->accprb);
    free(pt->uproba);
    free(pt->dproba);
    free(pt);
  }
}

#endif /* ZCOM_POTTS2__ */
#endif /* ZCOM_POTTS2 */

#ifdef  ZCOM_MD
#ifndef ZCOM_MD__
#define ZCOM_MD__

#define md_shiftcom(x, n, d) md_shiftcomw(x, NULL, n, d)
#define md_shiftcom3d(x, n) md_shiftcomw3d(x, NULL, n)
#define md_shiftcom2d(x, n) md_shiftcomw2d(x, NULL, n)
STRCLS void md_shiftcomw(real * RESTRICT x, const real * RESTRICT w, int n, int d);
/* these two are inline instead macros because they offer type checks */
INLINE void md_shiftcomw2d(rv2_t * RESTRICT x, const real * RESTRICT w, int n) \
  { md_shiftcomw((real *) x, w, n, 2); }
INLINE void md_shiftcomw3d(rv3_t * RESTRICT x, const real * RESTRICT w, int n) \
  { md_shiftcomw((real *) x, w, n, 3); }

STRCLS void md_shiftang2d(rv2_t * RESTRICT x, rv2_t * RESTRICT v, int n);
STRCLS void md_shiftang3d(rv3_t * RESTRICT x, rv3_t * RESTRICT v, int n);
INLINE void md_shiftang(real * RESTRICT x, real * RESTRICT v, int n, int d)
{
  if (d == 2) md_shiftang2d((rv2_t *)x, (rv2_t *)v, n);
  else md_shiftang3d((rv3_t *)x, (rv3_t *)v, n);
}

STRCLS real md_ekin(const real *v, int nd, int dof, real * RESTRICT tkin);
INLINE real md_ekin2d(rv3_t * RESTRICT v, int n, int dof, real * RESTRICT tkin) \
  { return md_ekin((const real *) v, n*2, dof, tkin); }
INLINE real md_ekin3d(rv3_t * RESTRICT v, int n, int dof, real * RESTRICT tkin) \
  { return md_ekin((const real *) v, n*3, dof, tkin); }

STRCLS void md_vrescale(real * RESTRICT v, int nd, int dof, real tp, real dt, real * RESTRICT ekin, real * RESTRICT tkin);
INLINE void md_vrescale2d(rv3_t * RESTRICT v, int n, int dof, real tp, real dt, real * RESTRICT ekin, real * RESTRICT tkin) \
    { md_vrescale((real *) v, n*2, dof, tp, dt, ekin, tkin); }
INLINE void md_vrescale3d(rv3_t * RESTRICT v, int n, int dof, real tp, real dt, real * RESTRICT ekin, real * RESTRICT tkin) \
    { md_vrescale((real *) v, n*3, dof, tp, dt, ekin, tkin); }


/* shift the center of mass to zero */
void md_shiftcomw(real * RESTRICT x, const real * RESTRICT w, int n, int d)
{
  int i, j;
  real rc, wtot = 0;

  if (w) for (i = 0; i < n; i++) wtot += w[i];
  else wtot = (real) n;
  for (j = 0; j < d; j++) {
    for (rc = 0, i = 0; i < n; i++)
      rc += x[i*d+j]*(w ? w[i] : 1.f);
    rc /= wtot;
    for (i = 0; i < n; i++)
      x[i*d+j] -= rc;
  }
}

/* annihilate angular momentum 2d */
void md_shiftang2d(rv2_t * RESTRICT x, rv2_t * RESTRICT v, int n)
{
  int i;
  real am, r2, xc[2] = {0,0}, xi[2];

  for (i = 0; i < n; i++) rv2_inc(xc, x[i]);
  rv2_smul(xc, 1.f/n);
  for (am = r2 = 0.f, i = 0; i < n; i++) {
    rv2_diff(xi, x[i], xc);
    am += rv2_cross(xi, v[i]);
    r2 += rv2_sqr(x[i]);
  }
  am = -am/r2;
  for (i = 0; i < n; i++) {
    rv2_diff(xi, x[i], xc);
    v[i][0] += -am*xi[1];
    v[i][1] +=  am*xi[0];
  }
}

/* annihilate angular momentum 3d
 * solve
 *   /  y^2 + z^2    -x y      -x y      \
 *   |  -x y       X^2 + z^2   -y z      |  c  =  I
 *   \  -x z         -y z     x^2 + y^2  /
 * use a velocity field
 *    v = c X r
 *   */

void md_shiftang3d(rv3_t *x, rv3_t *v, int n)
{
  int i;
  real xc[3] = {0,0,0}, xi[3], ang[3], am[3] = {0,0,0}, dv[3], mat[3][3], inv[3][3];
  real xx = 0, yy = 0, zz = 0, xy = 0, zx = 0, yz = 0;

  for (i = 0; i < n; i++) rv3_inc(xc, x[i]);
  rv3_smul(xc, 1.f/n);
  for (i = 0; i < n; i++) {
    rv3_diff(xi, x[i], xc);
    rv3_cross(ang, xi, v[i]);
    rv3_inc(am, ang);
    xx += xi[0]*xi[0];
    yy += xi[1]*xi[1];
    zz += xi[2]*xi[2];
    xy += xi[0]*xi[1];
    yz += xi[1]*xi[2];
    zx += xi[2]*xi[0];
  }
  mat[0][0] = yy+zz;
  mat[1][1] = xx+zz;
  mat[2][2] = xx+yy;
  mat[0][1] = mat[1][0] = -xy;
  mat[1][2] = mat[2][1] = -yz;
  mat[0][2] = mat[2][0] = -zx;
  rm3_inv(inv, mat);
  ang[0] = -rv3_dot(inv[0], am);
  ang[1] = -rv3_dot(inv[1], am);
  ang[2] = -rv3_dot(inv[2], am);
  /* ang is the solution of M^(-1) * I */
  for (i = 0; i < n; i++) {
    rv3_diff(xi, x[i], xc);
    rv3_cross(dv, ang, xi);
    rv3_inc(v[i], dv);
  }
}

/* return kinetic energy */
real md_ekin(const real *v, int nd, int dof, real *tkin)
{
  int i;
  real ekin;

  for (ekin = 0, i = 0; i < nd; i++) ekin += v[i]*v[i];
  if (tkin) *tkin = ekin/dof;
  return ekin *= .5f;
}

/* velocity rescaling thermostat */
void md_vrescale(real *v, int nd, int dof, real tp, real dt, real *ekin, real *tkin)
{
  int i;
  real ekav = .5f*tp*dof, ek1 = *ekin, ek2, s;
  double amp;

  amp = 2*sqrt(ek1*ekav*dt/dof);
  ek2 = ek1 + (ekav - ek1)*dt + (real)(amp*grand0());
  if (ek2 < 0) ek2 = 0;
  s = (real)sqrt(ek2/ek1);
  for (i = 0; i < nd; i++)
    v[i] *= s;
  *ekin = ek2;
  if (tkin) *tkin *= s*s;
}

#endif /* ZCOM_MD__ */
#endif /* ZCOM_MD */

#ifdef  ZCOM_LJ
#ifndef ZCOM_LJ__
#define ZCOM_LJ__

typedef struct {
  int d; /* dimension = 3 */
  int n; /* number of particles */
  int dof; /* degrees of freedom */
  real rho;
  real tp;
  real l, vol; /* side length and volume */
  real rc, rcdef; /* real / preferred rc */

  real * RESTRICT x; /* reduced unit (0, 1) */
  real * RESTRICT v, * RESTRICT f;
  real epot, epots; /* potential energy and shifted potential energy */
  real ekin, tkin, etot;
  real vir, p; /* virial and pressure */
  real epot_shift, epot_tail, p_tail;
  double t;
} lj_t;

STRCLS lj_t *lj_open(int n, int d, real rho, real rcdef, real tp);
STRCLS void lj_close(lj_t *lj);
STRCLS void lj_force(lj_t *lj);
STRCLS void lj_vv(lj_t *lj, real dt);

INLINE void lj_vrescale(lj_t *lj, real thermdt)
 { md_vrescale(lj->v, lj->n*lj->d, lj->dof, lj->tp, thermdt, &lj->ekin, &lj->tkin); }


/* set density and compute tail corrections */
static void lj_setrho(lj_t *lj, real rho)
{
  double irc, irc3, irc6;
  int i;

  lj->rho = rho;
  lj->l = (real) pow(1.*lj->n/rho, 1./lj->d);
  for (lj->vol = 1.f, i = 0; i < lj->d; i++) lj->vol *= lj->l;
  if ((lj->rc = lj->rcdef) > lj->l*.5) lj->rc = lj->l*.5;
  irc = 1.0/lj->rc;
  irc3 = irc*irc*irc; irc6 = irc3*irc3;
  lj->epot_shift = 4*irc6*(irc6-1);
  if (lj->d == 3) {
    lj->epot_tail = (real)( 8*M_PI*rho*lj->n/9*(irc6 - 3)*irc3 );
    lj->p_tail = (real)( 32*M_PI*rho*rho/9*(irc6 - 1.5)*irc3 );
  } else if (lj->d == 2) {
    lj->epot_tail = (real) (M_PI*rho*lj->n*(.4*irc6 - 1)*irc3*irc);
    lj->p_tail = (real) (M_PI*rho*rho*(1.6*irc6 - 2)*irc3*irc);
  }
}

/* initialize a fcc lattice */
static void lj_initfcc2d(lj_t *lj)
{
  int i, j, id, n1, n = lj->n;
  real a;

  n1 = (int) (pow(2*n, 1.0/lj->d) + .999999); /* # of particles per side */
  a = 1./n1;
  for (id = 0, i = 0; i < n1 && id < n; i++)
    for (j = 0; j < n1 && id < n; j++) {
      if ((i+j) % 2 != 0) continue;
      lj->x[id*2 + 0] = (i+.5)*a;
      lj->x[id*2 + 1] = (j+.5)*a;
      id++;
    }
}

/* initialize a fcc lattice */
static void lj_initfcc3d(lj_t *lj)
{
  int i, j, k, id, n1, n = lj->n;
  real a;

  n1 = (int) (pow(2*n, 1.0/lj->d) + .999999); /* # of particles per side */
  a = 1./n1;
  for (id = 0, i = 0; i < n1 && id < n; i++)
    for (j = 0; j < n1 && id < n; j++)
      for (k = 0; k < n1 && id < n; k++) {
        if ((i+j+k) % 2 != 0) continue;
        lj->x[id*3 + 0] = (i+.5)*a;
        lj->x[id*3 + 1] = (j+.5)*a;
        lj->x[id*3 + 2] = (k+.5)*a;
        id++;
      }
}

/* create an open structure */
lj_t *lj_open(int n, int d, real rho, real rcdef, real tp)
{
  lj_t *lj;
  int i;

  xnew(lj, 1);
  lj->n = n;
  lj->d = d;
  lj->dof = n*d - d*(d+1)/2;
  lj->tp = tp;
  xnew(lj->f, n*d);
  xnew(lj->v, n*d);
  xnew(lj->x, n*d);

  lj->rcdef = rcdef;
  lj_setrho(lj, rho);

  if (lj->d == 3) lj_initfcc3d(lj); else lj_initfcc2d(lj);

  /* init. random velocities */
  for (i = 0; i < n*d; i++) lj->v[i] = rnd0() - .5;

  md_shiftcom(lj->x, n, d);
  md_shiftang(lj->x, lj->v, n, d);
  return lj;
}

void lj_close(lj_t *lj)
{
  free(lj->x);
  free(lj->v);
  free(lj->f);
  free(lj);
}

INLINE real lj_pbc(real x, real L)
  { return (real)((x - ((int)((x)+1000.5) - 1000))*L) ; }

INLINE real *lj_vpbc2d(real *v, real L)
  { v[0] = lj_pbc(v[0], L); v[1] = lj_pbc(v[1], L); return v; }

INLINE real lj_pbcdist2_2d(real * RESTRICT dx, const real * RESTRICT a, const real * RESTRICT b, real L)
  { return rv2_sqr(lj_vpbc2d(rv3_diff(dx, a, b), L)); }

INLINE real *lj_vpbc3d(real *v, real L)
  { v[0] = lj_pbc(v[0], L); v[1] = lj_pbc(v[1], L); v[2] = lj_pbc(v[2], L); return v; }

INLINE real lj_pbcdist2_3d(real * RESTRICT dx, const real * RESTRICT a, const real * RESTRICT b, real L)
  { return rv3_sqr(lj_vpbc3d(rv3_diff(dx, a, b), L)); }

static void lj_force2d(lj_t *lj)
{
  real dx[2], fi[2], dr2, dr6, fs, tmp, U, Vir, L = lj->l, rc2 = lj->rc*lj->rc;
  int i, j, d, prcnt = 0, n = lj->n;

  for (i = 0; i < lj->n*2; i++) lj->f[i] = 0;
  for (U = Vir = 0, i = 0; i < n - 1; i++) {
    fi[0] = fi[1] = 0;
    for (j = i+1; j < n; j++) {
      dr2 = lj_pbcdist2_2d(dx, lj->x+i*2, lj->x+j*2, L);
      if (dr2 > rc2) continue;
      dr2 = 1.f/dr2;
      dr6 = dr2*dr2*dr2;
      fs = dr6*(48.f*dr6-24.f); /* f.r */
      Vir += fs;
      fs *= dr2;
      for (d = 0; d < 2; d++) {
        tmp = dx[d]*fs;
        fi[d] += tmp;
        lj->f[j*2+d] -= tmp;
      }
      U += 4*dr6*(dr6-1);
      prcnt++;
    }
    lj->f[i*2+0] += fi[0];
    lj->f[i*2+1] += fi[1];
  }
  lj->epots = U - prcnt*lj->epot_shift; /* shifted energy */
  lj->epot += lj->epot_tail; /* unshifted energy */
  lj->vir = Vir;
  lj->p = lj->rho*lj->tp+lj->vir/(lj->d*lj->vol)+lj->p_tail;
}

/* compute force and virial, return energy */
static void lj_force3d(lj_t *lj)
{
  real dx[3], fi[3], dr2, dr6, fs, tmp, U, Vir, L = lj->l, rc2 = lj->rc*lj->rc;
  int i, j, d, prcnt = 0, n = lj->n;
  real * RESTRICT f = lj->f, * RESTRICT x = lj->x;

  for (i = 0; i < lj->n*3; i++) f[i] = 0;
  for (U = Vir = 0, i = 0; i < n - 1; i++) {
    fi[0] = 0; fi[1] = 0; fi[2] = 0;
    for (j = i+1; j < n; j++) {
      dr2 = lj_pbcdist2_3d(dx, x+i*3, x+j*3, L);
      if (dr2 > rc2) continue;
      dr2 = 1.f/dr2;
      dr6 = dr2*dr2*dr2;
      fs = dr6*(48.f*dr6-24.f); /* f.r */
      Vir += fs;
      fs *= dr2;
      for (d = 0; d < 3; d++) {
        tmp = dx[d]*fs;
        fi[d] += tmp;
        f[j*3+d] -= tmp;
      }
      U += 4*dr6*(dr6-1);
      prcnt++;
    }
    f[i*3+0] += fi[0];
    f[i*3+1] += fi[1];
    f[i*3+2] += fi[2];
  }
  lj->epots = U - prcnt*lj->epot_shift; /* shifted energy */
  lj->epot =  U + lj->epot_tail; /* unshifted energy */
  lj->vir = Vir;
  lj->p = lj->rho*lj->tp+lj->vir/(lj->d*lj->vol)+lj->p_tail;
}

void lj_force(lj_t *lj)
{
  if (lj->d == 3) lj_force3d(lj);
  else if (lj->d == 2) lj_force2d(lj);
}

/* velocity verlet */
void lj_vv(lj_t *lj, real dt)
{
  int i, nd = lj->n*lj->d;
  real dth = dt*.5f, dtl = dt/lj->l;

  for (i = 0; i < nd; i++) { /* VV part 1 */
    lj->v[i] += lj->f[i]*dth;
    lj->x[i] += lj->v[i]*dtl;
  }
  lj_force(lj); /* calculate the new force */
  for (i = 0; i < nd; i++) /* VV part 2 */
    lj->v[i] += lj->f[i]*dth;

  lj->ekin = md_ekin(lj->v, nd, lj->dof, &lj->tkin);
  lj->t += dt;
}

#endif /* ZCOM_LJ__ */
#endif /* ZCOM_LJ */

#ifdef  ZCOM_ABPRO
#ifndef ZCOM_ABPRO__
#define ZCOM_ABPRO__
/* AB beads protein models */

#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <math.h>


typedef struct {
  int d; /* dimension */
  int model; /* 1 or 2 */
  int seqid; /* model sequence id */
  int n; /* number of atoms */
  int dof; /* number of degrees of freedom */
  real clj[2][2], sla, slb;
  int *type; /* 0: A, 1: B */
  real *x, *x1, *dx;
  real *v;
  real *f;
  real *lmx, *xmin;
  real emin, epot, ekin, tkin;
  double t;
} abpro_t;

#define AB_VERBOSE    0x0001
#define AB_SOFTFORCE  0x0010
#define AB_MILCSHAKE  0x0020
#define AB_LMREGISTER 0x0100
#define AB_LMWRITE    0x0200

STRCLS abpro_t *ab_open(int seqid, int d, int model, real randdev);
STRCLS void ab_close(abpro_t *ab);

STRCLS int ab_checkconn(abpro_t *ab, const real *x, double tol);
#define ab_shiftcom(ab, x)      md_shiftcom(x, ab->n, ab->d)
#define ab_shiftang(ab, x, v)   md_shiftang(x, v, ab->n, ab->d)
STRCLS void ab_rmcom(abpro_t *ab, real *x, real *v);

STRCLS int ab_writepos(abpro_t *ab, const real *x, const real *v, const char *fname);
STRCLS int ab_readpos(abpro_t *ab, real *x, real *v, const char *fname);
STRCLS int ab_initpos(abpro_t *ab, real *x, real randev);

STRCLS int ab_shake(abpro_t *ab, const real *x0, real *x1, real *v, real dt,
    int itmax, double tol, int verbose);
STRCLS int ab_rattle(abpro_t *ab, const real *x0, real *v,
    int itmax, double tol, int verbose);
STRCLS int ab_milcshake(abpro_t *ab, const real *x0, real *x1, real *v, real dt,
    int itmax, double tol, int verbose);
STRCLS int ab_milcrattle(abpro_t *ab, const real *x0, real *v);

STRCLS real ab_localmin(abpro_t *ab, const real *r, int itmax, double tol,
    int sh_itmax, double sh_tol, unsigned flags);
STRCLS real ab_energy(abpro_t *ab, const real *r, int soft);
STRCLS real ab_force(abpro_t *ab, real *f, const real *r, int soft);

INLINE real ab_ekin(abpro_t *ab)
  { return ab->ekin = md_ekin(ab->v, ab->n*ab->d, ab->dof, &ab->tkin); }
INLINE void ab_vrescale(abpro_t *ab, real tp, real dt)
  { md_vrescale(ab->v, ab->n*ab->d, ab->dof, tp, dt, &ab->ekin, &ab->tkin); }
STRCLS int ab_vv(abpro_t *ab, real fscal, real dt, unsigned flags);
STRCLS int ab_brownian(abpro_t *ab, real T, real fscal, real dt, unsigned flags);


/* initialization
 * seqid: 8: 34, 9: 55, 10: 89*/
abpro_t *ab_open(int seqid, int d, int model, real randdev)
{
  abpro_t *ab;
  int i, nd;
  double x;
  const int verbose = 0;

  die_if (d == 2 && model != 1, "%dd only for model 1", d);
  die_if (seqid < 0, "bad seqid %d\n", seqid);

  xnew(ab, 1);
  ab->d = d;

  ab->model = model;
  if (model == 1) {
    ab->clj[0][0] = 1; ab->clj[1][1] = .5f;
    ab->clj[0][1] = ab->clj[1][0] = -.5f;
    ab->sla = ab->slb = 24;
  } else {
    ab->clj[0][0] = 1;
    ab->clj[0][1] = ab->clj[1][0] = ab->clj[1][1] = .5f;
    ab->sla = ab->slb = 24;
  }

  ab->seqid = seqid;
  /* determine # of atoms */
  x = pow(.5*(sqrt(5.) + 1), seqid + 1);
  i = (seqid % 2) ? (-1) : 1;
  ab->n = (int)( (x + i/x)/sqrt(5.) + .5);

  /* construct sequence */
  xnew(ab->type, ab->n);
  if (seqid < 2) {
    ab->type[0] = seqid;
  } else {
    int *s[2], sl[2], who, j;
    xnew(s[0], ab->n);
    xnew(s[1], ab->n);
    s[0][0] = 0; sl[0] = 1;
    s[1][0] = 1; sl[1] = 1;
    for (who = 0, i = 2; i <= seqid; i++, who = !who) {
      /* s[who] += s[!who]; */
      die_if (sl[0] + sl[1] > ab->n, "combined length > %d\n", ab->n);
      for (j = 0; j < sl[!who]; j++)
        s[who][sl[who] + j] = s[!who][j];
      sl[who] += sl[!who];
    }
    for (who = !who, j = 0; j < ab->n; j++) {
     ab->type[j] = s[who][j];
    }
    free(s[0]);
    free(s[1]);
  }

  /* number of degrees of freedom */
  ab->dof = (ab->d == 2) ? (ab->n - 2) : (2*ab->n - 5);
  if (verbose) {
    printf("n = %3d, d = %d, dof = %3d: ", ab->n, ab->d, ab->dof);
    for (i = 0; i < ab->n; i++)
      printf("%c", ab->type[i]+'A');
    printf("\n");
  }

  nd = ab->n * ab->d;
  xnew(ab->x, nd);
  xnew(ab->x1, nd);
  xnew(ab->v, nd);
  xnew(ab->f, nd);
  xnew(ab->dx, nd);
  xnew(ab->lmx, nd);
  xnew(ab->xmin, nd);

  ab_initpos(ab, ab->x, randdev);
  ab->emin = ab->epot = ab_force(ab, ab->f, ab->x, 0);
  return ab;
}

/* initialize an almost straight chain,
 * randomness given by del */
int ab_initpos(abpro_t *ab, real *x, real del)
{
  int i, j;
  real dx[3];

  for (j = 0; j < ab->d; j++) ab->x[j] = 0;
  for (i = 0; i < ab->n - 1; i++) {
    for (j = 0; j < ab->d; j++) {
      dx[j] = (2.f*rand()/RAND_MAX - 1)*del + ((j == 0) ? 1.f : 0.f);
    }
    if (ab->d == 3) {
      rv3_normalize(dx);
      rv3_add(x + (i+1)*ab->d, x + i*ab->d, dx);
    } else {
      rv2_normalize(dx);
      rv2_add(x + (i+1)*ab->d, x + i*ab->d, dx);
    }
  }
  ab_shiftcom(ab, x);
  die_if (ab_checkconn(ab, x, 0) != 0, "initpos failed, with del = %g\n", del);
  return 0;
}

/* close ab */
void ab_close(abpro_t *ab)
{
  if (ab) {
    ab_milcshake(ab, NULL, NULL, NULL, 0, 0, 0, 0);
    ab_milcrattle(ab, NULL, NULL);
    ab_localmin(ab, NULL, 0, 0., 0, 0., 0);
    free(ab->type);
    free(ab->x);
    free(ab->x1);
    free(ab->dx);
    free(ab->v);
    free(ab->f);
    free(ab->lmx);
    free(ab->xmin);
    free(ab);
  }
}

/* check connectivity */
int ab_checkconn(abpro_t *ab, const real *x, double tol)
{
  int i, d = ab->d;
  real r;

  if (tol <= 0.) tol = 1e-3;
  for (i = 0; i < ab->n-1; i++) {
    if (d == 3) {
      r = rv3_dist(x + i*3, x + (i+1)*3);
    } else {
      r = rv2_dist(x + i*2, x + (i+1)*2);
    }
    if (fabs(r-1) > tol) {
      fprintf(stderr, "link (%d,%d) is broken, r = %g\n", i, i+1, r);
      return 1;
    }
  }
  return 0;
}

/* shift center of x to the origin,
 * remove center velocity and angular momentum */
void ab_rmcom(abpro_t *ab, real *x, real *v)
{
  ab_shiftcom(ab, x);
  ab_shiftcom(ab, v);
  ab_shiftang(ab, x, v); /* remove angular momentum */
}

/* write position file (which may include velocity) */
int ab_writepos(abpro_t *ab, const real *x, const real *v, const char *fname)
{
  FILE *fp;
  int i, j, d = ab->d, n = ab->n;

  if (fname == NULL) fname = "ab.pos";
  if ((fp = fopen(fname, "w")) == 0) {
    fprintf(stderr, "cannot open file [%s]\n", fname);
    return 1;
  }

  fprintf(fp, "# %d %d %d %d %d\n", d, ab->model, ab->seqid, ab->n, (v != NULL));
  for (i = 0; i < n; i++) {
    for (j = 0; j < d; j++) fprintf(fp, "%16.14f ", x[i*d+j]);
    if (v)
      for (j = 0; j < d; j++) fprintf(fp, "%16.14f ", v[i*d+j]);
    fprintf(fp, "%d ", ab->type[i]);
    fprintf(fp, "\n");
  }
  fclose(fp);
  return 0;
}

/* read position file (which may include velocity) */
int ab_readpos(abpro_t *ab, real *x, real *v, const char *fname)
{
  char s[1024], *p;
  FILE *fp;
  int i, j, seq, hasv = 0, next, d = ab->d, n = ab->n;
  const char *fmt;
  real vtmp[3], *vi;

  if (fname == NULL) fname = "ab.pos";
  if ((fp = fopen(fname, "r")) == 0) {
    fprintf(stderr, "cannot open file [%s]\n", fname);
    return 1;
  }

  if (fgets(s, sizeof s, fp) == NULL || s[0] != '#') {
    fprintf(stderr, "Warning: %s has no information line\n", fname);
    rewind(fp);
  } else {
    if (5 != sscanf(s+1, "%d%d%d%d%d", &i, &j, &seq, &next, &hasv)
       || i != d || j != ab->model || seq != ab->seqid || next != n) {
      fprintf(stderr, "first line is corrupted:\n%s", s);
      goto ERR;
    }
  }

  if (sizeof(double) == sizeof(real))
    fmt = "%lf%n";
  else
    fmt = "%f%n";
  for (i = 0; i < n; i++) {
    fgets(s, sizeof s, fp);
    if (strlen(s) < 10) goto ERR;
    for (p = s, j = 0; j < d; j++, p += next) {
      if (1 != sscanf(p, fmt, x+i*d+j, &next)) {
        fprintf(stderr, "cannot read i = %d, j = %d\n", i, j);
        goto ERR;
      }
    }
    if (hasv) {
      vi = (v != NULL) ? (v + i*d) : vtmp;
      for (j = 0; j < d; j++, p += next) {
        if (1 != sscanf(p, fmt, vi+j, &next)) {
          fprintf(stderr, "cannot read i = %d, j = %d\n", i, j);
          goto ERR;
        }
      }
    }
    if (1 != sscanf(p, "%d", &j) || j != ab->type[i]) {
      fprintf(stderr, "bad type on i = %d, j = %d\n", i, j);
      goto ERR;
    }
  }
  fclose(fp);
  return 0;

ERR:
  fprintf(stderr, "position file [%s] appears to be broken on line %d!\n%s\n", fname, i, s);
  fclose(fp);
  return 1;
}

static int ab_shake2d(abpro_t *ab, crv2_t *x0, rv2_t *x1, rv2_t *v, real dt,
    int itmax, double tol, int verbose)
{
  int i, again, it, n = ab->n;
  real dxi[2], g, r2, r2bad;
  rv2_t *dx0 = (rv2_t *)ab->dx;
  const real glow = .5, r2max = 4.0;

  for (i = 0; i < n-1; i++)
    rv2_diff(dx0[i], x0[i+1], x0[i]);

  for (it = 0; it < itmax; it++) {
    for (again = 0, i = 0; i < n-1; i++) {
      r2 = rv2_sqr(rv2_diff(dxi, x1[i+1], x1[i]));
      if (r2 > r2max) { /* too large, impossible to correct */
        if (verbose)
          fprintf(stderr, "shake: large distance %d-%d, %g\n", i,i+1, sqrt(r2));
        r2 = r2max;
      }

      if (fabs(r2-1) > tol) {
        if (!again) { again = 1; r2bad = r2; }

        g = rv2_dot(dxi, dx0[i]);
        if (fabs(g) < glow) { /* inner product too small */
          if (verbose)
            fprintf(stderr, "shake: bad alignment %d-%d, %g, dot = %g\n", i,i+1, sqrt(r2), g);
          g = (g > 0) ? glow : -glow;
        }
        g = (1-r2)/(4*g);
        rv2_sinc(x1[i],   dx0[i], -g);
        rv2_sinc(x1[i+1], dx0[i],  g);
        if (v) { /* add a force of dx/dt */
          rv2_sinc(v[i],   dx0[i], -g/dt);
          rv2_sinc(v[i+1], dx0[i],  g/dt);
        }
      }
    }
    if (!again) break;
  }
  if (it >= itmax) {
    if (verbose) {
      const char *fnf = "shakefail.pos";
      fprintf(stderr, "shake: failed after %d iter. r = 1%+g, see %s\n", it, sqrt(r2bad)-1, fnf);
      ab_writepos(ab, (real *)x1, NULL, fnf);
    }
    return -1;
  }
  return 0;
}

static int ab_shake3d(abpro_t *ab, crv3_t *x0, rv3_t *x1, rv3_t *v, real dt,
    int itmax, double tol, int verbose)
{
  int i, again, it, n = ab->n;
  real dxi[3], g, r2, r2bad;
  rv3_t *dx0 = (rv3_t *)ab->dx;
  const real glow = .5, r2max = 4.0;

  for (i = 0; i < n-1; i++)
    rv3_diff(dx0[i], x0[i+1], x0[i]);

  for (it = 0; it < itmax; it++) {
    for (again = 0, i = 0; i < n-1; i++) {
      r2 = rv3_sqr(rv3_diff(dxi, x1[i+1], x1[i]));
      if (r2 > r2max) { /* too large, impossible to correct */
        if (verbose)
          fprintf(stderr, "shake: large distance %d-%d, %g\n", i,i+1, sqrt(r2));
        r2 = r2max;
      }

      if (fabs(r2-1) > tol) {
        if (!again) { again = 1; r2bad = r2; }

        g = rv3_dot(dxi, dx0[i]);
        if (fabs(g) < glow) { /* inner product too small */
          if (verbose)
            fprintf(stderr, "shake: bad alignment %d-%d, %g, dot = %g\n", i,i+1, sqrt(r2), g);
          g = (g > 0) ? glow : -glow;
        }
        g = (1-r2)/(4*g);
        rv3_sinc(x1[i],   dx0[i], -g);
        rv3_sinc(x1[i+1], dx0[i],  g);
        if (v) { /* add a force of dx/dt */
          rv3_sinc(v[i],   dx0[i], -g/dt);
          rv3_sinc(v[i+1], dx0[i],  g/dt);
        }
      }
    }
    if (!again) break;
  }
  if (it >= itmax) {
    if (verbose) {
      const char *fnf = "shakefail.pos";
      fprintf(stderr, "shake: failed after %d iter. r = 1%+g, see %s\n", it, sqrt(r2bad)-1, fnf);
      ab_writepos(ab, (real *)x1, NULL, fnf);
    }
    return -1;
  }
  return 0;
}

/* shake x1 according to x0 */
int ab_shake(abpro_t *ab, const real *x0, real *x1, real *v, real dt,
    int itmax, double tol, int verbose)
{
  if (itmax <= 0) itmax = 3000;
  if (tol <= 0.) tol = (sizeof(real) == sizeof(double)) ? 1e-6 : 1e-4;
  return (ab->d == 3) ?
    ab_shake3d(ab, (crv3_t *)x0, (rv3_t *)x1, (rv3_t *)v, dt, itmax, tol, verbose) :
    ab_shake2d(ab, (crv2_t *)x0, (rv2_t *)x1, (rv2_t *)v, dt, itmax, tol, verbose);
}

static int ab_rattle2d(abpro_t *ab, crv2_t *x0, rv2_t *v,
    int itmax, double tol, int verbose)
{
  int i, again, it, n = ab->n;
  real dv[2], g, rvbad;
  rv2_t *dx = (rv2_t *)ab->dx;

  for (i = 0; i < n-1; i++)
    rv2_diff(dx[i], x0[i+1], x0[i]);

  for (it = 0; it < itmax; it++) {
    for (again = 0, i = 0; i < n-1; i++) {
      rv2_diff(dv, v[i+1], v[i]);
      g = .5f * rv2_dot(dx[i], dv);
      if (fabs(g) > tol) {
        if (!again) { again = 1; rvbad = g; }
        rv2_sinc(v[i],   dx[i],  g);
        rv2_sinc(v[i+1], dx[i], -g);
      }
    }
    if (!again) break;
  }
  if (it >= itmax) {
    if (verbose) {
      const char *fnf = "rattlefail.pos";
      fprintf(stderr, "rattle: failed after %d iter. rv = %+g, see %s\n", it, rvbad, fnf);
      ab_writepos(ab, (real *)x0, (real *)v, fnf);
    }
    return -1;
  }
  return 0;
}

static int ab_rattle3d(abpro_t *ab, crv3_t *x0, rv3_t *v,
    int itmax, double tol, int verbose)
{
  int i, again, it, n = ab->n;
  real dv[3], g, rvbad;
  rv3_t *dx = (rv3_t *)ab->dx;

  for (i = 0; i < n-1; i++)
    rv3_diff(dx[i], x0[i+1], x0[i]);

  for (it = 0; it < itmax; it++) {
    for (again = 0, i = 0; i < n-1; i++) {
      rv3_diff(dv, v[i+1], v[i]);
      g = .5f * rv3_dot(dx[i], dv);
      if (fabs(g) > tol) {
        if (!again) { again = 1; rvbad = g; }
        rv3_sinc(v[i],   dx[i],  g);
        rv3_sinc(v[i+1], dx[i], -g);
      }
    }
    if (!again) break;
  }
  if (it >= itmax) {
    if (verbose) {
      const char *fnf = "rattlefail.pos";
      fprintf(stderr, "rattle: failed after %d iter. rv = %+g, see %s\n", it, rvbad, fnf);
      ab_writepos(ab, (real *)x0, (real *)v, fnf);
    }
    return -1;
  }
  return 0;
}

/* shake v according to x0 */
int ab_rattle(abpro_t *ab, const real *x0, real *v, int itmax, double tol, int verbose)
{
  if (itmax <= 0) itmax = 3000;
  if (tol <= 0.) tol = 1e-4;
  return (ab->d == 3) ?
    ab_rattle3d(ab, (crv3_t *)x0, (rv3_t *)v, itmax, tol, verbose) :
    ab_rattle2d(ab, (crv2_t *)x0, (rv2_t *)v, itmax, tol, verbose);
}

static int ab_milcshake2d(abpro_t *ab, crv2_t *x0, rv2_t *x1, rv2_t *v, real dt,
    int itmax, double tol, int verbose)
{
  int i, again, it, n = ab->n, nl;
  static real *dl, *dm, *du, *lam, *rhs;
  static rv2_t *x, *dx0, *dx1;
  real y;

  if (dl == NULL) {
    if (x0 == NULL) return 0;
    xnew(dl, n); xnew(dm, n); xnew(du, n);
    xnew(lam, n); xnew(rhs, n);
    xnew(dx0, n); xnew(dx1, n); xnew(x, n);
  } else if (x0 == NULL) {
    free(dl); free(dm); free(du);
    free(lam); free(rhs);
    free(dx0); free(dx1); free(x);
    return 0;
  }

  nl = n - 1;
  for (i = 0; i < nl; i++) {
    rv2_diff(dx1[i], x1[i], x1[i+1]);
  }
  for (i = 0; i < nl; i++) {
    rv2_diff(dx0[i], x0[i], x0[i+1]);
  }

  /* dm[0..nl-1], du[0..nl-2], dl[1..nl-1] */
  dm[0] =  4*rv2_dot(dx0[0], dx1[0]);
  du[0] = -2*rv2_dot(dx0[1], dx1[0]);
  for (i = 1; i < nl; i++) {
    dl[i] = -2*rv2_dot(dx1[i], dx0[i-1]);
    dm[i] =  4*rv2_dot(dx1[i], dx0[i]);
    du[i] = -2*rv2_dot(dx1[i], dx0[i+1]); /* no dx0[nl], but doesn't matter */
  }
  for (i = 0; i < nl; i++)
    rhs[i] = 1 - rv2_sqr(dx1[i]);

  /* solve matrix equation D lam = rhs
   * first LU decompose D;
   * U --> du with diagonal being unity;
   * L --> dm and dl with dl unchanged */
  if (fabs(dm[0]) < 1e-6) return 1;
  for (i = 1; i < nl; i++) {
    dm[i] -= dl[i] * (du[i-1] /= dm[i-1]);
    if (fabs(dm[i]) < 1e-6) return i+1;
  }

  for (it = 1; it <= itmax; it++) {
    lam[0] = rhs[0]/dm[0];
    for (i = 1; i < nl; i++) /* solving L v = rhs */
      lam[i] = (rhs[i] - dl[i]*lam[i-1])/dm[i];
    for (i = nl - 1; i > 0; i--) /* solving U lam = v */
      lam[i-1] -= du[i-1]*lam[i];

    rv2_ncopy(x, x1, n);
    /* update the new position */
    for (i = 0; i < nl; i++) {
      rv2_sinc(x[i],   dx0[i],  lam[i]);
      rv2_sinc(x[i+1], dx0[i], -lam[i]);
    }

    /* calcualte the maximal error */
    for (again = 0, i = 0; i < nl; i++) {
      y = 1 - rv2_dist2(x[i], x[i+1]);
      if (fabs(y) > tol) again = 1;
      rhs[i] += y;
    }
    if (!again) break;
  }

  rv2_ncopy(x1, x, n);
  if (v != NULL) { /* correct velocities */
    for (i = 0; i < n-1; i++) {
      rv2_sinc(v[i],   dx0[i],  lam[i]/dt);
      rv2_sinc(v[i+1], dx0[i], -lam[i]/dt);
    }
  }

  if (it >= itmax) {
    if (verbose) {
      const char *fnf = "shakefail.pos";
      fprintf(stderr, "milcshake: failed after %d iter. see %s\n", it, fnf);
      ab_writepos(ab, (real *)x1, NULL, fnf);
    }
    return -1;
  }
  return 0;
}

static int ab_milcshake3d(abpro_t *ab, crv3_t *x0, rv3_t *x1, rv3_t *v, real dt,
    int itmax, double tol, int verbose)
{
  int i, again, it, n = ab->n, nl;
  static real *dl, *dm, *du, *lam, *rhs;
  static rv3_t *x, *dx0, *dx1;
  real y;

  if (dl == NULL) {
    if (x0 == NULL) return 0;
    xnew(dl, n); xnew(dm, n); xnew(du, n);
    xnew(lam, n); xnew(rhs, n);
    xnew(dx0, n); xnew(dx1, n); xnew(x, n);
  } else if (x0 == NULL) {
    free(dl); free(dm); free(du);
    free(lam); free(rhs);
    free(dx0); free(dx1); free(x);
    return 0;
  }

  nl = n - 1;
  for (i = 0; i < nl; i++) {
    rv3_diff(dx1[i], x1[i], x1[i+1]);
  }
  for (i = 0; i < nl; i++) {
    rv3_diff(dx0[i], x0[i], x0[i+1]);
  }

  /* dm[0..nl-1], du[0..nl-2], dl[1..nl-1] */
  dm[0] =  4*rv3_dot(dx0[0], dx1[0]);
  du[0] = -2*rv3_dot(dx0[1], dx1[0]);
  for (i = 1; i < nl; i++) {
    dl[i] = -2*rv3_dot(dx1[i], dx0[i-1]);
    dm[i] =  4*rv3_dot(dx1[i], dx0[i]);
    du[i] = -2*rv3_dot(dx1[i], dx0[i+1]); /* no dx0[nl], but doesn't matter */
  }
  for (i = 0; i < nl; i++)
    rhs[i] = 1 - rv3_sqr(dx1[i]);

  /* solve matrix equation D lam = rhs
   * first LU decompose D;
   * U --> du with diagonal being unity;
   * L --> dm and dl with dl unchanged */
  if (fabs(dm[0]) < 1e-6) return 1;
  for (i = 1; i < nl; i++) {
    dm[i] -= dl[i] * (du[i-1] /= dm[i-1]);
    if (fabs(dm[i]) < 1e-6) return i+1;
  }

  for (it = 1; it <= itmax; it++) {
    lam[0] = rhs[0]/dm[0];
    for (i = 1; i < nl; i++) /* solving L v = rhs */
      lam[i] = (rhs[i] - dl[i]*lam[i-1])/dm[i];
    for (i = nl - 1; i > 0; i--) /* solving U lam = v */
      lam[i-1] -= du[i-1]*lam[i];

    rv3_ncopy(x, x1, n);
    /* update the new position */
    for (i = 0; i < nl; i++) {
      rv3_sinc(x[i],   dx0[i],  lam[i]);
      rv3_sinc(x[i+1], dx0[i], -lam[i]);
    }

    /* calcualte the maximal error */
    for (again = 0, i = 0; i < nl; i++) {
      y = 1 - rv3_dist2(x[i], x[i+1]);
      if (fabs(y) > tol) again = 1;
      rhs[i] += y;
    }
    if (!again) break;
  }

  rv3_ncopy(x1, x, n);
  if (v != NULL) { /* correct velocities */
    for (i = 0; i < n-1; i++) {
      rv3_sinc(v[i],   dx0[i],  lam[i]/dt);
      rv3_sinc(v[i+1], dx0[i], -lam[i]/dt);
    }
  }

  if (it >= itmax) {
    if (verbose) {
      const char *fnf = "shakefail.pos";
      fprintf(stderr, "milcshake: failed after %d iter. see %s\n", it, fnf);
      ab_writepos(ab, (real *)x1, NULL, fnf);
    }
    return -1;
  }
  return 0;
}

/* MILC shake, make |dr| = 1
 * for a random config., about 30~40% faster than shake
 * but slower than shake for near-minimum config.  */
int ab_milcshake(abpro_t *ab, const real *x0, real *x1, real *v, real dt,
    int itmax, double tol, int verbose)
{
  if (itmax <= 0) itmax = 3000;
  if (tol <= 0.) tol = (sizeof(real) == sizeof(double)) ? 1e-6 : 1e-4;
  return (ab->d == 3) ?
    ab_milcshake3d(ab, (crv3_t *)x0, (rv3_t *)x1, (rv3_t *)v, dt, itmax, tol, verbose) :
    ab_milcshake2d(ab, (crv2_t *)x0, (rv2_t *)x1, (rv2_t *)v, dt, itmax, tol, verbose);
}

static int ab_milcrattle2d(abpro_t *ab, crv2_t *x, rv2_t *v)
{
  int i, n = ab->n, nl;
  static real *dl, *dm, *du, *lam, *rhs;
  static rv2_t *dx, *dv;

  if (dl == NULL) {
    if (x == NULL) return 0;
    xnew(dl, n); xnew(dm, n); xnew(du, n);
    xnew(lam, n); xnew(rhs, n);
    xnew(dx, n); xnew(dv, n);
  } else if (x == NULL) {
    free(dl); free(dm); free(du);
    free(lam); free(rhs);
    free(dx); free(dv);
    return 0;
  }

  nl = n - 1;
  for (i = 0; i < nl; i++) {
    rv2_diff(dx[i], x[i], x[i+1]);
    rv2_diff(dv[i], v[i], v[i+1]);
  }

  /* dm[0..nl-1], du[0..nl-2], dl[1..nl-1] */
  dm[0] = 2*rv2_dot(dx[0], dx[0]);
  du[0] =  -rv2_dot(dx[1], dx[0]);
  for (i = 1; i < nl; i++) {
    dl[i] =  -rv2_dot(dx[i], dx[i-1]);
    dm[i] = 2*rv2_dot(dx[i], dx[i]);
    du[i] =  -rv2_dot(dx[i], dx[i+1]); /* no dx[nl], but doesn't matter */
  }
  for (i = 0; i < nl; i++)
    rhs[i] = -rv2_dot(dv[i], dx[i]);

  /* solve matrix equation D lam = rhs
   * first LU decompose D;
   * U --> du with diagonal being unity;
   * L --> dm and dl with dl unchanged */
  if (fabs(dm[0]) < 1e-6) return 1;
  for (i = 1; i < nl; i++) {
    dm[i] -= dl[i] * (du[i-1] /= dm[i-1]);
    if (fabs(dm[i]) < 1e-6) return i+1;
  }

  lam[0] = rhs[0]/dm[0];
  for (i = 1; i < nl; i++) /* solving L v = rhs */
    lam[i] = (rhs[i] - dl[i]*lam[i-1])/dm[i];
  for (i = nl - 1; i > 0; i--) /* solving U lam = v */
    lam[i-1] -= du[i-1]*lam[i];

  /* update the new position */
  for (i = 0; i < nl; i++) {
    rv2_sinc(v[i],   dx[i],  lam[i]);
    rv2_sinc(v[i+1], dx[i], -lam[i]);
  }
  return 0;
}

static int ab_milcrattle3d(abpro_t *ab, crv3_t *x, rv3_t *v)
{
  int i, n = ab->n, nl;
  static real *dl, *dm, *du, *lam, *rhs;
  static rv3_t *dx, *dv;

  if (dl == NULL) {
    if (x == NULL) return 0;
    xnew(dl, n); xnew(dm, n); xnew(du, n);
    xnew(lam, n); xnew(rhs, n);
    xnew(dx, n); xnew(dv, n);
  } else if (x == NULL) {
    free(dl); free(dm); free(du);
    free(lam); free(rhs);
    free(dx); free(dv);
    return 0;
  }

  nl = n - 1;
  for (i = 0; i < nl; i++) {
    rv3_diff(dx[i], x[i], x[i+1]);
    rv3_diff(dv[i], v[i], v[i+1]);
  }

  /* dm[0..nl-1], du[0..nl-2], dl[1..nl-1] */
  dm[0] = 2*rv3_dot(dx[0], dx[0]);
  du[0] =  -rv3_dot(dx[1], dx[0]);
  for (i = 1; i < nl; i++) {
    dl[i] =  -rv3_dot(dx[i], dx[i-1]);
    dm[i] = 2*rv3_dot(dx[i], dx[i]);
    du[i] =  -rv3_dot(dx[i], dx[i+1]); /* no dx[nl], but doesn't matter */
  }
  for (i = 0; i < nl; i++)
    rhs[i] = -rv3_dot(dv[i], dx[i]);

  /* solve matrix equation D lam = rhs
   * first LU decompose D;
   * U --> du with diagonal being unity;
   * L --> dm and dl with dl unchanged */
  if (fabs(dm[0]) < 1e-6) return 1;
  for (i = 1; i < nl; i++) {
    dm[i] -= dl[i] * (du[i-1] /= dm[i-1]);
    if (fabs(dm[i]) < 1e-6) return i+1;
  }

  lam[0] = rhs[0]/dm[0];
  for (i = 1; i < nl; i++) /* solving L v = rhs */
    lam[i] = (rhs[i] - dl[i]*lam[i-1])/dm[i];
  for (i = nl - 1; i > 0; i--) /* solving U lam = v */
    lam[i-1] -= du[i-1]*lam[i];

  /* update the new position */
  for (i = 0; i < nl; i++) {
    rv3_sinc(v[i],   dx[i],  lam[i]);
    rv3_sinc(v[i+1], dx[i], -lam[i]);
  }
  return 0;
}

/* MILC rattle, make dr.v = 0 */
int ab_milcrattle(abpro_t *ab, const real *x, real *v)
{
  return (ab->d == 3) ?
    ab_milcrattle3d(ab, (crv3_t *)x, (rv3_t *)v) :
    ab_milcrattle2d(ab, (crv2_t *)x, (rv2_t *)v);
}

static real ab_energy2dm1(abpro_t *ab, crv2_t *r, int soft)
{
  int i, j, n = ab->n;
  real dr, dr2, dr6, U = 0;
  rv2_t *dx = (rv2_t *)ab->dx;

  for (i = 0; i < n - 1; i++)
    rv2_diff(dx[i], r[i+1], r[i]);

  for (i = 0; i < n - 2; i++)
    U += 1.f - rv2_dot(dx[i+1], dx[i]);
  U *= 0.25f;

  for (i = 0; i < n - 2; i++) {
    for (j = i+2; j < n; j++) {
      dr2 = rv2_dist2(r[j], r[i]);
      if (soft && dr2 < 1.f) {
        dr = (real) sqrt(dr2);
        U += (52 - 48*dr) - ab->clj[ab->type[i]][ab->type[j]]*(28 - 24*dr);
      } else {
        dr2 = 1/dr2;
        dr6 = dr2*dr2*dr2;
        U += 4*dr6*(dr6 - ab->clj[ab->type[i]][ab->type[j]]);
      }
    }
  }
  return U;
}

static real ab_energy3dm1(abpro_t *ab, crv3_t *r, int soft)
{
  int i, j, n = ab->n;
  real dr, dr2, dr6, U = 0;
  rv3_t *dx = (rv3_t *)ab->dx;

  for (i = 0; i < n - 1; i++)
    rv3_diff(dx[i], r[i+1], r[i]);

  for (i = 0; i < n - 2; i++)
    U += 1.f - rv3_dot(dx[i+1], dx[i]);
  U *= 0.25f;

  for (i = 0; i < n - 2; i++) {
    for (j = i+2; j < n; j++) {
      dr2 = rv3_dist2(r[j], r[i]);
      if (soft && dr2 < 1.f) {
        dr = (real) sqrt(dr2);
        U += (52 - 48*dr) - ab->clj[ab->type[i]][ab->type[j]]*(28 - 24*dr);
      } else {
        dr2 = 1/dr2;
        dr6 = dr2*dr2*dr2;
        U += 4*dr6*(dr6 - ab->clj[ab->type[i]][ab->type[j]]);
      }
    }
  }
  return U;
}

static real ab_energy3dm2(abpro_t *ab, crv3_t *r, int soft)
{
  int i, j, n = ab->n;
  real dr2, dr6, U = 0;
  rv3_t *dx = (rv3_t *)ab->dx;

  for (i = 0; i < n - 1; i++)
    rv3_diff(dx[i], r[i+1], r[i]);

  for (i = 1; i < n-1; i++)
    U += rv3_dot(dx[i], dx[i-1]);

  for (i = 1; i < n-2; i++)
    U -= .5f * rv3_dot(dx[i+1], dx[i-1]);

  for (i = 0; i < n-2; i++) {
    for (j = i+2; j < n; j++) {
      dr2 = rv3_dist2(r[j], r[i]);
      if (soft && dr2 < 1.f) {
        U += ab->clj[ab->type[i]][ab->type[j]]*(ab->sla - ab->slb*(real)sqrt(dr2));
      } else {
        dr2 = 1.f/dr2;
        dr6 = dr2*dr2*dr2;
        U += 4.f*ab->clj[ab->type[i]][ab->type[j]]*dr6*(dr6 - 1.f);
      }
    }
  }
  return U;
}

real ab_energy(abpro_t *ab, const real *r, int soft)
{
  if (ab->model == 2)
    return ab_energy3dm2(ab, (crv3_t *)r, soft);
  else if (ab->d == 3)
    return ab_energy3dm1(ab, (crv3_t *)r, soft);
  else
    return ab_energy2dm1(ab, (crv2_t *)r, soft);
}

static real ab_force2dm1(abpro_t *ab, rv2_t *f, crv2_t *r, int soft)
{
  int i, j, n = ab->n;
  real c, ff, dr2, dr6, U = 0.f;
  real *dxp, *dxm;
  rv2_t *dx = (rv2_t *)ab->dx, dxi;

  for (i = 0; i < n; i++) rv2_zero(f[i]);

  for (i = 0; i < n - 1; i++)
    rv2_diff(dx[i], r[i+1], r[i]);

  for (i = 1; i < n-1; i++) {
    dxp = dx[i];
    dxm = dx[i-1];
    U += 1.f - rv2_dot(dxp, dxm);
    rv2_sinc(f[i-1], dxp, -.25f);
    rv2_sinc(f[i],   dxp,  .25f);
    rv2_sinc(f[i],   dxm, -.25f);
    rv2_sinc(f[i+1], dxm,  .25f);
  }
  U *= 0.25f;

  for (i = 0; i < n-2; i++) {
    for (j = i+2; j < n; j++) {
      dr2 = rv2_sqr( rv2_diff(dxi, r[j], r[i]) );
      c = ab->clj[ab->type[i]][ab->type[j]];

      if (soft && dr2 < 1.) {
        dr2 = (real) sqrt(dr2);
        U += (52.f - 28.f*c) - 24.f*dr2*(2.f-c);
        ff = 24.f*(2.f - c)/dr2;
      } else {
        dr2 = 1.f/dr2;
        dr6 = dr2*dr2*dr2;
        U += 4.f*dr6*(dr6 - c);
        ff = 24.f*dr2*dr6*(dr6*2.f - c);
      }
      rv2_sinc(f[i], dxi, -ff);
      rv2_sinc(f[j], dxi, +ff);
    }
  }
  return U;
}

static real ab_force3dm1(abpro_t *ab, rv3_t *f, crv3_t *r, int soft)
{
  int i, j, n = ab->n;
  real c, ff, dr2, dr6, U = 0.f;
  real *dxp, *dxm;
  rv3_t *dx = (rv3_t *)ab->dx, dxi;

  for (i = 0; i < n; i++) rv3_zero(f[i]);

  for (i = 0; i < n - 1; i++)
    rv3_diff(dx[i], r[i+1], r[i]);

  for (i = 1; i < n-1; i++) {
    dxp = dx[i];
    dxm = dx[i-1];
    U += 1.f - rv3_dot(dxp, dxm);
    rv3_sinc(f[i-1], dxp, -.25f);
    rv3_sinc(f[i],   dxp,  .25f);
    rv3_sinc(f[i],   dxm, -.25f);
    rv3_sinc(f[i+1], dxm,  .25f);
  }
  U *= 0.25f;

  for (i = 0; i < n-2; i++) {
    for (j = i+2; j < n; j++) {
      dr2 = rv3_sqr( rv3_diff(dxi, r[j], r[i]) );
      c = ab->clj[ab->type[i]][ab->type[j]];

      if (soft && dr2 < 1.) {
        dr2 = (real) sqrt(dr2);
        U += (52.f - 28.f*c) - 24.f*dr2*(2.f-c);
        ff = 24.f*(2.f - c)/dr2;
      } else {
        dr2 = 1.f/dr2;
        dr6 = dr2*dr2*dr2;
        U += 4.f*dr6*(dr6 - c);
        ff = 24.f*dr2*dr6*(dr6*2.f - c);
      }
      rv3_sinc(f[i], dxi, -ff);
      rv3_sinc(f[j], dxi, +ff);
    }
  }
  return U;
}

static real ab_force3dm2(abpro_t *ab, rv3_t *f, crv3_t *r, int soft)
{
  real ff, dr2, dr6, U = 0;
  real *dxm, *dxp, c;
  int i, j, n = ab->n;
  rv3_t *dx = (rv3_t *)ab->dx, dxi;

  for (i = 0; i < n; i++) rv3_zero(f[i]);

  for (i = 0; i < n - 1; i++)
    rv3_diff(dx[i], r[i+1], r[i]);

  for (i = 1; i < n-1; i++) {
    dxp = dx[i];
    dxm = dx[i-1];
    rv3_inc(f[i-1], dxp);
    rv3_dec(f[i],   dxp);
    rv3_inc(f[i],   dxm);
    rv3_dec(f[i+1], dxm);
    U += rv3_dot(dxp, dxm);
  }

  for (i = 1; i < n-2; i++) {
    dxp = dx[i+1];
    dxm = dx[i-1];
    rv3_sinc(f[i-1], dxp, -.5f);
    rv3_sinc(f[i],   dxp,  .5f);
    rv3_sinc(f[i+1], dxm, -.5f);
    rv3_sinc(f[i+2], dxm,  .5f);
    U -= .5f*rv3_dot(dxp, dxm);
  }

  for (i = 0; i < n - 2; i++) {
    for (j = i+2; j < n; j++) {
      c = ab->clj[ab->type[i]][ab->type[j]];
      dr2 = rv3_sqr( rv3_diff(dxi, r[j], r[i]) );
      if (soft && dr2 < 1.f) {
        dr2 = (real) sqrt(dr2);
        U += c*(ab->sla - ab->slb*dr2);
        ff = (ab->slb*c)/dr2;
      } else {
        dr2 = 1.f/dr2;
        dr6 = dr2*dr2*dr2;
        U += 4.f*c*dr6*(dr6 - 1.f);
        ff = 48.f*c*dr2*dr6*(dr6 - .5f);
      }
      rv3_sinc(f[i], dxi, -ff);
      rv3_sinc(f[j], dxi,  ff);
    }
  }
  return U;
}

/* compute force f */
real ab_force(abpro_t *ab, real *f, const real *r, int soft)
{
  if (ab->d == 2)
    return ab_force2dm1(ab, (rv2_t *)f, (crv2_t *)r, soft);
  else if (ab->model == 1)
    return ab_force3dm1(ab, (rv3_t *)f, (crv3_t *)r, soft);
  else
    return ab_force3dm2(ab, (rv3_t *)f, (crv3_t *)r, soft);
}

/* minimizes the energy of a given configuration.
   The minimized configuration is saved in ab->lmx
   When a lowest energy configuration is found, the result is
   saved to global variable ab->xmin, with ab->emin updated. */
real ab_localmin(abpro_t *ab, const real *r, int itmax, double tol,
    int sh_itmax, double sh_tol, unsigned flags)
{
  int t, i, j, id, n = ab->n, d = ab->d;
  real up, u = 0, step = 0.02, del, mem = 1;
  static real *x[2], *f[2], *v;
  const real DELMAX = 0.20f;

  if (v == NULL) {
    if (r == NULL) return 0;
    xnew(x[0], n*d*sizeof(real));
    xnew(x[1], n*d*sizeof(real));
    xnew(f[0], n*d*sizeof(real));
    xnew(f[1], n*d*sizeof(real));
    xnew(v, n*d*sizeof(real));
  } else if (r == NULL) {
    free(x[0]); free(x[1]); free(f[0]); free(f[1]); free(v);
    return 0;
  }
  if (itmax <= 0) itmax = 10000;
  if (tol <= 0.) tol = 1e-12;
  /* to make a working copy */
  memcpy(x[id = 0], r, n*d*sizeof(real));
  up = ab_force(ab, f[id], x[id], 0);
  memset(v, 0, n*d*sizeof(real));

  for (t = 1; t <= itmax; t++) {
    for (i = 0; i < n; i++)
      for (j = 0; j < d; j++) {
        del = v[i*d+j] = v[i*d+j]*mem+f[id][i*d+j]*step;
        if (del > DELMAX) del = DELMAX; else if (del < -DELMAX) del = -DELMAX;
        x[!id][i*d+j] = x[id][i*d+j]+del;
      }

    if (flags & AB_MILCSHAKE) {
      if (ab_milcshake(ab, x[id], x[!id], NULL, 0., sh_itmax, sh_tol, 0) != 0) goto SHRINK;
    } else {
      if (ab_shake(ab, x[id], x[!id], NULL, 0., sh_itmax, sh_tol, 0) != 0) goto SHRINK;
    }
    u = ab_force(ab, f[!id], x[!id], 0);
    if (u > up) { mem = 0; goto SHRINK; }

    id = !id;
    if (up - u < tol) break;
    up = u;
    mem = 0.9;
    step *= 1.1;
    continue;

SHRINK:
    step *= 0.5;
  }
  if (t > itmax && (flags & AB_VERBOSE))
    fprintf(stderr, "localmin failed to converge, t = %d.\n", t);

  memcpy(ab->lmx, x[id], n*d*sizeof(real));
  if (u < ab->emin && (flags & AB_LMREGISTER)) {
    ab->emin = u;
    memcpy(ab->xmin, x[id], n*d*sizeof(real));
    if (flags & AB_LMWRITE)
      ab_writepos(ab, ab->xmin, NULL, "abmin.pos");
  }

  return u;
}

static int ab_vv2d(abpro_t *ab, real fscal, real dt, int soft, int milc)
{
  int i, verbose = 1, n = ab->n;
  real dth = .5f*dt*fscal;
  rv2_t *v = (rv2_t *)ab->v, *x = (rv2_t *)ab->x, *x1 = (rv2_t *)ab->x1, *f = (rv2_t *)ab->f;

  for (i = 0; i < n; i++) { /* vv part 1 */
    rv2_sinc(v[i], f[i], dth);
    rv2_lincomb2(x1[i], x[i], v[i], 1, dt);
  }
  if (milc) {
    i = ab_milcshake(ab, ab->x, ab->x1, ab->v, dt, 0, 0., verbose);
  } else {
    i = ab_shake(ab, ab->x, ab->x1, ab->v, dt, 0, 0., verbose);
  }
  die_if (i != 0, "t=%g: shake failed\n", ab->t);
  rv2_ncopy(x, x1, n);

  ab->epot = ab_force(ab, ab->f, ab->x, soft); /* calculate force */

  for (i = 0; i < n; i++) { /* vv part 2 */
    rv2_sinc(v[i], f[i], dth);
  }
  if (milc) {
    i = ab_milcrattle(ab, ab->x, ab->v);
  } else {
    i = ab_rattle(ab, ab->x, ab->v, 0, 0., verbose);
  }

  ab_ekin(ab);

  die_if (i != 0, "t=%g: failed rattle\n", ab->t);
  ab->t += dt;
  return 0;
}

static int ab_vv3d(abpro_t *ab, real fscal, real dt, int soft, int milc)
{
  int i, verbose = 1, n = ab->n;
  real dth = .5f*dt*fscal;
  rv3_t *v = (rv3_t *)ab->v, *x = (rv3_t *)ab->x, *x1 = (rv3_t *)ab->x1, *f = (rv3_t *)ab->f;

  for (i = 0; i < n; i++) { /* vv part 1 */
    rv3_sinc(v[i], f[i], dth);
    rv3_lincomb2(x1[i], x[i], v[i], 1, dt);
  }
  if (milc) {
    i = ab_milcshake(ab, ab->x, ab->x1, ab->v, dt, 0, 0., verbose);
  } else {
    i = ab_shake(ab, ab->x, ab->x1, ab->v, dt, 0, 0., verbose);
  }
  die_if (i != 0, "t=%g: shake failed\n", ab->t);
  rv3_ncopy(x, x1, n);

  ab->epot = ab_force(ab, ab->f, ab->x, soft); /* calculate force */

  for (i = 0; i < n; i++) { /* vv part 2 */
    rv3_sinc(v[i], f[i], dth);
  }
  if (milc) {
    i = ab_milcrattle(ab, ab->x, ab->v);
  } else {
    i = ab_rattle(ab, ab->x, ab->v, 0, 0., verbose);
  }

  ab_ekin(ab);

  die_if (i != 0, "t=%g: failed rattle\n", ab->t);
  ab->t += dt;
  return 0;
}

/* one step of velocity verlet integrator */
int ab_vv(abpro_t *ab, real fscal, real dt, unsigned flags)
{
  int soft = (flags & AB_SOFTFORCE), milc = (flags & AB_MILCSHAKE);
  return (ab->d == 3) ?
    ab_vv3d(ab, fscal, dt, soft, milc) :
    ab_vv2d(ab, fscal, dt, soft, milc);
}

/* Brownian dynamics */
int ab_brownian(abpro_t *ab, real T, real fscal, real dt, unsigned flags)
{
  int soft = (flags & AB_SOFTFORCE), milc = (flags & AB_MILCSHAKE);
  int i, nd = ab->n * ab->d, verbose = 1;
  real amp = (real) sqrt(2*dt*T);

  for (i = 0; i < nd; i++)
    ab->x1[i] = ab->x[i] + fscal*ab->f[i]*dt + (real)(grand0()*amp);

  if (milc) {
    i = ab_milcshake(ab, ab->x, ab->x1, NULL, 0.f, 0, 0., verbose);
  } else {
    i = ab_shake(ab, ab->x, ab->x1, NULL, 0.f, 0, 0., verbose);
  }
  die_if (i != 0, "t=%g: failed shake\n", ab->t);
  memcpy(ab->x, ab->x1, nd*sizeof(real));

  ab->epot = ab_force(ab, ab->f, ab->x, soft); /* calculate force */
  ab->t += dt;
  return 0;
}

#endif /* ZCOM_ABPRO__ */
#endif /* ZCOM_ABPRO */

#ifdef  ZCOM_CAGO
#ifndef ZCOM_CAGO__
#define ZCOM_CAGO__

/* alpha-carbon based GO-model */
typedef struct {
  int n; /* number of residues */
  int dof; /* degree of freedom */
  real kb; /* .5 kb (b - b0)^2 */
  real ka; /* .5 ka (a - a0)^2 */
  real kd1, kd3; /* kd1 (1 - cos(d - d0)) + kd3 (1 - cos(3*(d-d0))) */
  real nbe, nbc;
  rv3_t *xref;
  real epotref; /* energy of the reference structure */
  int *aa;
  real *bref; /* bonds */
  real *aref; /* angle */
  real *dref; /* dihedral */
  real *r2ref; /* pair distance */
  int ncont; /* number of contacts */
  int *iscont;
  real kave; /* average spring constant of contacts */
  real rrtp; /* estimate of sqrt( <r^2> / tp ) at low temperature */

  /* variables for MD simulations */
  rv3_t *x, *v, *f, *x1;
  real ekin, tkin, epot, t;
  real rmsd; /* result from a rotfit call  */
} cago_t;

STRCLS cago_t *cago_open(const char *fnpdb, real kb, real ka, real kd1, real kd3,
    real nbe, real nbc, real rcc);
STRCLS void cago_close(cago_t *go);
STRCLS void cago_rmcom(cago_t *go, rv3_t *x, rv3_t *v);
STRCLS int cago_initmd(cago_t *go, double rndamp, double T0);
STRCLS real cago_force(cago_t *go, rv3_t *x, rv3_t *f);
STRCLS int cago_vv(cago_t *go, real fscal, real dt);
INLINE real cago_ekin(cago_t *go, rv3_t *v)
  { return go->ekin = md_ekin((real *)v, go->n*3, go->dof, &go->tkin); }
INLINE void cago_vrescale(cago_t *go, real tp, real dt)
  { md_vrescale3d(go->v, go->n, go->dof, tp, dt, &go->ekin, &go->tkin); }
STRCLS int cago_writepos(cago_t *go, rv3_t *x, rv3_t *v, const char *fn);
STRCLS int cago_readpos(cago_t *go, rv3_t *x, rv3_t *v, const char *fn);
STRCLS int cago_writepdb(cago_t *go, rv3_t *x, const char *fn);

/* convenient macro for computing rmsd from the reference structure */
INLINE real cago_rotfit(cago_t *go, rv3_t *x, rv3_t *xf)
  { return go->rmsd = rotfit3(x, xf, go->xref, NULL, go->n, NULL, NULL); }

STRCLS int cago_mdrun(cago_t *go, real mddt, real thermdt, int nstcom,
    real tps, real tp, av_t *avep, av_t *avrmsd,
    int teql, int tmax, int trep);

STRCLS int cago_ucvgmdrun(cago_t *go, real mddt, real thermdt, int nstcom,
    real epot, int npass,
    real amp, real ampf, real tptol, av_t *avtp, av_t *avep, av_t *avrmsd,
    real tp, real tpmin, real tpmax, int tmax, int trep);

STRCLS int cago_rcvgmdrun(cago_t *go, real mddt, real thermdt, int nstcom,
    real rmsd, int npass,
    real amp, real ampf, real tptol, av_t *avtp, av_t *avep, av_t *avrmsd,
    real tp, real tpmin, real tpmax, int tmax, int trep);


/* initialize data for the potential energy function */
static int cago_initpot(cago_t *go)
{
  int i, j, n = go->n;
  real dr2;

  /* calculate reference geometric parameters */
  xnew(go->bref, n - 1); /* bonds */
  for (i = 0; i < n - 1; i++)
    go->bref[i] = rv3_dist(go->xref[i], go->xref[i+1]);

  xnew(go->aref, n - 2); /* angles */
  for (i = 1; i < n - 1; i++) {
    go->aref[i-1]  = rv3_ang(go->xref[i-1], go->xref[i], go->xref[i+1],
      NULL, NULL, NULL);
  }

  xnew(go->dref, n - 3); /* dihedrals */
  for (i = 0; i < n - 3; i++) {
    go->dref[i] = rv3_calcdih(NULL,
      go->xref[i], go->xref[i+1], go->xref[i+2], go->xref[i+3], 0);
  }

  /* reference pair distances */
  xnew(go->r2ref, n*n);
  for (i = 0; i < n - 1; i++) {
    for (j = i+1; j < n; j++) {
      dr2 = rv3_dist2(go->xref[i], go->xref[j]);
      go->r2ref[j*n + i] = go->r2ref[i*n + j] = dr2;
    }
  }
  return 0;
}

/* return cago_t from pdb file fnpdb
 * rcc is the cutoff radius for defining contacts */
cago_t *cago_open(const char *fnpdb, real kb, real ka, real kd1, real kd3,
    real nbe, real nbc, real rcc)
{
  cago_t *go;
  int i, j, id;
  pdbmodel_t *pm;
  pdbaac_t *c;
  real rmin = 1e9, rmax = 0;

  xnew(go, 1);

  if ((pm = pdbm_read(fnpdb, 0)) == NULL)
    return NULL;
  go->iscont = pdbm_contact(pm, rcc, PDB_CONTACT_HEAVY, 3, 1);
  if ((c = pdbaac_parse(pm, 0)) == NULL)
    return NULL;
  pdbm_free(pm);

  go->n = c->nres;
  go->dof = go->n*3 - 6;
  go->kb = kb;
  go->ka = ka;
  go->kd1 = kd1;
  go->kd3 = kd3;
  go->nbe = nbe;
  go->nbc = nbc;
  xnew(go->xref, go->n);
  xnew(go->aa, go->n);
  for (i = 0; i < go->n; i++) {
    for (j = 0; j < 3; j++)
      go->xref[i][j] = (real) c->res[i].xca[j];
    go->aa[i] = c->res[i].aa;
  }
  pdbaac_free(c);

  cago_initpot(go);

  go->ncont = 0;
  go->kave = 0;
  for (i = 0; i < go->n - 1; i++)
    for (j = i+1; j < go->n; j++)
      if (go->iscont[ (id = i*go->n + j) ]) {
        if (go->r2ref[id] > rmax)
          rmax = go->r2ref[id];
        else if (go->r2ref[id] < rmin)
          rmin = go->r2ref[id];
        go->ncont++;
        go->kave += 120.f/go->r2ref[id];
      }
  if (go->ncont > 0) go->kave /= go->ncont;
  rmin = sqrt(rmin); rmax = sqrt(rmax);
  go->rrtp = (real) sqrt(1.5 * go->dof/ (go->ncont*go->kave));
  return go;
}

/* destroy cago_t */
void cago_close(cago_t *go)
{
  if (go->x) free(go->x);
  if (go->v) free(go->v);
  if (go->f) free(go->f);
  if (go->x1) free(go->x1);
  if (go->iscont) free(go->iscont);
  free(go->bref);
  free(go->aref);
  free(go->dref);
  free(go->r2ref);
  free(go->xref);
  free(go->aa);
  memset(go, '\0', sizeof(*go));
  free(go);
}

/* remove center of mass motion, linear and angular */
void cago_rmcom(cago_t *go, rv3_t *x, rv3_t *v)
{
  md_shiftcom3d(x, go->n);
  md_shiftcom3d(v, go->n);
  md_shiftang3d(x, v, go->n);
}

/* initialize a md system
 * if rndamp >= 0, start from the reference structure,
 *   with a random disturbance of rndamp
 * if rndamp < 0, start from a nearly-straight chain,
 *   with a disturbance of rndamp in the x, y directions */
int cago_initmd(cago_t *go, double rndamp, double T0)
{
  int i, j, n = go->n;
  real s, dx[3];

  xnew(go->f, n);
  xnew(go->v, n);
  xnew(go->x, n);
  xnew(go->x1, n);

  /* initialize position */
  if (rndamp < 0) { /* open chain */
    rndamp *= -1;
    for (i = 0; i < n-1; i++) {
      for (j = 0; j < 3; j++)
        dx[j] = (j == 0) ? 1.f : rndamp*(2.f*rnd0() - 1);
      rv3_normalize(dx);
      rv3_smul(dx, go->bref[i]);
      rv3_add(go->x[i+1], go->x[i], dx);
    }
  } else { /* copy from xref, slightly disturb it */
    for (i = 0; i < n; i++) {
      rv3_copy(go->x[i], go->xref[i]);
      for (j = 0; j < 3; j++)
        go->x[i][j] += rndamp*(2.f*rnd0() - 1);
    }
  }
  go->epotref = cago_force(go, go->f, go->xref);
  go->epot = cago_force(go, go->f, go->x);

  /* initialize velocities */
  for (j = 0; j < 3; j++)
    for (i = 0; i < n; i++)
      go->v[i][j] = rnd0() - .5;
  cago_rmcom(go, go->x, go->v); /* remove center of mass motion */
  for (s = 0, i = 0; i < n; i++)
    s += rv3_sqr(go->v[i]);
  s = sqrt( (3*n*T0)/s );
  for (i = 0; i < n; i++) {
    rv3_smul(go->v[i], s);
  }
  go->ekin = cago_ekin(go, go->v);
  go->rmsd = cago_rotfit(go, go->x, NULL);
  go->t = 0;
  return 0;
}

real cago_force(cago_t *go, rv3_t *f, rv3_t *x)
{
  int i, j, n = go->n;
  real ene = 0, dr, del, ang, amp, invr2, dr2, dr4, dr6, dr10;
  real kb = go->kb, ka = go->ka, kd1 = go->kd1, kd3 = go->kd3,
       nbe = go->nbe, nbc2 = go->nbc*go->nbc;
  rv3_t dx, g[3];
  dihcalc_t dc[1];

  for (i = 0; i < n; i++)
    rv3_zero(f[i]);
  /* bonds */
  for (i = 0; i < n-1; i++) {
    rv3_diff(dx, x[i], x[i+1]);
    dr = rv3_norm(dx);
    del = dr - go->bref[i];
    ene += .5f*kb*del*del;
    amp = kb*del/dr;
    rv3_sinc(f[i],   dx, -amp);
    rv3_sinc(f[i+1], dx,  amp);
  }

  /* angles */
  for (i = 1; i < n-1; i++) {
    ang = rv3_ang(x[i-1], x[i], x[i+1], g[0], g[1], g[2]);
    del = ang - go->aref[i-1];
    ene += .5f*ka*del*del;
    amp = -ka*del;
    rv3_sinc(f[i-1], g[0], amp);
    rv3_sinc(f[i],   g[1], amp);
    rv3_sinc(f[i+1], g[2], amp);
  }

  /* dihedrals */
  memset(dc, 0, sizeof(*dc));
  dc->szreal = sizeof(real);
  for (i = 0; i < n - 3; i++) {
    ang = rv3_calcdih(dc, x[i], x[i+1], x[i+2], x[i+3],
                      DIH_FOUR|DIH_GRAD);
    del = ang - go->dref[i];
    if (del > M_PI) del -= 2*M_PI; else if (del < -M_PI) del += 2*M_PI;
    ene += (real)(  kd1 * (1. - cos(del)) );
    amp  = (real)( -kd1 * sin(del) );
    ene += (real)(  kd3 * (1. - cos(3.*del)) );
    amp += (real)( -kd3 * 3 * sin(3*del) );
    rv3_sinc(f[i],   dc->g[0], amp);
    rv3_sinc(f[i+1], dc->g[1], amp);
    rv3_sinc(f[i+2], dc->g[2], amp);
    rv3_sinc(f[i+3], dc->g[3], amp);
  }

  /* nonbonded */
  for (i = 0; i < n - 4; i++) {
    for (j = i + 4; j < n; j++) {
      rv3_diff(dx, x[i], x[j]);
      dr2 = rv3_sqr(dx);
      invr2 = 1/dr2;
      if (go->iscont[i*n + j]) { /* is a contact */
        dr2 = go->r2ref[i*n+j]*invr2;
        dr4 = dr2*dr2;
        dr6 = dr4*dr2;
        dr10 = dr4*dr6;
        amp = nbe*60*(dr2 - 1)*dr10*invr2;
        ene += nbe*(5*dr2 - 6)*dr10;
      } else {
        dr2 = nbc2/dr2;
        dr6 = dr2*dr2*dr2;
	dr6 *= dr6;
        amp = nbe*12*dr6*invr2;
        ene += nbe*dr6;
      }
      rv3_sinc(f[i], dx,  amp);
      rv3_sinc(f[j], dx, -amp);
    }
  }
  return ene;
}

/* velocity verlet */
int cago_vv(cago_t *go, real fscal, real dt)
{
  int i, n = go->n;
  real dth = .5f*dt;
  rv3_t *v = go->v, *x = go->x, *f = go->f;

  for (i = 0; i < n; i++) { /* vv part 1 */
    rv3_sinc(v[i], f[i], dth*fscal);
    rv3_sinc(x[i], v[i], dt);
  }

  go->epot = cago_force(go, go->f, go->x); /* calculate force */

  for (i = 0; i < n; i++) { /* vv part 2 */
    rv3_sinc(v[i], f[i], dth*fscal);
  }
  go->ekin = cago_ekin(go, go->v);
  go->t += dt;
  return 0;
}

/* write position/velocity file */
int cago_writepos(cago_t *go, rv3_t *x, rv3_t *v, const char *fn)
{
  FILE *fp;
  int i, n = go->n;

  if (fn == NULL) fn = "cago.pos";
  if ((fp = fopen(fn, "w")) == 0) {
    fprintf(stderr, "cannot open file [%s]\n", fn);
    return 1;
  }

  fprintf(fp, "# %d %d\n", go->n, (v != NULL));
  for (i = 0; i < n; i++) {
    fprintf(fp, "%16.12f %16.12f %16.12f ", x[i][0], x[i][1], x[i][2]);
    if (v)
      fprintf(fp, "%16.12f %16.12f %16.12f ", v[i][0], v[i][1], v[i][2]);
    fprintf(fp, "\n");
  }
  fclose(fp);
  return 0;
}

/* read position/velocity file */
int cago_readpos(cago_t *go, rv3_t *x, rv3_t *v, const char *fn)
{
  char s[1024], *p;
  FILE *fp;
  int i, hasv = 0, next, n = go->n;
  const char *fmt;
  real vtmp[3], *vi;

  if (fn == NULL) fn = "ab.pos";
  if ((fp = fopen(fn, "r")) == 0) {
    fprintf(stderr, "cannot open file [%s]\n", fn);
    return 1;
  }

  if (fgets(s, sizeof s, fp) == NULL || s[0] != '#') {
    fprintf(stderr, "Warning: %s has no information line\n", fn);
    rewind(fp);
  } else {
    if (2 != sscanf(s+1, "%d%d", &i, &hasv) || i != n ) {
      fprintf(stderr, "first line is corrupted:\n%s", s);
      goto ERR;
    }
  }

  if (sizeof(double) == sizeof(real))
    fmt = "%lf%lf%lf%n";
  else
    fmt = "%f%f%f%n";
  for (i = 0; i < n; i++) {
    fgets(s, sizeof s, fp);
    if (strlen(s) < 10) goto ERR;
    if (3 != sscanf(s, fmt, &x[i][0], &x[i][1], &x[i][2], &next)) {
      fprintf(stderr, "cannot read x, i = %d\n", i);
      goto ERR;
    }
    p = s+next;
    if (hasv) {
      vi = (v != NULL) ? v[i] : vtmp;
      if (3 != sscanf(p, fmt, &vi[0], &vi[1], &vi[2], &next)) {
        fprintf(stderr, "cannot read v, i = %d\n", i);
        goto ERR;
      }
    }
  }
  fclose(fp);
  return 0;

ERR:
  fprintf(stderr, "position file [%s] appears to be broken on line %d!\n%s\n", fn, i, s);
  fclose(fp);
  return 1;
}

/* output pdb format */
int cago_writepdb(cago_t *go, rv3_t *x, const char *fn)
{
  FILE *fp;
  int i, n = go->n;

  if ((fp = fopen(fn, "w")) == 0) {
    fprintf(stderr, "cannot open file [%s]\n", fn);
    return 1;
  }
  for (i = 0; i < n; i++)
    fprintf(fp, "ATOM  %5d  CA  %-4sA%4d    %8.3f%8.3f%8.3f  1.00  0.00           C  \n",
        i+1, pdbaaname(go->aa[i]), i+1, x[i][0], x[i][1], x[i][2]);
  fprintf(fp, "END%77s\n", " ");
  fclose(fp);
  return 0;
}

/* run a regular md
 * teql steps for equilibration, tmax steps for production
 * tp: the real temperature, tps: thermostat temperature */
int cago_mdrun(cago_t *go, real mddt, real thermdt, int nstcom,
    real tps, real tp, av_t *avep, av_t *avrmsd,
    int teql, int tmax, int trep)
{
  int t;
  real fs = tps/tp;

  tmax = (tmax < 0) ? -1 : (tmax + teql);
  av_clear(avep);
  av_clear(avrmsd);
  for (t = 1; tmax < 0 || t <= tmax; t++) {
    cago_vv(go, fs, mddt);
    if (t % nstcom == 0) cago_rmcom(go, go->x, go->v);
    cago_vrescale(go, (real) tps, thermdt);
    go->rmsd = cago_rotfit(go, go->x, NULL);
    if (t > teql) {
      av_add(avep, go->epot);
      av_add(avrmsd, go->rmsd);
    }
    if (trep > 0 && t % trep == 0) {
      printf("%9d: tp %.4f, tps %.4f, rmsd %7.4f, K %.2f, U %.2f\n",
          t, tp, tps, go->rmsd, go->ekin, go->epot);
    }
  }
  return 0;
}

/* guess a proper temperature for a target potential energy
 * return 0 if successful
 *
 * temperature is updated according to epot
 * several stages of updating are used, each with a fixed tpdt
 * after a stage, the updating magnitude amp is multiplied by ampf
 * iterations finish when the temperature difference is less than
 * a given tolerance 'tptol'
 * a pass is defined every time the potential energy crosses 'epot'
 * in every stage, npass passes are required to determine convergence
 * */
int cago_ucvgmdrun(cago_t *go, real mddt, real thermdt, int nstcom,
    real epot, int npass,
    real amp, real ampf, real tptol, av_t *avtp, av_t *avep, av_t *avrmsd,
    real tp, real tpmin, real tpmax, int tmax, int trep)
{
  int i, t, stg, sgp, sgn, ipass;
  real tpp = 0, tp1, tpav, epav, rdav, tmp;

  go->rmsd = cago_rotfit(go, go->x, NULL);
  sgp = (go->epot > epot) ? 1 : -1;
  for (stg = 0; ; stg++, amp *= ampf) { /* stages with different dpdt */
    if (avtp) av_clear(avtp);
    if (avep) av_clear(avep);
    if (avrmsd) av_clear(avrmsd);
    for (ipass = 0, t = 1; (tmax < 0 || t <= tmax) && ipass < npass; t++) {
      cago_vv(go, 1, mddt);
      if (t % nstcom == 0) cago_rmcom(go, go->x, go->v);
      cago_vrescale(go, (real) tp, thermdt);
      go->rmsd = cago_rotfit(go, go->x, NULL);
      sgn = (go->epot > epot) ? 1 : -1;
      if (sgn * sgp < 0) {
        ipass++;
        sgp = sgn;
      }
      /* update the temperature */
      tp1 = tp - sgn*mddt*amp;
      if (tp1 < tpmin) tp1 = tpmin;
      else if (tp1 > tpmax) tp1 = tpmax;
      for (tmp = tp1/tp, i = 0; i < go->n; i++) /* scale v */
        rv3_smul(go->v[i], tmp);
      tp = tp1;
      if (avtp) av_add(avtp, tp);
      if (avep) av_add(avep, go->epot);
      if (avrmsd) av_add(avrmsd, go->rmsd);
      if (trep >= 0 && t % trep == 0) {
        printf("%d|%9d: %.2f - %.2f, tp %.4f, K %.2f, rmsd %.4f pass: %d/%d\n",
            stg, t, go->epot, epot, tp,
            go->ekin, go->rmsd, ipass, npass);
      }
    }
    /* end of a stage */
    if (ipass < npass) { /* not enough passes over rmsd */
      const char fnfail[] = "fail.pos";
      cago_rotfit(go, go->x, go->x1);
      cago_writepos(go, go->x1, NULL, fnfail);
      fprintf(stderr, "%d: failed to converge, epot: %g - %g, %d passes, %s\n",
          stg, epot, go->epot, ipass, fnfail);
      return 1;
    }
    tpav = av_getave(avtp);
    epav = av_getave(avep);
    rdav = av_getave(avrmsd);
    printf("%d: amp %g, tp %g, tpav %g/%g, epotav %g, rmsd %g, pass %d/%d\n",
        stg, amp, tp, tpav, tpp, epav, rdav, ipass, npass);
    tmp = .5*(tpav + tpp);
    if (stg > 0 && fabs(tpav - tpp) < tptol*tmp) break;
    tpp = tpav;
  }
  return 0;
}

/* guess a proper temperature for a target rmsd, return 0 if successful
 * should work for a small rmsd
 * for a rmsd in the transition region, it can be stuck in a local minimal,
 * in which rmsd is greater than the target value, but tp reaches tpmin
 *
 * temperature is updated according to rmsd
 * several stages of updating are used, each with a fixed tpdt
 * after a stage, the updating magnitude amp is multiplied by ampf
 * iterations finish when the temperature difference is less than
 * a given tolerance 'tptol'
 * a pass is defined every time the rmsd crosses 'rmsd'
 * in every stage, npass passes are required to determine convergence
 * */
int cago_rcvgmdrun(cago_t *go, real mddt, real thermdt, int nstcom,
    real rmsd, int npass,
    real amp, real ampf, real tptol, av_t *avtp, av_t *avep, av_t *avrmsd,
    real tp, real tpmin, real tpmax, int tmax, int trep)
{
  int i, t, stg, sgp, sgn, ipass;
  real tpp = 0, tp1, tpav, epav, rdav, tmp;

  go->rmsd = cago_rotfit(go, go->x, NULL);
  sgp = (go->rmsd > rmsd) ? 1 : -1;
  for (stg = 0; ; stg++, amp *= ampf) { /* stages with different dpdt */
    if (avtp) av_clear(avtp);
    if (avep) av_clear(avep);
    if (avrmsd) av_clear(avrmsd);
    for (ipass = 0, t = 1; (tmax < 0 || t <= tmax) && ipass < npass; t++) {
      cago_vv(go, 1, mddt);
      if (t % nstcom == 0) cago_rmcom(go, go->x, go->v);
      cago_vrescale(go, (real) tp, thermdt);
      go->rmsd = cago_rotfit(go, go->x, NULL);
      sgn = (go->rmsd > rmsd) ? 1 : -1;
      if (sgn * sgp < 0) {
        ipass++;
        sgp = sgn;
      }
      /* update the temperature */
      tp1 = tp - sgn*mddt*amp;
      if (tp1 < tpmin) tp1 = tpmin;
      else if (tp1 > tpmax) tp1 = tpmax;
      for (tmp = tp1/tp, i = 0; i < go->n; i++) /* scale v */
        rv3_smul(go->v[i], tmp);
      tp = tp1;
      if (avtp) av_add(avtp, tp);
      if (avep) av_add(avep, go->epot);
      if (avrmsd) av_add(avrmsd, go->rmsd);
      if (trep >= 0 && t % trep == 0) {
        printf("%d|%9d: %.2f - %.2f, tp %.4f, K %.2f, U %.2f, pass: %d/%d\n",
            stg, t, go->rmsd, rmsd, tp,
            go->ekin, go->epot, ipass, npass);
      }
    }
    /* end of a stage */
    if (ipass < npass) { /* not enough passes over rmsd */
      const char fnfail[] = "fail.pos";
      cago_rotfit(go, go->x, go->x1);
      cago_writepos(go, go->x1, NULL, fnfail);
      fprintf(stderr, "%d: failed to converge, rmsd: %g - %g, %d passes, %s\n",
          stg, rmsd, go->rmsd, ipass, fnfail);
      return 1;
    }
    tpav = av_getave(avtp);
    epav = av_getave(avep);
    rdav = av_getave(avrmsd);
    printf("%d: amp %g, tp %g, tpav %g/%g, epotav %g, rmsd %g, pass %d/%d\n",
        stg, amp, tp, tpav, tpp, epav, rdav, ipass, npass);
    tmp = .5*(tpav + tpp);
    if (stg > 0 && fabs(tpav - tpp) < tptol*tmp) break;
    tpp = tpav;
  }
  return 0;
}


#endif /* ZCOM_CAGO__ */
#endif /* ZCOM_CAGO */

#ifdef  ZCOM_TMH
#ifndef ZCOM_TMH__
#define ZCOM_TMH__

/* tempering with modified Hamiltonian */

typedef struct {
  double tp, ec; /* current temperature, and the expected energy there */
  int itp, iec; /* indices of tp and ec */
  double tp0, tp1, dtp; /* temperature range */
  int tpn; /* number of temperature */
  double emin, emax; /* energy range */
  double de; /* bin size of energy histogram */
  int en; /* number of energy bins */
  double erg0, erg1; /* energy range (erg0, erg1) */
  double derg; /* bin size for the updating energy range */
  int ergn; /* number of the updating energy bins */
  double dergdt; /* (erg1 - erg0)/(tp1 - tp0) */
  int dhdeorder; /* order of dhde interpolation */
  double dhdemin; /* minimal of dhde */
  double dhdemax; /* maximal of dhde */
  double *dhde; /* dH / dE - 1 */
  double *tpehis; /* multipl-temperature energy histogram */
  double ensexp; /* w(T) = 1/T^ensexp */
  double *lnz; /* partition function */
  double *lng; /* density of states */
  double *mh; /* modified Hamiltonian  */
} tmh_t;

STRCLS tmh_t *tmh_open(double tp0, double tp1, double dtp,
    double erg0, double erg1, double derg,
    double emin, double emax, double de,
    double ensexp, int dhdeorder);
STRCLS void tmh_close(tmh_t *tmh);
STRCLS double tmh_hdif(tmh_t *tmh, double eb, double ea);
STRCLS int tmh_tlgvmove(tmh_t *tmh, double enow, double lgvdt);
STRCLS int tmh_savedhde(tmh_t *tmh, const char *fn, double amp, double t);
STRCLS int tmh_loaddhde(tmh_t *tmh, const char *fn, double *amp, double *t);
STRCLS int tmh_savetp(tmh_t *tmh, const char *fn);
STRCLS int tmh_save(tmh_t *tmh, const char *fntp, const char *fnehis,
    const char *fndhde, double amp, double t);
STRCLS int tmh_load(tmh_t *tmh, const char *fnehis,
    const char *fndhde, double *amp, double *t);
STRCLS int tmh_loaderange(const char *fn,
    double *tp0, double *tp1, double *dtp,
    double *erg0, double *erg1, double *derg,
    double *emin, double *emax, double *de,
    double *ensexp, int *dhdeorder);
STRCLS int tmh_calcdos(tmh_t *tmh, int itmax, double tol,
    const char *fndos, const char *fnlnz);

/* set the current temperature */
INLINE void tmh_settp(tmh_t *tmh, double tp)
{
#ifndef TMH_NOCHECK
  die_if (tp > tmh->tp1 || tp < tmh->tp0, "temperature %g not in(%g, %g)", tp, tmh->tp0, tmh->tp1);
#endif
  tmh->tp = tp;
  tmh->itp = (int)((tmh->tp - tmh->tp0)/tmh->dtp);
  tmh->ec = tmh->erg0 + (tmh->tp - tmh->tp0)*tmh->dergdt;
  tmh->iec = (int)((tmh->ec - tmh->erg0)/tmh->derg);
}

/* retrieve local dhde */
INLINE double tmh_getdhde(tmh_t *tmh, double e, int ie)
{
  if (tmh->dhdeorder == 0) {
#ifndef TMH_NOCHECK
    die_if (ie < 0 || ie >= tmh->en, "overflow ie %d en %d\n", ie, tmh->en);
#endif
    return tmh->dhde[ie];
  } else {
    double lam = (e - (tmh->erg0 + ie*tmh->derg))/tmh->derg;
#ifndef TMH_NOCHECK
    die_if (lam < 0. || lam > 1.,
        "cannot interpolate, e %g, %d, %g %g %g\n",
        e, ie, tmh->erg0 + ie*tmh->derg, tmh->erg0, tmh->derg);
#endif
    return tmh->dhde[ie]*(1-lam) + tmh->dhde[ie+1]*lam;
  }
}

/* update dhde curve */
INLINE void tmh_dhdeupdate(tmh_t *tmh, double erg, double amp)
{
  double del = amp * (erg - tmh->ec);

#ifdef TMH_NOCHECK
  #define TMH_UPDHDE(i, del) tmh->dhde[i] += del;
#else
  #define TMH_UPDHDE(i, del) { \
  tmh->dhde[i] += del; \
  if (tmh->dhde[i] < tmh->dhdemin) \
    tmh->dhde[i] = tmh->dhdemin; \
  else if (tmh->dhde[i] > tmh->dhdemax) \
    tmh->dhde[i] = tmh->dhdemax; }
#endif

  if (tmh->dhdeorder == 0) {
    TMH_UPDHDE(tmh->iec, del);
    if (tmh->iec == tmh->ergn - 1) /* last bin */
      tmh->dhde[tmh->ergn] = tmh->dhde[tmh->iec];
  } else {
    del *= .5;
    TMH_UPDHDE(tmh->iec, del);
    TMH_UPDHDE(tmh->iec+1, del);
  }
}

INLINE void tmh_eadd(tmh_t *tmh, double erg)
{
  int ie;
#ifndef TMH_NOCHECK
  if (erg < tmh->emin || erg > tmh->emax) return;
#endif
  ie = (int)((erg - tmh->emin)/tmh->de);
#ifndef TMH_NOCHECK
  die_if (ie < 0 || ie >= tmh->en, "ie = %d, erg %g emin %g de %g output range\n",
      ie, erg, tmh->emin, tmh->de);
  die_if (tmh->itp > tmh->tpn, "itp = %d, tpn = %d, tp = %g, dtp = %g\n",
      tmh->itp, tmh->tpn, tmh->tp, tmh->dtp);
#endif
  tmh->tpehis[tmh->itp*tmh->en + ie] += 1.;
}

INLINE int tmh_saveehis(tmh_t *tmh, const char *fn)
{
  return histsave(tmh->tpehis, tmh->tpn,
      tmh->en, tmh->emin, tmh->de, HIST_ADDAHALF|HIST_OVERALL, fn);
}

INLINE int tmh_loadehis(tmh_t *tmh, const char *fn)
{
  return histload(tmh->tpehis, tmh->tpn,
      tmh->en, tmh->emin, tmh->de, HIST_ADDAHALF, fn);
}



/* 0: cold; 1: hot */
tmh_t *tmh_open(double tp0, double tp1, double dtp,
    double erg0, double erg1, double derg,
    double emin, double emax, double de,
    double ensexp, int dhdeorder)
{
  tmh_t *tmh;
  int i;

  xnew(tmh, 1);

  /* energy histogram range */
  tmh->de = de;
  die_if(emin >= emax, "Error: emin %g >= emax %g\n", emin, emax);
  tmh->emin = emin;
  tmh->emax = emin + dblround(emax - emin, de);
  tmh->en = (int)((tmh->emax - tmh->emin)/de + .5); /* number of energy bins */

  /* the updating energy range */
  die_if(erg0 >= erg1, "Error: erg0 %g >= erg1 %g\n", erg0, erg1);
  tmh->derg = derg;
  tmh->erg0 = erg0;
  tmh->erg1 = erg0 + dblround(erg1 - erg0, derg);
  tmh->ergn = (int)((tmh->erg1 - tmh->erg0)/tmh->derg + .5);

  /* dhde parameters */
  tmh->dhdeorder = dhdeorder;
  tmh->dhdemin = 0.1;
  tmh->dhdemax = 10.0;
  xnew(tmh->dhde, tmh->ergn + 1);
  for (i = 0; i <= tmh->ergn; i++)
    tmh->dhde[i] = 1.;

  die_if(tp0 >= tp1, "Error: T0 %g >= T1 %g\n", tp0, tp1);
  if (dtp <= 0) {
    tmh->tp0 = tp0;
    tmh->tp1 = tp1;
    tmh->tpn = tmh->ergn;
    tmh->dtp = (tmh->tp1 - tmh->tp0)/tmh->tpn * (1. + 1e-12);
  } else {
    tmh->dtp = dtp;
    tmh->tp0 = dblround(tp0, dtp);
    tmh->tp1 = dblround(tp1, dtp);
    tmh->tpn = (int)((tmh->tp1 - tmh->tp0)/tmh->dtp + .5);
  }
  xnew(tmh->tpehis, tmh->tpn*tmh->en);

  tmh->ensexp = ensexp;
  tmh_settp(tmh, (tmh->tp0+tmh->tp1)*.5);
  tmh->dergdt = (tmh->erg1 - tmh->erg0)/(tmh->tp1 - tmh->tp0);

  xnew(tmh->lnz, tmh->tpn);
  xnew(tmh->lng, tmh->en);
  xnew(tmh->mh, tmh->en+1);

  return tmh;
}

void tmh_close(tmh_t *tmh)
{
  if (tmh != NULL) {
    free(tmh->dhde);
    free(tmh->tpehis);
    free(tmh->lnz);
    free(tmh->lng);
    free(tmh->mh);
    free(tmh);
  }
}

static double tmh_hdif0(tmh_t *tmh, double eh, double el)
{
  int ie, iel, ieh;
  double dh;

  if (eh < tmh->erg0) { /* first energy bin */
    dh = (eh - el)*tmh->dhde[0];
  } else if (el > tmh->erg1) { /* last energy bin */
    dh = (eh - el)*tmh->dhde[tmh->ergn];
  } else {
    dh = 0.;
    /* overhead index */
    if (el < tmh->erg0) {
      dh += (tmh->erg0 - el)*tmh->dhde[0];
      el = tmh->erg0 + 1e-8;
    }
    if (eh >= tmh->erg1) {
      dh += (eh - tmh->erg1)*tmh->dhde[tmh->ergn];
      eh = tmh->erg1 - 1e-8;
    }
    /* energy index */
    iel = (int)((el - tmh->erg0)/tmh->derg);
    ieh = (int)((eh - tmh->erg0)/tmh->derg);
    if (iel == ieh) {
      dh += (eh - el)*tmh->dhde[iel];
    } else if (iel < ieh) {
      /* dh at the two terminal energy bin */
      dh += (tmh->erg0 + (iel+1)*tmh->derg - el) * tmh->dhde[iel]
          + (eh - (tmh->erg0 + tmh->derg*ieh)) * tmh->dhde[ieh];
      /* integrate dH/dE */
      for (ie = iel+1; ie < ieh; ie++)
        dh += tmh->dhde[ie]*tmh->derg;
    }
  }
  return dh;
}

static double tmh_hdif1(tmh_t *tmh, double eh, double el)
{
  int ie, iel, ieh;
  double dh, de, k, el0, eh0;

  if (eh < tmh->erg0) { /* first energy bin */
    dh = (eh - el)*tmh->dhde[0];
  } else if (el > tmh->erg1) { /* last energy bin */
    dh = (eh - el)*tmh->dhde[tmh->ergn];
  } else {
    dh = 0.;
    if (el < tmh->erg0) {
      dh += (tmh->erg0 - el)*tmh->dhde[0];
      el = tmh->erg0 + 1e-8;
    }
    if (eh >= tmh->erg1) {
      dh += (eh - tmh->erg1)*tmh->dhde[tmh->ergn];
      eh = tmh->erg1 - 1e-8;
    }
    /* energy index */
    iel = (int)((el - tmh->erg0)/tmh->derg);
    ieh = (int)((eh - tmh->erg0)/tmh->derg);
    if (iel == ieh) {
      k = (tmh->dhde[iel+1] - tmh->dhde[iel])/tmh->derg;
      el0 = tmh->erg0 + iel*tmh->derg;
      dh += (eh - el)*(tmh->dhde[iel] + k * (.5f*(el+eh) - el0));
    } else if (iel < ieh) {
      /* dh at the two terminal energy bin */
      el0 = tmh->erg0 + (iel+1)*tmh->derg;
      de = el0 - el;
      k = (tmh->dhde[iel+1] - tmh->dhde[iel])/tmh->derg;
      dh += de * (tmh->dhde[iel+1] - k*.5*de);
      eh0 = tmh->erg0 + tmh->derg*ieh;
      de = eh - eh0;
      k = (tmh->dhde[ieh+1] - tmh->dhde[ieh])/tmh->derg;
      dh += de * (tmh->dhde[ieh] + k*.5*de);
      /* integrate dH/dE */
      for (ie = iel+1; ie < ieh; ie++)
        dh += .5*(tmh->dhde[ie] + tmh->dhde[ie+1])*tmh->derg;
    }
  }
  return dh;
}

/* d(H) = H(e1) - H(e0), */
double tmh_hdif(tmh_t *tmh, double e1, double e0)
{
  int sgn;
  double tmp;

  /* to make sure e1 > e0 */
  if (e1 < e0) {
    sgn = -1;
    tmp = e1, e1 = e0, e0 = tmp;
  } else sgn = 1;

  return sgn * ((tmh->dhdeorder == 0) ?
    tmh_hdif0(tmh, e1, e0) : tmh_hdif1(tmh, e1, e0));
}

/* temperature move using a Langevin equation */
int tmh_tlgvmove(tmh_t *tmh, double enow, double lgvdt)
{
  double derg, tpp, bexp = 2.-tmh->ensexp, amp;
  int lgvtype = 1;
  derg = tmh_hdif(tmh, enow, tmh->ec);
  if (lgvtype == 0) {
    amp = sqrt(2*lgvdt);
    tpp = 1.0 / ( 1.0/(tmh->tp) - (derg+bexp*tmh->tp)*lgvdt + grand0()*amp );
  } else {
    amp = tmh->tp*sqrt(2*lgvdt);
    tpp = tmh->tp + (derg+bexp*tmh->tp)*lgvdt + grand0()*amp;
  }
  if (tpp > tmh->tp0 && tpp < tmh->tp1) {
    tmh_settp(tmh, tpp);
    return 1;
  }
  return 0;
}

/* write dhde and overall energy distribution */
int tmh_savedhde(tmh_t *tmh, const char *fn, double amp, double t)
{
  int ie;
  FILE *fp;

  if ((fp = fopen(fn, "w")) == NULL) {
    fprintf(stderr, "cannot write file %s\n", fn);
    return -1;
  }
  fprintf(fp, "# 2 %g %g %d %g %g %d %g %g %d %g %d %g %g %g\n",
      tmh->erg0, tmh->derg, tmh->ergn,
      tmh->emin, tmh->de,   tmh->en,
      tmh->tp0,  tmh->dtp,  tmh->tpn,
      tmh->ensexp, tmh->dhdeorder,
      amp, t, tmh->tp);
  for (ie = 0; ie <= tmh->ergn; ie++) {
    fprintf(fp, "%g %g\n", tmh->erg0 + ie*tmh->derg, tmh->dhde[ie]);
  }
  fclose(fp);
  return 0;
}

/* read dhde and overall energy distribution */
int tmh_loaddhde(tmh_t *tmh, const char *fn, double *amp, double *t)
{
  int ie, ergn, en, ver, tpn, dhdeorder, next;
  FILE *fp;
  char s[1024], *p;
  double emin, de, derg, erg, erg0, tp0, dtp, dhde, ensexp;

  if ((fp = fopen(fn, "r")) == NULL) {
    fprintf(stderr, "cannot write file %s\n", fn);
    return -1;
  }
  if (fgets(s, sizeof s, fp) == NULL) {
    fprintf(stderr, "cannot read the first line %s\n", fn);
    goto ERR;
  }
  if (7 != sscanf(s, " # %d%lf%lf%d%lf%lf%d%n",
        &ver, &erg0, &derg, &ergn, &emin, &de, &en, &next)) {
    fprintf(stderr, "corrupted info line %s", s);
    goto ERR;
  }
  if (ver < 1 ||
      ergn != tmh->ergn || fabs(derg - tmh->derg) > 1e-5 || fabs(erg0 - tmh->erg0) > 1e-5 ||
      en != tmh->en || fabs(de - tmh->de) > 1e-5 || fabs(emin - tmh->emin) > 1e-5) {
    fprintf(stderr, "bad energy ergn %d %d, erg0 %g %g, derg %g %g, en %d %d, emin %g %g, de %g %g\n",
        ergn, tmh->ergn, erg0, tmh->erg0, derg, tmh->derg,
        en, tmh->en, emin, tmh->emin, de, tmh->de);
    goto ERR;
  }
  p = s + next;
  if (ver >= 2) {
    if (5 != sscanf(p, "%lf%lf%d%lf%d%n", &tp0, &dtp, &tpn, &ensexp, &dhdeorder, &next)) {
      fprintf(stderr, "corrupted info line p2 %s", s);
      goto ERR;
    }
    p += next;
    if (tpn != tmh->tpn || fabs(tp0 - tmh->tp0) > 1e-6 || fabs(dtp - tmh->dtp) > 1e-6) {
      fprintf(stderr, "bad temperature range tpn %d %d, tp0 %g %g, dtp %g %g\n",
          tpn, tmh->tpn, tp0, tmh->tp0, dtp, tmh->dtp);
      goto ERR;
    }
  }
  if (3 != sscanf(p, "%lf%lf%lf", amp, t, &tmh->tp)) {
    fprintf(stderr, "corrupted info line p3 %s", s);
    goto ERR;
  }

  for (ie = 0; ie < tmh->ergn; ie++) {
    if (fgets(s, sizeof s, fp) == NULL) {
      fprintf(stderr, "cannot read line %d\n", ie);
      goto ERR;
    }
    if (2 != sscanf(s, "%lf%lf", &erg, &dhde)) {
      fprintf(stderr, "cannot read energy and dhde at %d\n, s = %s", ie, s);
      goto ERR;
    }
    erg0 = tmh->erg0 + ie*tmh->derg;
    if (fabs(erg0 - erg) > tmh->derg*.1) {
      fprintf(stderr, "energy %g, should be %g\n", erg, erg0);
      goto ERR;
    }
    tmh->dhde[ie] = dhde;
  }
  fclose(fp);
  return 0;
ERR:
  fclose(fp);
  return -1;
}

/* get energy range from dhde file */
int tmh_loaderange(const char *fn,
    double *tp0, double *tp1, double *dtp,
    double *erg0, double *erg1, double *derg,
    double *emin, double *emax, double *de,
    double *ensexp, int *dhdeorder)
{
  int en, ergn, tpn, ver, next;
  FILE *fp;
  char s[1024];

  if ((fp = fopen(fn, "r")) == NULL) {
    fprintf(stderr, "cannot write file %s\n", fn);
    return -1;
  }
  if (fgets(s, sizeof s, fp) == NULL) {
    fprintf(stderr, "cannot read the first line %s\n", fn);
    goto ERR;
  }

  if (7 != sscanf(s, " # %d%lf%lf%d%lf%lf%d%n",
        &ver, erg0, derg, &ergn, emin, de, &en, &next) || ver < 1) {
    fprintf(stderr, "corrupted info line %s", s);
    goto ERR;
  }
  *erg1 = *erg0 + ergn*(*derg);
  *emax = *emin + en*(*de);

  if (ver >= 2) { /* additional information */
    if (5 != sscanf(s+next, "%lf%lf%d%lf%d", tp0, dtp, &tpn, ensexp, dhdeorder)) {
      fprintf(stderr, "corrupted info line %s", s);
      goto ERR;
    }
    *tp1 = *tp0 + tpn * (*dtp);
  }

  fclose(fp);
  return 0;
ERR:
  fclose(fp);
  return -1;
}

/* write temperature histogram */
int tmh_savetp(tmh_t *tmh, const char *fn)
{
  int i, j;
  double *eh, erg, cnt, esm, e2sm, eav, edv;
  FILE *fp;

  if ((fp = fopen(fn, "w")) == NULL) {
    fprintf(stderr, "cannot write file %s\n", fn);
    return -1;
  }
  fprintf(fp, "# %g %g %d\n", tmh->tp0, tmh->dtp, tmh->tpn);
  for (i = 0; i < tmh->tpn; i++) {
    eh = tmh->tpehis + i*tmh->en;
    for (cnt = esm = e2sm = 0., j = 0; j < tmh->en; j++) {
      erg = tmh->emin + (j + .5) * tmh->de;
      cnt += eh[j];
      esm += eh[j]*erg;
      e2sm += eh[j]*(erg*erg);
    }
    if (cnt > 1e-8) {
      eav = esm/cnt;
      edv = sqrt(e2sm/cnt - eav*eav);
    } else {
      eav = edv = 0.;
    }
    fprintf(fp, "%g %g %g %g\n",
        tmh->tp0 + (i+.5)*tmh->dtp, cnt, eav, edv);
  }
  fclose(fp);
  return 0;
}

int tmh_save(tmh_t *tmh, const char *fntp, const char *fnehis,
    const char *fndhde, double amp, double t)
{
  tmh_savetp(tmh, fntp);
  tmh_savedhde(tmh, fndhde, amp, t);
  tmh_saveehis(tmh, fnehis);
  return 0;
}

int tmh_load(tmh_t *tmh, const char *fnehis,
    const char *fndhde, double *amp, double *t)
{
  if (tmh_loaddhde(tmh, fndhde, amp, t) != 0) return -1;
  if (tmh_loadehis(tmh, fnehis) != 0) return -1;
  return 0;
}

/* calculate the modified Hamiltonian */
static int tmh_calcmh(tmh_t *tmh)
{
  int i, ie;
  double erg, hm, dh;

  hm = tmh->emin;
  for (i = 0; i < tmh->en; i++) {
    erg = tmh->emin + (i+.5)*tmh->de;
    if (erg <= tmh->erg0) {
      dh = tmh->dhde[0];
    } else if (erg >= tmh->erg1) {
      dh = tmh->dhde[tmh->ergn];
    } else {
      ie = (int)((erg - tmh->erg0)/tmh->derg);
      die_if (ie < 0 || ie >= tmh->ergn,
          "ie %d, erg %g, erg0 %g, derg %g",
          ie, erg, tmh->erg0, tmh->derg);
      dh = tmh_getdhde(tmh, erg, ie);
    }
    dh *= tmh->de;
    tmh->mh[i] = hm + .5*dh;
    hm += dh;
  }
  return 0;
}

/* iteratively compute the density of states */
int tmh_calcdos(tmh_t *tmh, int itmax, double tol,
    const char *fndos, const char *fnlnz)
{
  int i, j, it, ie0, ie1, en = tmh->en, tpn = tmh->tpn;
  double x, dif, lnz0, db;
  double *lnn, *lnm, *bet, *lnz1;
  const double LOG0 = -1e10;
  FILE *fp;

  if (itmax <= 0) itmax = 1000;

  /* determine nonempty energy range for more efficient loops */
  for (ie0 = 0; ie0 < en; ie0++) {
    for (j = 0; j < tmh->tpn; j++)
      if (tmh->tpehis[j*en + ie0] > 0.) break;
    if (j < tmh->tpn) break;
  }
  for (ie1 = en; ie1 > ie0; ie1--) {
    for (j = 0; j < tmh->tpn; j++)
      if (tmh->tpehis[j*en + ie1-1] > 0.) break;
    if (j < tmh->tpn) break;
  }

  /* n[j] is the total number of visits to temperature j
   * m[i] is the total number of visits to energy i */
  xnew(lnn, tpn);
  xnew(lnm, en);
  for (i = 0; i < en; i++) lnm[i] = 0.;
  for (j = 0; j < tpn; j++) {
    lnn[j] = 0.;
    for (i = ie0; i < ie1; i++) {
      x = tmh->tpehis[j*en + i];
      lnn[j] += x;
      lnm[i] += x;
    }
  }
  for (j = 0; j < tpn; j++)
    lnn[j] = (lnn[j] > 0.) ? log(lnn[j]) : LOG0;
  for (i = 0; i < en; i++)
    lnm[i] = (lnm[i] > 0.) ? log(lnm[i]) : LOG0;

  xnew(bet, tpn);
  for (j = 0; j < tpn; j++)
    bet[j] = 1.0/(tmh->tp0 + (j+.5)*tmh->dtp);

  /* get mh and lnz */
  for (i = 0; i < en; i++) tmh->lng[i] = LOG0;
  tmh_calcmh(tmh);
  /* estimate initial lnz */
  for (lnz0 = 0., j = 0; j < tpn; j++) {
    x = tmh->erg0 + (j+.5)*tmh->dtp * tmh->dergdt;
    i = (int)((x - tmh->emin)/tmh->de);
    die_if (i < 0 || i >= en, "i %d, x %g\n", i, x);
    db = 1.0/(tmh->tp0 + (j+1)*tmh->dtp)
       - 1.0/(tmh->tp0 + j*tmh->dtp);
    x = tmh->mh[i]*db;
    tmh->lnz[j] = lnz0 - x*.5;
    lnz0 -= x;
  }
  for (j = tpn - 1; j >= 0; j--)
     tmh->lnz[j] -= tmh->lnz[0];

  xnew(lnz1, tpn);

  /* repeat until convergence */
  for (it = 0; it < itmax; it++) {
    /* compute the density of states */
    for (i = ie0; i < ie1; i++) {
      for (x = LOG0, j = 0; j < tpn; j++) {
        x = lnadd(x, lnn[j] - bet[j]*tmh->mh[i] - tmh->lnz[j]);
      }
      tmh->lng[i] = (lnm[i] < LOG0+.1) ? LOG0 : (lnm[i] - x);
    }

    /* update partition function */
    for (j = 0; j < tpn; j++) {
      for (x = LOG0, i = ie0; i < ie1; i++)
        x = lnadd(x, tmh->lng[i] - bet[j]*tmh->mh[i]);
      lnz1[j] = x;
    }
    for (j = tpn - 1; j >= 0; j--)
      lnz1[j] -= lnz1[0];

    /* check difference */
    for (dif = 0., j = 1; j < tpn; j++) {
      x = fabs(tmh->lnz[j] - lnz1[j]);
      if (x > dif) dif = x;
    }
    for (j = 0; j < tpn; j++) tmh->lnz[j] = lnz1[j];
    if (dif < tol) break;
  }

  /* write dos */
  if (fndos && (fp = fopen(fndos, "w")) != NULL) {
    for (i = ie0; i < ie1; i++) {
      x = tmh->emin + (i + .5)*tmh->de;
      fprintf(fp, "%g %g %g %g\n", x, tmh->lng[i] - tmh->lng[ie0],
          exp(lnm[i]), tmh->mh[i]);
    }
    fclose(fp);
  }

  /* write lnz */
  if (fnlnz && (fp = fopen(fnlnz, "w")) != NULL) {
    for (j = 0; j < tpn; j++) {
      fprintf(fp, "%g %g %g\n", bet[j], tmh->lnz[j], exp(lnn[j]));
    }
    fclose(fp);
  }
  free(lnm);
  free(lnn);
  free(bet);
  free(lnz1);
  return 0;
}

#endif /* ZCOM_TMH__ */
#endif /* ZCOM_TMH */

