Logo Search packages:      
Sourcecode: scilab version File versions  Download package

taucs_scilab.c

/*
 *   This big file contains all the taucs routines needed for the
 *   scilab interface onto the taucs supernodal multi-frontal choleski 
 *   solver :
 *        - internal structures used by the solver 
 *        - genmmd interface
 *        - create / free taucs matrices routines
 *        - permutations routines
 *        - symbolic analysis routines
 *        - super nodal multi frontal factor routines
 *        - super nodal multi frontal solve routines
 *        - various utilities
 *
 *    All this code is from different files of the TAUCS solver
 *    lib from Sivan Toledo (see the TAUCS_license.txt file).
 *
 *    Very minor modifs have been done (Bruno) :
 *        - replace the _ for calling fortran routine by the 
 *          scilab macro F2C  (call to genmmd and calls to blas
 *          routines)
 *        - the print messages that I have stayed are redirected
 *          to the scilab window with sciprint
 *        - I have added a small routine to get cnz
 *
 *    Thanks to Sivan Toledo
 */

/*************************************************************/
/*                                                           */
/*************************************************************/

#include <stdlib.h>
#include <assert.h>
#include <math.h>
#include "taucs_scilab.h"
#include "stack-c.h"   /* F2C macro */
#include "MALLOC.h"
#include "BOOL.h"
#include "sciprint.h"
#include "localization.h"

#ifdef _MSC_VER
#define alloca(x) MALLOC(x)
#endif

#define BLAS_FLOPS_CUTOFF  1000.0
#define SOLVE_DENSE_CUTOFF 5

extern int C2F(genmmd)();
extern int C2F(dpotrf)();
extern int C2F(dtrsm)();
extern int C2F(dsyrk)();
extern int C2F(dgemm)();
/*************************************************************/
/* structures                                                */
/*************************************************************/

00058 typedef struct {
  int     sn_size;
  int     n;
  int*    rowind;

  int     up_size;
  int*    sn_vertices;
  int*    up_vertices;
  double* f1;
  double* f2;
  double* u;
} supernodal_frontal_matrix;

00071 typedef struct {
  char    uplo;     /* 'u' for upper, 'l' for lower, ' ' don't know; prefer lower. */
  int     n;        /* size of matrix */
  int     n_sn;     /* number of supernodes */

  int* parent;      /* supernodal elimination tree */
  int* first_child; 
  int* next_child;

  int* sn_size;     /* size of supernodes (diagonal block) */
  int* sn_up_size;  /* size of subdiagonal update blocks   */
  int** sn_struct;  /* row structure of supernodes         */

  int* sn_blocks_ld;  /* lda of supernode blocks */
  double** sn_blocks; /* supernode blocks        */
    
  int* up_blocks_ld;  /* lda of update blocks    */
  double** up_blocks; /* update blocks           */
} supernodal_factor_matrix;

/*************************************************************/
/* for qsort                                                 */
/*************************************************************/

static int* compare_indirect_map;
static int compare_indirect_ints(const void* vx, const void* vy)
{
  int* ix = (int*)vx;
  int* iy = (int*)vy;
  if (compare_indirect_map[*ix] < compare_indirect_map[*iy]) return -1;
  if (compare_indirect_map[*ix] > compare_indirect_map[*iy]) return  1;
  return 0;
}

/*********************************************************/
/* Interface to MMD                                      */
/*********************************************************/

void taucs_ccs_genmmd(taucs_ccs_matrix* m, int** perm, int** invperm)
{
  int  n, maxint, delta, nofsub;
  int* xadj;
  int* adjncy;
  int* invp;
  int* prm;
  int* dhead;
  int* qsize;
  int* llist;
  int* marker;

  int* len;
  int* next;

  int  nnz,i,j,ip;
  
  if (!(m->flags & TAUCS_SYMMETRIC)) 
  {
      sciprint("%s: %s","taucs_ccs_genmmd",_("GENMMD ordering only works on symmetric matrices.\n"));
    *perm    = NULL;
    *invperm = NULL;
    return;
  }
  /* this routine may actually work on UPPER as well */
  if (!(m->flags & TAUCS_LOWER)) 
  {
      sciprint("%s: %s","taucs_ccs_genmmd",_("The lower part of the matrix must be represented.\n"));
    *perm    = NULL;
    *invperm = NULL;
    return;
  }

  *perm    = NULL;
  *invperm = NULL;

  n   = m->n;
  nnz = (m->colptr)[n];
  
  /* I copied the value of delta and the size of */
  /* from SuperLU. Sivan                         */

  delta = 1; /* DELTA is a parameter to allow the choice of nodes
            whose degree <= min-degree + DELTA. */

  maxint = 32000;

  assert(sizeof(int) == 4);
  maxint = 2147483647; /* 2**31-1, for 32-bit only! */

  xadj   = (int*) MALLOC((n+1)     * sizeof(int));
  adjncy = (int*) MALLOC((2*nnz-n) * sizeof(int));
  invp   = (int*) MALLOC((n+1)     * sizeof(int));
  prm    = (int*) MALLOC(n         * sizeof(int));
  dhead  = (int*) MALLOC((n+1)     * sizeof(int));
  qsize  = (int*) MALLOC((n+1)     * sizeof(int));
  llist  = (int*) MALLOC(n         * sizeof(int));
  marker = (int*) MALLOC(n         * sizeof(int));

  if (!xadj || !adjncy || !invp || !prm 
      || !dhead || !qsize || !llist || !marker) 
  {
    FREE(xadj  );
    FREE(adjncy);
    FREE(invp  );
    FREE(prm   );
    FREE(dhead );
    FREE(qsize );
    FREE(llist );
    FREE(marker);
    return;
  }

  len  = dhead; /* we reuse space */
  next = qsize; /* we reuse space */

  for (i=0; i<n; i++) len[i] = 0;

  for (j=0; j<n; j++)
  {
      for (ip = (m->colptr)[j]; ip < (m->colptr)[j+1]; ip++) 
    {
            i = (m->rowind)[ip];
            if (i != j) 
            {
                  len[i] ++;
                  len[j] ++;
            }
    }
  }
  xadj[0] = 1;
  for (i=1; i<=n; i++) xadj[i] = xadj[i-1] + len[i-1];

  
  /* use degree as a temporary */
  for (i=0; i<n; i++) next[i] = xadj[i] - 1;

  for (j=0; j<n; j++)
  {
    for (ip = (m->colptr)[j]; ip < (m->colptr)[j+1]; ip++) 
    {
            i = (m->rowind)[ip];
            assert( next[i] < 2*nnz-n );
            assert( next[j] < 2*nnz-n );
            if (i != j) 
            {
                  adjncy[ next[i] ] = j+1;
                  adjncy[ next[j] ] = i+1;
                  next[i] ++;
                  next[j] ++;
            }
    }
  }

  F2C(genmmd)(&n,
            xadj, adjncy,
            invp,prm,
            &delta,
            dhead,qsize,llist,marker,
            &maxint,&nofsub);

  FREE(marker);
  FREE(llist );
  FREE(qsize );
  FREE(dhead );
  FREE(xadj  );
  FREE(adjncy);

  for (i=0; i<n; i++) prm[i] --;
  for (i=0; i<n; i++) invp[ prm[i] ] = i;

  *perm    = prm;
  *invperm = invp;
}

/*********************************************************/
/* CCS : create free taucs matrix routines               */
/*********************************************************/

taucs_ccs_matrix* 
taucs_ccs_create(int m, int n, int nnz)
{
  taucs_ccs_matrix* matrix;

  matrix = (taucs_ccs_matrix*) MALLOC(sizeof(taucs_ccs_matrix));
  if (!matrix) 
  { 
      sciprint(_("%s: No more memory.\n"),"taucs_ccs_create");
    return NULL; 
  }
  matrix->flags = 0;;
  matrix->n = n;
  matrix->m = m;
  matrix->colptr = (int*)    MALLOC((n+1) * sizeof(int));
  matrix->rowind = (int*)    MALLOC(nnz   * sizeof(int));
  matrix->values = (double*) MALLOC(nnz   * sizeof(double));
  if (!(matrix->colptr) || !(matrix->rowind) || !(matrix->rowind)) 
  {
    sciprint(_("%s: No more memory (n=%d, nnz=%d).\n"),"taucs_ccs_create",n,nnz);
    FREE(matrix->colptr); FREE(matrix->rowind); FREE(matrix->values);
    FREE (matrix);
    return NULL; 
  }

  return matrix;
} 

void taucs_ccs_free(taucs_ccs_matrix* matrix)
{
  FREE(matrix->rowind);
  FREE(matrix->colptr);
  FREE(matrix->values);
  FREE(matrix);
}

/*********************************************************/
/* CCS : permutation routines                            */
/*********************************************************/

taucs_ccs_matrix* 
taucs_ccs_permute_symmetrically(taucs_ccs_matrix* A, int* perm, int* invperm)
{
  taucs_ccs_matrix* PAPT;
  int n;
  int nnz;
  /* int* colptr; */
  int* len;
  int i,j,ip,I,J;
  double AIJ;

  /*assert(A->flags & TAUCS_SYMMETRIC);*/
  assert(A->flags & TAUCS_LOWER);

  n   = A->n;
  nnz = (A->colptr)[n];

  PAPT = taucs_ccs_create(n,n,nnz);
  /*PAPT->flags = TAUCS_SYMMETRIC | TAUCS_LOWER;*/
  PAPT->flags = A->flags;

  len    = (int*) MALLOC(n * sizeof(int));
  //colptr = (int*) MALLOC(n * sizeof(int));

  for (j=0; j<n; j++) len[j] = 0;

  for (j=0; j<n; j++) 
  {
    for (ip = (A->colptr)[j]; ip < (A->colptr)[j+1]; ip++) 
      {
      /*i = (A->rowind)[ip] - (A->indshift);*/
      i = (A->rowind)[ip];

      I = invperm[i];
      J = invperm[j];

      if (I < J) 
        {
            int T = I; 
            I = J;
            J = T;
      }
      len[J] ++;
    }
  }

  (PAPT->colptr)[0] = 0;
  for (j=1; j<=n; j++) (PAPT->colptr)[j] = (PAPT->colptr)[j-1] + len[j-1];

  for (j=0; j<n; j++) len[j] = (PAPT->colptr)[j];
  
  for (j=0; j<n; j++) 
  {
    for (ip = (A->colptr)[j]; ip < (A->colptr)[j+1]; ip++) 
      {
      /*i   = (A->rowind)[ip] - (A->indshift);*/
      i   = (A->rowind)[ip];
      AIJ = (A->values)[ip];

      I = invperm[i];
      J = invperm[j];

      if (I < J) 
        {
            int T = I; 
            I = J;
            J = T;
      }

      /*(PAPT->rowind)[ len[J] ] = I + (PAPT->indshift);*/
      (PAPT->rowind)[ len[J] ] = I;
      (PAPT->values)[ len[J] ] = AIJ;

      len[J] ++;
    }
  }

  if (len) {FREE(len); len = NULL;}

  return PAPT;
}

void taucs_vec_permute(int n, double v[], double pv[], int p[])
{
  int i;
  for (i=0; i<n; i++) pv[i] = v[p[i]];
} 

void taucs_vec_ipermute(int n, double pv[], double v[], int invp[])
{
  int i;
  for (i=0; i<n; i++) v[invp[i]] = pv[i];
} 


/*************************************************************/
/* create and free the factor object                         */
/*************************************************************/

static supernodal_factor_matrix*
multifrontal_supernodal_create(void)
{
  supernodal_factor_matrix* L;
  
  L = (supernodal_factor_matrix*) MALLOC(sizeof(supernodal_factor_matrix));
  if (!L) return NULL;
  L->uplo      = 'l';
  L->n         = -1; /* unused */

  L->sn_struct   = NULL;
  L->sn_size     = NULL;
  L->sn_up_size  = NULL;
  L->parent      = NULL;
  L->first_child = NULL;
  L->next_child  = NULL;
  L->sn_blocks_ld  = NULL;
  L->sn_blocks     = NULL;
  L->up_blocks_ld  = NULL;
  L->up_blocks     = NULL;

  return L;
}

void taucs_supernodal_factor_free(void* vL)
{
  supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
  int sn;
  
  FREE(L->parent);
  FREE(L->first_child);
  FREE(L->next_child);

  FREE(L->sn_size);
  FREE(L->sn_up_size);
  FREE(L->sn_blocks_ld);
  FREE(L->up_blocks_ld);
  if (L->sn_struct)   
    for (sn=0; sn<L->n_sn; sn++)
      FREE(L->sn_struct[sn]);

  if (L->sn_blocks)   
    for (sn=0; sn<L->n_sn; sn++)
      FREE(L->sn_blocks[sn]);

  if (L->up_blocks)   
    for (sn=0; sn<L->n_sn; sn++)
      FREE(L->up_blocks[sn]);
  /*
  for (sn=0; sn<L->n_sn; sn++) 
  {
    FREE(L->sn_struct[sn]);
    FREE(L->sn_blocks[sn]);
    FREE(L->up_blocks[sn]);
      }*/

  FREE(L->sn_struct);
  FREE(L->sn_blocks);
  FREE(L->up_blocks);

  FREE(L);
}

/*************************************************************/
/* create and free frontal matrices                          */
/*************************************************************/

static supernodal_frontal_matrix* 
supernodal_frontal_create(int* firstcol_in_supernode,
                    int sn_size,
                    int n, 
                    int* rowind)
{
  supernodal_frontal_matrix* tmp;
  int iDontCheckf1 = 0;
  int iDontCheckf2 = 0;
  int iDontChecku = 0;

  tmp = (supernodal_frontal_matrix*)MALLOC(sizeof(supernodal_frontal_matrix));
  if(tmp==NULL) return NULL;

  tmp->sn_size = sn_size;
  tmp->n = n;

  tmp->rowind = rowind;

  tmp->n = n;
  tmp->sn_size = sn_size;
  tmp->up_size = n-sn_size;

  tmp->sn_vertices = rowind;
  tmp->up_vertices = rowind + sn_size;

  if (tmp->sn_size > 0)
  {
        tmp->f1 = (double*)CALLOC((tmp->sn_size)*(tmp->sn_size),sizeof(double));
  }
  else
  {
      iDontCheckf1 = 1;
      tmp->f1 = NULL;
  }

  if (tmp->sn_size  > 0 && tmp->up_size > 0)
  {
        tmp->f2 = (double*)CALLOC((tmp->up_size)*(tmp->sn_size),sizeof(double));
  }
  else
  {
      iDontCheckf2 = 1;
      tmp->f2 = NULL;
  }
  
  if (tmp->up_size > 0)
  {
        tmp->u  = (double*)CALLOC((tmp->up_size)*(tmp->up_size),sizeof(double));
  }
  else
  {
      iDontChecku = 1;
      tmp->u = NULL;
  }

  /*check allocation only if size is > 0 */
  if( (tmp->f1 == NULL && iDontCheckf1 == 0) || 
        (tmp->f2 == NULL && iDontCheckf2 == 0) || 
        (tmp->u  == NULL && iDontChecku  == 0)) 
  {
        FREE(tmp->u);
        FREE(tmp->f1);
        FREE(tmp->f2);
        FREE(tmp);
        return NULL;
  }

  assert(tmp);
  return tmp;
}

static void supernodal_frontal_free(supernodal_frontal_matrix* to_del)
{
  /* f1 and f2 are moved to the factor */
  FREE(to_del->u);
  FREE(to_del);
}


/*************************************************************/
/* factor a frontal matrix                                   */
/*************************************************************/

static int
multifrontal_supernodal_front_factor(int sn,
                             int* firstcol_in_supernode,
                             int sn_size,
                             taucs_ccs_matrix* A,
                             supernodal_frontal_matrix* mtr,
                             int* bitmap,
                             supernodal_factor_matrix* snL)
{
  int i,j;
  int* ind;
  double* re;
  int INFO;
  double done      =  1.0;
  double dminusone = -1.0;

  /* creating transform for real indices */
  for(i=0;i<mtr->sn_size;i++) bitmap[mtr->sn_vertices[i]] = i;
  for(i=0;i<mtr->up_size;i++) bitmap[mtr->up_vertices[i]] = mtr->sn_size + i;

  /* adding sn_size column of A to first sn_size column of frontal matrix */
  for(j=0;j<(mtr->sn_size);j++) 
  {
    ind = &(A->rowind[A->colptr[*(firstcol_in_supernode+j)]]);
    re  = &(A->values[A->colptr[*(firstcol_in_supernode+j)]]); 
    for(i=0;i < A->colptr[*(firstcol_in_supernode+j)+1]- A->colptr[*(firstcol_in_supernode+j)]; i++) 
      {
      if (bitmap[ind[i]] < mtr->sn_size)
        {
            mtr->f1[ (mtr->sn_size)*j + bitmap[ind[i]]] += re[i];
        }
      else
        {
            mtr->f2[ (mtr->up_size)*j + bitmap[ind[i]] - mtr->sn_size] += re[i];
        }
    }
  }

  /* we use the BLAS through the Fortran interface */

  /* solving of lower triangular system for L */
  if (mtr->sn_size)
  {
    F2C(dpotrf)("LOWER",
            &(mtr->sn_size),
            mtr->f1,&(mtr->sn_size),
            &INFO);
  }

  if (INFO) 
  {
      sciprint(_("    CC^T Factorization: Matrix is not positive definite.\n"));
    sciprint(_("                        nonpositive pivot in column %d\n"),mtr->sn_vertices[INFO-1]);
    return -1;
  }

  /* getting completion for found columns of L */
  if (mtr->up_size && mtr->sn_size)
  {
    F2C(dtrsm)("Right",
             "Lower",
             "Transpose",
             "No unit diagonal",
             &(mtr->up_size),&(mtr->sn_size),
             &done,
             mtr->f1,&(mtr->sn_size),
             mtr->f2,&(mtr->up_size));
  }

  (snL->sn_blocks   )[sn] = mtr->f1;
  (snL->sn_blocks_ld)[sn] = mtr->sn_size;

  (snL->up_blocks   )[sn] = mtr->f2;
  (snL->up_blocks_ld)[sn] = mtr->up_size;

  /* computation of updated part of frontal matrix */
  if (mtr->up_size && mtr->sn_size)
  {
    F2C(dsyrk)("Lower",
             "No Transpose",
             &(mtr->up_size),&(mtr->sn_size),
             &dminusone,
             mtr->f2,&(mtr->up_size),
             &done,
             mtr->u, &(mtr->up_size));
  }

  return 0;
 }

/*************************************************************/
/* extend-add                                                */
/*************************************************************/

static void 
multifrontal_supernodal_front_extend_add(
                               supernodal_frontal_matrix* parent_mtr,
                               supernodal_frontal_matrix* my_mtr,
                               int* bitmap)
{
  int j,i,parent_i,parent_j;
  double v;

  for(i=0;i<parent_mtr->sn_size;i++) bitmap[parent_mtr->sn_vertices[i]] = i;
  for(i=0;i<parent_mtr->up_size;i++) bitmap[parent_mtr->up_vertices[i]] = (parent_mtr->sn_size)+i;

  /* extend add operation for update matrix */
  for(j=0;j<my_mtr->up_size;j++) 
  {
    for(i=j;i<my_mtr->up_size;i++) 
      {
      parent_j = bitmap[ my_mtr->up_vertices[j] ];
      parent_i = bitmap[ my_mtr->up_vertices[i] ];
      /* we could skip this if indices were sorted */
      if (parent_j>parent_i) 
        {
            int tmp = parent_j;
            parent_j = parent_i;
            parent_i = tmp;
      }

      v = (my_mtr->u)[(my_mtr->up_size)*j+i];

      if (parent_j < parent_mtr->sn_size) 
        {
            if (parent_i < parent_mtr->sn_size) 
            {
                  (parent_mtr->f1)[ (parent_mtr->sn_size)*parent_j + parent_i] += v;
            } 
            else 
            {
                  (parent_mtr->f2)[ (parent_mtr->up_size)*parent_j + (parent_i-parent_mtr->sn_size)] += v;
            }
      } 
        else 
        {
            (parent_mtr->u)[ (parent_mtr->up_size)*(parent_j-parent_mtr->sn_size) + (parent_i-parent_mtr->sn_size)] += v;
      }
    }
  }
}

/*************************************************************/
/* symbolic elimination                                      */
/*************************************************************/

/* UNION FIND ROUTINES */

static int uf_makeset(int* uf, int i)        { uf[i] = i; return i; }
static int uf_find   (int* uf, int i)        
{ 
      if (uf[i] != i) uf[i] = uf_find(uf,uf[i]); 
      return uf[i]; 
}

static int uf_union  (int* uf, int s, int t) {
  
  if (uf_find(uf,s) < uf_find(uf,t)) 
  {
    uf[uf_find(uf,s)] = uf_find(uf,t); 
    return (uf_find(uf,t)); 
  }
  else
  {
    uf[uf_find(uf,s)] = uf_find(uf,t); 
    return (uf_find(uf,t)); 
  }
}

static
void recursive_postorder(int  j,
                   int  first_child[],
                   int  next_child[],
                   int  postorder[],
                   int  ipostorder[],
                   int* next)
{
  int c;
  for (c=first_child[j]; c != -1; c = next_child[c]) 
  {
    recursive_postorder(c,first_child,next_child,
                  postorder,ipostorder,next);
  }

  if (postorder)  postorder [*next] = j;
  if (ipostorder) ipostorder[j] = *next;
  (*next)++;
}

#define GILBERT_NG_PEYTON_ANALYSIS_SUP

/* in a few tests the supernodal version seemed slower */
#undef GILBERT_NG_PEYTON_ANALYSIS_SUP

static int ordered_uf_makeset(int* uf, int i)
{ 
  uf[i] = i; 
  return i; 
}
static int ordered_uf_find   (int* uf, int i) 
{ 
  if (uf[i] != i) uf[i] = uf_find(uf,uf[i]); 
  return uf[i]; 
}
static int ordered_uf_union  (int* uf, int s, int t) 
{
  assert(uf[t] == t);
  assert(uf[s] == s);
  assert(t > s);
  if (t > s) 
  {
    uf[s] = t; 
    return t; 
  }
  else
  {
    uf[t] = s;
  }
      return s; 
}

static void 
tree_level(int j,
         int isroot, 
         int first_child[],
         int next_child[],
         int level[],
         int level_j)
{
  int c;
  if (!isroot) level[j] = level_j;
  for (c=first_child[j]; c != -1; c = next_child[c]) 
  {
    tree_level(c,
             FALSE,
             first_child,
             next_child,
             level,
             level_j+1);
  }
}

static void
tree_first_descendant(int j,
                  int isroot, 
                  int first_child[],
                  int next_child[],
                  int ipostorder[],
                  int first_descendant[])
{
  int c;
  int fd = ipostorder[j];
  for (c=first_child[j]; c != -1; c = next_child[c]) 
  {
    tree_first_descendant(c,
                    FALSE,
                    first_child,
                    next_child,
                    ipostorder,
                    first_descendant);
    if (first_descendant[c] < fd) fd = first_descendant[c]; 
  }
  if (!isroot) first_descendant[j] = fd;
}

int taucs_ccs_etree(taucs_ccs_matrix* A,
            int* parent,
            int* l_colcount,
            int* l_rowcount,
            int* l_nnz)
{
  int* prev_p;
  int* level;
  int* l_cc;
  int* l_rc;
  int* wt;

  int  n = A->n;
  int  u,pprime;
  int  ju;
  int* postorder;
  int* ipostorder;
  int  *first_child,*next_child;

  int i,j,k,ip,jp,kp;
  int nnz,jnnz;
  int* uf;
  int* rowptr;
  int* colind;
  int* rowcount;
  int* realroot;

  /* we need the row structures for the lower triangle */

  nnz = (A->colptr)[n];
  
  uf       = (int*)MALLOC(n     * sizeof(int));
  rowcount = (int*)MALLOC((n+1) * sizeof(int));
  rowptr   = (int*)MALLOC((n+1) * sizeof(int));
  colind   = (int*)MALLOC(nnz   * sizeof(int));

  for (i=0; i <=n; i++) rowcount[i] = 0;
  for (j=0; j < n; j++) {
    jnnz = (A->colptr)[j+1] - (A->colptr)[j];
    for (ip=0; ip<jnnz; ip++) {
      i = (A->rowind)[(A->colptr)[j] + ip];
      if (j < i) rowcount[i]++;
    }
  }

  ip = 0;
  for (i=0; i <= n; i++) {
    int next_ip = ip + rowcount[i];
    rowcount[i] = ip;
    rowptr  [i] = ip;
    ip = next_ip;
  }

  for (j=0; j < n; j++) {
    jnnz = (A->colptr)[j+1] - (A->colptr)[j];
    for (ip=0; ip<jnnz; ip++) {
      i = (A->rowind)[(A->colptr)[j] + ip];
      if (i==j) continue;
      assert( rowcount[i] < rowptr[i+1] );
      colind[ rowcount[i] ] = j;
      rowcount[i]++;
    }
  }

  /* now compute the etree */

  {
    int u2,t,vroot;
    realroot = rowcount; /* reuse space */

    for (i=0; i<n; i++) {
      uf_makeset(uf,i);
      realroot[i] = i;
      parent[i] = n;
      vroot = i;
      for (kp=rowptr[i]; kp<rowptr[i+1]; kp++) {
      k=colind[kp];
      u2 = uf_find(uf,k);
      t = realroot[u2];
      if (parent[t] == n && t != i) {
        parent[t] = i;
        vroot = uf_union(uf,vroot,u2);
        realroot[vroot] = i;
      }
      }
    }
  }

  FREE(colind);
  FREE(rowptr);
  FREE(rowcount);
  
  /* compute column counts */

  if (l_colcount || l_rowcount || l_nnz) {
    int* l_nz;
    int  tmp;
    int  u2,p,q;

    first_child = (int*)MALLOC((n+1)     * sizeof(int));
    next_child  = (int*)MALLOC((n+1)     * sizeof(int));
    postorder   = (int*)MALLOC(n     * sizeof(int));
    ipostorder  = (int*)MALLOC(n     * sizeof(int));
    wt  = (int*)MALLOC(n     * sizeof(int));
    level = (int*)MALLOC(n     * sizeof(int));
    prev_p  = (int*)MALLOC(n     * sizeof(int));

#ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
    prev_nbr  = MALLOC(n     * sizeof(int));
    first_descendant  = MALLOC(n     * sizeof(int));
#endif


    /* compute the postorder */
    
    for (j=0; j<=n; j++) first_child[j] = -1;
    for (j=n-1; j>=0; j--) {
      next_child[j] = first_child[parent[j]];
      first_child[parent[j]] = j;
    }
    
    {
      int next = 0;
      recursive_postorder(n,first_child,next_child,
                    postorder,
                    ipostorder,&next);
    }
    
    /* we allocate scratch vectors to avoid conditionals */
    /* in the inner loop.                                */

    if (l_colcount) l_cc = l_colcount;
    else            l_cc = (int*) MALLOC(n*sizeof(int));
    if (l_rowcount) l_rc = l_rowcount;
    else            l_rc = (int*) MALLOC(n*sizeof(int));
    if (l_nnz)      l_nz = l_nnz;
    else            l_nz = &tmp;

    /* sort by postorder of etree */
    /* compute level, fst_desc */

    tree_level(n,TRUE,first_child,next_child,
             level,-1);
    
    for (u=0; u < n; u++) prev_p  [u] = -1;
    for (u=0; u < n; u++) l_rc    [u] =  1;
    for (u=0; u < n; u++) ordered_uf_makeset(uf,u);
    for (u=0; u < n; u++) {
      if (first_child[u] == -1)
      wt[u] = 1; /* leaves     */
      else
      wt[u] =  0; /* nonleaves */
    }

#ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
    for (u=0; u < n; u++) prev_nbr[u] = -1;

    tree_first_descendant(n,TRUE,
                    first_child,next_child,ipostorder,
                    first_descendant);
#endif

    FREE(first_child);
    FREE(next_child);

    for (p=0; p<n; p++) {
      jp = postorder[p];
      if (parent[jp] != n) wt[parent[jp]] --;
      for (ip = (A->colptr)[jp]; ip < (A->colptr)[jp+1]; ip++) {
      ju = (A->rowind)[ip];
      u  = ipostorder[ju];
      if (ju==jp) continue; /* we only want proper neighbors */
#ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
      if (first_descendant[jp] > prev_nbr[u]) {
#else
      if (1) {
#endif
        wt[jp] ++;
        pprime = prev_p[ju];
        if (pprime == -1) 
          l_rc[ju] += level[jp] - level[ju];
        else {
          q = ordered_uf_find(uf,pprime);
          l_rc[ju] += level[jp] - level[q];
          wt[q] --;
        }
        prev_p[ju] = jp;
      }
#ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
      prev_nbr[u] = p;
#endif
      }
      if (parent[jp] != n) {
      if (!(ipostorder[parent[jp]] > ipostorder[jp])) {
        sciprint("jp %d parent %d (ipo_j %d ipo_parent %d\n\r",
               jp,parent[jp],ipostorder[jp],ipostorder[parent[jp]]);
      }
      assert(ipostorder[parent[jp]] > ipostorder[jp]);
      ordered_uf_union(uf,jp,parent[jp]);
      }
    }

    *l_nz = 0;
    for (u2=0; u2<n; u2++) {
      l_cc[u2] = wt[u2];
      *l_nz += wt[u2];
    }
    for (u2=0; u2<n; u2++) {
      if (parent[u2] != n) {
      l_cc[parent[u2]] += l_cc[u2];
      *l_nz += l_cc[u2];
      }
    }

    /* free scrtach vectors                              */

    if (!l_colcount) FREE(l_cc);
    if (!l_rowcount) FREE(l_rc);

    /* free other data structures */

    FREE(postorder);
    FREE(ipostorder);
    FREE(wt);
    FREE(level);
    FREE(prev_p);
    
#ifdef GILBERT_NG_PEYTON_ANALYSIS_SUP
    FREE(prev_nbr);
    FREE(first_descendant);
#endif
  }

  FREE(uf);
  return 0;
}


int 
taucs_ccs_etree_liu(taucs_ccs_matrix* A,
                int* parent,
                int* l_colcount,
                int* l_rowcount,
                int* l_nnz)
{
  int n = A->n;
  int i,j,k,ip,kp;
  int nnz,jnnz;

  int* uf;
  int* rowptr;
  int* colind;

  int* rowcount;
  int* marker;
  int* realroot;

  int* l_cc;
  int* l_rc;

  /* we need the row structures for the lower triangle */

  nnz = (A->colptr)[n];
  
  uf       = (int*)MALLOC(n     * sizeof(int));
  rowcount = (int*)MALLOC((n+1) * sizeof(int));
  rowptr   = (int*)MALLOC((n+1) * sizeof(int));
  colind   = (int*)MALLOC(nnz   * sizeof(int));

  for (i=0; i <=n; i++) rowcount[i] = 0;

  for (j=0; j < n; j++) {
    
    jnnz = (A->colptr)[j+1] - (A->colptr)[j];

    for (ip=0; ip<jnnz; ip++) {
      i = (A->rowind)[(A->colptr)[j] + ip];
      if (j < i) rowcount[i]++;
    }
  }

  ip = 0;
  for (i=0; i <= n; i++) {
    int next_ip = ip + rowcount[i];
    rowcount[i] = ip;
    rowptr  [i] = ip;
    ip = next_ip;
  }

  for (j=0; j < n; j++) {
    jnnz = (A->colptr)[j+1] - (A->colptr)[j];

    for (ip=0; ip<jnnz; ip++) {
      i = (A->rowind)[(A->colptr)[j] + ip];
      if (i==j) continue;
      assert( rowcount[i] < rowptr[i+1] );
      colind[ rowcount[i] ] = j;
      rowcount[i]++;
    }
  }

  /* now compute the etree */

  {
    int u,t,vroot;
    realroot = rowcount; /* reuse space */

    for (i=0; i<n; i++) {
      uf_makeset(uf,i);
      realroot[i] = i;
      parent[i] = n;
      vroot = i;
      for (kp=rowptr[i]; kp<rowptr[i+1]; kp++) {
      k=colind[kp];
      u = uf_find(uf,k);
      t = realroot[u];
      if (parent[t] == n && t != i) {
        parent[t] = i;
        vroot = uf_union(uf,vroot,u);
        realroot[vroot] = i;
      }
      }
    }
  }

  /* compute column counts */

  if (l_colcount || l_rowcount || l_nnz) {
    int* l_nz;
    int  tmp;

    /* we allocate scratch vectors to avoid conditionals */
    /* in the inner loop.                                */

    if (l_colcount) l_cc = l_colcount;
    else            l_cc = (int*) MALLOC(n*sizeof(int));
    if (l_rowcount) l_rc = l_rowcount;
    else            l_rc = (int*) MALLOC(n*sizeof(int));
    if (l_nnz)      l_nz = l_nnz;
    else            l_nz = &tmp;

    marker = rowcount; /* we reuse the space */
    
    for (j=0; j < n; j++) l_cc[j] = 1;
    *l_nz = n;
    
    for (i=0; i<n; i++) marker[i] = n; /* clear the array */
    
    for (i=0; i<n; i++) {
      l_rc[i] = 1;
      marker[ i ] = i;
      for (kp=rowptr[i]; kp<rowptr[i+1]; kp++) {
      k=colind[kp];
      j=k;
      while (marker[j] != i) {
        l_cc[j]++;
        l_rc[i]++;
        (*l_nz)++;
        marker[j] = i;
        j = parent[j];
      }
      }
    }

    /* free scrtach vectors                              */

    if (!l_colcount) FREE(l_cc);
    if (!l_rowcount) FREE(l_rc);
  }

  FREE(colind);
  FREE(rowptr);
  FREE(rowcount);
  FREE(uf);
  return 0;
}

static void
recursive_symbolic_elimination(int            j,
                         taucs_ccs_matrix* A,
                         int            first_child[],
                         int            next_child[],
                         int*           n_sn,
                         int            sn_size[],
                         int            sn_up_size[],
                         int*           sn_rowind[],
                         int            sn_first_child[], 
                         int            sn_next_child[], 
                         int            rowind[],
                         int            column_to_sn_map[],
                         int            map[],
                         int            do_order,
                         int            ipostorder[]
                         )
{
  int  i,ip,c,c_sn;
  int  in_previous_sn;
  int  nnz;
  
  for (c=first_child[j]; c != -1; c = next_child[c]) {
    recursive_symbolic_elimination(c,A,
                           first_child,next_child,
                           n_sn,
                           sn_size,sn_up_size,sn_rowind,
                           sn_first_child,sn_next_child,
                           rowind, /* temporary */
                           column_to_sn_map,
                           map,
                           do_order,ipostorder
                           );
  }

  in_previous_sn = 1;
  if (j == A->n) 
    in_previous_sn = 0; /* this is not a real column */
  else if (first_child[j] == -1) 
    in_previous_sn = 0; /* this is a leaf */
  else if (next_child[first_child[j]] != -1) 
    in_previous_sn = 0; /* more than 1 child */
  else { 
    /* check that the structure is nested */
    /* map contains child markers         */

    c=first_child[j];
    for (ip=(A->colptr)[j]; ip<(A->colptr)[j+1]; ip++) {
      i = (A->rowind)[ip];
      in_previous_sn = in_previous_sn && (map[i] == c);
    }
  }

  if (in_previous_sn) {
    c = first_child[j];
    c_sn = column_to_sn_map[c];
    column_to_sn_map[j] = c_sn;

    /* swap row indices so j is at the end of the */
    /* supernode, not in the update indices       */
    for (ip=sn_size[c_sn]; ip<sn_up_size[c_sn]; ip++) 
      if (sn_rowind[c_sn][ip] == j) break;
    assert(ip<sn_up_size[c_sn]);
    sn_rowind[c_sn][ip] = sn_rowind[c_sn][sn_size[c_sn]];
    sn_rowind[c_sn][sn_size[c_sn]] = j;

    /* mark the nonzeros in the map */
    for (ip=sn_size[c_sn]; ip<sn_up_size[c_sn]; ip++) 
      map[ sn_rowind[c_sn][ip] ] = j;

    sn_size   [c_sn]++;

    return;
  }

  /* we are in a new supernode */

  if (j < A->n) {
    nnz = 1;
    rowind[0] = j;
    map[j]    = j;
    
    for (c=first_child[j]; c != -1; c = next_child[c]) {
      c_sn = column_to_sn_map[c];
      for (ip=sn_size[c_sn]; ip<sn_up_size[c_sn]; ip++) {
      i = sn_rowind[c_sn][ip];
      if (i > j && map[i] != j) { /* new row index */
        map[i] = j;
        rowind[nnz] = i;
        nnz++;
      }
      }
    }
    
    for (ip=(A->colptr)[j]; ip<(A->colptr)[j+1]; ip++) {
      i = (A->rowind)[ip];
      if (map[i] != j) { /* new row index */
      map[i] = j;
      rowind[nnz] = i;
      nnz++;
      }
    }
  }
    
  /*printf("children of sn %d: ",*n_sn);*/
  for (c=first_child[j]; c != -1; c = next_child[c]) {
    c_sn = column_to_sn_map[c];
    /*printf("%d ",c_sn);*/
    if (c==first_child[j])
      sn_first_child[*n_sn] = c_sn;
    else {
      sn_next_child[ c_sn ] = sn_first_child[*n_sn];
      sn_first_child[*n_sn] = c_sn;
    }
  }
  /*printf("\n");*/

  if (j < A->n) {
    column_to_sn_map[j] = *n_sn;
    sn_size   [*n_sn] = 1;
    sn_up_size[*n_sn] = nnz;
    sn_rowind [*n_sn] = (int*) MALLOC(nnz * sizeof(int));
    for (ip=0; ip<nnz; ip++) sn_rowind[*n_sn][ip] = rowind[ip];
    if (do_order) {
      /* Sivan and Vladimir: we think that we can sort in */
      /* column order, not only in etree postorder.       */
      /*
      radix_sort(sn_rowind [*n_sn],nnz);
      qsort(sn_rowind [*n_sn],nnz,sizeof(int),compare_ints);
      */
      compare_indirect_map = ipostorder;
      qsort(sn_rowind [*n_sn],nnz,sizeof(int),compare_indirect_ints);
    }
    assert(sn_rowind [*n_sn][0] == j);
    (*n_sn)++;
  }

  return;
}

/* count zeros and nonzeros in a supernode to compute the */
/* utility of merging fundamental supernodes.             */

01323 typedef struct {
  double zeros;
  double nonzeros;
} znz;

static znz
recursive_amalgamate_supernodes(int           sn,
                        int*           n_sn,
                        int            sn_size[],
                        int            sn_up_size[],
                        int*           sn_rowind[],
                        int            sn_first_child[], 
                        int            sn_next_child[], 
                        int            rowind[],
                        int            column_to_sn_map[],
                        int            map[],
                        int            do_order,
                        int            ipostorder[]
                        )
{
  int  i,ip,c_sn,gc_sn;
  int  nnz;
  int  nchildren; /* number of children, child index */
  znz* c_znz;
  znz  sn_znz, merged_znz;

  int new_sn_size, new_sn_up_size;

  sn_znz.zeros    = 0.0;
  sn_znz.nonzeros = (double) (((sn_up_size[sn] - sn_size[sn]) * sn_size[sn]) 
                              + (sn_size[sn] * (sn_size[sn] + 1))/2);

  if (sn_first_child[sn] == -1) { /* leaf */
    return sn_znz;
  }

  nchildren = 0;
  for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn])
    nchildren++;

  c_znz = (znz*) alloca(nchildren * sizeof(znz));

  //printf("supernode %d out of %d\n",sn,*n_sn);

  /* merge the supernode with its children! */

  i = 0;
  for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) {
    c_znz[i] = 
      recursive_amalgamate_supernodes(c_sn,
                              n_sn,
                              sn_size,sn_up_size,sn_rowind,
                              sn_first_child,sn_next_child,
                              rowind, /* temporary */
                              column_to_sn_map,
                              map,
                              do_order,ipostorder
                              );
    assert(c_znz[i].zeros + c_znz[i].nonzeros ==
         (double) (((sn_up_size[c_sn] - sn_size[c_sn]) * sn_size[c_sn]) 
                 + (sn_size[c_sn] * (sn_size[c_sn] + 1))/2 ));
    i++;
  }

  merged_znz.nonzeros = sn_znz.nonzeros;
  merged_znz.zeros    = sn_znz.zeros;
                   
  for (i=0; i<nchildren; i++) {
    merged_znz.nonzeros += (c_znz[i]).nonzeros;
    merged_znz.zeros    += (c_znz[i]).zeros;
  }

  /*  printf("supernode %d out of %d (continuing)\n",sn,*n_sn);*/

  /* should we merge the supernode with its children? */

  nnz = 0;
  for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) {
    for (ip=0; ip<sn_size[c_sn]; ip++) {
      i = sn_rowind[c_sn][ip];
      assert( map[i] != sn );
      map[i] = sn;
      rowind[nnz] = i;
      nnz++;
    }
  }

  for (ip=0; ip<sn_size[sn]; ip++) {
    i = sn_rowind[sn][ip];
    assert( map[i] != sn );
    map[i] = sn;
    rowind[nnz] = i;
    nnz++;
  }

  new_sn_size = nnz;

  for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) {
    for (ip=sn_size[c_sn]; ip<sn_up_size[c_sn]; ip++) {
      i = sn_rowind[c_sn][ip];
      if (map[i] != sn) { /* new row index */
      map[i] = sn;
      rowind[nnz] = i;
      nnz++;
      }
    }
  }

  for (ip=sn_size[sn]; ip<sn_up_size[sn]; ip++) {
    i = sn_rowind[sn][ip];
    if (map[i] != sn) { /* new row index */
      map[i] = sn;
      rowind[nnz] = i;
      nnz++;
    }
  }
  
  new_sn_up_size = nnz;

  if (do_order) {
    compare_indirect_map = ipostorder;
    qsort(rowind,nnz,sizeof(int),compare_indirect_ints);
  }

  /* determine whether we should merge the supernode and its children */

  {
    int n;
    double* zcount;

    n = 0;
    for (ip=0; ip<nnz; ip++) {
      i = rowind[ip];
      if (i >= n) n = i+1;
    }

    zcount = (double*) alloca(n * sizeof(double));
    assert(zcount);
    
    for (ip=0; ip<new_sn_size; ip++) {
      i = rowind[ip]; assert(i<n);
      zcount[i] = (double) (ip+1);
    }
    for (ip=new_sn_size; ip<new_sn_up_size; ip++) {
      i = rowind[ip]; assert(i<n);
      zcount[i] = (double) new_sn_size;
    }

    /*
    for (ip=0; ip<new_sn_up_size; ip++) 
      printf("row %d zcount = %.0f\n",rowind[ip],zcount[rowind[ip]]);
    */
    
    for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) {
      for (ip=0; ip<sn_size[c_sn]; ip++) {
      i = sn_rowind[c_sn][ip]; assert(i<n);
      zcount[i] -= (double) (ip+1);
      }
      for (ip=sn_size[c_sn]; ip<sn_up_size[c_sn]; ip++) {
      i = sn_rowind[c_sn][ip]; assert(i<n);
      zcount[i] -= (double) sn_size[c_sn];
      }
    }

    for (ip=0; ip<sn_size[sn]; ip++) {
      i = sn_rowind[sn][ip]; assert(i<n);
      zcount[i] -= (double) (ip+1);
    }
    for (ip=sn_size[sn]; ip<sn_up_size[sn]; ip++) {
      i = sn_rowind[sn][ip]; assert(i<n);
      zcount[i] -= (double) sn_size[sn];
    }

    /*
    for (ip=0; ip<new_sn_up_size; ip++) 
      printf("ROW %d zcount = %.0f\n",rowind[ip],zcount[rowind[ip]]);
    printf("zeros before merging %.0f\n",merged_znz.zeros);
    */
    
    for (ip=0; ip<new_sn_up_size; ip++) {
      i = rowind[ip]; assert(i<n);
      assert(zcount[i] >= 0.0);
      merged_znz.zeros += zcount[i];
    }

    /*printf("zeros after merging %.0f\n",merged_znz.zeros);*/

    /* voodoo constants (need some kind of a utility function */
    if ((new_sn_size < 16)
      ||
      ((sn_size[sn] < 50) && (merged_znz.zeros < 0.5 * merged_znz.nonzeros))
      ||
      ((sn_size[sn] < 250) && (merged_znz.zeros < 0.25 * merged_znz.nonzeros))
      ||
      ((sn_size[sn] < 500) && (merged_znz.zeros < 0.10 * merged_znz.nonzeros))
      ||
      (merged_znz.zeros < 0.05 * merged_znz.nonzeros)
      ) {
      /*
      taucs_printf("merging sn %d, zeros (%f) vs nonzeros (%f)\n",
               sn,merged_znz.zeros,merged_znz.nonzeros);
      */
    } else {
      /*
      taucs_printf("sn %d, too many zeros (%f) vs nonzeros (%f)\n",
               sn,merged_znz.zeros,merged_znz.nonzeros);
      printf("returning without merging\n");
      */
      return sn_znz;
    }
  }

  /* now merge the children lists */

  sn_size[sn]    = new_sn_size;
  sn_up_size[sn] = new_sn_up_size;
  sn_rowind[sn]  = (int*) REALLOC(sn_rowind[sn], 
                          new_sn_up_size * sizeof(int));
  for (ip=0; ip<new_sn_up_size; ip++) sn_rowind[sn][ip] = rowind[ip];

  //  printf("supernode %d out of %d (merging)\n",sn,*n_sn);

  nchildren = 0;
  for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) {
    for (ip=0; ip<sn_size[c_sn]; ip++) {
      i = (sn_rowind[c_sn])[ip];
      assert(column_to_sn_map[i] == c_sn);
      column_to_sn_map[i] = sn;
    }

    for (gc_sn=sn_first_child[c_sn]; gc_sn != -1; gc_sn = sn_next_child[gc_sn]) {
      rowind[nchildren] = gc_sn;
      nchildren++;
    }
  }

  /* free the children's rowind vectors */
  for (c_sn=sn_first_child[sn]; c_sn != -1; c_sn = sn_next_child[c_sn]) {
    FREE( sn_rowind[c_sn] );
    sn_rowind[c_sn]  = NULL;
    sn_size[c_sn]    = 0;
    sn_up_size[c_sn] = 0;
  }

  sn_first_child[sn] = -1;
  for (i=0; i<nchildren; i++) {
    sn_next_child[ rowind[i] ] = sn_first_child[sn];
    sn_first_child[sn] = rowind[i];
  }    

  /*
  printf("supernode %d out of %d (done)\n",sn,*n_sn);
  printf("returning, merging\n");
  */
  return merged_znz;
}

int      
taucs_ccs_symbolic_elimination(taucs_ccs_matrix* A,
                         void* vL,
                         int do_order
                         )
{
  supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
  int* first_child;
  int* next_child;
  int j;
  int* column_to_sn_map;
  int* map;
  int* rowind;
  int* parent;
  int* ipostorder;

  L->n         = A->n;
  L->sn_struct = (int**)MALLOC((A->n  )*sizeof(int*));
  L->sn_size   = (int*) MALLOC((A->n+1)*sizeof(int));
  
  L->sn_up_size   = (int*) MALLOC((A->n+1)*sizeof(int));
  L->first_child = (int*) MALLOC((A->n+1)*sizeof(int));
  L->next_child  = (int*) MALLOC((A->n+1)*sizeof(int));
  
  column_to_sn_map = (int*)MALLOC((A->n+1)*sizeof(int));
  map              = (int*) MALLOC((A->n+1)*sizeof(int));

  first_child = (int*) MALLOC(((A->n)+1)*sizeof(int));
  next_child  = (int*) MALLOC(((A->n)+1)*sizeof(int));
    
  rowind      = (int*) MALLOC((A->n)*sizeof(int));

  /* compute the vertex elimination tree */
  parent      = (int*)MALLOC((A->n+1)*sizeof(int));

  taucs_ccs_etree(A,parent,NULL,NULL,NULL);

  if (0) {
    int *cc1,*cc2,*rc1,*rc2;
    int *p1;
    int nnz1,nnz2;

    cc1=(int*)MALLOC((A->n)*sizeof(int));
    cc2=(int*)MALLOC((A->n)*sizeof(int));
    rc1=(int*)MALLOC((A->n)*sizeof(int));
    rc2=(int*)MALLOC((A->n)*sizeof(int));
    p1 =(int*)MALLOC((A->n)*sizeof(int));

    taucs_ccs_etree_liu(A,parent,cc1,rc1,&nnz1);

    taucs_ccs_etree(A,p1,cc2,rc2,&nnz2);

    for (j=0; j<(A->n); j++) assert(parent[j]==p1[j]);
    for (j=0; j<(A->n); j++) {
      if (cc1[j]!=cc2[j]) sciprint("j=%d cc1=%d cc2=%d\n",j,cc1[j],cc2[j]);
      assert(cc1[j]==cc2[j]);
    }

    for (j=0; j<(A->n); j++) {
      if (rc1[j]!=rc2[j]) sciprint("j=%d rc1=%d rc2=%d\n",j,rc1[j],rc2[j]);
      assert(rc1[j]==rc2[j]);
    }

    if (nnz1!=nnz2) sciprint("nnz1=%d nnz2=%d\n",nnz1,nnz2);
    
    FREE(cc1); FREE(cc2); FREE(rc1); FREE(rc2);
  }

  for (j=0; j <= (A->n); j++) first_child[j] = -1;
  for (j = (A->n)-1; j >= 0; j--) {
    int p = parent[j];
    next_child[j] = first_child[p];
    first_child[p] = j;
  }
  FREE(parent);

  ipostorder = (int*)MALLOC((A->n+1)*sizeof(int));
  { 
    int next = 0;
    /*int* postorder = (int*)MALLOC((A->n+1)*sizeof(int));*/
    recursive_postorder(A->n,first_child,next_child,
                  NULL,
                  ipostorder,&next);
    /*
    printf("ipostorder ");
    for (j=0; j <= (A->n); j++) printf("%d ",ipostorder[j]);
    printf("\n");
    printf(" postorder ");
    for (j=0; j <= (A->n); j++) printf("%d ",postorder[j]);
    printf("\n");
    */
  }

  L->n_sn = 0;
  for (j=0; j < (A->n); j++) map[j] = -1;
  for (j=0; j <= (A->n); j++) (L->first_child)[j] = (L->next_child)[j] = -1;
  
  recursive_symbolic_elimination(A->n,
                         A,
                         first_child,next_child,
                         &(L->n_sn),
                         L->sn_size,L->sn_up_size,L->sn_struct,
                         L->first_child,L->next_child,
                         rowind,
                         column_to_sn_map,
                         map,
                         do_order,ipostorder
                         );

  {
    double nnz   = 0.0;
    double flops = 0.0;
    int sn,i,colnnz;
    for (sn=0; sn<(L->n_sn); sn++) {
      for (i=0, colnnz = (L->sn_up_size)[sn]; 
         i<(L->sn_size)[sn]; 
         i++, colnnz--) {
      flops += ((double)(colnnz) - 1.0) * ((double)(colnnz) + 2.0) / 2.0;
      nnz   += (double) (colnnz);
      }
    }
    /*sciprint("\t\tSymbolic Analysis of CC^T: %.2e nonzeros, %.2e flops\n",
      nnz, flops); */
  }

  for (j=0; j < (A->n); j++) map[j] = -1;

  (void) recursive_amalgamate_supernodes((L->n_sn) - 1,
                               &(L->n_sn),
                               L->sn_size,L->sn_up_size,L->sn_struct,
                               L->first_child,L->next_child,
                               rowind,
                               column_to_sn_map,
                               map,
                               do_order,ipostorder
                               );


  {
    double nnz   = 0.0;
    double flops = 0.0;
    int sn,i,colnnz;
    for (sn=0; sn<(L->n_sn); sn++) {
      for (i=0, colnnz = (L->sn_up_size)[sn]; 
         i<(L->sn_size)[sn]; 
         i++, colnnz--) {
      flops += ((double)(colnnz) - 1.0) * ((double)(colnnz) + 2.0) / 2.0;
      nnz   += (double) (colnnz);
      }
    }
    /*sciprint("\t\tRelaxed  Analysis of LL^T: %.2e nonzeros, %.2e flops\n",
      nnz, flops); */
  }

  /*
  {
    int i;
    printf("c2sn: ");
    for (i=0; i<A->n; i++) printf("%d ",column_to_sn_map[i]);
    printf("\n");
  }
  */
  


  L->sn_blocks_ld  = (int*)MALLOC((L->n_sn) * sizeof(int));
  L->sn_blocks     = (double**)CALLOC((L->n_sn), sizeof(double*)); /* so we can free before allocation */
  
  L->up_blocks_ld  = (int*)MALLOC((L->n_sn) * sizeof(int));
  L->up_blocks     = (double**)CALLOC((L->n_sn), sizeof(double*));

  FREE(rowind);
  FREE(map);
  FREE(column_to_sn_map);
  FREE(next_child);
  FREE(first_child);
  FREE(ipostorder);

  return 0;
}


/*************************************************************/
/* factor routines                                           */
/*************************************************************/

static supernodal_frontal_matrix*
recursive_multifrontal_supernodal_factor_llt(int sn,       /* this supernode */
                                   int is_root,  /* is v the root? */
                                   int* bitmap,
                                   taucs_ccs_matrix* A,
                                   supernodal_factor_matrix* snL,
                                   int* fail)
{
  supernodal_frontal_matrix* my_matrix=NULL;
  supernodal_frontal_matrix* child_matrix=NULL;
  int child;
  int* v; 
  int  sn_size;
  int* first_child   = snL->first_child;
  int* next_child    = snL->next_child;

  v = &( snL->sn_struct[sn][0] );

  if (!is_root)
    sn_size = snL->sn_size[sn];
  else
    sn_size = -1;

  for (child = first_child[sn]; child != -1; child = next_child[child]) {
    child_matrix = 
      recursive_multifrontal_supernodal_factor_llt(child,
                                       FALSE,
                                       bitmap,
                                       A,snL,fail);
    if (*fail) { 
      if (child_matrix) supernodal_frontal_free(child_matrix);
      return NULL;
    }

    if (!is_root) {
      if (!my_matrix) {
      my_matrix =  supernodal_frontal_create(v,sn_size,
                                     snL->sn_up_size[sn],
                                     snL->sn_struct[sn]);
      if (!my_matrix) {
        *fail = TRUE;
        supernodal_frontal_free(child_matrix);
        return NULL;
      }
      }

      multifrontal_supernodal_front_extend_add(my_matrix,child_matrix,bitmap);
      supernodal_frontal_free(child_matrix);
    }
  }

  /* in case we have no children, we allocate now */
  if (!is_root && !my_matrix) 
    my_matrix =  supernodal_frontal_create(v,sn_size,
                                 snL->sn_up_size[sn],
                                 snL->sn_struct[sn]);

  if(!is_root) {
    if (multifrontal_supernodal_front_factor(sn,
                                   v,sn_size,
                                   A,
                                   my_matrix,
                                   bitmap,
                                   snL)) { 
      /* nonpositive pivot */
      *fail = TRUE;
      supernodal_frontal_free(my_matrix);
      return NULL;
    }
  }
  return my_matrix;
}

void* taucs_ccs_factor_llt_mf(taucs_ccs_matrix* A)
{
  supernodal_factor_matrix* L;
  int* map;
  int fail;

  L = multifrontal_supernodal_create();

  taucs_ccs_symbolic_elimination(A,L,
                         FALSE /* don't sort row indices */ );

  map = (int*)MALLOC((A->n+1)*sizeof(int));

  fail = FALSE;
  recursive_multifrontal_supernodal_factor_llt((L->n_sn),  
                                     TRUE, 
                                     map,
                                     A,L,&fail);

  FREE(map);

  if (fail) {
    taucs_supernodal_factor_free(L);
    return NULL;
  }

  return (void*) L;
}


/*************************************************************/
/* supernodal solve routines                                 */
/*************************************************************/

static void 
recursive_supernodal_solve_l(int sn,       /* this supernode */
                       int is_root,  /* is v the root? */
                       int* first_child, int* next_child,
                       int** sn_struct, int* sn_sizes, int* sn_up_sizes,
                       int* sn_blocks_ld,double* sn_blocks[],
                       int* up_blocks_ld,double* up_blocks[],
                       double x[], double b[],
                       double t[])
{
  int child;
  int  sn_size; /* number of rows/columns in the supernode    */
  int  up_size; /* number of rows that this supernode updates */
  int    ione = 1;
  double done = 1.0;
  double dzero = 0.0;
  double* xdense;
  double* bdense;
  double  flops;
  int i,j,ip,jp;

  for (child = first_child[sn]; child != -1; child = next_child[child]) {
    recursive_supernodal_solve_l(child,
                         FALSE,
                         first_child,next_child,
                         sn_struct,sn_sizes,sn_up_sizes,
                         sn_blocks_ld,sn_blocks,
                         up_blocks_ld,up_blocks,
                         x,b,t);
  }

  if(!is_root) {

    sn_size = sn_sizes[sn];
    up_size = sn_up_sizes[sn] - sn_sizes[sn];

    flops = ((double)sn_size)*((double)sn_size) 
      + 2.0*((double)sn_size)*((double)up_size);

    if (flops > BLAS_FLOPS_CUTOFF) {
      xdense = t;
      bdense = t + sn_size;
      
      for (i=0; i<sn_size; i++)
      xdense[i] = b[ sn_struct[ sn ][ i ] ];
      for (i=0; i<up_size; i++)
      bdense[i] = 0.0;
      
      F2C(dtrsm)("Left",
             "Lower",
             "No Transpose",
             "No unit diagonal",
             &sn_size,&ione,
             &done,
             sn_blocks[sn],&(sn_blocks_ld[sn]),
             xdense       ,&sn_size);
      
      if ((up_size > 0) & (sn_size > 0)) {
      F2C(dgemm)("No Transpose","No Transpose",
               &up_size, &ione, &sn_size,
               &done,
               up_blocks[sn],&(up_blocks_ld[sn]),
               xdense       ,&sn_size,
               &dzero,
               bdense       ,&up_size);
      }
      
      for (i=0; i<sn_size; i++)
      x[ sn_struct[ sn][ i ] ]  = xdense[i];
      for (i=0; i<up_size; i++)
      b[ sn_struct[ sn ][ sn_size + i ] ] -= bdense[i];

    } else if (sn_size > SOLVE_DENSE_CUTOFF) {

      xdense = t;
      bdense = t + sn_size;
      
      for (i=0; i<sn_size; i++)
      xdense[i] = b[ sn_struct[ sn ][ i ] ];
      for (i=0; i<up_size; i++)
      bdense[i] = 0.0;
      
      for (jp=0; jp<sn_size; jp++) {
      xdense[jp] = xdense[jp] / sn_blocks[sn][ sn_blocks_ld[sn]*jp + jp];

      for (ip=jp+1; ip<sn_size; ip++) {
        xdense[ip] -= xdense[jp] * sn_blocks[sn][ sn_blocks_ld[sn]*jp + ip];
      }
      }

      for (jp=0; jp<sn_size; jp++) {
      for (ip=0; ip<up_size; ip++) {
        bdense[ip] += xdense[jp] * up_blocks[sn][ up_blocks_ld[sn]*jp + ip];
      }
      }

      for (i=0; i<sn_size; i++)
      x[ sn_struct[ sn][ i ] ]  = xdense[i];
      for (i=0; i<up_size; i++)
      b[ sn_struct[ sn ][ sn_size + i ] ] -= bdense[i];
      
    } else {

      for (jp=0; jp<sn_size; jp++) {
      j = sn_struct[sn][jp];
      x[j] = b[j] / sn_blocks[sn][ sn_blocks_ld[sn]*jp + jp];
      for (ip=jp+1; ip<sn_size; ip++) {
        i = sn_struct[sn][ip];
        b[i] -= x[j] * sn_blocks[sn][ sn_blocks_ld[sn]*jp + ip];
      }

      for (ip=0; ip<up_size; ip++) {
        i = sn_struct[sn][sn_size + ip];
        b[i] -= x[j] * up_blocks[sn][ up_blocks_ld[sn]*jp + ip];
      }
      }

    }
  }
}

static void 
recursive_supernodal_solve_lt(int sn,       /* this supernode */
                        int is_root,  /* is v the root? */
                        int* first_child, int* next_child,
                        int** sn_struct, int* sn_sizes, int* sn_up_sizes,
                        int* sn_blocks_ld,double* sn_blocks[],
                        int* up_blocks_ld,double* up_blocks[],
                        double x[], double b[],
                        double t[])
{
  int child;
  int  sn_size; /* number of rows/columns in the supernode    */
  int  up_size; /* number of rows that this supernode updates */
  int    ione = 1;
  double done = 1.0;
  double dminusone = -1.0;
  double dzero = 0.0;
  double* xdense;
  double* bdense;
  double  flops;
  int i,j,ip,jp;

  if(!is_root) {

    sn_size = sn_sizes[sn];
    up_size = sn_up_sizes[sn]-sn_sizes[sn];
    
    flops = ((double)sn_size)*((double)sn_size) 
      + 2.0*((double)sn_size)*((double)up_size);

    if (flops > BLAS_FLOPS_CUTOFF) {

      bdense = t;
      xdense = t + sn_size;
      
      for (i=0; i<sn_size; i++)
      bdense[i] = b[ sn_struct[ sn][ i ] ];
      for (i=0; i<up_size; i++)
      xdense[i] = x[ sn_struct[sn][sn_size+i] ];
      
      if ((up_size > 0) & (sn_size > 0))
      F2C(dgemm)("Transpose","No Transpose",
               &sn_size, &ione, &up_size,
               &dminusone,
               up_blocks[sn],&(up_blocks_ld[sn]),
               xdense       ,&up_size,
               &done,
               bdense       ,&sn_size);
      
      F2C(dtrsm)("Left",
             "Lower",
             "Transpose",
             "No unit diagonal",
             &sn_size,&ione,
             &done,
             sn_blocks[sn],&(sn_blocks_ld[sn]),
             bdense       ,&sn_size);
      
      for (i=0; i<sn_size; i++)
      x[ sn_struct[ sn][ i ] ]  = bdense[i];
    
    } else if (sn_size > SOLVE_DENSE_CUTOFF) {

      bdense = t;
      xdense = t + sn_size;
      
      for (i=0; i<sn_size; i++)
      bdense[i] = b[ sn_struct[ sn][ i ] ];
      for (i=0; i<up_size; i++)
      xdense[i] = x[ sn_struct[sn][sn_size+i] ];
      
      for (ip=sn_size-1; ip>=0; ip--) {
      for (jp=0; jp<up_size; jp++) {
        bdense[ip] -= xdense[jp] * up_blocks[sn][ up_blocks_ld[sn]*ip + jp];
      }
      }

      for (ip=sn_size-1; ip>=0; ip--) {
      for (jp=sn_size-1; jp>ip; jp--) {
        bdense[ip] -= bdense[jp] * sn_blocks[sn][ sn_blocks_ld[sn]*ip + jp];
      }
      bdense[ip] = bdense[ip] / sn_blocks[sn][ sn_blocks_ld[sn]*ip + ip];
      }

      for (i=0; i<sn_size; i++)
      x[ sn_struct[ sn][ i ] ]  = bdense[i];
    
    } else {

      for (ip=sn_size-1; ip>=0; ip--) {
      i = sn_struct[sn][ip];

      for (jp=0; jp<up_size; jp++) {
        j = sn_struct[sn][sn_size + jp];
        b[i] -= x[j] * up_blocks[sn][ up_blocks_ld[sn]*ip + jp];
      }

      for (jp=sn_size-1; jp>ip; jp--) {
        j = sn_struct[sn][jp];
        b[i] -= x[j] * sn_blocks[sn][ sn_blocks_ld[sn]*ip + jp];
      }
      x[i] = b[i] / sn_blocks[sn][ sn_blocks_ld[sn]*ip + ip];

      }

    }
  }

  for (child = first_child[sn]; child != -1; child = next_child[child]) {
    recursive_supernodal_solve_lt(child,
                          FALSE,
                          first_child,next_child,
                          sn_struct,sn_sizes,sn_up_sizes,
                          sn_blocks_ld,sn_blocks,
                          up_blocks_ld,up_blocks,
                          x,b,t);
  }
}

int taucs_supernodal_solve_llt(void* vL,
                         double* x, double* b)
{
  supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
  double* y;
  double* t; /* temporary vector */
  int     i;
  
  y = MALLOC((L->n) * sizeof(double));
  t = MALLOC((L->n) * sizeof(double));
  if (!y || !t) {
    FREE(y);
    FREE(t);
      sciprint(_("%s: No more memory.\n"),"multifrontal_supernodal_solve_llt");
    return -1;
  }

  for (i=0; i<L->n; i++) x[i] = b[i];

  recursive_supernodal_solve_l (L->n_sn,
                        TRUE,  /* this is the root */
                        L->first_child, L->next_child,
                        L->sn_struct,L->sn_size,L->sn_up_size,
                        L->sn_blocks_ld, L->sn_blocks,
                        L->up_blocks_ld, L->up_blocks,
                        y, x, t);

  recursive_supernodal_solve_lt(L->n_sn,
                        TRUE,  /* this is the root */
                        L->first_child, L->next_child,
                        L->sn_struct,L->sn_size,L->sn_up_size,
                        L->sn_blocks_ld, L->sn_blocks,
                        L->up_blocks_ld, L->up_blocks,
                        x, y, t);

  FREE(y);
  FREE(t);
    
  return 0;
}

/*****************************************************
 *  some utility routines                            *
 *****************************************************/

taucs_ccs_matrix*
taucs_supernodal_factor_to_ccs(void* vL)
{
  supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
  taucs_ccs_matrix* C;
  int n,nnz;
  int i,j,ip,jp,sn,next;
  double v;
  int* len;

  n = L->n;

  len = (int*) MALLOC(n*sizeof(int));
  if (!len) return NULL;

  nnz = 0;

  for (sn=0; sn<L->n_sn; sn++) {
    for (jp=0; jp<(L->sn_size)[sn]; jp++) {
      j = (L->sn_struct)[sn][jp];
      len[j] = 0;

      for (ip=jp; ip<(L->sn_size)[sn]; ip++) {
      i = (L->sn_struct)[sn][ ip ];
      v = (L->sn_blocks)[sn][ jp*(L->sn_blocks_ld)[sn] + ip ];

      if (v) { 
        len[j] ++;
        nnz ++;
      }
      }
      for (ip=(L->sn_size)[sn]; ip<(L->sn_up_size)[sn]; ip++) {
      i = (L->sn_struct)[sn][ ip ];
      v = (L->up_blocks)[sn][ jp*(L->up_blocks_ld)[sn] + (ip-(L->sn_size)[sn]) ];

      if (v) { 
        len[j] ++;
        nnz ++;
      }
      }
    }
  }


  C = taucs_ccs_create(n,n,nnz);
  if (!C) {
    FREE(len);
    return NULL;
  }
  C->flags = TAUCS_TRIANGULAR | TAUCS_LOWER;

  (C->colptr)[0] = 0;
  for (j=1; j<=n; j++) (C->colptr)[j] = (C->colptr)[j-1] + len[j-1];

  FREE(len);

  for (sn=0; sn<L->n_sn; sn++) {
    for (jp=0; jp<(L->sn_size)[sn]; jp++) {
      j = (L->sn_struct)[sn][jp];

      next = (C->colptr)[j];

      for (ip=jp; ip<(L->sn_size)[sn]; ip++) {
      i = (L->sn_struct)[sn][ ip ];
      v = (L->sn_blocks)[sn][ jp*(L->sn_blocks_ld)[sn] + ip ];

      if (v == 0.0) continue;

      (C->rowind)[next] = i;
      (C->values)[next] = v;
      next++;
      }
      for (ip=(L->sn_size)[sn]; ip<(L->sn_up_size)[sn]; ip++) {
      i = (L->sn_struct)[sn][ ip ];
      v = (L->up_blocks)[sn][ jp*(L->up_blocks_ld)[sn] + (ip-(L->sn_size)[sn]) ];

      if (v == 0.0) continue;

      (C->rowind)[next] = i;
      (C->values)[next] = v;
      next++;
      }
    }
  }

  return C;
}

/* 
 *   from the preeeding routine here is a small one to get only
 *   the number of non zero elements of the factor (Bruno le 29/11/2001)
 *   In fact the supernodal struct keeps on some exact zeros and so the
 *   nnz get here is representative of the memory used by the supernodal
 *   struc.
 */

int taucs_get_nnz_from_supernodal_factor(void* vL)
{
  supernodal_factor_matrix* L = (supernodal_factor_matrix*) vL;
  int n,nnz;
  int jp,sn;

  n = L->n;

  nnz = 0;

  for (sn=0; sn<L->n_sn; sn++)
    for (jp=0; jp<(L->sn_size)[sn]; jp++) 
      nnz += (L->sn_up_size)[sn] - jp;

  return ( nnz );
}




Generated by  Doxygen 1.6.0   Back to index