#include "OpenNL_psm.h"

/*
 *  Copyright (c) 2004-2010, Bruno Levy
 *  All rights reserved.
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *  this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright notice,
 *  this list of conditions and the following disclaimer in the documentation
 *  and/or other materials provided with the distribution.
 *  * Neither the name of the ALICE Project-Team nor the names of its
 *  contributors may be used to endorse or promote products derived from this
 *  software without specific prior written permission.
 * 
 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *  POSSIBILITY OF SUCH DAMAGE.
 *
 *  If you modify this software, you should include a notice giving the
 *  name of the person performing the modification, the date of modification,
 *  and the reason for such modification.
 *
 *  Contact: Bruno Levy
 *
 *     levy@loria.fr
 *
 *     ALICE Project
 *     LORIA, INRIA Lorraine, 
 *     Campus Scientifique, BP 239
 *     54506 VANDOEUVRE LES NANCY CEDEX 
 *     FRANCE
 *
 */


/*
 *  This file is a PSM (pluggable software module)
 *   generated from the distribution of Geogram.
 *
 *  See Geogram documentation on:
 *   http://alice.loria.fr/software/geogram/doc/html/index.html
 *
 *  See documentation of the functions bundled in this PSM on:
 *   http://alice.loria.fr/software/geogram/doc/html/nl_8h.html
 */


/******* extracted from nl_private.h *******/

#ifndef OPENNL_PRIVATE_H
#define OPENNL_PRIVATE_H

#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <math.h>


#if defined(__APPLE__) && defined(__MACH__)
#define NL_OS_APPLE
#endif

#if defined(__linux__) || defined(__ANDROID__) || defined(NL_OS_APPLE)
#define NL_OS_UNIX
#endif


#if defined(WIN32) || defined(_WIN64)
#define NL_OS_WINDOWS
#endif

#define nl_arg_used(x) (void)x


#if defined(__clang__) || defined(__GNUC__)
#define NL_NORETURN __attribute__((noreturn))
#else
#define NL_NORETURN 
#endif

#if defined(_MSC_VER)
#define NL_NORETURN_DECL __declspec(noreturn) 
#else
#define NL_NORETURN_DECL 
#endif

NL_NORETURN_DECL void nl_assertion_failed(
    const char* cond, const char* file, int line
) NL_NORETURN;

NL_NORETURN_DECL void nl_range_assertion_failed(
    double x, double min_val, double max_val, const char* file, int line
) NL_NORETURN;

NL_NORETURN_DECL void nl_should_not_have_reached(
    const char* file, int line
) NL_NORETURN;

#define nl_assert(x) {                                          \
    if(!(x)) {                                                  \
        nl_assertion_failed(#x,__FILE__, __LINE__) ;            \
    }                                                           \
} 

#define nl_range_assert(x,min_val,max_val) {                    \
    if(((x) < (min_val)) || ((x) > (max_val))) {                \
        nl_range_assertion_failed(x, min_val, max_val,          \
            __FILE__, __LINE__                                  \
        ) ;                                                     \
    }                                                           \
}

#define nl_assert_not_reached {                                 \
    nl_should_not_have_reached(__FILE__, __LINE__) ;            \
}

#ifdef NL_DEBUG
    #define nl_debug_assert(x) nl_assert(x)
    #define nl_debug_range_assert(x,min_val,max_val)            \
                               nl_range_assert(x,min_val,max_val)
#else
    #define nl_debug_assert(x) 
    #define nl_debug_range_assert(x,min_val,max_val) 
#endif

#ifdef NL_PARANOID
    #define nl_parano_assert(x) nl_assert(x)
    #define nl_parano_range_assert(x,min_val,max_val)           \
                               nl_range_assert(x,min_val,max_val)
#else
    #define nl_parano_assert(x) 
    #define nl_parano_range_assert(x,min_val,max_val) 
#endif


void nlError(const char* function, const char* message) ;

void nlWarning(const char* function, const char* message) ;


NLdouble nlCurrentTime(void);

typedef void* NLdll;


#define NL_LINK_NOW    1

#define NL_LINK_LAZY   2

#define NL_LINK_GLOBAL 4

#define NL_LINK_QUIET  8

#define NL_LINK_USE_FALLBACK 16

NLdll nlOpenDLL(const char* filename, NLenum flags);

void nlCloseDLL(NLdll handle);

NLfunc nlFindFunction(NLdll handle, const char* funcname);


/* classic macros */

#ifndef MIN
#define MIN(x,y) (((x) < (y)) ? (x) : (y)) 
#endif

#ifndef MAX
#define MAX(x,y) (((x) > (y)) ? (x) : (y)) 
#endif


#define NL_NEW(T)                (T*)(calloc(1, sizeof(T))) 

#define NL_NEW_ARRAY(T,NB)       (T*)(calloc((size_t)(NB),sizeof(T)))

#define NL_RENEW_ARRAY(T,x,NB)   (T*)(realloc(x,(size_t)(NB)*sizeof(T))) 

#define NL_DELETE(x)             free(x); x = NULL 

#define NL_DELETE_ARRAY(x)       free(x); x = NULL

#define NL_CLEAR(T, x)           memset(x, 0, sizeof(T)) 

#define NL_CLEAR_ARRAY(T,x,NB)   memset(x, 0, (size_t)(NB)*sizeof(T)) 


#define NL_UINT_MAX 0xffffffff

#define NL_USHORT_MAX 0xffff


extern NLprintfFunc nl_printf;

extern NLfprintfFunc nl_fprintf;


#endif

/******* extracted from nl_blas.h *******/


#ifndef OPENNL_BLAS_H
#define OPENNL_BLAS_H

struct NLBlas;

typedef struct NLBlas* NLBlas_t;

typedef enum {
    NoTranspose=0, Transpose=1, ConjugateTranspose=2
} MatrixTranspose ;

typedef enum {
    UpperTriangle=0, LowerTriangle=1
} MatrixTriangle ;

typedef enum {
    UnitTriangular=0, NotUnitTriangular=1
} MatrixUnitTriangular ;

typedef enum {
    NL_HOST_MEMORY, NL_DEVICE_MEMORY
} NLmemoryType;

typedef void* (*FUNPTR_malloc)(
    NLBlas_t blas, NLmemoryType type, size_t size
);

typedef void (*FUNPTR_free)(
    NLBlas_t blas, NLmemoryType type, size_t size, void* ptr
);

typedef void (*FUNPTR_memcpy)(
    NLBlas_t blas,
    void* to, NLmemoryType to_type,
    void* from, NLmemoryType from_type,
    size_t size
);

typedef void (*FUNPTR_dcopy)(
    NLBlas_t blas, int n, const double *x, int incx, double *y, int incy
);

typedef void (*FUNPTR_dscal)(
    NLBlas_t blas, int n, double a, double *x, int incx
);


typedef double (*FUNPTR_ddot)(
    NLBlas_t blas, int n, const double *x, int incx, const double *y, int incy
);

typedef double (*FUNPTR_dnrm2)(NLBlas_t blas, int n, const double *x, int incx);

typedef void (*FUNPTR_daxpy)(
    NLBlas_t blas, int n,
    double a, const double *x, int incx, double *y, int incy
);


typedef void (*FUNPTR_dgemv)( 
    NLBlas_t blas, MatrixTranspose trans, int m, int n, double alpha,
    const double *A, int ldA, const double *x, int incx,
    double beta, double *y, int incy 
);


typedef void (*FUNPTR_dtpsv)(
    NLBlas_t blas, MatrixTriangle uplo, MatrixTranspose trans,
    MatrixUnitTriangular diag, int n, const double *AP,
    double *x, int incx 
);

struct NLBlas {
    FUNPTR_malloc Malloc;
    FUNPTR_free Free;
    FUNPTR_memcpy Memcpy;

    FUNPTR_dcopy Dcopy;
    FUNPTR_dscal Dscal;
    FUNPTR_ddot  Ddot;
    FUNPTR_dnrm2 Dnrm2;
    FUNPTR_daxpy Daxpy;
    FUNPTR_dgemv Dgemv;
    FUNPTR_dtpsv Dtpsv;

    NLboolean has_unified_memory;
    double start_time;
    NLulong flops;
    NLulong used_ram[2];
    NLulong max_used_ram[2];
    
    /* 
     * Used for stats of the linear solver
     * (a bit ugly, should not be here, but
     * more convenient for now...)
     */
    double sq_rnorm; 
    double sq_bnorm;
};

NLboolean nlBlasHasUnifiedMemory(NLBlas_t blas);

void nlBlasResetStats(NLBlas_t blas);

double nlBlasGFlops(NLBlas_t blas);

NLulong nlBlasUsedRam(NLBlas_t blas, NLmemoryType type);

NLulong nlBlasMaxUsedRam(NLBlas_t blas, NLmemoryType type);

NLBlas_t nlHostBlas(void);

#define NL_NEW_VECTOR(blas, memtype, dim) \
    (double*)blas->Malloc(blas,memtype,(size_t)(dim)*sizeof(double))

#define NL_DELETE_VECTOR(blas, memtype, dim, ptr) \
    blas->Free(blas,memtype,(size_t)(dim)*sizeof(double),ptr)


#endif

/******* extracted from nl_matrix.h *******/


#ifndef OPENNL_MATRIX_H
#define OPENNL_MATRIX_H


#ifdef __cplusplus
extern "C" {
#endif


/* Abstract matrix interface */

struct NLMatrixStruct;
typedef struct NLMatrixStruct* NLMatrix;

typedef void(*NLDestroyMatrixFunc)(NLMatrix M);    

typedef void(*NLMultMatrixVectorFunc)(NLMatrix M, const double* x, double* y);

#define NL_MATRIX_SPARSE_DYNAMIC 0x1001
#define NL_MATRIX_CRS            0x1002
#define NL_MATRIX_SUPERLU_EXT    0x1003    
#define NL_MATRIX_CHOLMOD_EXT    0x1004    
#define NL_MATRIX_FUNCTION       0x1005
#define NL_MATRIX_OTHER          0x1006
    
struct NLMatrixStruct {
    NLuint m;

    NLuint n;

    NLenum type;

    NLDestroyMatrixFunc destroy_func;

    NLMultMatrixVectorFunc mult_func;
};

NLAPI void NLAPIENTRY nlDeleteMatrix(NLMatrix M);

NLAPI void NLAPIENTRY nlMultMatrixVector(
    NLMatrix M, const double* x, double* y
);
    

/* Dynamic arrays for sparse row/columns */

typedef struct  {
    NLuint index;

    NLdouble value; 
} NLCoeff;

typedef struct {
    NLuint size;
    
    NLuint capacity;

    NLCoeff* coeff;  
} NLRowColumn;

NLAPI void NLAPIENTRY nlRowColumnConstruct(NLRowColumn* c);

NLAPI void NLAPIENTRY nlRowColumnDestroy(NLRowColumn* c);

NLAPI void NLAPIENTRY nlRowColumnGrow(NLRowColumn* c);

NLAPI void NLAPIENTRY nlRowColumnAdd(
    NLRowColumn* c, NLuint index, NLdouble value
);

NLAPI void NLAPIENTRY nlRowColumnAppend(
    NLRowColumn* c, NLuint index, NLdouble value
);

NLAPI void NLAPIENTRY nlRowColumnZero(NLRowColumn* c);

NLAPI void NLAPIENTRY nlRowColumnClear(NLRowColumn* c);

NLAPI void NLAPIENTRY nlRowColumnSort(NLRowColumn* c);


/* Compressed Row Storage */

typedef struct {
    NLuint m;
    
    NLuint n;

    NLenum type;
    
    NLDestroyMatrixFunc destroy_func;

    NLMultMatrixVectorFunc mult_func;
    
    NLdouble* val;    

    NLuint* rowptr;

    NLuint* colind;

    NLuint nslices;

    NLuint* sliceptr;

    NLboolean symmetric_storage;
} NLCRSMatrix;

NLAPI void NLAPIENTRY nlCRSMatrixConstruct(
    NLCRSMatrix* M, NLuint m, NLuint n, NLuint nnz, NLuint nslices
);

NLAPI void NLAPIENTRY nlCRSMatrixConstructSymmetric(
    NLCRSMatrix* M, NLuint n, NLuint nnz
);
    
NLAPI NLboolean NLAPIENTRY nlCRSMatrixLoad(
    NLCRSMatrix* M, const char* filename
);

NLAPI NLboolean NLAPIENTRY nlCRSMatrixSave(
    NLCRSMatrix* M, const char* filename
);

NLAPI NLuint NLAPIENTRY nlCRSMatrixNNZ(NLCRSMatrix* M);
    

/* SparseMatrix data structure */

#define NL_MATRIX_STORE_ROWS          1

#define NL_MATRIX_STORE_COLUMNS       2

#define NL_MATRIX_STORE_SYMMETRIC     4
    
typedef struct {
    NLuint m;
    
    NLuint n;

    NLenum type;
    
    NLDestroyMatrixFunc destroy_func;

    NLMultMatrixVectorFunc mult_func;

    
    NLuint diag_size;

    NLuint diag_capacity;
    
    NLenum storage;

    NLRowColumn* row;

    NLRowColumn* column;

    NLdouble*    diag;

    NLuint row_capacity;

    NLuint column_capacity;
    
} NLSparseMatrix;


NLAPI NLMatrix NLAPIENTRY nlSparseMatrixNew(
    NLuint m, NLuint n, NLenum storage
);

NLAPI void NLAPIENTRY nlSparseMatrixConstruct(
    NLSparseMatrix* M, NLuint m, NLuint n, NLenum storage
);

NLAPI void NLAPIENTRY nlSparseMatrixDestroy(NLSparseMatrix* M);

NLAPI void NLAPIENTRY nlSparseMatrixMult(
    NLSparseMatrix* A, const NLdouble* x, NLdouble* y
);    
    
NLAPI void NLAPIENTRY nlSparseMatrixAdd(
    NLSparseMatrix* M, NLuint i, NLuint j, NLdouble value
);

NLAPI void NLAPIENTRY nlSparseMatrixAddMatrix(
    NLSparseMatrix* M, double mul, const NLMatrix N
);	
    
NLAPI void NLAPIENTRY nlSparseMatrixZero( NLSparseMatrix* M);

NLAPI void NLAPIENTRY nlSparseMatrixClear( NLSparseMatrix* M);

NLAPI NLuint NLAPIENTRY nlSparseMatrixNNZ( NLSparseMatrix* M);

NLAPI void NLAPIENTRY nlSparseMatrixSort( NLSparseMatrix* M);

NLAPI void NLAPIENTRY nlSparseMatrixAddRow( NLSparseMatrix* M);

NLAPI void NLAPIENTRY nlSparseMatrixAddColumn( NLSparseMatrix* M);

NLAPI void NLAPIENTRY nlSparseMatrixMAddRow(
    NLSparseMatrix* M, NLuint i1, double s, NLuint i2
);

NLAPI void NLAPIENTRY nlSparseMatrixScaleRow(
    NLSparseMatrix* M, NLuint i, double s
);

NLAPI void NLAPIENTRY nlSparseMatrixZeroRow(
    NLSparseMatrix* M, NLuint i
);

    
NLAPI NLMatrix NLAPIENTRY nlCRSMatrixNewFromSparseMatrix(NLSparseMatrix* M);    

NLAPI NLMatrix NLAPIENTRY nlCRSMatrixNewFromSparseMatrixSymmetric(
    NLSparseMatrix* M
);    

    
NLAPI void NLAPIENTRY nlMatrixCompress(NLMatrix* M);

NLAPI NLuint NLAPIENTRY nlMatrixNNZ(NLMatrix M);

NLAPI NLMatrix NLAPIENTRY nlMatrixFactorize(NLMatrix M, NLenum solver);
    

    typedef void(*NLMatrixFunc)(const double* x, double* y);

NLAPI NLMatrix NLAPIENTRY nlMatrixNewFromFunction(
    NLuint m, NLuint n, NLMatrixFunc func
);	     

NLAPI NLMatrixFunc NLAPIENTRY nlMatrixGetFunction(NLMatrix M);


NLAPI NLMatrix NLAPIENTRY nlMatrixNewFromProduct(
    NLMatrix M, NLboolean product_owns_M,
    NLMatrix N, NLboolean product_owns_N
);


#ifdef __cplusplus
}
#endif

#endif

/******* extracted from nl_context.h *******/

#ifndef OPENNL_CONTEXT_H
#define OPENNL_CONTEXT_H


/* NLContext data structure */


typedef NLboolean(*NLSolverFunc)(void);

typedef void(*NLProgressFunc)(
    NLuint cur_iter, NLuint max_iter, double cur_err, double max_err
);

#define NL_STATE_INITIAL                0
#define NL_STATE_SYSTEM                 1
#define NL_STATE_MATRIX                 2
#define NL_STATE_ROW                    3
#define NL_STATE_MATRIX_CONSTRUCTED     4
#define NL_STATE_SYSTEM_CONSTRUCTED     5
#define NL_STATE_SOLVED                 6

typedef struct {
    void* base_address;
    NLuint stride;
} NLBufferBinding;

#define NL_BUFFER_ITEM(B,i) \
    *(double*)((void*)((char*)((B).base_address)+((i)*(B).stride)))


typedef struct {
    NLenum           state;

    NLboolean        user_variable_buffers;
    
    NLBufferBinding* variable_buffer;
    
    NLdouble*        variable_value;

    NLboolean*       variable_is_locked;

    NLuint*          variable_index;
    
    NLuint           n;


    NLenum           matrix_mode;

    NLMatrix         M;

    NLMatrix         P;

    NLMatrix         B;
    
    NLRowColumn      af;

    NLRowColumn      al;

    NLdouble*        x;

    NLdouble*        b;

    NLdouble*        right_hand_side;

    NLdouble         row_scaling;

    NLenum           solver;

    NLenum           preconditioner;

    NLboolean        preconditioner_defined;
    
    NLuint           nb_variables;

    NLuint           nb_systems;

    NLboolean        ij_coefficient_called;
    
    NLuint           current_row;

    NLboolean        least_squares;

    NLboolean        symmetric;

    NLuint           max_iterations;


    NLboolean        max_iterations_defined;
    
    NLuint           inner_iterations;

    NLdouble         threshold;

    NLboolean        threshold_defined;
    
    NLdouble         omega;

    NLboolean        normalize_rows;
    
    NLuint           used_iterations;

    NLdouble         error;


    NLdouble         start_time;
    
    NLdouble         elapsed_time;

    NLSolverFunc     solver_func;

    NLProgressFunc   progress_func;

    NLboolean        verbose;

    NLulong          flops;

    NLenum           eigen_solver;

    NLdouble         eigen_shift;

    NLboolean        eigen_shift_invert;

    NLdouble*        eigen_value;

    NLdouble*        temp_eigen_value;
    
} NLContextStruct;

extern NLContextStruct* nlCurrentContext;

void nlCheckState(NLenum state);

void nlTransition(NLenum from_state, NLenum to_state);

NLboolean nlDefaultSolver(void);

#endif

/******* extracted from nl_iterative_solvers.h *******/


#ifndef OPENNL_ITERATIVE_SOLVERS_H
#define OPENNL_ITERATIVE_SOLVERS_H


NLAPI NLuint NLAPIENTRY nlSolveSystemIterative(
    NLBlas_t blas,
    NLMatrix M, NLMatrix P, NLdouble* b, NLdouble* x,
    NLenum solver,
    double eps, NLuint max_iter, NLuint inner_iter
);

#endif


/******* extracted from nl_preconditioners.h *******/

#ifndef OPENNL_PRECONDITIONERS_H
#define OPENNL_PRECONDITIONERS_H


/* preconditioners */

NLMatrix nlNewJacobiPreconditioner(NLMatrix M);

NLMatrix nlNewSSORPreconditioner(NLMatrix M, double omega);

#endif

/******* extracted from nl_superlu.h *******/

#ifndef OPENNL_SUPERLU_H
#define OPENNL_SUPERLU_H


NLAPI NLMatrix NLAPIENTRY nlMatrixFactorize_SUPERLU(
    NLMatrix M, NLenum solver
);

NLboolean nlInitExtension_SUPERLU(void);

NLboolean nlExtensionIsInitialized_SUPERLU(void);

#endif

/******* extracted from nl_cholmod.h *******/

#ifndef OPENNL_CHOLMOD_H
#define OPENNL_CHOLMOD_H


NLAPI NLMatrix NLAPIENTRY nlMatrixFactorize_CHOLMOD(
    NLMatrix M, NLenum solver
);

NLboolean nlInitExtension_CHOLMOD(void);

NLboolean nlExtensionIsInitialized_CHOLMOD(void);

#endif

/******* extracted from nl_arpack.h *******/

#ifndef OPENNL_ARPACK_H
#define OPENNL_ARPACK_H


NLboolean nlInitExtension_ARPACK(void);

NLboolean nlExtensionIsInitialized_ARPACK(void);

void nlEigenSolve_ARPACK(void);


#endif

/******* extracted from nl_mkl.h *******/

#ifndef OPENNL_MKL_H
#define OPENNL_MKL_H


NLboolean nlInitExtension_MKL(void);

NLboolean nlExtensionIsInitialized_MKL(void);

extern NLMultMatrixVectorFunc NLMultMatrixVector_MKL;

#endif

/******* extracted from nl_cuda.h *******/

#ifndef OPENNL_CUDA_EXT_H
#define OPENNL_CUDA_EXT_H


NLboolean nlInitExtension_CUDA(void);

NLboolean nlExtensionIsInitialized_CUDA(void);


NLMatrix nlCUDAMatrixNewFromCRSMatrix(NLMatrix M);

NLMatrix nlCUDAJacobiPreconditionerNewFromCRSMatrix(NLMatrix M);

NLBlas_t nlCUDABlas(void);


#endif

/******* extracted from nl_os.c *******/


#if (defined (WIN32) || defined(_WIN64))
#include <windows.h>
#else
#include <sys/types.h>
#include <sys/times.h> 
#endif

#if defined(GEO_DYNAMIC_LIBS) && defined(NL_OS_UNIX)
#include <dlfcn.h>
#endif


/* Assertions */


void nl_assertion_failed(const char* cond, const char* file, int line) {
    nl_fprintf(
        stderr, 
        "OpenNL assertion failed: %s, file:%s, line:%d\n",
        cond,file,line
    ) ;
    abort() ;
}

void nl_range_assertion_failed(
    double x, double min_val, double max_val, const char* file, int line
) {
    nl_fprintf(
        stderr, 
        "OpenNL range assertion failed: "
	"%f in [ %f ... %f ], file:%s, line:%d\n",
        x, min_val, max_val, file,line
    ) ;
    abort() ;
}

void nl_should_not_have_reached(const char* file, int line) {
    nl_fprintf(
        stderr, 
        "OpenNL should not have reached this point: file:%s, line:%d\n",
        file,line
    ) ;
    abort() ;
}


/* Timing */

#ifdef WIN32
NLdouble nlCurrentTime() {
    return (NLdouble)GetTickCount() / 1000.0 ;
}
#else
double nlCurrentTime() {
    clock_t user_clock ;
    struct tms user_tms ;
    user_clock = times(&user_tms) ;
    return (NLdouble)user_clock / 100.0 ;
}
#endif


/* DLLs/shared objects/dylibs */

#if defined(GEO_DYNAMIC_LIBS) 

#  if defined(NL_OS_UNIX)

NLdll nlOpenDLL(const char* name, NLenum flags_in) {
    void* result = NULL;
    int flags = 0;
    if((flags_in & NL_LINK_NOW) != 0) {
	flags |= RTLD_NOW;
    }
    if((flags_in & NL_LINK_LAZY) != 0) {
	flags |= RTLD_LAZY;
    }
    if((flags_in & NL_LINK_GLOBAL) != 0) {
	flags |= RTLD_GLOBAL;
    }
    if((flags_in & NL_LINK_QUIET) == 0) {
	nl_fprintf(stdout,"Trying to load %s\n", name);
    }
    result = dlopen(name, flags);
    if(result == NULL) {
	if((flags_in & NL_LINK_QUIET) == 0) {	
	    nl_fprintf(stderr,"Did not find %s,\n", name);
	    nl_fprintf(stderr,"Retrying with libgeogram_num_3rdparty.so\n");
	}
	if((flags_in & NL_LINK_USE_FALLBACK) != 0) {
	    result=dlopen("libgeogram_num_3rdparty.so", flags);
	    if(result == NULL) {
		if((flags_in & NL_LINK_QUIET) == 0) {		    
		    nlError("nlOpenDLL/dlopen",dlerror());
		}
	    }
        }
    }
    if((flags_in & NL_LINK_QUIET) == 0 && result != NULL) {
	nl_fprintf(stdout,"Loaded %s\n", name);
    }
    
    return result;
}

void nlCloseDLL(void* handle) {
    dlclose(handle);
}

NLfunc nlFindFunction(void* handle, const char* name) {
    /*
     * It is not legal in modern C to cast a void*
     *  pointer into a function pointer, thus requiring this
     *  (quite dirty) function that uses a union.    
     */
    union {
        void* ptr;
        NLfunc fptr;
    } u;
    u.ptr = dlsym(handle, name);
    return u.fptr;
}

#  elif defined(NL_OS_WINDOWS)

NLdll nlOpenDLL(const char* name, NLenum flags) {
    /* Note: NL_LINK_LAZY and NL_LINK_GLOBAL are ignored. */
    void* result = LoadLibrary(name);
    if(result == NULL && ((flags & NL_LINK_USE_FALLBACK) != 0)) {
	if((flags & NL_LINK_QUIET) == 0) {
	    nl_fprintf(stderr,"Did not find %s,\n", name);
	    nl_fprintf(stderr,"Retrying with geogram_num_3rdparty\n");
	}
        result=LoadLibrary("geogram_num_3rdparty.dll");
    }
    return result;
}

void nlCloseDLL(void* handle) {
    FreeLibrary((HMODULE)handle);
}

NLfunc nlFindFunction(void* handle, const char* name) {
    return (NLfunc)GetProcAddress((HMODULE)handle, name);
}

#  endif

#else

NLdll nlOpenDLL(const char* name, NLenum flags) {
    nl_arg_used(name);
    nl_arg_used(flags);
#ifdef NL_OS_UNIX
    nlError("nlOpenDLL","Was not compiled with dynamic linking enabled");
    nlError("nlOpenDLL","(see VORPALINE_BUILD_DYNAMIC in CMakeLists.txt)");        
#else    
    nlError("nlOpenDLL","Not implemented");
#endif    
    return NULL;
}

void nlCloseDLL(void* handle) {
    nl_arg_used(handle);
    nlError("nlCloseDLL","Not implemented");        
}

NLfunc nlFindFunction(void* handle, const char* name) {
    nl_arg_used(handle);
    nl_arg_used(name);
    nlError("nlFindFunction","Not implemented");            
    return NULL;
}

#endif


/* Error-reporting functions */

NLprintfFunc nl_printf = printf;
NLfprintfFunc nl_fprintf = fprintf;

void nlError(const char* function, const char* message) {
    nl_fprintf(stderr, "OpenNL error in %s(): %s\n", function, message) ; 
}

void nlWarning(const char* function, const char* message) {
    nl_fprintf(stderr, "OpenNL warning in %s(): %s\n", function, message) ; 
}

void nlPrintfFuncs(NLprintfFunc f1, NLfprintfFunc f2) {
    nl_printf = f1;
    nl_fprintf = f2;
}


/******* extracted from nl_matrix.c *******/


/*
 Some warnings about const cast in callback for
 qsort() function.
 */

#ifdef __clang__
#pragma GCC diagnostic ignored "-Wcast-qual"
#endif


void nlDeleteMatrix(NLMatrix M) {
    if(M == NULL) {
        return;
    }
    M->destroy_func(M);
    NL_DELETE(M);
}

void nlMultMatrixVector(
    NLMatrix M, const double* x, double* y
) {
    M->mult_func(M,x,y);
}


void nlRowColumnConstruct(NLRowColumn* c) {
    c->size     = 0;
    c->capacity = 0;
    c->coeff    = NULL;
}

void nlRowColumnDestroy(NLRowColumn* c) {
    NL_DELETE_ARRAY(c->coeff);
    c->size = 0;
    c->capacity = 0;
}

void nlRowColumnGrow(NLRowColumn* c) {
    if(c->capacity != 0) {
        c->capacity = 2 * c->capacity;
        c->coeff = NL_RENEW_ARRAY(NLCoeff, c->coeff, c->capacity);
    } else {
        c->capacity = 4;
        c->coeff = NL_NEW_ARRAY(NLCoeff, c->capacity);
    }
}

void nlRowColumnAdd(NLRowColumn* c, NLuint index, NLdouble value) {
    NLuint i;
    for(i=0; i<c->size; i++) {
        if(c->coeff[i].index == index) {
            c->coeff[i].value += value;
            return;
        }
    }
    if(c->size == c->capacity) {
        nlRowColumnGrow(c);
    }
    c->coeff[c->size].index = index;
    c->coeff[c->size].value = value;
    c->size++;
}

/* Does not check whether the index already exists */
void nlRowColumnAppend(NLRowColumn* c, NLuint index, NLdouble value) {
    if(c->size == c->capacity) {
        nlRowColumnGrow(c);
    }
    c->coeff[c->size].index = index;
    c->coeff[c->size].value = value;
    c->size++;
}

void nlRowColumnZero(NLRowColumn* c) {
    c->size = 0;
}

void nlRowColumnClear(NLRowColumn* c) {
    c->size     = 0;
    c->capacity = 0;
    NL_DELETE_ARRAY(c->coeff);
}

static int nlCoeffCompare(const void* p1, const void* p2) {
    return (((NLCoeff*)(p2))->index < ((NLCoeff*)(p1))->index);
}

void nlRowColumnSort(NLRowColumn* c) {
    qsort(c->coeff, c->size, sizeof(NLCoeff), nlCoeffCompare);
}


/* CRSMatrix data structure */

static void nlCRSMatrixDestroy(NLCRSMatrix* M) {
    NL_DELETE_ARRAY(M->val);
    NL_DELETE_ARRAY(M->rowptr);
    NL_DELETE_ARRAY(M->colind);
    NL_DELETE_ARRAY(M->sliceptr);
    M->m = 0;
    M->n = 0;
    M->nslices = 0;
}


NLboolean nlCRSMatrixSave(NLCRSMatrix* M, const char* filename) {
    NLuint nnz = M->rowptr[M->m];
    FILE* f = fopen(filename, "rb");
    if(f == NULL) {
        nlError("nlCRSMatrixSave", "Could not open file");
        return NL_FALSE;
    }

    fwrite(&M->m, sizeof(NLuint), 1, f);
    fwrite(&M->n, sizeof(NLuint), 1, f);
    fwrite(&nnz, sizeof(NLuint), 1, f);

    fwrite(M->rowptr, sizeof(NLuint), M->m+1, f);
    fwrite(M->colind, sizeof(NLuint), nnz, f);
    fwrite(M->val, sizeof(double), nnz, f);
    
    return NL_TRUE;
}

NLboolean nlCRSMatrixLoad(NLCRSMatrix* M, const char* filename) {
    NLuint nnz = 0;
    FILE* f = fopen(filename, "rb");
    NLboolean truncated = NL_FALSE;
    
    if(f == NULL) {
        nlError("nlCRSMatrixLoad", "Could not open file");
        return NL_FALSE;
    }
    
    truncated = truncated || (
        fread(&M->m, sizeof(NLuint), 1, f) != 1 ||
        fread(&M->n, sizeof(NLuint), 1, f) != 1 ||
        fread(&nnz, sizeof(NLuint), 1, f) != 1
    );

    if(truncated) {
        M->rowptr = NULL;
        M->colind = NULL;
        M->val = NULL;
    } else {
        M->rowptr = NL_NEW_ARRAY(NLuint, M->m+1);
        M->colind = NL_NEW_ARRAY(NLuint, nnz);
        M->val = NL_NEW_ARRAY(double, nnz);
        truncated = truncated || (
            fread(M->rowptr, sizeof(NLuint), M->m+1, f) != M->m+1 ||
            fread(M->colind, sizeof(NLuint), nnz, f) != nnz ||
            fread(M->val, sizeof(double), nnz, f) != nnz
        );
    }

    if(truncated) {
        nlError("nlCRSMatrixSave", "File appears to be truncated");
        NL_DELETE_ARRAY(M->rowptr);
        NL_DELETE_ARRAY(M->colind);
        NL_DELETE_ARRAY(M->val);
        return NL_FALSE;
    } else {
        M->nslices = 1;    
        M->sliceptr = NL_NEW_ARRAY(NLuint, M->nslices+1);
        M->sliceptr[0] = 0;
        M->sliceptr[1] = M->m;
    }

    fclose(f);
    return NL_TRUE;
}

NLuint nlCRSMatrixNNZ(NLCRSMatrix* M) {
    return M->rowptr[M->m];
}

static void nlCRSMatrixMultSlice(
    NLCRSMatrix* M, const double* x, double* y, NLuint Ibegin, NLuint Iend
) {
    NLuint i,j;
    for(i=Ibegin; i<Iend; ++i) {
        double sum=0.0;
        for(j=M->rowptr[i]; j<M->rowptr[i+1]; ++j) {
            sum += M->val[j] * x[M->colind[j]];
        }
        y[i] = sum; 
    }
}

static void nlCRSMatrixMult(
    NLCRSMatrix* M, const double* x, double* y
) {
    int slice;
    int nslices = (int)(M->nslices);
    NLuint i,j,jj;
    NLdouble a;
    
    if(M->symmetric_storage) {
        for(i=0; i<M->m; ++i) {
            y[i] = 0.0;
        }
        for(i=0; i<M->m; ++i) {
            for(jj=M->rowptr[i]; jj<M->rowptr[i+1]; ++jj) {
                a = M->val[jj];
                j = M->colind[jj];
                y[i] += a * x[j];
                if(j != i) {
                    y[j] += a * x[i];
                }
            }
        }
    } else {
    
#if defined(_OPENMP)
#pragma omp parallel for private(slice)
#endif
    
	for(slice=0; slice<nslices; ++slice) {
	    nlCRSMatrixMultSlice(
		M,x,y,M->sliceptr[slice],M->sliceptr[slice+1]
	    );
	}
    }

    nlHostBlas()->flops += (NLulong)(2*nlCRSMatrixNNZ(M));
}

void nlCRSMatrixConstruct(
    NLCRSMatrix* M, NLuint m, NLuint n, NLuint nnz, NLuint nslices
) {
    M->m = m;
    M->n = n;
    M->type = NL_MATRIX_CRS;
    M->destroy_func = (NLDestroyMatrixFunc)nlCRSMatrixDestroy;
    if(NLMultMatrixVector_MKL != NULL) {
	M->mult_func = (NLMultMatrixVectorFunc)NLMultMatrixVector_MKL;
    } else {
	M->mult_func = (NLMultMatrixVectorFunc)nlCRSMatrixMult;
    }
    M->nslices = nslices;
    M->val = NL_NEW_ARRAY(double, nnz);
    M->rowptr = NL_NEW_ARRAY(NLuint, m+1);
    M->colind = NL_NEW_ARRAY(NLuint, nnz);
    M->sliceptr = NL_NEW_ARRAY(NLuint, nslices+1);
    M->symmetric_storage = NL_FALSE;
}

void nlCRSMatrixConstructSymmetric(
    NLCRSMatrix* M, NLuint n, NLuint nnz
) {
    M->m = n;
    M->n = n;
    M->type = NL_MATRIX_CRS;
    M->destroy_func = (NLDestroyMatrixFunc)nlCRSMatrixDestroy;
    M->mult_func = (NLMultMatrixVectorFunc)nlCRSMatrixMult;
    M->nslices = 0;
    M->val = NL_NEW_ARRAY(double, nnz);
    M->rowptr = NL_NEW_ARRAY(NLuint, n+1);
    M->colind = NL_NEW_ARRAY(NLuint, nnz);
    M->sliceptr = NULL;
    M->symmetric_storage = NL_TRUE;
}


/* SparseMatrix data structure */


static void nlSparseMatrixDestroyRowColumns(NLSparseMatrix* M) {
    NLuint i;
    if(M->storage & NL_MATRIX_STORE_ROWS) {
        for(i=0; i<M->m; i++) {
            nlRowColumnDestroy(&(M->row[i]));
        }
        NL_DELETE_ARRAY(M->row);
    }
    M->storage = (NLenum)((int)(M->storage) & ~NL_MATRIX_STORE_ROWS);
    
    if(M->storage & NL_MATRIX_STORE_COLUMNS) {
        for(i=0; i<M->n; i++) {
            nlRowColumnDestroy(&(M->column[i]));
        }
        NL_DELETE_ARRAY(M->column);
    }
    M->storage = (NLenum)((int)(M->storage) & ~NL_MATRIX_STORE_COLUMNS);    
}

void nlSparseMatrixDestroy(NLSparseMatrix* M) {
    nl_assert(M->type == NL_MATRIX_SPARSE_DYNAMIC);
    nlSparseMatrixDestroyRowColumns(M);
    NL_DELETE_ARRAY(M->diag);
#ifdef NL_PARANOID
    NL_CLEAR(NLSparseMatrix,M);
#endif
}

void nlSparseMatrixAdd(NLSparseMatrix* M, NLuint i, NLuint j, NLdouble value) {
    nl_parano_range_assert(i, 0, M->m - 1);
    nl_parano_range_assert(j, 0, M->n - 1);
    if((M->storage & NL_MATRIX_STORE_SYMMETRIC) && (j > i)) {
        return;
    }
    if(i == j) {
        M->diag[i] += value;
    }
    if(M->storage & NL_MATRIX_STORE_ROWS) {
        nlRowColumnAdd(&(M->row[i]), j, value);
    }
    if(M->storage & NL_MATRIX_STORE_COLUMNS) {
        nlRowColumnAdd(&(M->column[j]), i, value);
    }
}

static void nlSparseMatrixAddSparseMatrix(
    NLSparseMatrix* M, double mul, const NLSparseMatrix* N    
) {
    NLuint i,j,ii,jj;
    nl_assert(M->m == N->m);
    nl_assert(M->n == N->n);
    if(N->storage & NL_MATRIX_STORE_SYMMETRIC) {
	nl_assert(M->storage & NL_MATRIX_STORE_SYMMETRIC);
    }
    if(N->storage & NL_MATRIX_STORE_ROWS) {
	for(i=0; i<N->m; ++i) {
	    for(jj=0; jj<N->row[i].size; ++jj) {
		nlSparseMatrixAdd(
		    M,
		    i, N->row[i].coeff[jj].index,
		    mul*N->row[i].coeff[jj].value
		);
	    }
	}
    } else {
	nl_assert(N->storage & NL_MATRIX_STORE_COLUMNS);	
	for(j=0; j<N->n; ++j) {
	    for(ii=0; ii<N->column[j].size; ++ii) {
		nlSparseMatrixAdd(
		    M,
		    N->column[j].coeff[ii].index, j,
		    mul*N->column[j].coeff[ii].value
		);
	    }
	}
    }
}

static void nlSparseMatrixAddCRSMatrix(
    NLSparseMatrix* M, double mul, const NLCRSMatrix* N    
) {
    NLuint i,jj;
    nl_assert(M->m == N->m);
    nl_assert(M->n == N->n);
    for(i=0; i<M->m; ++i) {
	for(jj=N->rowptr[i]; jj<N->rowptr[i+1]; ++jj) {
	    nlSparseMatrixAdd(
		M,
		i,
		N->colind[jj],
		mul*N->val[jj]
	    );
	}
    }
}

void nlSparseMatrixAddMatrix(
    NLSparseMatrix* M, double mul, const NLMatrix N
) {
    nl_assert(M->m == N->m);
    nl_assert(M->n == N->n);
    if(N->type == NL_MATRIX_SPARSE_DYNAMIC) {
	nlSparseMatrixAddSparseMatrix(M, mul, (const NLSparseMatrix*)N);
    } else if(N->type == NL_MATRIX_CRS) {
	nlSparseMatrixAddCRSMatrix(M, mul, (const NLCRSMatrix*)N);	
    } else {
	nl_assert_not_reached;
    }
}
    

void nlSparseMatrixZero( NLSparseMatrix* M) {
    NLuint i;
    if(M->storage & NL_MATRIX_STORE_ROWS) {
        for(i=0; i<M->m; i++) {
            nlRowColumnZero(&(M->row[i]));
        }
    }
    if(M->storage & NL_MATRIX_STORE_COLUMNS) {
        for(i=0; i<M->n; i++) {
            nlRowColumnZero(&(M->column[i]));
        }
    }
    NL_CLEAR_ARRAY(NLdouble, M->diag, M->diag_size);
}

void nlSparseMatrixClear( NLSparseMatrix* M) {
    NLuint i;
    if(M->storage & NL_MATRIX_STORE_ROWS) {
        for(i=0; i<M->m; i++) {
            nlRowColumnClear(&(M->row[i]));
        }
    }
    if(M->storage & NL_MATRIX_STORE_COLUMNS) {
        for(i=0; i<M->n; i++) {
            nlRowColumnClear(&(M->column[i]));
        }
    }
    NL_CLEAR_ARRAY(NLdouble, M->diag, M->diag_size);
}

/* Returns the number of non-zero coefficients */
NLuint nlSparseMatrixNNZ( NLSparseMatrix* M) {
    NLuint nnz = 0;
    NLuint i;
    if(M->storage & NL_MATRIX_STORE_ROWS) {
        for(i = 0; i<M->m; i++) {
            nnz += M->row[i].size;
        }
    } else if (M->storage & NL_MATRIX_STORE_COLUMNS) {
        for(i = 0; i<M->n; i++) {
            nnz += M->column[i].size;
        }
    } else {
        nl_assert_not_reached;
    }
    return nnz;
}

void nlSparseMatrixSort( NLSparseMatrix* M) {
    NLuint i;
    if(M->storage & NL_MATRIX_STORE_ROWS) {
        for(i = 0; i<M->m; i++) {
            nlRowColumnSort(&(M->row[i]));                
        }
    } 
    if (M->storage & NL_MATRIX_STORE_COLUMNS) {
        for(i = 0; i<M->n; i++) {
            nlRowColumnSort(&(M->column[i]));
        }
    } 
}

void nlSparseMatrixMAddRow(
    NLSparseMatrix* M, NLuint i1, double s, NLuint i2
) {
    NLuint jj;
    NLRowColumn* Ri2 = &(M->row[i2]);
    NLCoeff* c = NULL;

    nl_debug_assert(i1 < M->m);
    nl_debug_assert(i2 < M->m);
    
    for(jj=0; jj<Ri2->size; ++jj) {
	c = &(Ri2->coeff[jj]);
	nlSparseMatrixAdd(M, i1, c->index, s*c->value);
    }
}

void nlSparseMatrixScaleRow(
    NLSparseMatrix* M, NLuint i, double s
) {
    NLuint jj;
    NLRowColumn* Ri = &(M->row[i]);
    NLCoeff* c = NULL;

    nl_assert(M->storage & NL_MATRIX_STORE_ROWS);
    nl_assert(!(M->storage & NL_MATRIX_STORE_COLUMNS));    
    nl_debug_assert(i < M->m);
    
    for(jj=0; jj<Ri->size; ++jj) {
	c = &(Ri->coeff[jj]);
	c->value *= s;
    }
    if(i < M->diag_size) {
	M->diag[i] *= s;
    }
}

void nlSparseMatrixZeroRow(
    NLSparseMatrix* M, NLuint i
) {
    NLRowColumn* Ri = &(M->row[i]);

    nl_debug_assert(i < M->m);
    
    Ri->size = 0;
    if(i < M->diag_size) {
	M->diag[i] = 0.0;
    }
}


/* SparseMatrix x Vector routines, internal helper routines */

static void nlSparseMatrix_mult_rows_symmetric(
    NLSparseMatrix* A,
    const NLdouble* x,
    NLdouble* y
) {
    NLuint m = A->m;
    NLuint i,ij;
    NLCoeff* c = NULL;
    for(i=0; i<m; i++) {
        NLRowColumn* Ri = &(A->row[i]);
        y[i] = 0;
        for(ij=0; ij<Ri->size; ++ij) {
            c = &(Ri->coeff[ij]);
            y[i] += c->value * x[c->index];
            if(i != c->index) {
                y[c->index] += c->value * x[i];
            }
        }
    }
}

static void nlSparseMatrix_mult_rows(
        NLSparseMatrix* A,
        const NLdouble* x,
        NLdouble* y
) {
    /* 
     * Note: OpenMP does not like unsigned ints
     * (causes some floating point exceptions),
     * therefore I use here signed ints for all
     * indices.
     */
    
    int m = (int)(A->m);
    int i,ij;
    NLCoeff* c = NULL;
    NLRowColumn* Ri = NULL;

#if defined(_OPENMP)    
#pragma omp parallel for private(i,ij,c,Ri)
#endif
    
    for(i=0; i<m; i++) {
        Ri = &(A->row[i]);       
        y[i] = 0;
        for(ij=0; ij<(int)(Ri->size); ij++) {
            c = &(Ri->coeff[ij]);
            y[i] += c->value * x[c->index];
        }
    }
}

static void nlSparseMatrix_mult_cols_symmetric(
        NLSparseMatrix* A,
        const NLdouble* x,
        NLdouble* y
) {
    NLuint n = A->n;
    NLuint j,ii;
    NLCoeff* c = NULL;
    for(j=0; j<n; j++) {
        NLRowColumn* Cj = &(A->column[j]);       
        y[j] = 0;
        for(ii=0; ii<Cj->size; ii++) {
            c = &(Cj->coeff[ii]);
            y[c->index] += c->value * x[j];
            if(j != c->index) {
                y[j] += c->value * x[c->index];
            }
        }
    }
}

static void nlSparseMatrix_mult_cols(
        NLSparseMatrix* A,
        const NLdouble* x,
        NLdouble* y
) {
    NLuint n = A->n;
    NLuint j,ii; 
    NLCoeff* c = NULL;
    NL_CLEAR_ARRAY(NLdouble, y, A->m);
    for(j=0; j<n; j++) {
        NLRowColumn* Cj = &(A->column[j]);
        for(ii=0; ii<Cj->size; ii++) {
            c = &(Cj->coeff[ii]);
            y[c->index] += c->value * x[j];
        }
    }
}

void nlSparseMatrixMult(
    NLSparseMatrix* A, const NLdouble* x, NLdouble* y
) {
    nl_assert(A->type == NL_MATRIX_SPARSE_DYNAMIC);
    if(A->storage & NL_MATRIX_STORE_ROWS) {
        if(A->storage & NL_MATRIX_STORE_SYMMETRIC) {
            nlSparseMatrix_mult_rows_symmetric(A, x, y);
        } else {
            nlSparseMatrix_mult_rows(A, x, y);
        }
    } else {
        if(A->storage & NL_MATRIX_STORE_SYMMETRIC) {
            nlSparseMatrix_mult_cols_symmetric(A, x, y);
        } else {
            nlSparseMatrix_mult_cols(A, x, y);
        }
    }
    nlHostBlas()->flops += (NLulong)(2*nlSparseMatrixNNZ(A));
}

NLMatrix nlSparseMatrixNew(
    NLuint m, NLuint n, NLenum storage
) {
    NLSparseMatrix* result = NL_NEW(NLSparseMatrix);
    nlSparseMatrixConstruct(result, m, n, storage);
    return (NLMatrix)result;
}

void nlSparseMatrixConstruct(
    NLSparseMatrix* M, NLuint m, NLuint n, NLenum storage
) {
    NLuint i;
    M->m = m;
    M->n = n;
    M->type = NL_MATRIX_SPARSE_DYNAMIC;
    M->destroy_func = (NLDestroyMatrixFunc)nlSparseMatrixDestroy;
    M->mult_func = (NLMultMatrixVectorFunc)nlSparseMatrixMult;
    M->storage = storage;
    if(storage & NL_MATRIX_STORE_ROWS) {
        M->row = NL_NEW_ARRAY(NLRowColumn, m);
	M->row_capacity = m;
        for(i=0; i<n; i++) {
            nlRowColumnConstruct(&(M->row[i]));
        }
    } else {
        M->row = NULL;
	M->row_capacity = 0;
    }

    if(storage & NL_MATRIX_STORE_COLUMNS) {
        M->column = NL_NEW_ARRAY(NLRowColumn, n);
	M->column_capacity = n;
        for(i=0; i<n; i++) {
            nlRowColumnConstruct(&(M->column[i]));
        }
    } else {
        M->column = NULL;
	M->column_capacity = 0;
    }

    M->diag_size = MIN(m,n);
    M->diag_capacity = M->diag_size;
    M->diag = NL_NEW_ARRAY(NLdouble, M->diag_size);
}

static void adjust_diag(NLSparseMatrix* M) {
    NLuint new_diag_size = MIN(M->m, M->n);
    NLuint i;
    if(new_diag_size > M->diag_size) {
	if(new_diag_size > M->diag_capacity) {
	    M->diag_capacity *= 2;
	    if(M->diag_capacity == 0) {
		M->diag_capacity = 16;
	    }
	    M->diag = NL_RENEW_ARRAY(double, M->diag, M->diag_capacity);
	    for(i=M->diag_size; i<new_diag_size; ++i) {
		M->diag[i] = 0.0;
	    }
	}
	M->diag_size= new_diag_size;
    }
}

void nlSparseMatrixAddRow( NLSparseMatrix* M) {
    ++M->m;
    if(M->storage & NL_MATRIX_STORE_ROWS) {
	if(M->m > M->row_capacity) {
	    M->row_capacity *= 2;
	    if(M->row_capacity == 0) {
		M->row_capacity = 16;
	    }
	    M->row = NL_RENEW_ARRAY(
		NLRowColumn, M->row, M->row_capacity
	    );
	}
	nlRowColumnConstruct(&(M->row[M->m-1]));
    }
    adjust_diag(M);
}

void nlSparseMatrixAddColumn( NLSparseMatrix* M) {
    ++M->n;
    if(M->storage & NL_MATRIX_STORE_COLUMNS) {
	if(M->n > M->column_capacity) {
	    M->column_capacity *= 2;
	    if(M->column_capacity == 0) {
		M->column_capacity = 16;
	    }
	    M->column = NL_RENEW_ARRAY(
		NLRowColumn, M->column, M->column_capacity
	    );
	}
	nlRowColumnConstruct(&(M->column[M->n-1]));
    }
    adjust_diag(M);
}


NLMatrix nlCRSMatrixNewFromSparseMatrix(NLSparseMatrix* M) {
    NLuint nnz = nlSparseMatrixNNZ(M);
    NLuint nslices = 8; /* TODO: get number of cores */
    NLuint slice, cur_bound, cur_NNZ, cur_row;
    NLuint i,ij,k; 
    NLuint slice_size = nnz / nslices;
    NLCRSMatrix* CRS = NL_NEW(NLCRSMatrix);

    nl_assert(M->storage & NL_MATRIX_STORE_ROWS);

    if(M->storage & NL_MATRIX_STORE_SYMMETRIC) {
        nl_assert(M->m == M->n);
        nlCRSMatrixConstructSymmetric(CRS, M->n, nnz);        
    } else {
        nlCRSMatrixConstruct(CRS, M->m, M->n, nnz, nslices);
    }
    
    nlSparseMatrixSort(M);
    /* Convert matrix to CRS format */
    k=0;
    for(i=0; i<M->m; ++i) {
        NLRowColumn* Ri = &(M->row[i]);
        CRS->rowptr[i] = k;
        for(ij=0; ij<Ri->size; ij++) {
            NLCoeff* c = &(Ri->coeff[ij]);
            CRS->val[k] = c->value;
            CRS->colind[k] = c->index;
            ++k;
        }
    }
    CRS->rowptr[M->m] = k;
        
    /* Create "slices" to be used by parallel sparse matrix vector product */
    if(CRS->sliceptr != NULL) {
	cur_bound = slice_size;
	cur_NNZ = 0;
	cur_row = 0;
	CRS->sliceptr[0]=0;
	for(slice=1; slice<nslices; ++slice) {
	    while(cur_NNZ < cur_bound && cur_row < M->m) {
		++cur_row;
		cur_NNZ += CRS->rowptr[cur_row+1] - CRS->rowptr[cur_row];
	    }
	    CRS->sliceptr[slice] = cur_row;
	    cur_bound += slice_size;
	}
	CRS->sliceptr[nslices]=M->m;
    }
    return (NLMatrix)CRS;
}

NLMatrix nlCRSMatrixNewFromSparseMatrixSymmetric(NLSparseMatrix* M) {
    NLuint nnz;
    NLuint i,j,jj,k;
    NLCRSMatrix* CRS = NL_NEW(NLCRSMatrix);
    
    nl_assert(M->storage & NL_MATRIX_STORE_ROWS);
    nl_assert(M->m == M->n);

    nlSparseMatrixSort(M);
    
    if(M->storage & NL_MATRIX_STORE_SYMMETRIC) {
        nnz = nlSparseMatrixNNZ(M);
    } else {
        nnz = 0;
        for(i=0; i<M->n; ++i) {
            NLRowColumn* Ri = &M->row[i];
            for(jj=0; jj<Ri->size; ++jj) {
                j = Ri->coeff[jj].index;
                if(j <= i) {
                    ++nnz;
                }
            }
        }
    }

    nlCRSMatrixConstructSymmetric(CRS, M->n, nnz);        

    k=0;
    for(i=0; i<M->m; ++i) {
        NLRowColumn* Ri = &(M->row[i]);
        CRS->rowptr[i] = k;
        for(jj=0; jj<Ri->size; ++jj) {
            j = Ri->coeff[jj].index;
            if((M->storage & NL_MATRIX_STORE_SYMMETRIC)) {
                nl_debug_assert(j <= i);
            }
            if(j <= i) {
                CRS->val[k] = Ri->coeff[jj].value;
                CRS->colind[k] = j;
                ++k;
            }
        }
    }
    CRS->rowptr[M->m] = k;

    return (NLMatrix)CRS;
}


void nlMatrixCompress(NLMatrix* M) {
    NLMatrix CRS = NULL;
    if((*M)->type != NL_MATRIX_SPARSE_DYNAMIC) {
        return;
    }
    CRS = nlCRSMatrixNewFromSparseMatrix((NLSparseMatrix*)*M);
    nlDeleteMatrix(*M);
    *M = CRS;
}

NLuint nlMatrixNNZ(NLMatrix M) {
    if(M->type == NL_MATRIX_SPARSE_DYNAMIC) {
	return nlSparseMatrixNNZ((NLSparseMatrix*)M);
    } else if(M->type == NL_MATRIX_CRS) {
	return nlCRSMatrixNNZ((NLCRSMatrix*)M);	
    }
    return M->m * M->n;
}

NLMatrix nlMatrixFactorize(NLMatrix M, NLenum solver) {
    NLMatrix result = NULL;
    switch(solver) {
	case NL_SUPERLU_EXT:
	case NL_PERM_SUPERLU_EXT:      
	case NL_SYMMETRIC_SUPERLU_EXT:
	    result = nlMatrixFactorize_SUPERLU(M,solver);
	    break;
	case NL_CHOLMOD_EXT:
	    result = nlMatrixFactorize_CHOLMOD(M,solver);	    
	    break;
	default:
	    nlError("nlMatrixFactorize","unknown solver");
    }
    return result;
}


typedef struct {
    NLuint m;

    NLuint n;

    NLenum type;

    NLDestroyMatrixFunc destroy_func;

    NLMultMatrixVectorFunc mult_func;

    NLMatrixFunc matrix_func;
} NLFunctionMatrix;

static void nlFunctionMatrixDestroy(NLFunctionMatrix* M) {
    (void)M; /* to avoid 'unused parameter' warning */
    /* 
     * Nothing special to do, 
     * there is no dynamic allocated mem.
     */
}

static void nlFunctionMatrixMult(
    NLFunctionMatrix* M, const NLdouble* x, NLdouble* y
) {
    M->matrix_func(x,y);
}

NLMatrix nlMatrixNewFromFunction(NLuint m, NLuint n, NLMatrixFunc func) {
    NLFunctionMatrix* result = NL_NEW(NLFunctionMatrix);
    result->m = m;
    result->n = n;
    result->type = NL_MATRIX_FUNCTION;
    result->destroy_func = (NLDestroyMatrixFunc)nlFunctionMatrixDestroy;
    result->mult_func = (NLMultMatrixVectorFunc)nlFunctionMatrixMult;
    result->matrix_func = func;
    return (NLMatrix)result;
}

NLMatrixFunc nlMatrixGetFunction(NLMatrix M) {
    if(M == NULL) {
	return NULL;
    }
    if(M->type != NL_MATRIX_FUNCTION) {
	return NULL;
    }
    return ((NLFunctionMatrix*)M)->matrix_func;
}


typedef struct {
    NLuint m;

    NLuint n;

    NLenum type;

    NLDestroyMatrixFunc destroy_func;

    NLMultMatrixVectorFunc mult_func;

    NLMatrixFunc matrix_func;

    NLMatrix M;

    NLboolean owns_M;
    
    NLMatrix N;

    NLboolean owns_N;
    
    NLdouble* work;
} NLMatrixProduct;


static void nlMatrixProductDestroy(NLMatrixProduct* P) {
    NL_DELETE_ARRAY(P->work);
    if(P->owns_M) {
	nlDeleteMatrix(P->M); P->M = NULL;
    }
    if(P->owns_N) {
	nlDeleteMatrix(P->N); P->N = NULL;
    }
}

static void nlMatrixProductMult(
    NLMatrixProduct* P, const NLdouble* x, NLdouble* y
) {
    nlMultMatrixVector(P->N, x, P->work);
    nlMultMatrixVector(P->M, P->work, y);
}

NLMatrix nlMatrixNewFromProduct(
    NLMatrix M, NLboolean owns_M, NLMatrix N, NLboolean owns_N
) {
    NLMatrixProduct* result = NL_NEW(NLMatrixProduct);
    nl_assert(M->n == N->m);
    result->m = M->m;
    result->n = N->n;
    result->type = NL_MATRIX_OTHER;
    result->work = NL_NEW_ARRAY(NLdouble,N->m);
    result->destroy_func = (NLDestroyMatrixFunc)nlMatrixProductDestroy;
    result->mult_func = (NLMultMatrixVectorFunc)nlMatrixProductMult;
    result->M = M;
    result->owns_M = owns_M;
    result->N = N;
    result->owns_N = owns_N;
    return (NLMatrix)result;
}


/******* extracted from nl_context.c *******/


NLContextStruct* nlCurrentContext = NULL;

NLContext nlNewContext() {
    NLContextStruct* result     = NL_NEW(NLContextStruct);
    result->state               = NL_STATE_INITIAL;
    result->solver              = NL_SOLVER_DEFAULT;
    result->max_iterations      = 100;
    result->threshold           = 1e-6;
    result->omega               = 1.5;
    result->row_scaling         = 1.0;
    result->inner_iterations    = 5;
    result->solver_func         = nlDefaultSolver;
    result->progress_func       = NULL;
    result->verbose             = NL_FALSE;
    result->nb_systems          = 1;
    result->matrix_mode         = NL_STIFFNESS_MATRIX;
    nlMakeCurrent(result);
    return result;
}

void nlDeleteContext(NLContext context_in) {
    NLContextStruct* context = (NLContextStruct*)(context_in);
    if(nlCurrentContext == context) {
        nlCurrentContext = NULL;
    }

    nlDeleteMatrix(context->M);
    context->M = NULL;

    nlDeleteMatrix(context->P);
    context->P = NULL;

    nlDeleteMatrix(context->B);
    context->B = NULL;
    
    nlRowColumnDestroy(&context->af);
    nlRowColumnDestroy(&context->al);

    NL_DELETE_ARRAY(context->variable_value);
    NL_DELETE_ARRAY(context->variable_buffer);
    NL_DELETE_ARRAY(context->variable_is_locked);
    NL_DELETE_ARRAY(context->variable_index);
    
    NL_DELETE_ARRAY(context->x);
    NL_DELETE_ARRAY(context->b);
    NL_DELETE_ARRAY(context->right_hand_side);

    NL_DELETE_ARRAY(context->eigen_value);
    
#ifdef NL_PARANOID
    NL_CLEAR(NLContextStruct, context);
#endif
    NL_DELETE(context);
}

void nlMakeCurrent(NLContext context) {
    nlCurrentContext = (NLContextStruct*)(context);
}

NLContext nlGetCurrent() {
    return nlCurrentContext;
}


/* Finite state automaton   */

void nlCheckState(NLenum state) {
    nl_assert(nlCurrentContext->state == state);
}

void nlTransition(NLenum from_state, NLenum to_state) {
    nlCheckState(from_state);
    nlCurrentContext->state = to_state;
}


/* Preconditioner setup and default solver */

static void nlSetupPreconditioner() {
    /* Check compatibility between solver and preconditioner */
    if(
        nlCurrentContext->solver == NL_BICGSTAB && 
        nlCurrentContext->preconditioner == NL_PRECOND_SSOR
    ) {
        nlWarning(
            "nlSolve", 
            "cannot use SSOR preconditioner with non-symmetric matrix, "
	    "switching to Jacobi"
        );
        nlCurrentContext->preconditioner = NL_PRECOND_JACOBI;        
    }
    if(
        nlCurrentContext->solver == NL_GMRES && 
        nlCurrentContext->preconditioner != NL_PRECOND_NONE
    ) {
        nlWarning("nlSolve", "Preconditioner not implemented yet for GMRES");
        nlCurrentContext->preconditioner = NL_PRECOND_NONE;        
    }
    if(
        nlCurrentContext->solver == NL_SUPERLU_EXT && 
        nlCurrentContext->preconditioner != NL_PRECOND_NONE
    ) {
        nlWarning("nlSolve", "Preconditioner not implemented yet for SUPERLU");
        nlCurrentContext->preconditioner = NL_PRECOND_NONE;        
    }
    if(
        nlCurrentContext->solver == NL_CHOLMOD_EXT && 
        nlCurrentContext->preconditioner != NL_PRECOND_NONE
    ) {
        nlWarning("nlSolve", "Preconditioner not implemented yet for CHOLMOD");
        nlCurrentContext->preconditioner = NL_PRECOND_NONE;        
    }
    if(
        nlCurrentContext->solver == NL_PERM_SUPERLU_EXT && 
        nlCurrentContext->preconditioner != NL_PRECOND_NONE
    ) {
        nlWarning(
	    "nlSolve", "Preconditioner not implemented yet for PERMSUPERLU"
	);
        nlCurrentContext->preconditioner = NL_PRECOND_NONE;        
    }
    if(
        nlCurrentContext->solver == NL_SYMMETRIC_SUPERLU_EXT && 
        nlCurrentContext->preconditioner != NL_PRECOND_NONE
    ) {
        nlWarning(
	    "nlSolve", "Preconditioner not implemented yet for PERMSUPERLU"
	);
        nlCurrentContext->preconditioner = NL_PRECOND_NONE;        
    }

    nlDeleteMatrix(nlCurrentContext->P);
    nlCurrentContext->P = NULL;
    
    switch(nlCurrentContext->preconditioner) {
    case NL_PRECOND_NONE:
        break;
    case NL_PRECOND_JACOBI:
	nlCurrentContext->P = nlNewJacobiPreconditioner(nlCurrentContext->M);
        break;
    case NL_PRECOND_SSOR:
	nlCurrentContext->P = nlNewSSORPreconditioner(
	    nlCurrentContext->M,nlCurrentContext->omega
	);	
        break;
    case NL_PRECOND_USER:
        break;
    default:
        nl_assert_not_reached;
    }

    if(nlCurrentContext->preconditioner != NL_PRECOND_SSOR) {
        if(getenv("NL_LOW_MEM") == NULL) {
            nlMatrixCompress(&nlCurrentContext->M);
        }
    }
}

static NLboolean nlSolveDirect() {
    NLdouble* b = nlCurrentContext->b;
    NLdouble* x = nlCurrentContext->x;
    NLuint n = nlCurrentContext->n;
    NLuint k;
    
    NLMatrix F = nlMatrixFactorize(
	nlCurrentContext->M, nlCurrentContext->solver
    );
    if(F == NULL) {
	return NL_FALSE;
    }
    for(k=0; k<nlCurrentContext->nb_systems; ++k) {
	nlMultMatrixVector(F, b, x);
	b += n;
	x += n;
    }
    nlDeleteMatrix(F);
    return NL_TRUE;
}

static NLboolean nlSolveIterative() {
    NLboolean use_CUDA = NL_FALSE;
    NLdouble* b = nlCurrentContext->b;
    NLdouble* x = nlCurrentContext->x;
    NLuint n = nlCurrentContext->n;
    NLuint k;
    NLBlas_t blas = nlHostBlas();
    NLMatrix M = nlCurrentContext->M;
    NLMatrix P = nlCurrentContext->P;
    
    /*
     * For CUDA: it is implemented for
     *   all iterative solvers except GMRES
     *   Jacobi preconditioner
     */
    if(nlExtensionIsInitialized_CUDA() &&
       (nlCurrentContext->solver != NL_GMRES) && 
       (nlCurrentContext->preconditioner == NL_PRECOND_NONE ||
	nlCurrentContext->preconditioner == NL_PRECOND_JACOBI)
    ) {
	if(nlCurrentContext->verbose) { 
	    nl_printf("Using CUDA\n");
	} 
	use_CUDA = NL_TRUE;
	blas = nlCUDABlas();
	if(nlCurrentContext->preconditioner == NL_PRECOND_JACOBI) {
	    P = nlCUDAJacobiPreconditionerNewFromCRSMatrix(M);
	}
	M = nlCUDAMatrixNewFromCRSMatrix(M);
    }

    /* 
     * We do not count CUDA transfers and CUDA matrix construction
     * when estimating GFlops
     */
    nlCurrentContext->start_time = nlCurrentTime();     
    nlBlasResetStats(blas);
    
    for(k=0; k<nlCurrentContext->nb_systems; ++k) {
	nlSolveSystemIterative(
	    blas,
	    M,
	    P,
	    b,
	    x,
	    nlCurrentContext->solver,
	    nlCurrentContext->threshold,
	    nlCurrentContext->max_iterations,
	    nlCurrentContext->inner_iterations
	);
	b += n;
	x += n;
    }

    nlCurrentContext->flops += blas->flops;
    
    if(use_CUDA) {
	nlDeleteMatrix(M);
	nlDeleteMatrix(P);
    }
    
    return NL_TRUE;
}


NLboolean nlDefaultSolver() {
    NLboolean result = NL_TRUE;
    nlSetupPreconditioner();
    switch(nlCurrentContext->solver) {
	case NL_CG:
	case NL_BICGSTAB:
	case NL_GMRES: {
	    result = nlSolveIterative();
	} break;

	case NL_SUPERLU_EXT: 
	case NL_PERM_SUPERLU_EXT: 
	case NL_SYMMETRIC_SUPERLU_EXT: 
	case NL_CHOLMOD_EXT: {
	    result = nlSolveDirect();
	} break;
	default:
	    nl_assert_not_reached;
    }
    return result;
}

/******* extracted from nl_blas.c *******/


/*
 Many warnings about const double* converted to
 double* when calling BLAS functions that do not
 have the const qualifier in their prototypes.
*/
#ifdef __clang__
#pragma GCC diagnostic ignored "-Wcast-qual"
#endif

#ifndef NL_FORTRAN_WRAP
#define NL_FORTRAN_WRAP(x) x##_
#endif

#ifdef NL_USE_ATLAS
int NL_FORTRAN_WRAP(xerbla)(char *srname, int *info) {
    nl_printf(stderr, "** On entry to %6s, parameter number %2d had an illegal value\n",
              srname, *info
    );
    return 0;
} 
#ifndef NL_USE_BLAS
#define NL_USE_BLAS
#endif
#endif

#ifdef NL_USE_SUPERLU
#ifndef NL_USE_BLAS
#define NL_USE_BLAS
/* 
 * The BLAS included in SuperLU does not have DTPSV,
 * we use the DTPSV embedded in OpenNL.
 */
#define NEEDS_DTPSV
#endif
#endif

#ifndef NL_USE_BLAS
#define NEEDS_DTPSV
#endif


/* BLAS routines                                                           */
/* copy-pasted from CBLAS (i.e. generated from f2c) */

/*
 * lsame
 * xerbla
 * daxpy
 * ddot
 * dscal
 * dnrm2
 * dcopy
 * dgemv
 * dtpsv
 */


typedef NLint     integer ;
typedef NLdouble  doublereal ;
typedef NLboolean logical ;
typedef NLint     ftnlen ;


#ifndef max
#define max(x,y) ((x) > (y) ? (x) : (y))
#endif

#ifndef NL_USE_BLAS

static int NL_FORTRAN_WRAP(lsame)(const char *ca, const char *cb)
{
/*  -- LAPACK auxiliary routine (version 2.0) --   
       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
       Courant Institute, Argonne National Lab, and Rice University   
       September 30, 1994   

    Purpose   
    =======   

    LSAME returns .TRUE. if CA is the same letter as CB regardless of case.   

    Arguments   
    =========   

    CA      (input) CHARACTER*1   
    CB      (input) CHARACTER*1   
            CA and CB specify the single characters to be compared.   

   ===================================================================== 
*/  

    /* System generated locals */
    int ret_val;
    
    /* Local variables */
    int inta, intb, zcode;

    ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
    if (ret_val) {
        return ret_val;
    }

    /* Now test for equivalence if both characters are alphabetic. */

    zcode = 'Z';

    /* Use 'Z' rather than 'A' so that ASCII can be detected on Prime   
       machines, on which ICHAR returns a value with bit 8 set.   
       ICHAR('A') on Prime machines returns 193 which is the same as   
       ICHAR('A') on an EBCDIC machine. */

    inta = *(unsigned char *)ca;
    intb = *(unsigned char *)cb;

    if (zcode == 90 || zcode == 122) {
        /* ASCII is assumed - ZCODE is the ASCII code of either lower or   
          upper case 'Z'. */
        if (inta >= 97 && inta <= 122) inta += -32;
        if (intb >= 97 && intb <= 122) intb += -32;

    } else if (zcode == 233 || zcode == 169) {
        /* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or   
          upper case 'Z'. */
        if ((inta >= 129 && inta <= 137) || 
            (inta >= 145 && inta <= 153) || 
            (inta >= 162 && inta <= 169)
        )
            inta += 64;
        if (
            (intb >= 129 && intb <= 137) || 
            (intb >= 145 && intb <= 153) || 
            (intb >= 162 && intb <= 169)
        )
            intb += 64;
    } else if (zcode == 218 || zcode == 250) {
        /* ASCII is assumed, on Prime machines - ZCODE is the ASCII code   
          plus 128 of either lower or upper case 'Z'. */
        if (inta >= 225 && inta <= 250) inta += -32;
        if (intb >= 225 && intb <= 250) intb += -32;
    }
    ret_val = inta == intb;
    return ret_val;
    
} /* lsame_ */

/* Subroutine */ static int NL_FORTRAN_WRAP(xerbla)(const char *srname, int *info)
{
/*  -- LAPACK auxiliary routine (version 2.0) --   
       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
       Courant Institute, Argonne National Lab, and Rice University   
       September 30, 1994   


    Purpose   
    =======   

    XERBLA  is an error handler for the LAPACK routines.   
    It is called by an LAPACK routine if an input parameter has an   
    invalid value.  A message is printed and execution stops.   

    Installers may consider modifying the STOP statement in order to   
    call system-specific exception-handling facilities.   

    Arguments   
    =========   

    SRNAME  (input) CHARACTER*6   
            The name of the routine which called XERBLA.   

    INFO    (input) INT   
            The position of the invalid parameter in the parameter list   

            of the calling routine.   

   ===================================================================== 
*/

    nl_fprintf(stderr, "** On entry to %6s, parameter number %2d had an illegal value\n",
                srname, *info);

/*     End of XERBLA */

    return 0;
} /* xerbla_ */


/* Subroutine */ static int NL_FORTRAN_WRAP(daxpy)(integer *n, doublereal *da, doublereal *dx, 
        integer *incx, doublereal *dy, integer *incy)
{


    /* System generated locals */
    integer i__1;

    /* Local variables */
    static integer i, m, ix, iy, mp1;


/*     constant times a vector plus a vector.   
       uses unrolled loops for increments equal to one.   
       jack dongarra, linpack, 3/11/78.   
       modified 12/3/93, array(1) declarations changed to array(*)   


   Parameter adjustments   
       Function Body */
#define DY(I) dy[(I)-1]
#define DX(I) dx[(I)-1]


    if (*n <= 0) {
        return 0;
    }
    if (*da == 0.) {
        return 0;
    }
    if (*incx == 1 && *incy == 1) {
        goto L20;
    }

/*        code for unequal increments or equal increments   
            not equal to 1 */

    ix = 1;
    iy = 1;
    if (*incx < 0) {
        ix = (-(*n) + 1) * *incx + 1;
    }
    if (*incy < 0) {
        iy = (-(*n) + 1) * *incy + 1;
    }
    i__1 = *n;
    for (i = 1; i <= *n; ++i) {
        DY(iy) += *da * DX(ix);
        ix += *incx;
        iy += *incy;
/* L10: */
    }
    return 0;

/*        code for both increments equal to 1   


          clean-up loop */

L20:
    m = *n % 4;
    if (m == 0) {
        goto L40;
    }
    i__1 = m;
    for (i = 1; i <= m; ++i) {
        DY(i) += *da * DX(i);
/* L30: */
    }
    if (*n < 4) {
        return 0;
    }
L40:
    mp1 = m + 1;
    i__1 = *n;
    for (i = mp1; i <= *n; i += 4) {
        DY(i) += *da * DX(i);
        DY(i + 1) += *da * DX(i + 1);
        DY(i + 2) += *da * DX(i + 2);
        DY(i + 3) += *da * DX(i + 3);
/* L50: */
    }
    nl_arg_used(i__1);
    return 0;
} /* daxpy_ */
#undef DY
#undef DX


static doublereal NL_FORTRAN_WRAP(ddot)(integer *n, doublereal *dx, integer *incx, doublereal *dy, 
        integer *incy)
{

    /* System generated locals */
    integer i__1;
    doublereal ret_val;

    /* Local variables */
    static integer i, m;
    static doublereal dtemp;
    static integer ix, iy, mp1;


/*     forms the dot product of two vectors.   
       uses unrolled loops for increments equal to one.   
       jack dongarra, linpack, 3/11/78.   
       modified 12/3/93, array(1) declarations changed to array(*)   


   Parameter adjustments   
       Function Body */
#define DY(I) dy[(I)-1]
#define DX(I) dx[(I)-1]

    ret_val = 0.;
    dtemp = 0.;
    if (*n <= 0) {
        return ret_val;
    }
    if (*incx == 1 && *incy == 1) {
        goto L20;
    }

/*        code for unequal increments or equal increments   
            not equal to 1 */

    ix = 1;
    iy = 1;
    if (*incx < 0) {
        ix = (-(*n) + 1) * *incx + 1;
    }
    if (*incy < 0) {
        iy = (-(*n) + 1) * *incy + 1;
    }
    i__1 = *n;
    for (i = 1; i <= *n; ++i) {
        dtemp += DX(ix) * DY(iy);
        ix += *incx;
        iy += *incy;
/* L10: */
    }
    ret_val = dtemp;
    return ret_val;

/*        code for both increments equal to 1   


          clean-up loop */

L20:
    m = *n % 5;
    if (m == 0) {
        goto L40;
    }
    i__1 = m;
    for (i = 1; i <= m; ++i) {
        dtemp += DX(i) * DY(i);
/* L30: */
    }
    if (*n < 5) {
        goto L60;
    }
L40:
    mp1 = m + 1;
    i__1 = *n;
    for (i = mp1; i <= *n; i += 5) {
        dtemp = dtemp + DX(i) * DY(i) + DX(i + 1) * DY(i + 1) + DX(i + 2) * 
                DY(i + 2) + DX(i + 3) * DY(i + 3) + DX(i + 4) * DY(i + 4);
/* L50: */
    }
L60:
    ret_val = dtemp;
    nl_arg_used(i__1);
    return ret_val;
} /* ddot_ */
#undef DY
#undef DX

/* Subroutine */ static int NL_FORTRAN_WRAP(dscal)(integer *n, doublereal *da, doublereal *dx, 
    integer *incx)
{


    /* System generated locals */
    integer i__1, i__2;

    /* Local variables */
    static integer i, m, nincx, mp1;


/*     scales a vector by a constant.   
       uses unrolled loops for increment equal to one.   
       jack dongarra, linpack, 3/11/78.   
       modified 3/93 to return if incx .le. 0.   
       modified 12/3/93, array(1) declarations changed to array(*)   


   Parameter adjustments   
       Function Body */
#ifdef DX
#undef DX
#endif
#define DX(I) dx[(I)-1]


    if (*n <= 0 || *incx <= 0) {
        return 0;
    }
    if (*incx == 1) {
        goto L20;
    }

/*        code for increment not equal to 1 */

    nincx = *n * *incx;
    i__1 = nincx;
    i__2 = *incx;
    for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) {
        DX(i) = *da * DX(i);
/* L10: */
    }
    return 0;

/*        code for increment equal to 1   


          clean-up loop */

L20:
    m = *n % 5;
    if (m == 0) {
        goto L40;
    }
    i__2 = m;
    for (i = 1; i <= m; ++i) {
        DX(i) = *da * DX(i);
/* L30: */
    }
    if (*n < 5) {
        return 0;
    }
L40:
    mp1 = m + 1;
    i__2 = *n;
    for (i = mp1; i <= *n; i += 5) {
        DX(i) = *da * DX(i);
        DX(i + 1) = *da * DX(i + 1);
        DX(i + 2) = *da * DX(i + 2);
        DX(i + 3) = *da * DX(i + 3);
        DX(i + 4) = *da * DX(i + 4);
/* L50: */
    }
    nl_arg_used(i__1);
    nl_arg_used(i__2);
    return 0;
} /* dscal_ */
#undef DX

static doublereal NL_FORTRAN_WRAP(dnrm2)(integer *n, doublereal *x, integer *incx)
{


    /* System generated locals */
    integer i__1, i__2;
    doublereal ret_val, d__1;

    /* Builtin functions */
    /* BL: already declared in the included <math.h>, 
       we do not need it here. */
    /*double sqrt(doublereal); */

    /* Local variables */
    static doublereal norm, scale, absxi;
    static integer ix;
    static doublereal ssq;


/*  DNRM2 returns the euclidean norm of a vector via the function   
    name, so that   

       DNRM2 := sqrt( x'*x )   


    -- This version written on 25-October-1982.   
       Modified on 14-October-1993 to inline the call to DLASSQ.   
       Sven Hammarling, Nag Ltd.   


   Parameter adjustments   
       Function Body */
#ifdef X
#undef X
#endif
#define X(I) x[(I)-1]


    if (*n < 1 || *incx < 1) {
        norm = 0.;
    } else if (*n == 1) {
        norm = fabs(X(1));
    } else {
        scale = 0.;
        ssq = 1.;
/*        The following loop is equivalent to this call to the LAPACK 
  
          auxiliary routine:   
          CALL DLASSQ( N, X, INCX, SCALE, SSQ ) */

        i__1 = (*n - 1) * *incx + 1;
        i__2 = *incx;
        for (ix = 1; *incx < 0 ? ix >= (*n-1)**incx+1 : ix <= (*n-1)**incx+1; ix += *incx) {
            if (X(ix) != 0.) {
                absxi = (d__1 = X(ix), fabs(d__1));
                if (scale < absxi) {
/* Computing 2nd power */
                    d__1 = scale / absxi;
                    ssq = ssq * (d__1 * d__1) + 1.;
                    scale = absxi;
                } else {
/* Computing 2nd power */
                    d__1 = absxi / scale;
                    ssq += d__1 * d__1;
                }
            }
/* L10: */
        }
        norm = scale * sqrt(ssq);
    }

    ret_val = norm;

    nl_arg_used(i__1);
    nl_arg_used(i__2);

    return ret_val;

/*     End of DNRM2. */

} /* dnrm2_ */
#undef X

/* Subroutine */ static int NL_FORTRAN_WRAP(dcopy)(integer *n, doublereal *dx, integer *incx, 
        doublereal *dy, integer *incy)
{

    /* System generated locals */
    integer i__1;

    /* Local variables */
    static integer i, m, ix, iy, mp1;


/*     copies a vector, x, to a vector, y.   
       uses unrolled loops for increments equal to one.   
       jack dongarra, linpack, 3/11/78.   
       modified 12/3/93, array(1) declarations changed to array(*)   


   Parameter adjustments   
       Function Body */
#define DY(I) dy[(I)-1]
#define DX(I) dx[(I)-1]


    if (*n <= 0) {
        return 0;
    }
    if (*incx == 1 && *incy == 1) {
        goto L20;
    }

/*        code for unequal increments or equal increments   
            not equal to 1 */

    ix = 1;
    iy = 1;
    if (*incx < 0) {
        ix = (-(*n) + 1) * *incx + 1;
    }
    if (*incy < 0) {
        iy = (-(*n) + 1) * *incy + 1;
    }
    i__1 = *n;
    for (i = 1; i <= *n; ++i) {
        DY(iy) = DX(ix);
        ix += *incx;
        iy += *incy;
/* L10: */
    }
    return 0;

/*        code for both increments equal to 1   


          clean-up loop */

L20:
    m = *n % 7;
    if (m == 0) {
        goto L40;
    }
    i__1 = m;
    for (i = 1; i <= m; ++i) {
        DY(i) = DX(i);
/* L30: */
    }
    if (*n < 7) {
        return 0;
    }
L40:
    mp1 = m + 1;
    i__1 = *n;
    for (i = mp1; i <= *n; i += 7) {
        DY(i) = DX(i);
        DY(i + 1) = DX(i + 1);
        DY(i + 2) = DX(i + 2);
        DY(i + 3) = DX(i + 3);
        DY(i + 4) = DX(i + 4);
        DY(i + 5) = DX(i + 5);
        DY(i + 6) = DX(i + 6);
/* L50: */
    }
    nl_arg_used(i__1);
    return 0;
} /* dcopy_ */

#undef DX
#undef DY

/* Subroutine */ static int NL_FORTRAN_WRAP(dgemv)(const char *trans, integer *m, integer *n, doublereal *
        alpha, doublereal *a, integer *lda, doublereal *x, integer *incx, 
        doublereal *beta, doublereal *y, integer *incy)
{


    /* System generated locals */
    /* integer a_dim1, a_offset ; */
    integer i__1, i__2; 

    /* Local variables */
    static integer info;
    static doublereal temp;
    static integer lenx, leny, i, j;
/*    extern logical lsame_(char *, char *); */
    static integer ix, iy, jx, jy, kx, ky;
/*    extern int xerbla_(char *, integer *); */


/*  Purpose   
    =======   

    DGEMV  performs one of the matrix-vector operations   

       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   

    where alpha and beta are scalars, x and y are vectors and A is an   
    m by n matrix.   

    Parameters   
    ==========   

    TRANS  - CHARACTER*1.   
             On entry, TRANS specifies the operation to be performed as   
             follows:   

                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   

                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   

                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   

             Unchanged on exit.   

    M      - INTEGER.   
             On entry, M specifies the number of rows of the matrix A.   
             M must be at least zero.   
             Unchanged on exit.   

    N      - INTEGER.   
             On entry, N specifies the number of columns of the matrix A. 
  
             N must be at least zero.   
             Unchanged on exit.   

    ALPHA  - DOUBLE PRECISION.   
             On entry, ALPHA specifies the scalar alpha.   
             Unchanged on exit.   

    A      - DOUBLE PRECISION array of DIMENSION ( LDA, n ).   
             Before entry, the leading m by n part of the array A must   
             contain the matrix of coefficients.   
             Unchanged on exit.   

    LDA    - INTEGER.   
             On entry, LDA specifies the first dimension of A as declared 
  
             in the calling (sub) program. LDA must be at least   
             max( 1, m ).   
             Unchanged on exit.   

    X      - DOUBLE PRECISION array of DIMENSION at least   
             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
             and at least   
             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
             Before entry, the incremented array X must contain the   
             vector x.   
             Unchanged on exit.   

    INCX   - INTEGER.   
             On entry, INCX specifies the increment for the elements of   
             X. INCX must not be zero.   
             Unchanged on exit.   

    BETA   - DOUBLE PRECISION.   
             On entry, BETA specifies the scalar beta. When BETA is   
             supplied as zero then Y need not be set on input.   
             Unchanged on exit.   

    Y      - DOUBLE PRECISION array of DIMENSION at least   
             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
             and at least   
             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
             Before entry with BETA non-zero, the incremented array Y   
             must contain the vector y. On exit, Y is overwritten by the 
  
             updated vector y.   

    INCY   - INTEGER.   
             On entry, INCY specifies the increment for the elements of   
             Y. INCY must not be zero.   
             Unchanged on exit.   


    Level 2 Blas routine.   

    -- Written on 22-October-1986.   
       Jack Dongarra, Argonne National Lab.   
       Jeremy Du Croz, Nag Central Office.   
       Sven Hammarling, Nag Central Office.   
       Richard Hanson, Sandia National Labs.   


       Test the input parameters.   

    
   Parameter adjustments   
       Function Body */
#define X(I) x[(I)-1]
#define Y(I) y[(I)-1]

#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]

    info = 0;
    if (! NL_FORTRAN_WRAP(lsame)(trans, "N") && ! NL_FORTRAN_WRAP(lsame)(trans, "T") && ! 
            NL_FORTRAN_WRAP(lsame)(trans, "C")) {
        info = 1;
    } else if (*m < 0) {
        info = 2;
    } else if (*n < 0) {
        info = 3;
    } else if (*lda < max(1,*m)) {
        info = 6;
    } else if (*incx == 0) {
        info = 8;
    } else if (*incy == 0) {
        info = 11;
    }
    if (info != 0) {
        NL_FORTRAN_WRAP(xerbla)("DGEMV ", &info);
        return 0;
    }

/*     Quick return if possible. */

    if (*m == 0 || *n == 0 || (*alpha == 0. && *beta == 1.)) {
        return 0;
    }

/*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set 
  
       up the start points in  X  and  Y. */

    if (NL_FORTRAN_WRAP(lsame)(trans, "N")) {
        lenx = *n;
        leny = *m;
    } else {
        lenx = *m;
        leny = *n;
    }
    if (*incx > 0) {
        kx = 1;
    } else {
        kx = 1 - (lenx - 1) * *incx;
    }
    if (*incy > 0) {
        ky = 1;
    } else {
        ky = 1 - (leny - 1) * *incy;
    }

/*     Start the operations. In this version the elements of A are   
       accessed sequentially with one pass through A.   

       First form  y := beta*y. */

    if (*beta != 1.) {
        if (*incy == 1) {
            if (*beta == 0.) {
                i__1 = leny;
                for (i = 1; i <= leny; ++i) {
                    Y(i) = 0.;
/* L10: */
                }
            } else {
                i__1 = leny;
                for (i = 1; i <= leny; ++i) {
                    Y(i) = *beta * Y(i);
/* L20: */
                }
            }
        } else {
            iy = ky;
            if (*beta == 0.) {
                i__1 = leny;
                for (i = 1; i <= leny; ++i) {
                    Y(iy) = 0.;
                    iy += *incy;
/* L30: */
                }
            } else {
                i__1 = leny;
                for (i = 1; i <= leny; ++i) {
                    Y(iy) = *beta * Y(iy);
                    iy += *incy;
/* L40: */
                }
            }
        }
    }
    if (*alpha == 0.) {
        return 0;
    }
    if (NL_FORTRAN_WRAP(lsame)(trans, "N")) {

/*        Form  y := alpha*A*x + y. */

        jx = kx;
        if (*incy == 1) {
            i__1 = *n;
            for (j = 1; j <= *n; ++j) {
                if (X(jx) != 0.) {
                    temp = *alpha * X(jx);
                    i__2 = *m;
                    for (i = 1; i <= *m; ++i) {
                        Y(i) += temp * A(i,j);
/* L50: */
                    }
                }
                jx += *incx;
/* L60: */
            }
        } else {
            i__1 = *n;
            for (j = 1; j <= *n; ++j) {
                if (X(jx) != 0.) {
                    temp = *alpha * X(jx);
                    iy = ky;
                    i__2 = *m;
                    for (i = 1; i <= *m; ++i) {
                        Y(iy) += temp * A(i,j);
                        iy += *incy;
/* L70: */
                    }
                }
                jx += *incx;
/* L80: */
            }
        }
    } else {

/*        Form  y := alpha*A'*x + y. */

        jy = ky;
        if (*incx == 1) {
            i__1 = *n;
            for (j = 1; j <= *n; ++j) {
                temp = 0.;
                i__2 = *m;
                for (i = 1; i <= *m; ++i) {
                    temp += A(i,j) * X(i);
/* L90: */
                }
                Y(jy) += *alpha * temp;
                jy += *incy;
/* L100: */
            }
        } else {
            i__1 = *n;
            for (j = 1; j <= *n; ++j) {
                temp = 0.;
                ix = kx;
                i__2 = *m;
                for (i = 1; i <= *m; ++i) {
                    temp += A(i,j) * X(ix);
                    ix += *incx;
/* L110: */
                }
                Y(jy) += *alpha * temp;
                jy += *incy;
/* L120: */
            }
        }
    }

    nl_arg_used(i__1);
    nl_arg_used(i__2);
    return 0;

/*     End of DGEMV . */

} /* dgemv_ */

#undef X
#undef Y
#undef A


#else

extern void NL_FORTRAN_WRAP(daxpy)( 
    int *n, double *alpha, double *x,
    int *incx, double *y, int *incy 
) ;


extern double NL_FORTRAN_WRAP(dnrm2)( int *n, double *x, int *incx ) ;

extern int NL_FORTRAN_WRAP(dcopy)(int* n, double* dx, int* incx, double* dy, int* incy) ;

extern void NL_FORTRAN_WRAP(dscal)(int* n, double* alpha, double *x, int* incx) ;

#ifndef NEEDS_DTPSV
extern void NL_FORTRAN_WRAP(dtpsv)( 
    char *uplo, char *trans, char *diag,
    int *n, double *AP, double *x, int *incx 
) ;
#endif

extern void NL_FORTRAN_WRAP(dgemv)( 
    char *trans, int *m, int *n,
    double *alpha, double *A, int *ldA,
    double *x, int *incx,
    double *beta, double *y, int *incy 
) ;

#endif

#ifdef NEEDS_DTPSV

/* DECK DTPSV */
/* Subroutine */ 
static int NL_FORTRAN_WRAP(dtpsv)(
   const char* uplo, 
   const char* trans, 
   const char* diag, 
   integer* n, 
   doublereal* ap, 
   doublereal* x, 
   integer* incx
) {
    /* System generated locals */
    integer i__1, i__2;

    /* Local variables */
    static integer info;
    static doublereal temp;
    static integer i__, j, k;
/*    extern logical lsame_(); */
    static integer kk, ix, jx, kx;
/*    extern int xerbla_(); */
    static logical nounit;

/* ***BEGIN PROLOGUE  DTPSV */
/* ***PURPOSE  Solve one of the systems of equations. */
/* ***LIBRARY   SLATEC (BLAS) */
/* ***CATEGORY  D1B4 */
/* ***TYPE      DOUBLE PRECISION (STPSV-S, DTPSV-D, CTPSV-C) */
/* ***KEYWORDS  LEVEL 2 BLAS, LINEAR ALGEBRA */
/* ***AUTHOR  Dongarra, J. J., (ANL) */
/*           Du Croz, J., (NAG) */
/*           Hammarling, S., (NAG) */
/*           Hanson, R. J., (SNLA) */
/* ***DESCRIPTION */

/*  DTPSV  solves one of the systems of equations */

/*     A*x = b,   or   A'*x = b, */

/*  where b and x are n element vectors and A is an n by n unit, or */
/*  non-unit, upper or lower triangular matrix, supplied in packed form. */

/*  No test for singularity or near-singularity is included in this */
/*  routine. Such tests must be performed before calling this routine. */

/*  Parameters */
/*  ========== */

/*  UPLO   - CHARACTER*1. */
/*           On entry, UPLO specifies whether the matrix is an upper or */
/*           lower triangular matrix as follows: */

/*              UPLO = 'U' or 'u'   A is an upper triangular matrix. */

/*              UPLO = 'L' or 'l'   A is a lower triangular matrix. */

/*           Unchanged on exit. */

/*  TRANS  - CHARACTER*1. */
/*           On entry, TRANS specifies the equations to be solved as */
/*           follows: */

/*              TRANS = 'N' or 'n'   A*x = b. */

/*              TRANS = 'T' or 't'   A'*x = b. */

/*              TRANS = 'C' or 'c'   A'*x = b. */

/*           Unchanged on exit. */

/*  DIAG   - CHARACTER*1. */
/*           On entry, DIAG specifies whether or not A is unit */
/*           triangular as follows: */

/*              DIAG = 'U' or 'u'   A is assumed to be unit triangular. */

/*              DIAG = 'N' or 'n'   A is not assumed to be unit */
/*                                  triangular. */

/*           Unchanged on exit. */

/*  N      - INTEGER. */
/*           On entry, N specifies the order of the matrix A. */
/*           N must be at least zero. */
/*           Unchanged on exit. */

/*  AP     - DOUBLE PRECISION array of DIMENSION at least */
/*           ( ( n*( n + 1))/2). */
/*           Before entry with  UPLO = 'U' or 'u', the array AP must */
/*           contain the upper triangular matrix packed sequentially, */
/*           column by column, so that AP( 1 ) contains a( 1, 1 ), */
/*           AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) */
/*           respectively, and so on. */
/*           Before entry with UPLO = 'L' or 'l', the array AP must */
/*           contain the lower triangular matrix packed sequentially, */
/*           column by column, so that AP( 1 ) contains a( 1, 1 ), */
/*           AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) */
/*           respectively, and so on. */
/*           Note that when  DIAG = 'U' or 'u', the diagonal elements of */
/*           A are not referenced, but are assumed to be unity. */
/*           Unchanged on exit. */

/*  X      - DOUBLE PRECISION array of dimension at least */
/*           ( 1 + ( n - 1 )*abs( INCX ) ). */
/*           Before entry, the incremented array X must contain the n */
/*           element right-hand side vector b. On exit, X is overwritten */
/*           with the solution vector x. */

/*  INCX   - INTEGER. */
/*           On entry, INCX specifies the increment for the elements of */
/*           X. INCX must not be zero. */
/*           Unchanged on exit. */

/* ***REFERENCES  Dongarra, J. J., Du Croz, J., Hammarling, S., and */
/*                 Hanson, R. J.  An extended set of Fortran basic linear */
/*                 algebra subprograms.  ACM TOMS, Vol. 14, No. 1, */
/*                 pp. 1-17, March 1988. */
/* ***ROUTINES CALLED  LSAME, XERBLA */
/* ***REVISION HISTORY  (YYMMDD) */
/*   861022  DATE WRITTEN */
/*   910605  Modified to meet SLATEC prologue standards.  Only comment */
/*           lines were modified.  (BKS) */
/* ***END PROLOGUE  DTPSV */
/*     .. Scalar Arguments .. */
/*     .. Array Arguments .. */
/*     .. Parameters .. */
/*     .. Local Scalars .. */
/*     .. External Functions .. */
/*     .. External Subroutines .. */
/* ***FIRST EXECUTABLE STATEMENT  DTPSV */

/*     Test the input parameters. */

    /* Parameter adjustments */
    --x;
    --ap;

    /* Function Body */
    info = 0;
    if (!NL_FORTRAN_WRAP(lsame)(uplo, "U") && 
        !NL_FORTRAN_WRAP(lsame)(uplo, "L")
    ) {
        info = 1;
    } else if (
        !NL_FORTRAN_WRAP(lsame)(trans, "N") && 
        !NL_FORTRAN_WRAP(lsame)(trans, "T") && 
        !NL_FORTRAN_WRAP(lsame)(trans, "C")
    ) {
        info = 2;
    } else if (
        !NL_FORTRAN_WRAP(lsame)(diag, "U") && 
        !NL_FORTRAN_WRAP(lsame)(diag, "N")
    ) {
        info = 3;
    } else if (*n < 0) {
        info = 4;
    } else if (*incx == 0) {
        info = 7;
    }
    if (info != 0) {
        NL_FORTRAN_WRAP(xerbla)("DTPSV ", &info);
        return 0;
    }

/*     Quick return if possible. */

    if (*n == 0) {
        return 0;
    }

    nounit = (logical)(NL_FORTRAN_WRAP(lsame)(diag, "N"));

/*     Set up the start point in X if the increment is not unity. This */
/*     will be  ( N - 1 )*INCX  too small for descending loops. */

    if (*incx <= 0) {
        kx = 1 - (*n - 1) * *incx;
    } else if (*incx != 1) {
        kx = 1;
    }

/*     Start the operations. In this version the elements of AP are */
/*     accessed sequentially with one pass through AP. */

    if (NL_FORTRAN_WRAP(lsame)(trans, "N")) {

/*        Form  x := inv( A )*x. */

        if (NL_FORTRAN_WRAP(lsame)(uplo, "U")) {
            kk = *n * (*n + 1) / 2;
            if (*incx == 1) {
                for (j = *n; j >= 1; --j) {
                    if (x[j] != 0.) {
                        if (nounit) {
                            x[j] /= ap[kk];
                        }
                        temp = x[j];
                        k = kk - 1;
                        for (i__ = j - 1; i__ >= 1; --i__) {
                            x[i__] -= temp * ap[k];
                            --k;
/* L10: */
                        }
                    }
                    kk -= j;
/* L20: */
                }
            } else {
                jx = kx + (*n - 1) * *incx;
                for (j = *n; j >= 1; --j) {
                    if (x[jx] != 0.) {
                        if (nounit) {
                            x[jx] /= ap[kk];
                        }
                        temp = x[jx];
                        ix = jx;
                        i__1 = kk - j + 1;
                        for (k = kk - 1; k >= i__1; --k) {
                            ix -= *incx;
                            x[ix] -= temp * ap[k];
/* L30: */
                        }
                    }
                    jx -= *incx;
                    kk -= j;
/* L40: */
                }
            }
        } else {
            kk = 1;
            if (*incx == 1) {
                i__1 = *n;
                for (j = 1; j <= i__1; ++j) {
                    if (x[j] != 0.) {
                        if (nounit) {
                            x[j] /= ap[kk];
                        }
                        temp = x[j];
                        k = kk + 1;
                        i__2 = *n;
                        for (i__ = j + 1; i__ <= i__2; ++i__) {
                            x[i__] -= temp * ap[k];
                            ++k;
/* L50: */
                        }
                    }
                    kk += *n - j + 1;
/* L60: */
                }
            } else {
                jx = kx;
                i__1 = *n;
                for (j = 1; j <= i__1; ++j) {
                    if (x[jx] != 0.) {
                        if (nounit) {
                            x[jx] /= ap[kk];
                        }
                        temp = x[jx];
                        ix = jx;
                        i__2 = kk + *n - j;
                        for (k = kk + 1; k <= i__2; ++k) {
                            ix += *incx;
                            x[ix] -= temp * ap[k];
/* L70: */
                        }
                    }
                    jx += *incx;
                    kk += *n - j + 1;
/* L80: */
                }
            }
        }
    } else {

/*        Form  x := inv( A' )*x. */

        if (NL_FORTRAN_WRAP(lsame)(uplo, "U")) {
            kk = 1;
            if (*incx == 1) {
                i__1 = *n;
                for (j = 1; j <= i__1; ++j) {
                    temp = x[j];
                    k = kk;
                    i__2 = j - 1;
                    for (i__ = 1; i__ <= i__2; ++i__) {
                        temp -= ap[k] * x[i__];
                        ++k;
/* L90: */
                    }
                    if (nounit) {
                        temp /= ap[kk + j - 1];
                    }
                    x[j] = temp;
                    kk += j;
/* L100: */
                }
            } else {
                jx = kx;
                i__1 = *n;
                for (j = 1; j <= i__1; ++j) {
                    temp = x[jx];
                    ix = kx;
                    i__2 = kk + j - 2;
                    for (k = kk; k <= i__2; ++k) {
                        temp -= ap[k] * x[ix];
                        ix += *incx;
/* L110: */
                    }
                    if (nounit) {
                        temp /= ap[kk + j - 1];
                    }
                    x[jx] = temp;
                    jx += *incx;
                    kk += j;
/* L120: */
                }
            }
        } else {
            kk = *n * (*n + 1) / 2;
            if (*incx == 1) {
                for (j = *n; j >= 1; --j) {
                    temp = x[j];
                    k = kk;
                    i__1 = j + 1;
                    for (i__ = *n; i__ >= i__1; --i__) {
                        temp -= ap[k] * x[i__];
                        --k;
/* L130: */
                    }
                    if (nounit) {
                        temp /= ap[kk - *n + j];
                    }
                    x[j] = temp;
                    kk -= *n - j + 1;
/* L140: */
                }
            } else {
                kx += (*n - 1) * *incx;
                jx = kx;
                for (j = *n; j >= 1; --j) {
                    temp = x[jx];
                    ix = kx;
                    i__1 = kk - (*n - (j + 1));
                    for (k = kk; k >= i__1; --k) {
                        temp -= ap[k] * x[ix];
                        ix -= *incx;
/* L150: */
                    }
                    if (nounit) {
                        temp /= ap[kk - *n + j];
                    }
                    x[jx] = temp;
                    jx -= *incx;
                    kk -= *n - j + 1;
/* L160: */
                }
            }
        }
    }

    return 0;

/*     End of DTPSV . */

} /* dtpsv_ */

#endif 


/* End of BLAS routines */


/* Abstract BLAS interface                                              */


void nlBlasResetStats(NLBlas_t blas) {
    blas->start_time = nlCurrentTime();
    blas->flops = 0;
    blas->used_ram[0] = 0;
    blas->used_ram[1] = 0;
    blas->max_used_ram[0] = 0;
    blas->max_used_ram[1] = 0;
    blas->sq_rnorm = 0.0;
    blas->sq_bnorm = 0.0;
}

double nlBlasGFlops(NLBlas_t blas) {
    double now = nlCurrentTime();
    double elapsed_time = now - blas->start_time;
    return (NLdouble)(blas->flops) / (elapsed_time * 1e9);
}

NLulong nlBlasUsedRam(NLBlas_t blas, NLmemoryType type) {
    return blas->used_ram[type];
}

NLulong nlBlasMaxUsedRam(NLBlas_t blas, NLmemoryType type) {
    return blas->max_used_ram[type];
}

NLboolean nlBlasHasUnifiedMemory(NLBlas_t blas) {
    return blas->has_unified_memory;
}

static void* host_blas_malloc(
    NLBlas_t blas, NLmemoryType type, size_t size
) {
    nl_arg_used(type);
    blas->used_ram[type] += (NLulong)size;
    blas->max_used_ram[type] = MAX(
	blas->max_used_ram[type],blas->used_ram[type]
    );
    return malloc(size);
}

static void host_blas_free(
    NLBlas_t blas, NLmemoryType type, size_t size, void* ptr
) {
    nl_arg_used(type);
    blas->used_ram[type] -= (NLulong)size;
    free(ptr);
}

static void host_blas_memcpy(
    NLBlas_t blas,
    void* to, NLmemoryType to_type,
    void* from, NLmemoryType from_type,
    size_t size
) {
    nl_arg_used(blas);
    nl_arg_used(to_type);
    nl_arg_used(from_type);
    memcpy(to,from,size);
}

static void host_blas_dcopy(
    NLBlas_t blas, int n, const double *x, int incx, double *y, int incy    
) {
    nl_arg_used(blas);
    NL_FORTRAN_WRAP(dcopy)(&n,(double*)x,&incx,y,&incy);    
}

static double host_blas_ddot(
    NLBlas_t blas, int n, const double *x, int incx, const double *y, int incy    
) {
    blas->flops += (NLulong)(2*n);
    return NL_FORTRAN_WRAP(ddot)(&n,(double*)x,&incx,(double*)y,&incy);
}

static double host_blas_dnrm2(
    NLBlas_t blas, int n, const double *x, int incx
) {
    blas->flops += (NLulong)(2*n);
    return NL_FORTRAN_WRAP(dnrm2)(&n,(double*)x,&incx);
}

static void host_blas_daxpy(
    NLBlas_t blas, int n, double a, const double *x, int incx, double *y, int incy
) {
    blas->flops += (NLulong)(2*n);
    NL_FORTRAN_WRAP(daxpy)(&n,&a,(double*)x,&incx,y,&incy);
}

static void host_blas_dscal(
    NLBlas_t blas, int n, double a, double *x, int incx    
) {
    blas->flops += (NLulong)n;
    NL_FORTRAN_WRAP(dscal)(&n,&a,x,&incx);    
}

static void host_blas_dgemv(
    NLBlas_t blas, MatrixTranspose trans, int m, int n, double alpha,
    const double *A, int ldA, const double *x, int incx,
    double beta, double *y, int incy 
) {
    static const char *T[3] = { "N", "T", 0 };
    nl_arg_used(blas);
    NL_FORTRAN_WRAP(dgemv)(
	T[(int)trans],&m,&n,&alpha,(double*)A,&ldA,
	(double*)x,&incx,&beta,y,&incy
    );
    /* TODO: update flops */    
}

static void host_blas_dtpsv(
    NLBlas_t blas, MatrixTriangle uplo, MatrixTranspose trans,
    MatrixUnitTriangular diag, int n, const double *AP,
    double *x, int incx 
) {
    static const char *UL[2] = { "U", "L" };
    static const char *T[3]  = { "N", "T", 0 };
    static const char *D[2]  = { "U", "N" };
    nl_arg_used(blas);    
    NL_FORTRAN_WRAP(dtpsv)(
	UL[(int)uplo],T[(int)trans],D[(int)diag],&n,(double*)AP,x,&incx
    );
    /* TODO: update flops */
}

NLBlas_t nlHostBlas() {
    static NLboolean initialized = NL_FALSE;
    static struct NLBlas blas;
    if(!initialized) {
	memset(&blas, 0, sizeof(blas));
	blas.has_unified_memory = NL_TRUE;
	blas.Malloc = host_blas_malloc;
	blas.Free = host_blas_free;
	blas.Memcpy = host_blas_memcpy;
	blas.Dcopy = host_blas_dcopy;
	blas.Ddot = host_blas_ddot;
	blas.Dnrm2 = host_blas_dnrm2;
	blas.Daxpy = host_blas_daxpy;
	blas.Dscal = host_blas_dscal;
	blas.Dgemv = host_blas_dgemv;
	blas.Dtpsv = host_blas_dtpsv;
	nlBlasResetStats(&blas);
	initialized = NL_TRUE;
    }
    return &blas;
}


/******* extracted from nl_iterative_solvers.c *******/


/* Solvers */

/*
 * The implementation of the solvers is inspired by 
 * the lsolver library, by Christian Badura, available from:
 * http://www.mathematik.uni-freiburg.de
 * /IAM/Research/projectskr/lin_solver/
 *
 * About the Conjugate Gradient, details can be found in:
 *  Ashby, Manteuffel, Saylor
 *     A taxononmy for conjugate gradient methods
 *     SIAM J Numer Anal 27, 1542-1568 (1990)
 *
 *  This version is completely abstract, the same code can be used for 
 * CPU/GPU, dense matrix / sparse matrix etc...
 *  Abstraction is realized through:
 *   - Abstract blas interface (NLBlas_t), that can implement BLAS 
 *     operations on the CPU or on the GPU.
 *   - Abstract matrix interface (NLMatrix), that can implement different
 *     versions of matrix x vector product (CPU/GPU, sparse/dense ...)
 */


static NLuint nlSolveSystem_CG(
    NLBlas_t blas,
    NLMatrix M, NLdouble* b, NLdouble* x,
    double eps, NLuint max_iter
) {
    NLint N = (NLint)M->m;

    NLdouble *g = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
    NLdouble *r = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); 
    NLdouble *p = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
    NLuint its=0;
    NLdouble t, tau, sig, rho, gam;
    NLdouble b_square=blas->Ddot(blas,N,b,1,b,1);
    NLdouble err=eps*eps*b_square;
    NLdouble curr_err;

    nlMultMatrixVector(M,x,g);
    blas->Daxpy(blas,N,-1.,b,1,g,1);
    blas->Dscal(blas,N,-1.,g,1);
    blas->Dcopy(blas,N,g,1,r,1);
    curr_err = blas->Ddot(blas,N,g,1,g,1);
    while ( curr_err >err && its < max_iter) {
	if(nlCurrentContext != NULL) {
	    if(nlCurrentContext->progress_func != NULL) {
		nlCurrentContext->progress_func(its, max_iter, curr_err, err);
	    }
	    if(nlCurrentContext->verbose && !(its % 100)) {
		nl_printf ( "%d : %.10e -- %.10e\n", its, curr_err, err );
	    }
	}
	nlMultMatrixVector(M,r,p);
        rho=blas->Ddot(blas,N,p,1,p,1);
        sig=blas->Ddot(blas,N,r,1,p,1);
        tau=blas->Ddot(blas,N,g,1,r,1);
        t=tau/sig;
        blas->Daxpy(blas,N,t,r,1,x,1);
        blas->Daxpy(blas,N,-t,p,1,g,1);
        gam=(t*t*rho-tau)/tau;
        blas->Dscal(blas,N,gam,r,1);
        blas->Daxpy(blas,N,1.,g,1,r,1);
        ++its;
        curr_err = blas->Ddot(blas,N,g,1,g,1);
    }
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, g);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, r);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, p);
    blas->sq_bnorm = b_square;
    blas->sq_rnorm = curr_err;
    return its;
}

static NLuint nlSolveSystem_PRE_CG(
    NLBlas_t blas,
    NLMatrix M, NLMatrix P, NLdouble* b, NLdouble* x,
    double eps, NLuint max_iter
) {
    NLint     N        = (NLint)M->n;
    NLdouble* r = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
    NLdouble* d = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
    NLdouble* h = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
    NLdouble *Ad = h;
    NLuint its=0;
    NLdouble rh, alpha, beta;
    NLdouble b_square = blas->Ddot(blas,N,b,1,b,1);
    NLdouble err=eps*eps*b_square;
    NLdouble curr_err;

    nlMultMatrixVector(M,x,r);
    blas->Daxpy(blas,N,-1.,b,1,r,1);
    nlMultMatrixVector(P,r,d);
    blas->Dcopy(blas,N,d,1,h,1);
    rh=blas->Ddot(blas,N,r,1,h,1);
    curr_err = blas->Ddot(blas,N,r,1,r,1);

    while ( curr_err >err && its < max_iter) {
	if(nlCurrentContext != NULL) {
	    if(nlCurrentContext->progress_func != NULL) {
		nlCurrentContext->progress_func(its, max_iter, curr_err, err);
	    }
	    if( nlCurrentContext->verbose && !(its % 100)) {
		nl_printf ( "%d : %.10e -- %.10e\n", its, curr_err, err );
	    }
	}
	nlMultMatrixVector(M,d,Ad);
        alpha=rh/blas->Ddot(blas,N,d,1,Ad,1);
        blas->Daxpy(blas,N,-alpha,d,1,x,1);
        blas->Daxpy(blas,N,-alpha,Ad,1,r,1);
	nlMultMatrixVector(P,r,h);
        beta=1./rh;
	rh=blas->Ddot(blas,N,r,1,h,1);
	beta*=rh;
        blas->Dscal(blas,N,beta,d,1);
        blas->Daxpy(blas,N,1.,h,1,d,1);
        ++its;
        curr_err = blas->Ddot(blas,N,r,1,r,1);
    }
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, r);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, d);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, h);
    blas->sq_bnorm = b_square;
    blas->sq_rnorm = curr_err;
    return its;
}

static NLuint nlSolveSystem_BICGSTAB(
    NLBlas_t blas,
    NLMatrix M, NLdouble* b, NLdouble* x,
    double eps, NLuint max_iter
) {
    NLint     N   = (NLint)M->n;
    NLdouble *rT  = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); 
    NLdouble *d   = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); 
    NLdouble *h   = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); 
    NLdouble *u   = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); 
    NLdouble *Ad  = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); 
    NLdouble *t   = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); 
    NLdouble *s   = h;
    NLdouble rTh, rTAd, rTr, alpha, beta, omega, st, tt;
    NLuint its=0;
    NLdouble b_square = blas->Ddot(blas,N,b,1,b,1);
    NLdouble err=eps*eps*b_square;
    NLdouble *r = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
    nlMultMatrixVector(M,x,r);
    blas->Daxpy(blas,N,-1.,b,1,r,1);
    blas->Dcopy(blas,N,r,1,d,1);
    blas->Dcopy(blas,N,d,1,h,1);
    blas->Dcopy(blas,N,h,1,rT,1);
    nl_assert( blas->Ddot(blas,N,rT,1,rT,1)>1e-40 );
    rTh=blas->Ddot(blas,N,rT,1,h,1);
    rTr=blas->Ddot(blas,N,r,1,r,1);

    while ( rTr>err && its < max_iter) {
	if(nlCurrentContext != NULL) {
	    if(nlCurrentContext->progress_func != NULL) {
		nlCurrentContext->progress_func(its, max_iter, rTr, err);
	    }
	    if( (nlCurrentContext->verbose) && !(its % 100)) {
		nl_printf ( "%d : %.10e -- %.10e\n", its, rTr, err );
	    }
	}
	nlMultMatrixVector(M,d,Ad);
        rTAd=blas->Ddot(blas,N,rT,1,Ad,1);
        nl_assert( fabs(rTAd)>1e-40 );
        alpha=rTh/rTAd;
        blas->Daxpy(blas,N,-alpha,Ad,1,r,1);
        blas->Dcopy(blas,N,h,1,s,1);
        blas->Daxpy(blas,N,-alpha,Ad,1,s,1);
	nlMultMatrixVector(M,s,t);
        blas->Daxpy(blas,N,1.,t,1,u,1);
        blas->Dscal(blas,N,alpha,u,1);
        st=blas->Ddot(blas,N,s,1,t,1);
        tt=blas->Ddot(blas,N,t,1,t,1);
        if ( fabs(st)<1e-40 || fabs(tt)<1e-40 ) {
            omega = 0.;
        } else {
            omega = st/tt;
        }
        blas->Daxpy(blas,N,-omega,t,1,r,1);
        blas->Daxpy(blas,N,-alpha,d,1,x,1);
        blas->Daxpy(blas,N,-omega,s,1,x,1);
        blas->Dcopy(blas,N,s,1,h,1);
        blas->Daxpy(blas,N,-omega,t,1,h,1);
        beta=(alpha/omega)/rTh;
	rTh=blas->Ddot(blas,N,rT,1,h,1);
	beta*=rTh;
        blas->Dscal(blas,N,beta,d,1);
        blas->Daxpy(blas,N,1.,h,1,d,1);
        blas->Daxpy(blas,N,-beta*omega,Ad,1,d,1);
        rTr=blas->Ddot(blas,N,r,1,r,1);
        ++its;
    }
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, r);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, rT);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, d);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, h);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, u);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, Ad);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, t);
    blas->sq_bnorm = b_square;
    blas->sq_rnorm = rTr;
    return its;
}

static NLuint nlSolveSystem_PRE_BICGSTAB(
    NLBlas_t blas,
    NLMatrix M, NLMatrix P, NLdouble* b, NLdouble* x,
    double eps, NLuint max_iter
) {
    NLint     N   = (NLint)M->n;
    NLdouble *rT  = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
    NLdouble *d   = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
    NLdouble *h   = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
    NLdouble *u   = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
    NLdouble *Sd  = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
    NLdouble *t   = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
    NLdouble *aux = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
    NLdouble *s   = h;
    NLdouble rTh, rTSd, rTr, alpha, beta, omega, st, tt;
    NLuint its=0;
    NLdouble b_square = blas->Ddot(blas,N,b,1,b,1);
    NLdouble err  = eps*eps*b_square;
    NLdouble *r   = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);

    nlMultMatrixVector(M,x,r);
    blas->Daxpy(blas,N,-1.,b,1,r,1);
    nlMultMatrixVector(P,r,d);
    blas->Dcopy(blas,N,d,1,h,1);
    blas->Dcopy(blas,N,h,1,rT,1);
    nl_assert( blas->Ddot(blas,N,rT,1,rT,1)>1e-40 );
    rTh=blas->Ddot(blas,N,rT,1,h,1);
    rTr=blas->Ddot(blas,N,r,1,r,1);

    while ( rTr>err && its < max_iter) {
	if(nlCurrentContext != NULL) {	
	    if(nlCurrentContext->progress_func != NULL) {
		nlCurrentContext->progress_func(its, max_iter, rTr, err);
	    }
	    if( (nlCurrentContext->verbose) && !(its % 100)) {
		nl_printf ( "%d : %.10e -- %.10e\n", its, rTr, err );
	    }
	}
	nlMultMatrixVector(M,d,aux);
	nlMultMatrixVector(P,aux,Sd);
        rTSd=blas->Ddot(blas,N,rT,1,Sd,1);
        nl_assert( fabs(rTSd)>1e-40 );
        alpha=rTh/rTSd;
        blas->Daxpy(blas,N,-alpha,aux,1,r,1);
        blas->Dcopy(blas,N,h,1,s,1);
        blas->Daxpy(blas,N,-alpha,Sd,1,s,1);
	nlMultMatrixVector(M,s,aux);
	nlMultMatrixVector(P,aux,t);
        blas->Daxpy(blas,N,1.,t,1,u,1);
        blas->Dscal(blas,N,alpha,u,1);
        st=blas->Ddot(blas,N,s,1,t,1);
        tt=blas->Ddot(blas,N,t,1,t,1);
        if ( fabs(st)<1e-40 || fabs(tt)<1e-40 ) {
            omega = 0.;
        } else {
            omega = st/tt;
        }
        blas->Daxpy(blas,N,-omega,aux,1,r,1);
        blas->Daxpy(blas,N,-alpha,d,1,x,1);
        blas->Daxpy(blas,N,-omega,s,1,x,1);
        blas->Dcopy(blas,N,s,1,h,1);
        blas->Daxpy(blas,N,-omega,t,1,h,1);
        beta=(alpha/omega)/rTh;
	rTh=blas->Ddot(blas,N,rT,1,h,1);
	beta*=rTh;
        blas->Dscal(blas,N,beta,d,1);
        blas->Daxpy(blas,N,1.,h,1,d,1);
        blas->Daxpy(blas,N,-beta*omega,Sd,1,d,1);
        rTr=blas->Ddot(blas,N,r,1,r,1);
        ++its;
    }
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, r);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, rT);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, d);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, h);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, u);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, Sd);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, t);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, aux);
    blas->sq_bnorm = b_square;
    blas->sq_rnorm = rTr;
    return its;
}

/* 
 * Note: this one cannot be executed on device (GPU)
 * because it directly manipulates the vectors.
 */
static NLuint nlSolveSystem_GMRES(
    NLBlas_t blas,
    NLMatrix M, NLdouble* b, NLdouble* x,
    double eps, NLuint max_iter, NLuint inner_iter
) {
    NLint    n    = (NLint)M->n;
    NLint    m    = (NLint)inner_iter;
    typedef NLdouble *NLdoubleP;     
    NLdouble *V   = NL_NEW_ARRAY(NLdouble, n*(m+1)   );
    NLdouble *U   = NL_NEW_ARRAY(NLdouble, m*(m+1)/2 );
    NLdouble *r   = NL_NEW_ARRAY(NLdouble, n         );
    NLdouble *y   = NL_NEW_ARRAY(NLdouble, m+1       );
    NLdouble *c   = NL_NEW_ARRAY(NLdouble, m         );
    NLdouble *s   = NL_NEW_ARRAY(NLdouble, m         );
    NLdouble **v  = NL_NEW_ARRAY(NLdoubleP, m+1      );
    NLint i, j, io, uij, u0j; 
    NLint its = -1;
    NLdouble beta, h, rd, dd, nrm2b;

    /* 
     * The way it is written, this routine will not
     * work on the GPU since it directly modifies the
     * vectors.
     */
    nl_assert(nlBlasHasUnifiedMemory(blas));
    
    for ( i=0; i<=m; ++i ){
        v[i]=V+i*n;
    }
    
    nrm2b=blas->Dnrm2(blas,n,b,1);
    io=0;

    do  { /* outer loop */
        ++io;
	nlMultMatrixVector(M,x,r);
        blas->Daxpy(blas,n,-1.,b,1,r,1);
        beta=blas->Dnrm2(blas,n,r,1);
        blas->Dcopy(blas,n,r,1,v[0],1);
        blas->Dscal(blas,n,1./beta,v[0],1);

        y[0]=beta;
        j=0;
        uij=0;
        do { /* inner loop: j=0,...,m-1 */
            u0j=uij;
	    nlMultMatrixVector(M,v[j],v[j+1]);
            blas->Dgemv(
                blas,Transpose,n,j+1,1.,V,n,v[j+1],1,0.,U+u0j,1
            );
            blas->Dgemv(
                blas,NoTranspose,n,j+1,-1.,V,n,U+u0j,1,1.,v[j+1],1
            );
            h=blas->Dnrm2(blas,n,v[j+1],1);
            blas->Dscal(blas,n,1./h,v[j+1],1);
            for (i=0; i<j; ++i ) { /* rotiere neue Spalte */
                double tmp = c[i]*U[uij]-s[i]*U[uij+1];
                U[uij+1]   = s[i]*U[uij]+c[i]*U[uij+1];
                U[uij]     = tmp;
                ++uij;
            }
            { /* berechne neue Rotation */
                rd     = U[uij];
                dd     = sqrt(rd*rd+h*h);
                c[j]   = rd/dd;
                s[j]   = -h/dd;
                U[uij] = dd;
                ++uij;
            }
            { /* rotiere rechte Seite y (vorher: y[j+1]=0) */
                y[j+1] = s[j]*y[j];
                y[j]   = c[j]*y[j];
            }
            ++j;
        } while ( 
            j<m && fabs(y[j])>=eps*nrm2b 
        );
        { /* minimiere bzgl Y */
            blas->Dtpsv(
		blas,
                UpperTriangle,
                NoTranspose,
                NotUnitTriangular,
                j,U,y,1
            );
            /* correct X */
            blas->Dgemv(blas,NoTranspose,n,j,-1.,V,n,y,1,1.,x,1);
        }
    } while ( fabs(y[j])>=eps*nrm2b && (m*(io-1)+j) < (NLint)max_iter);
    
    /* Count the inner iterations */
    its = m*(io-1)+j;
    blas->sq_bnorm = nrm2b*nrm2b;
    blas->sq_rnorm = y[j]*y[j];
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, V);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, U);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, r);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, y);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, c);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, s);
    NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, v);
    return (NLuint)its;
}


/* Main driver routine */

NLuint nlSolveSystemIterative(
    NLBlas_t blas,
    NLMatrix M, NLMatrix P, NLdouble* b_in, NLdouble* x_in,
    NLenum solver,
    double eps, NLuint max_iter, NLuint inner_iter
) {
    NLuint N = M->n;
    NLuint result=0;
    NLdouble rnorm=0.0;
    NLdouble bnorm=0.0; 
    double* b = b_in;
    double* x = x_in;
    nl_assert(M->m == M->n);

    if(!nlBlasHasUnifiedMemory(blas)) {
	b = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, (int)M->n);
	blas->Memcpy(
	    blas,
	    b, NL_DEVICE_MEMORY,
	    b_in, NL_HOST_MEMORY, (size_t)N*sizeof(double)
	);
	x = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, (int)M->n);
	blas->Memcpy(
	    blas,
	    x, NL_DEVICE_MEMORY,
	    x_in, NL_HOST_MEMORY, (size_t)N*sizeof(double)
	);	
    }

    switch(solver) {
	case NL_CG:
	    if(P == NULL) {
		result = nlSolveSystem_CG(blas,M,b,x,eps,max_iter);
	    } else {
		result = nlSolveSystem_PRE_CG(blas,M,P,b,x,eps,max_iter);
	    }
	    break;
	case NL_BICGSTAB:
	    if(P == NULL) {
		result = nlSolveSystem_BICGSTAB(blas,M,b,x,eps,max_iter);
	    } else {
		result = nlSolveSystem_PRE_BICGSTAB(blas,M,P,b,x,eps,max_iter);
	    }
	    break;
	case NL_GMRES:
	    result = nlSolveSystem_GMRES(blas,M,b,x,eps,max_iter,inner_iter);
	    break;
	default:
	    nl_assert_not_reached;
    }


    /* Get residual norm and rhs norm from BLAS context */
    if(nlCurrentContext != NULL) {
	bnorm = sqrt(blas->sq_bnorm);
	rnorm = sqrt(blas->sq_rnorm);
	if(bnorm == 0.0) {
	    nlCurrentContext->error = rnorm;
	    if(nlCurrentContext->verbose) {
		nl_printf("in OpenNL : ||Ax-b|| = %e\n",nlCurrentContext->error);
	    }
	} else {
	    nlCurrentContext->error = rnorm/bnorm;
	    if(nlCurrentContext->verbose) {
		nl_printf("in OpenNL : ||Ax-b||/||b|| = %e\n",
		       nlCurrentContext->error
		);
	    }
	}
    }
    nlCurrentContext->used_iterations = result;

    if(!nlBlasHasUnifiedMemory(blas)) {
	blas->Memcpy(
	    blas,
	    x_in, NL_HOST_MEMORY, x, NL_DEVICE_MEMORY, (size_t)N*sizeof(double)
	);	
	NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, (int)M->n, x);
	NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, (int)M->n, b);
    }
    
    return result;
}


/******* extracted from nl_preconditioners.c *******/


typedef struct {
    NLuint m;

    NLuint n;

    NLenum type;

    NLDestroyMatrixFunc destroy_func;

    NLMultMatrixVectorFunc mult_func;
    
    NLdouble* diag_inv;
    
} NLJacobiPreconditioner;


static void nlJacobiPreconditionerDestroy(NLJacobiPreconditioner* M) {
    NL_DELETE_ARRAY(M->diag_inv);
}

static void nlJacobiPreconditionerMult(
    NLJacobiPreconditioner* M, const double* x, double* y
) {
    NLuint i;
    for(i=0; i<M->n; ++i) {
	y[i] = x[i] * M->diag_inv[i];
    }
    nlHostBlas()->flops += (NLulong)(M->n);    
}

NLMatrix nlNewJacobiPreconditioner(NLMatrix M_in) {
    NLSparseMatrix* M = NULL;
    NLJacobiPreconditioner* result = NULL;
    NLuint i;
    nl_assert(M_in->type == NL_MATRIX_SPARSE_DYNAMIC);
    nl_assert(M_in->m == M_in->n);
    M = (NLSparseMatrix*)M_in;
    result = NL_NEW(NLJacobiPreconditioner);
    result->m = M->m;
    result->n = M->n;
    result->type = NL_MATRIX_OTHER;
    result->destroy_func = (NLDestroyMatrixFunc)nlJacobiPreconditionerDestroy;
    result->mult_func = (NLMultMatrixVectorFunc)nlJacobiPreconditionerMult;
    result->diag_inv = NL_NEW_ARRAY(double, M->n);
    for(i=0; i<M->n; ++i) {
	result->diag_inv[i] = (M->diag[i] == 0.0) ? 1.0 : 1.0/M->diag[i];
    }
    return (NLMatrix)result;
}


typedef struct {
    NLuint m;

    NLuint n;

    NLenum type;

    NLDestroyMatrixFunc destroy_func;

    NLMultMatrixVectorFunc mult_func;

    NLSparseMatrix* M;

    double omega;
    
    NLdouble* work;
    
} NLSSORPreconditioner;


static void nlSSORPreconditionerDestroy(NLSSORPreconditioner* M) {
    NL_DELETE_ARRAY(M->work);
}


static void nlSparseMatrixMultLowerInverse(
    NLSparseMatrix* A, const NLdouble* x, NLdouble* y, double omega
) {
    NLuint n       = A->n;
    NLdouble* diag = A->diag;
    NLuint i;
    NLuint ij;
    NLCoeff* c = NULL;
    NLdouble S;

    nl_assert(A->storage & NL_MATRIX_STORE_SYMMETRIC);
    nl_assert(A->storage & NL_MATRIX_STORE_ROWS);

    for(i=0; i<n; i++) {
        NLRowColumn*  Ri = &(A->row[i]);       
        S = 0;
        for(ij=0; ij < Ri->size; ij++) {
            c = &(Ri->coeff[ij]);
            nl_parano_assert(c->index <= i); 
            if(c->index != i) {
                S += c->value * y[c->index]; 
            }
        }
        nlHostBlas()->flops += (NLulong)(2*Ri->size);                    
        y[i] = (x[i] - S) * omega / diag[i];
    }
    nlHostBlas()->flops += (NLulong)(n*3);                
}
static void nlSparseMatrixMultUpperInverse(
    NLSparseMatrix* A, const NLdouble* x, NLdouble* y, NLdouble omega
) {
    NLuint n       = A->n;
    NLdouble* diag = A->diag;
    NLint i;
    NLuint ij;
    NLCoeff* c = NULL;
    NLdouble S;

    nl_assert(A->storage & NL_MATRIX_STORE_SYMMETRIC);
    nl_assert(A->storage & NL_MATRIX_STORE_COLUMNS);

    for(i=(NLint)(n-1); i>=0; i--) {
        NLRowColumn*  Ci = &(A->column[i]);       
        S = 0;
        for(ij=0; ij < Ci->size; ij++) {
            c = &(Ci->coeff[ij]);
            nl_parano_assert(c->index >= i); 
            if((NLint)(c->index) != i) {
                S += c->value * y[c->index]; 
            }
        }
        nlHostBlas()->flops += (NLulong)(2*Ci->size);                    
        y[i] = (x[i] - S) * omega / diag[i];
    }
    nlHostBlas()->flops += (NLulong)(n*3);                
}


static void nlSSORPreconditionerMult(
    NLSSORPreconditioner* P, const double* x, double* y
) {
    NLdouble* diag = P->M->diag;
    NLuint i;
    nlSparseMatrixMultLowerInverse(
        P->M, x, P->work, P->omega
    );
    for(i=0; i<P->n; i++) {
        P->work[i] *= (diag[i] / P->omega);
    }
    nlHostBlas()->flops += (NLulong)(P->n);
    nlSparseMatrixMultUpperInverse(
        P->M, P->work, y, P->omega
    );
    nlHostBlas()->Dscal(nlHostBlas(),(NLint)P->n, 2.0 - P->omega, y, 1);
}

NLMatrix nlNewSSORPreconditioner(NLMatrix M_in, double omega) {
    NLSparseMatrix* M = NULL;
    NLSSORPreconditioner* result = NULL;
    nl_assert(M_in->type == NL_MATRIX_SPARSE_DYNAMIC);
    nl_assert(M_in->m == M_in->n);
    M = (NLSparseMatrix*)M_in;
    result = NL_NEW(NLSSORPreconditioner);
    result->m = M->m;
    result->n = M->n;
    result->type = NL_MATRIX_OTHER;
    result->destroy_func = (NLDestroyMatrixFunc)nlSSORPreconditionerDestroy;
    result->mult_func = (NLMultMatrixVectorFunc)nlSSORPreconditionerMult;
    result->M = M;
    result->work = NL_NEW_ARRAY(NLdouble, result->n);
    result->omega = omega;
    return (NLMatrix)result;
}


/******* extracted from nl_superlu.c *******/


#ifdef NL_OS_UNIX
#  ifdef NL_OS_APPLE
#      define SUPERLU_LIB_NAME "libsuperlu_5.dylib"
#  else
#      define SUPERLU_LIB_NAME "libsuperlu.so"
#  endif
#else
#  define SUPERLU_LIB_NAME "libsuperlu.xxx"
#endif


typedef enum {
    SLU_NC,    /* column-wise, no supernode */
    SLU_NCP,   /* column-wise, column-permuted, no supernode 
                  (The consecutive columns of nonzeros, after permutation,
                   may not be stored  contiguously.) */
    SLU_NR,    /* row-wize, no supernode */
    SLU_SC,    /* column-wise, supernode */
    SLU_SCP,   /* supernode, column-wise, permuted */    
    SLU_SR,    /* row-wise, supernode */
    SLU_DN,     /* Fortran style column-wise storage for dense matrix */
    SLU_NR_loc  /* distributed compressed row format  */ 
} Stype_t;

typedef enum {
    SLU_S,     /* single */
    SLU_D,     /* double */
    SLU_C,     /* single complex */
    SLU_Z      /* double complex */
} Dtype_t;


typedef enum {
    SLU_GE,    /* general */
    SLU_TRLU,  /* lower triangular, unit diagonal */
    SLU_TRUU,  /* upper triangular, unit diagonal */
    SLU_TRL,   /* lower triangular */
    SLU_TRU,   /* upper triangular */
    SLU_SYL,   /* symmetric, store lower half */
    SLU_SYU,   /* symmetric, store upper half */
    SLU_HEL,   /* Hermitian, store lower half */
    SLU_HEU    /* Hermitian, store upper half */
} Mtype_t;

typedef int int_t;

typedef struct {
    int_t  nnz;	    /* number of nonzeros in the matrix */
    void *nzval;    /* pointer to array of nonzero values, packed by raw */
    int_t  *colind; /* pointer to array of columns indices of the nonzeros */
    int_t  *rowptr; /* pointer to array of beginning of rows in nzval[] 
		       and colind[]  */
                    /* Note:
		       Zero-based indexing is used;
		       rowptr[] has nrow+1 entries, the last one pointing
		       beyond the last row, so that rowptr[nrow] = nnz. */
} NRformat;

typedef struct {
        Stype_t Stype; /* Storage type: interprets the storage structure 
                          pointed to by *Store. */
        Dtype_t Dtype; /* Data type. */
        Mtype_t Mtype; /* Matrix type: describes the mathematical property of 
                          the matrix. */
        int_t  nrow;   /* number of rows */
        int_t  ncol;   /* number of columns */
        void *Store;   /* pointer to the actual storage of the matrix */
} SuperMatrix;

/* Stype == SLU_DN */
typedef struct {
    int_t lda;    /* leading dimension */
    void *nzval;  /* array of size lda*ncol to represent a dense matrix */
} DNformat;


typedef enum {NO, YES}                                          yes_no_t;
typedef enum {DOFACT, SamePattern, SamePattern_SameRowPerm, FACTORED} fact_t;
typedef enum {NOROWPERM, LargeDiag, MY_PERMR}                   rowperm_t;
typedef enum {NATURAL, MMD_ATA, MMD_AT_PLUS_A, COLAMD,
              METIS_AT_PLUS_A, PARMETIS, ZOLTAN, MY_PERMC}      colperm_t;
typedef enum {NOTRANS, TRANS, CONJ}                             trans_t;
typedef enum {NOEQUIL, ROW, COL, BOTH}                          DiagScale_t;
typedef enum {NOREFINE, SLU_SINGLE=1, SLU_DOUBLE, SLU_EXTRA}    IterRefine_t;
typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL}              MemType;
typedef enum {HEAD, TAIL}                                       stack_end_t;
typedef enum {SYSTEM, USER}                                     LU_space_t;
typedef enum {ONE_NORM, TWO_NORM, INF_NORM}                     norm_t;
typedef enum {SILU, SMILU_1, SMILU_2, SMILU_3}                  milu_t;

typedef struct {
    fact_t        Fact;
    yes_no_t      Equil;
    colperm_t     ColPerm;
    trans_t       Trans;
    IterRefine_t  IterRefine;
    double        DiagPivotThresh;
    yes_no_t      SymmetricMode;
    yes_no_t      PivotGrowth;
    yes_no_t      ConditionNumber;
    rowperm_t     RowPerm;
    int           ILU_DropRule;
    double        ILU_DropTol;    /* threshold for dropping */
    double        ILU_FillFactor; /* gamma in the secondary dropping */
    norm_t        ILU_Norm;       /* infinity-norm, 1-norm, or 2-norm */
    double        ILU_FillTol;    /* threshold for zero pivot perturbation */
    milu_t        ILU_MILU;
    double        ILU_MILU_Dim;   /* Dimension of PDE (if available) */
    yes_no_t      ParSymbFact;
    yes_no_t      ReplaceTinyPivot; /* used in SuperLU_DIST */
    yes_no_t      SolveInitialized;
    yes_no_t      RefineInitialized;
    yes_no_t      PrintStat;
    int           nnzL, nnzU;      /* used to store nnzs for now       */
    int           num_lookaheads;  /* num of levels in look-ahead      */
    yes_no_t      lookahead_etree; /* use etree computed from the
                                      serial symbolic factorization */
    yes_no_t      SymPattern;      /* symmetric factorization          */
} superlu_options_t;

typedef void* superlu_options_ptr;

typedef float    flops_t;
typedef unsigned char Logical;

typedef struct {
    int     *panel_histo;    /* histogram of panel size distribution */
    double  *utime;          /* running time at various phases */
    flops_t *ops;            /* operation count at various phases */
    int     TinyPivots;      /* number of tiny pivots */
    int     RefineSteps;     /* number of iterative refinement steps */
    int     expansions;      /* number of memory expansions (SuperLU4) */
} SuperLUStat_t;

/*! \brief Headers for 4 types of dynamatically managed memory */
typedef struct e_node {
    int size;      /* length of the memory that has been used */
    void *mem;     /* pointer to the new malloc'd store */
} ExpHeader;

typedef struct {
    int  size;
    int  used;
    int  top1;  /* grow upward, relative to &array[0] */
    int  top2;  /* grow downward */
    void *array;
} LU_stack_t;

typedef struct {
    int     *xsup;    /* supernode and column mapping */
    int     *supno;   
    int     *lsub;    /* compressed L subscripts */
    int	    *xlsub;
    void    *lusup;   /* L supernodes */
    int     *xlusup;
    void    *ucol;    /* U columns */
    int     *usub;
    int	    *xusub;
    int     nzlmax;   /* current max size of lsub */
    int     nzumax;   /*    "    "    "      ucol */
    int     nzlumax;  /*    "    "    "     lusup */
    int     n;        /* number of columns in the matrix */
    LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */
    int     num_expansions;
    ExpHeader *expanders; /* Array of pointers to 4 types of memory */
    LU_stack_t stack;     /* use user supplied memory */
} GlobalLU_t;


typedef void (*FUNPTR_set_default_options)(superlu_options_ptr options);
typedef void (*FUNPTR_ilu_set_default_options)(superlu_options_ptr options);
typedef void (*FUNPTR_StatInit)(SuperLUStat_t *);
typedef void (*FUNPTR_StatFree)(SuperLUStat_t *);

typedef void (*FUNPTR_dCreate_CompCol_Matrix)(
    SuperMatrix *, int, int, int, const double *,
    const int *, const int *, Stype_t, Dtype_t, Mtype_t);

typedef void (*FUNPTR_dCreate_Dense_Matrix)(
    SuperMatrix *, int, int, const double *, int,
    Stype_t, Dtype_t, Mtype_t);

typedef void (*FUNPTR_Destroy_SuperNode_Matrix)(SuperMatrix *);
typedef void (*FUNPTR_Destroy_CompCol_Matrix)(SuperMatrix *);
typedef void (*FUNPTR_Destroy_CompCol_Permuted)(SuperMatrix *);
typedef void (*FUNPTR_Destroy_SuperMatrix_Store)(SuperMatrix *);

typedef void (*FUNPTR_dgssv)(
    superlu_options_ptr, SuperMatrix *, int *, int *, SuperMatrix *,
    SuperMatrix *, SuperMatrix *, SuperLUStat_t *, int *
);

typedef void (*FUNPTR_dgstrs)(
    trans_t, SuperMatrix *, SuperMatrix *, int *, int *,
    SuperMatrix *, SuperLUStat_t*, int *    
);

typedef void (*FUNPTR_get_perm_c)(int, SuperMatrix *, int *);
typedef void (*FUNPTR_sp_preorder)(
   superlu_options_t *, SuperMatrix*, int*, int*, SuperMatrix*
);
typedef int (*FUNPTR_sp_ienv)(int);
typedef int (*FUNPTR_input_error)(const char *, int *);

typedef void (*FUNPTR_dgstrf) (superlu_options_t *options, SuperMatrix *A,
        int relax, int panel_size, int *etree, void *work, int lwork,
        int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U,
    	GlobalLU_t *Glu, /* persistent to facilitate multiple factorizations */
        SuperLUStat_t *stat, int *info
);


typedef struct {
    FUNPTR_set_default_options set_default_options;
    FUNPTR_ilu_set_default_options ilu_set_default_options;    
    FUNPTR_StatInit StatInit;
    FUNPTR_StatFree StatFree;
    FUNPTR_dCreate_CompCol_Matrix dCreate_CompCol_Matrix;
    FUNPTR_dCreate_Dense_Matrix dCreate_Dense_Matrix;
    FUNPTR_Destroy_SuperNode_Matrix Destroy_SuperNode_Matrix;
    FUNPTR_Destroy_CompCol_Matrix Destroy_CompCol_Matrix;
    FUNPTR_Destroy_CompCol_Permuted Destroy_CompCol_Permuted;    
    FUNPTR_Destroy_SuperMatrix_Store Destroy_SuperMatrix_Store;
    FUNPTR_dgssv dgssv;
    FUNPTR_dgstrs dgstrs;
    FUNPTR_get_perm_c get_perm_c;
    FUNPTR_sp_preorder sp_preorder;
    FUNPTR_sp_ienv sp_ienv;
    FUNPTR_dgstrf dgstrf;
    FUNPTR_input_error input_error;
    
    NLdll DLL_handle;
} SuperLUContext;

static SuperLUContext* SuperLU() {
    static SuperLUContext context;
    static NLboolean init = NL_FALSE;
    if(!init) {
        init = NL_TRUE;
        memset(&context, 0, sizeof(context));
    }
    return &context;
}

NLboolean nlExtensionIsInitialized_SUPERLU() {
    return
        SuperLU()->DLL_handle != NULL &&
        SuperLU()->set_default_options != NULL &&
        SuperLU()->ilu_set_default_options != NULL &&   
        SuperLU()->StatInit != NULL &&
        SuperLU()->StatFree != NULL &&
        SuperLU()->dCreate_CompCol_Matrix != NULL &&
        SuperLU()->dCreate_Dense_Matrix != NULL &&
        SuperLU()->Destroy_SuperNode_Matrix != NULL &&
        SuperLU()->Destroy_CompCol_Matrix != NULL &&
        SuperLU()->Destroy_CompCol_Permuted != NULL &&	
        SuperLU()->Destroy_SuperMatrix_Store != NULL &&
        SuperLU()->dgssv != NULL &&
        SuperLU()->dgstrs != NULL &&
	SuperLU()->get_perm_c != NULL &&
	SuperLU()->sp_preorder != NULL &&
	SuperLU()->sp_ienv != NULL &&
	SuperLU()->dgstrf != NULL &&
	SuperLU()->input_error != NULL;
}

static void nlTerminateExtension_SUPERLU(void) {
    if(SuperLU()->DLL_handle != NULL) {
        nlCloseDLL(SuperLU()->DLL_handle);
        SuperLU()->DLL_handle = NULL;
    }
}


#define find_superlu_func(name)                                   \
    if(                                                           \
        (                                                         \
            SuperLU()->name =                                     \
            (FUNPTR_##name)nlFindFunction(SuperLU()->DLL_handle,#name) \
        ) == NULL                                                 \
    ) {                                                           \
        nlError("nlInitExtension_SUPERLU","function not found");  \
        nlError("nlInitExtension_SUPERLU",#name);                 \
        return NL_FALSE;                                          \
    }


NLboolean nlInitExtension_SUPERLU(void) {
    NLenum flags = NL_LINK_NOW | NL_LINK_USE_FALLBACK;
    if(nlCurrentContext == NULL || !nlCurrentContext->verbose) {
	flags |= NL_LINK_QUIET;
    }
    
    if(SuperLU()->DLL_handle != NULL) {
        return nlExtensionIsInitialized_SUPERLU();
    }

    SuperLU()->DLL_handle = nlOpenDLL(SUPERLU_LIB_NAME, flags);
    if(SuperLU()->DLL_handle == NULL) {
        return NL_FALSE;
    }
    
    find_superlu_func(set_default_options);
    find_superlu_func(ilu_set_default_options);    
    find_superlu_func(StatInit);
    find_superlu_func(StatFree);
    find_superlu_func(dCreate_CompCol_Matrix);
    find_superlu_func(dCreate_Dense_Matrix);
    find_superlu_func(Destroy_SuperNode_Matrix);
    find_superlu_func(Destroy_CompCol_Matrix);
    find_superlu_func(Destroy_CompCol_Permuted);        
    find_superlu_func(Destroy_SuperMatrix_Store);
    find_superlu_func(dgssv);
    find_superlu_func(dgstrs);
    find_superlu_func(get_perm_c);    
    find_superlu_func(sp_preorder);
    find_superlu_func(sp_ienv);
    find_superlu_func(dgstrf);
    find_superlu_func(input_error);

    atexit(nlTerminateExtension_SUPERLU);
    return NL_TRUE;
}


typedef struct {
    NLuint m;

    NLuint n;

    NLenum type;

    NLDestroyMatrixFunc destroy_func;

    NLMultMatrixVectorFunc mult_func;

    SuperMatrix L;

    SuperMatrix U;

    int* perm_r;

    int* perm_c;

    trans_t trans;
    
} NLSuperLUFactorizedMatrix;


static void nlSuperLUFactorizedMatrixDestroy(NLSuperLUFactorizedMatrix* M) {
    SuperLU()->Destroy_SuperNode_Matrix(&M->L);
    SuperLU()->Destroy_CompCol_Matrix(&M->U);    
    NL_DELETE_ARRAY(M->perm_r);
    NL_DELETE_ARRAY(M->perm_c);
}

static void nlSuperLUFactorizedMatrixMult(
    NLSuperLUFactorizedMatrix* M, const double* x, double* y
) {
    SuperMatrix B;
    SuperLUStat_t stat;
    int info = 0;
    NLuint i;

    /* Create vector */
    SuperLU()->dCreate_Dense_Matrix(
        &B, (int)(M->n), 1, y, (int)(M->n), 
        SLU_DN, /* Fortran-type column-wise storage */
        SLU_D,  /* doubles */
        SLU_GE  /* general */
    );

    /* copy rhs onto y (superLU matrix-vector product expects it here */
    for(i = 0; i < M->n; i++){
        y[i] = x[i];
    }

    /* Call SuperLU triangular solve */
    SuperLU()->StatInit(&stat) ;

    SuperLU()->dgstrs(
       M->trans, &M->L, &M->U, M->perm_c, M->perm_r, &B, &stat, &info
    );

    SuperLU()->StatFree(&stat) ;
    
    /*  Only the "store" structure needs to be 
     *  deallocated (the array has been allocated
     * by client code).
     */
    SuperLU()->Destroy_SuperMatrix_Store(&B) ;
}

/*
 * Copied from SUPERLU/dgssv.c, removed call to linear solve.
 */
static void dgssv_factorize_only(
      superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r,
      SuperMatrix *L, SuperMatrix *U,
      SuperLUStat_t *stat, int *info, trans_t *trans
) {
    SuperMatrix *AA = NULL;
        /* A in SLU_NC format used by the factorization routine.*/
    SuperMatrix AC; /* Matrix postmultiplied by Pc */
    int      lwork = 0, *etree, i;
    GlobalLU_t Glu; /* Not needed on return. */
    
    /* Set default values for some parameters */
    int      panel_size;     /* panel size */
    int      relax;          /* no of columns in a relaxed snodes */
    int      permc_spec;

    nl_assert(A->Stype == SLU_NR || A->Stype == SLU_NC);
    
    *trans = NOTRANS;

    if ( options->Fact != DOFACT ) *info = -1;
    else if ( A->nrow != A->ncol || A->nrow < 0 ||
	 (A->Stype != SLU_NC && A->Stype != SLU_NR) ||
	 A->Dtype != SLU_D || A->Mtype != SLU_GE )
	*info = -2;
    if ( *info != 0 ) {
	i = -(*info);
	SuperLU()->input_error("SUPERLU/OpenNL dgssv_factorize_only", &i);
	return;
    }

    /* Convert A to SLU_NC format when necessary. */
    if ( A->Stype == SLU_NR ) {
	NRformat *Astore = (NRformat*)A->Store;
	AA = NL_NEW(SuperMatrix);
	SuperLU()->dCreate_CompCol_Matrix(
	    AA, A->ncol, A->nrow, Astore->nnz, 
	    (double*)Astore->nzval, Astore->colind, Astore->rowptr,
	    SLU_NC, A->Dtype, A->Mtype
	);
	*trans = TRANS;
    } else {
        if ( A->Stype == SLU_NC ) AA = A;
    }

    nl_assert(AA != NULL);
    
    /*
     * Get column permutation vector perm_c[], according to permc_spec:
     *   permc_spec = NATURAL:  natural ordering 
     *   permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
     *   permc_spec = MMD_ATA:  minimum degree on structure of A'*A
     *   permc_spec = COLAMD:   approximate minimum degree column ordering
     *   permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
     */
    permc_spec = options->ColPerm;
    if ( permc_spec != MY_PERMC && options->Fact == DOFACT )
	SuperLU()->get_perm_c(permc_spec, AA, perm_c);
    
    etree = NL_NEW_ARRAY(int,A->ncol);
    SuperLU()->sp_preorder(options, AA, perm_c, etree, &AC);
    panel_size = SuperLU()->sp_ienv(1);
    relax = SuperLU()->sp_ienv(2);
    SuperLU()->dgstrf(options, &AC, relax, panel_size, etree,
            NULL, lwork, perm_c, perm_r, L, U, &Glu, stat, info);

    NL_DELETE_ARRAY(etree);
    SuperLU()->Destroy_CompCol_Permuted(&AC);
    if ( A->Stype == SLU_NR ) {
	SuperLU()->Destroy_SuperMatrix_Store(AA);
	NL_DELETE(AA);
    }
}


NLMatrix nlMatrixFactorize_SUPERLU(
    NLMatrix M, NLenum solver
) {
    NLSuperLUFactorizedMatrix* LU = NULL;
    NLCRSMatrix* CRS = NULL;
    SuperMatrix superM;
    NLuint n = M->n;
    superlu_options_t options;
    SuperLUStat_t     stat;
    NLint info = 0;       /* status code  */
    
    nl_assert(M->m == M->n);

    if(M->type == NL_MATRIX_CRS) {
        CRS = (NLCRSMatrix*)M;
    } else if(M->type == NL_MATRIX_SPARSE_DYNAMIC) {
        CRS = (NLCRSMatrix*)nlCRSMatrixNewFromSparseMatrix((NLSparseMatrix*)M);
    }

    nl_assert(!(CRS->symmetric_storage));
    
    LU = NL_NEW(NLSuperLUFactorizedMatrix);
    LU->m = M->m;
    LU->n = M->n;
    LU->type = NL_MATRIX_OTHER;
    LU->destroy_func = (NLDestroyMatrixFunc)(nlSuperLUFactorizedMatrixDestroy);
    LU->mult_func = (NLMultMatrixVectorFunc)(nlSuperLUFactorizedMatrixMult);
    LU->perm_c = NL_NEW_ARRAY(int, n);
    LU->perm_r = NL_NEW_ARRAY(int, n);    

    SuperLU()->dCreate_CompCol_Matrix(
        &superM, (int)n, (int)n, (int)nlCRSMatrixNNZ(CRS),
        CRS->val, (int*)CRS->colind, (int*)CRS->rowptr, 
        SLU_NR,              /* Row_wise, no supernode */
        SLU_D,               /* doubles                */ 
        CRS->symmetric_storage ? SLU_SYL : SLU_GE
    );

    SuperLU()->set_default_options(&options);
    switch(solver) {
    case NL_SUPERLU_EXT: {
        options.ColPerm = NATURAL;
    } break;
    case NL_PERM_SUPERLU_EXT: {
        options.ColPerm = COLAMD;
    } break;
    case NL_SYMMETRIC_SUPERLU_EXT: {
        options.ColPerm = MMD_AT_PLUS_A;
        options.SymmetricMode = YES;
    } break;
    default: 
        nl_assert_not_reached;
    }
    
    SuperLU()->StatInit(&stat);

    dgssv_factorize_only(
	  &options, &superM, LU->perm_c, LU->perm_r,
	  &LU->L, &LU->U, &stat, &info, &LU->trans
    );

    SuperLU()->StatFree(&stat);
    
    /*
     * Only the "store" structure needs to be deallocated 
     * (the arrays have been allocated by us, they are in CRS).
     */
    SuperLU()->Destroy_SuperMatrix_Store(&superM);
    
    if((NLMatrix)CRS != M) {
        nlDeleteMatrix((NLMatrix)CRS);
    }

    if(info != 0) {
	NL_DELETE(LU);
	LU = NULL;
    }
    return (NLMatrix)LU;
}

/******* extracted from nl_cholmod.c *******/


#ifdef NL_OS_UNIX
#  ifdef NL_OS_APPLE
#      define CHOLMOD_LIB_NAME "libcholmod.dylib"
#  else
#      define CHOLMOD_LIB_NAME "libcholmod.so"
#  endif
#else
#  define CHOLMOD_LIB_NAME "libcholmod.xxx"
#endif


/*            Excerpt from cholmod_core.h                         */


/* A dense matrix in column-oriented form.  It has no itype since it contains
 * no integers.  Entry in row i and column j is located in x [i+j*d].
 */
typedef struct cholmod_dense_struct {
    size_t nrow ;       /* the matrix is nrow-by-ncol */
    size_t ncol ;
    size_t nzmax ;      /* maximum number of entries in the matrix */
    size_t d ;          /* leading dimension (d >= nrow must hold) */
    void *x ;           /* size nzmax or 2*nzmax, if present */
    void *z ;           /* size nzmax, if present */
    int xtype ;         /* pattern, real, complex, or zomplex */
    int dtype ;         /* x and z double or float */
} cholmod_dense ;

/* A sparse matrix stored in compressed-column form. */

typedef struct cholmod_sparse_struct
{
    size_t nrow ;       /* the matrix is nrow-by-ncol */
    size_t ncol ;
    size_t nzmax ;      /* maximum number of entries in the matrix */

    /* pointers to int or SuiteSparse_long: */
    void *p ;           /* p [0..ncol], the column pointers */
    void *i ;           /* i [0..nzmax-1], the row indices */

    /* for unpacked matrices only: */
    void *nz ;          /* nz [0..ncol-1], the # of nonzeros in each col.  In
                         * packed form, the nonzero pattern of column j is in
        * A->i [A->p [j] ... A->p [j+1]-1].  In unpacked form, column j is in
        * A->i [A->p [j] ... A->p [j]+A->nz[j]-1] instead.  In both cases, the
        * numerical values (if present) are in the corresponding locations in
        * the array x (or z if A->xtype is CHOLMOD_ZOMPLEX). */

    /* pointers to double or float: */
    void *x ;           /* size nzmax or 2*nzmax, if present */
    void *z ;           /* size nzmax, if present */

    int stype ;         /* Describes what parts of the matrix are considered:
                         *
        * 0:  matrix is "unsymmetric": use both upper and lower triangular parts
        *     (the matrix may actually be symmetric in pattern and value, but
        *     both parts are explicitly stored and used).  May be square or
        *     rectangular.
        * >0: matrix is square and symmetric, use upper triangular part.
        *     Entries in the lower triangular part are ignored.
        * <0: matrix is square and symmetric, use lower triangular part.
        *     Entries in the upper triangular part are ignored.
        *
        * Note that stype>0 and stype<0 are different for cholmod_sparse and
        * cholmod_triplet.  See the cholmod_triplet data structure for more
        * details.
        */

    int itype ;         /* CHOLMOD_INT:     p, i, and nz are int.
                         * CHOLMOD_INTLONG: p is SuiteSparse_long,
                         *                  i and nz are int.
                         * CHOLMOD_LONG:    p, i, and nz are SuiteSparse_long */

    int xtype ;         /* pattern, real, complex, or zomplex */
    int dtype ;         /* x and z are double or float */
    int sorted ;        /* TRUE if columns are sorted, FALSE otherwise */
    int packed ;        /* TRUE if packed (nz ignored), FALSE if unpacked
                         * (nz is required) */

} cholmod_sparse ;


typedef void* cholmod_common_ptr;
typedef cholmod_dense* cholmod_dense_ptr;
typedef cholmod_sparse* cholmod_sparse_ptr;
typedef void* cholmod_factor_ptr;


typedef enum cholmod_xtype_enum {
    CHOLMOD_PATTERN =0,
    CHOLMOD_REAL    =1,
    CHOLMOD_COMPLEX =2,
    CHOLMOD_ZOMPLEX =3
} cholmod_xtype;


typedef enum cholmod_solve_type_enum {
   CHOLMOD_A    =0,   
   CHOLMOD_LDLt =1,   
   CHOLMOD_LD   =2,   
   CHOLMOD_DLt  =3,   
   CHOLMOD_L    =4,   
   CHOLMOD_Lt   =5,   
   CHOLMOD_D    =6,   
   CHOLMOD_P    =7,   
   CHOLMOD_Pt   =8   
} cholmod_solve_type;
    
typedef int cholmod_stype;

typedef void (*FUNPTR_cholmod_start)(cholmod_common_ptr);

typedef cholmod_sparse_ptr (*FUNPTR_cholmod_allocate_sparse)(
    size_t m, size_t n, size_t nnz, int sorted,
    int packed, int stype, int xtype, cholmod_common_ptr
);

typedef cholmod_dense_ptr (*FUNPTR_cholmod_allocate_dense)(
    size_t m, size_t n, size_t d, int xtype, cholmod_common_ptr
);

typedef cholmod_factor_ptr (*FUNPTR_cholmod_analyze)(
    cholmod_sparse_ptr A, cholmod_common_ptr
);

typedef int (*FUNPTR_cholmod_factorize)(
    cholmod_sparse_ptr A, cholmod_factor_ptr L, cholmod_common_ptr
);

typedef cholmod_dense_ptr (*FUNPTR_cholmod_solve)(
    int solve_type, cholmod_factor_ptr, cholmod_dense_ptr, cholmod_common_ptr
);

typedef void (*FUNPTR_cholmod_free_factor)(
    cholmod_factor_ptr*, cholmod_common_ptr
);

typedef void (*FUNPTR_cholmod_free_dense)(
    cholmod_dense_ptr*, cholmod_common_ptr
);

typedef void (*FUNPTR_cholmod_free_sparse)(
    cholmod_sparse_ptr*, cholmod_common_ptr
);

typedef void (*FUNPTR_cholmod_finish)(cholmod_common_ptr);

typedef struct {
    char cholmod_common[16384];

    FUNPTR_cholmod_start cholmod_start;
    FUNPTR_cholmod_allocate_sparse cholmod_allocate_sparse;
    FUNPTR_cholmod_allocate_dense cholmod_allocate_dense;
    FUNPTR_cholmod_analyze cholmod_analyze;
    FUNPTR_cholmod_factorize cholmod_factorize;
    FUNPTR_cholmod_solve cholmod_solve;
    FUNPTR_cholmod_free_factor cholmod_free_factor;
    FUNPTR_cholmod_free_sparse cholmod_free_sparse;        
    FUNPTR_cholmod_free_dense cholmod_free_dense;
    FUNPTR_cholmod_finish cholmod_finish;
    
    NLdll DLL_handle;
} CHOLMODContext;

static CHOLMODContext* CHOLMOD() {
    static CHOLMODContext context;
    static NLboolean init = NL_FALSE;
    if(!init) {
        init = NL_TRUE;
        memset(&context, 0, sizeof(context));
    }
    return &context;
}

NLboolean nlExtensionIsInitialized_CHOLMOD() {
    return
        CHOLMOD()->DLL_handle != NULL &&
        CHOLMOD()->cholmod_start != NULL &&
        CHOLMOD()->cholmod_allocate_sparse != NULL &&
        CHOLMOD()->cholmod_allocate_dense != NULL &&
        CHOLMOD()->cholmod_analyze != NULL &&
        CHOLMOD()->cholmod_factorize != NULL &&
        CHOLMOD()->cholmod_solve != NULL &&
        CHOLMOD()->cholmod_free_factor != NULL &&
        CHOLMOD()->cholmod_free_sparse != NULL &&
        CHOLMOD()->cholmod_free_dense != NULL &&
        CHOLMOD()->cholmod_finish != NULL ;
}

#define find_cholmod_func(name)                                        \
    if(                                                                \
        (                                                              \
            CHOLMOD()->name =                                          \
            (FUNPTR_##name)nlFindFunction(CHOLMOD()->DLL_handle,#name) \
        ) == NULL                                                      \
    ) {                                                                \
        nlError("nlInitExtension_CHOLMOD","function not found");       \
        return NL_FALSE;                                               \
    }


static void nlTerminateExtension_CHOLMOD(void) {
    if(CHOLMOD()->DLL_handle != NULL) {
        CHOLMOD()->cholmod_finish(&CHOLMOD()->cholmod_common);
        nlCloseDLL(CHOLMOD()->DLL_handle);
        CHOLMOD()->DLL_handle = NULL;
    }
}

NLboolean nlInitExtension_CHOLMOD(void) {
    NLenum flags = NL_LINK_NOW | NL_LINK_USE_FALLBACK;
    if(nlCurrentContext == NULL || !nlCurrentContext->verbose) {
	flags |= NL_LINK_QUIET;
    }
    
    if(CHOLMOD()->DLL_handle != NULL) {
        return nlExtensionIsInitialized_CHOLMOD();
    }

    /*
     *   MKL has a built-in CHOLMOD that conflicts with
     * the CHOLMOD used by OpenNL (to be fixed). For now
     * we simply output a warning message and deactivate the
     * CHOLMOD extension if the MKL extension was initialized
     * before.
     */
    if(NLMultMatrixVector_MKL != NULL) {
	nl_fprintf(
	    stderr,
	    "CHOLMOD extension incompatible with MKL (deactivating)"
	);
	return NL_FALSE;
    }

    
    CHOLMOD()->DLL_handle = nlOpenDLL(CHOLMOD_LIB_NAME,flags);
    if(CHOLMOD()->DLL_handle == NULL) {
        return NL_FALSE;
    }

    find_cholmod_func(cholmod_start);
    find_cholmod_func(cholmod_allocate_sparse);
    find_cholmod_func(cholmod_allocate_dense);
    find_cholmod_func(cholmod_analyze);
    find_cholmod_func(cholmod_factorize);
    find_cholmod_func(cholmod_solve);
    find_cholmod_func(cholmod_free_factor);
    find_cholmod_func(cholmod_free_sparse);
    find_cholmod_func(cholmod_free_dense);
    find_cholmod_func(cholmod_finish);

    CHOLMOD()->cholmod_start(&CHOLMOD()->cholmod_common);

    atexit(nlTerminateExtension_CHOLMOD);
    return NL_TRUE;
}


typedef struct {
    NLuint m;

    NLuint n;

    NLenum type;

    NLDestroyMatrixFunc destroy_func;

    NLMultMatrixVectorFunc mult_func;

    cholmod_factor_ptr L;
    
} NLCholmodFactorizedMatrix;

static void nlCholmodFactorizedMatrixDestroy(NLCholmodFactorizedMatrix* M) {
    CHOLMOD()->cholmod_free_factor(&M->L, &CHOLMOD()->cholmod_common);
}

static void nlCholmodFactorizedMatrixMult(
    NLCholmodFactorizedMatrix* M, const double* x, double* y
) {
    /* 
     * TODO: see whether CHOLDMOD can use user-allocated vectors
     * (and avoid copy)
     */
    cholmod_dense_ptr X=CHOLMOD()->cholmod_allocate_dense(
        M->n, 1, M->n, CHOLMOD_REAL, &CHOLMOD()->cholmod_common
    );
    cholmod_dense_ptr Y=NULL;

    memcpy(X->x, x, M->n*sizeof(double));    
    Y = CHOLMOD()->cholmod_solve(
	CHOLMOD_A, M->L, X, &CHOLMOD()->cholmod_common
    );
    memcpy(y, Y->x, M->n*sizeof(double));    
    
    CHOLMOD()->cholmod_free_dense(&X, &CHOLMOD()->cholmod_common);
    CHOLMOD()->cholmod_free_dense(&Y, &CHOLMOD()->cholmod_common);
}

NLMatrix nlMatrixFactorize_CHOLMOD(
    NLMatrix M, NLenum solver
) {
    NLCholmodFactorizedMatrix* LLt = NULL;
    NLCRSMatrix* CRS = NULL;
    cholmod_sparse_ptr cM= NULL;
    NLuint nnz, cur, i, j, jj;
    int* rowptr = NULL;
    int* colind = NULL;
    double* val = NULL;
    NLuint n = M->n;

    nl_assert(solver == NL_CHOLMOD_EXT);
    nl_assert(M->m == M->n);
    
    if(M->type == NL_MATRIX_CRS) {
        CRS = (NLCRSMatrix*)M;
    } else if(M->type == NL_MATRIX_SPARSE_DYNAMIC) {
	/* 
	 * Note: since we convert once again into symmetric storage,
	 * we could also directly read the NLSparseMatrix there instead
	 * of copying once more...
	 */
        CRS = (NLCRSMatrix*)nlCRSMatrixNewFromSparseMatrix((NLSparseMatrix*)M);
    }

    LLt = NL_NEW(NLCholmodFactorizedMatrix);
    LLt->m = M->m;
    LLt->n = M->n;
    LLt->type = NL_MATRIX_OTHER;
    LLt->destroy_func = (NLDestroyMatrixFunc)(nlCholmodFactorizedMatrixDestroy);
    LLt->mult_func = (NLMultMatrixVectorFunc)(nlCholmodFactorizedMatrixMult);

    /*
     * Compute required nnz, if matrix is not already with symmetric storage,
     * ignore entries in the upper triangular part.
     */
    
    nnz=0;
    for(i=0; i<n; ++i) {
	for(jj=CRS->rowptr[i]; jj<CRS->rowptr[i+1]; ++jj) {
	    j=CRS->colind[jj];
	    if(j <= i) {
		++nnz;
	    }
	}
    }

    /*
     * Copy CRS matrix into CHOLDMOD matrix (and ignore upper trianglar part)
     */
    
    cM = CHOLMOD()->cholmod_allocate_sparse(
        n, n, nnz,    /* Dimensions and number of non-zeros */
        NL_FALSE,     /* Sorted = false */
        NL_TRUE,      /* Packed = true  */
        1,            /* stype (-1 = lower triangular, 1 = upper triangular) */
        CHOLMOD_REAL, /* Entries are real numbers */
        &CHOLMOD()->cholmod_common
    );

    rowptr = (int*)cM->p;
    colind = (int*)cM->i;
    val = (double*)cM->x;
    cur = 0;
    for(i=0; i<n; ++i) {
        rowptr[i] = (int)cur;
	for(jj=CRS->rowptr[i]; jj<CRS->rowptr[i+1]; ++jj) {
            j = CRS->colind[jj];
            if(j <= i) {
		val[cur] = CRS->val[jj];
		colind[cur] = (int)j;
		++cur;
            }
        }
    }
    rowptr[n] = (int)cur;
    nl_assert(cur==nnz);

    LLt->L = CHOLMOD()->cholmod_analyze(cM, &CHOLMOD()->cholmod_common);
    if(!CHOLMOD()->cholmod_factorize(cM, LLt->L, &CHOLMOD()->cholmod_common)) {
        CHOLMOD()->cholmod_free_factor(&LLt->L, &CHOLMOD()->cholmod_common);
	NL_DELETE(LLt);
    }
    
    CHOLMOD()->cholmod_free_sparse(&cM, &CHOLMOD()->cholmod_common);
    
    if((NLMatrix)CRS != M) {
        nlDeleteMatrix((NLMatrix)CRS);
    }

    return (NLMatrix)(LLt);
}


/******* extracted from nl_arpack.c *******/


#ifdef NL_OS_UNIX
#  ifdef NL_OS_APPLE
#      define ARPACK_LIB_NAME "libarpack.dylib"
#  else
#      define ARPACK_LIB_NAME "libarpack.so"
#  endif
#else
#  define ARPACK_LIB_NAME "libarpack.dll"
#endif


typedef int ARint;
typedef int ARlogical;


/* double precision symmetric routines */

typedef void (*FUNPTR_dsaupd)(
    ARint *ido, char *bmat, ARint *n, char *which,
    ARint *nev, double *tol, double *resid,
    ARint *ncv, double *V, ARint *ldv,
    ARint *iparam, ARint *ipntr, double *workd,
    double *workl, ARint *lworkl, ARint *info
);

typedef void (*FUNPTR_dseupd)(
    ARlogical *rvec, char *HowMny, ARlogical *select,
    double *d, double *Z, ARint *ldz,
    double *sigma, char *bmat, ARint *n,
    char *which, ARint *nev, double *tol,
    double *resid, ARint *ncv, double *V,
    ARint *ldv, ARint *iparam, ARint *ipntr,
    double *workd, double *workl,
    ARint *lworkl, ARint *info
);

/* double precision nonsymmetric routines */
    
typedef void (*FUNPTR_dnaupd)(
    ARint *ido, char *bmat, ARint *n, char *which,
    ARint *nev, double *tol, double *resid,
    ARint *ncv, double *V, ARint *ldv,
    ARint *iparam, ARint *ipntr, double *workd,
    double *workl, ARint *lworkl, ARint *info
);

typedef void (*FUNPTR_dneupd)(
    ARlogical *rvec, char *HowMny, ARlogical *select,
    double *dr, double *di, double *Z,
    ARint *ldz, double *sigmar,
    double *sigmai, double *workev,
    char *bmat, ARint *n, char *which,
    ARint *nev, double *tol, double *resid,
    ARint *ncv, double *V, ARint *ldv,
    ARint *iparam, ARint *ipntr,
    double *workd, double *workl,
    ARint *lworkl, ARint *info
);


typedef struct {
    FUNPTR_dsaupd dsaupd;
    FUNPTR_dseupd dseupd;
    FUNPTR_dnaupd dnaupd;
    FUNPTR_dneupd dneupd;
    NLdll DLL_handle;
} ARPACKContext;


static ARPACKContext* ARPACK() {
    static ARPACKContext context;
    static NLboolean init = NL_FALSE;
    if(!init) {
        init = NL_TRUE;
        memset(&context, 0, sizeof(context));
    }
    return &context;
}

NLboolean nlExtensionIsInitialized_ARPACK() {
    return
        ARPACK()->DLL_handle != NULL &&
        ARPACK()->dsaupd != NULL &&
        ARPACK()->dseupd != NULL &&   
        ARPACK()->dnaupd != NULL &&
        ARPACK()->dneupd != NULL;
}

static void nlTerminateExtension_ARPACK(void) {
    if(ARPACK()->DLL_handle != NULL) {
        nlCloseDLL(ARPACK()->DLL_handle);
        ARPACK()->DLL_handle = NULL;
    }
}


static char* u(const char* str) {
    static char buff[1000];
    sprintf(buff, "%s_", str);
    return buff;
}

#define find_arpack_func(name)                                             \
    if(                                                                    \
        (                                                                  \
            ARPACK()->name =                                               \
            (FUNPTR_##name)nlFindFunction(ARPACK()->DLL_handle,u(#name))   \
        ) == NULL                                                          \
    ) {                                                                    \
        nlError("nlInitExtension_ARPACK","function not found");            \
        nlError("nlInitExtension_ARPACK",u(#name));	   		   \
        return NL_FALSE;                                                   \
    }

NLboolean nlInitExtension_ARPACK(void) {
    NLenum flags = NL_LINK_NOW | NL_LINK_USE_FALLBACK;
    if(nlCurrentContext == NULL || !nlCurrentContext->verbose) {
	flags |= NL_LINK_QUIET;
    }
    
    if(ARPACK()->DLL_handle != NULL) {
        return nlExtensionIsInitialized_ARPACK();
    }
    
    ARPACK()->DLL_handle = nlOpenDLL(ARPACK_LIB_NAME, flags);
    if(ARPACK()->DLL_handle == NULL) {
        return NL_FALSE;
    }

    find_arpack_func(dsaupd);
    find_arpack_func(dseupd);
    find_arpack_func(dnaupd);
    find_arpack_func(dneupd);

    atexit(nlTerminateExtension_ARPACK);
    return NL_TRUE;
}


static NLMatrix create_OP(NLboolean symmetric) {
    NLuint n = nlCurrentContext->M->n;
    NLuint i;
    NLMatrix result = NULL;
    
	
    if(nlCurrentContext->eigen_shift != 0.0) {
	/*
	 * A = M
	 */
	NLSparseMatrix* A = NL_NEW(NLSparseMatrix);
	nlSparseMatrixConstruct(A, n, n, NL_MATRIX_STORE_ROWS);
	nlSparseMatrixAddMatrix(A, 1.0, nlCurrentContext->M);
	if(nlCurrentContext->B == NULL) {
	    /*
	     * A = A - shift * Id
	     */
	    for(i=0; i<n; ++i) {
		nlSparseMatrixAdd(A, i, i, -nlCurrentContext->eigen_shift);
	    }
	} else {
	    /*
	     * A = A - shift * B
	     */
	    nlSparseMatrixAddMatrix(
		A, -nlCurrentContext->eigen_shift, nlCurrentContext->B
	    );
	}

	/* 
	 * OP = A^{-1} 
	 */
	if(nlCurrentContext->verbose) {
	    nl_printf("Factorizing matrix...\n");
	}
	result = nlMatrixFactorize(
	    (NLMatrix)A,
	    symmetric ? NL_SYMMETRIC_SUPERLU_EXT : NL_PERM_SUPERLU_EXT
	);
	if(nlCurrentContext->verbose) {
	    nl_printf("Matrix factorized\n");
	}
	nlDeleteMatrix((NLMatrix)A);
    } else {
	/* 
	 * OP = M^{-1} 
	 */
	if(nlCurrentContext->verbose) {
	    nl_printf("Factorizing matrix...\n");
	}
	result = nlMatrixFactorize(
	    nlCurrentContext->M,
	    symmetric ? NL_SYMMETRIC_SUPERLU_EXT : NL_PERM_SUPERLU_EXT
	    );
	if(nlCurrentContext->verbose) {
	    nl_printf("Matrix factorized\n");
	}
    }
    
    if(nlCurrentContext->B != NULL) {
	/* 
	 * OP = OP * B
	 */	
	result = nlMatrixNewFromProduct(
	    result, NL_TRUE, /* mem. ownership transferred */
	    nlCurrentContext->B, NL_FALSE  /* mem. ownership kept by context */
	);
    }

    return result;
}

static int eigencompare(const void* pi, const void* pj) {
    NLuint i = *(const NLuint*)pi;
    NLuint j = *(const NLuint*)pj;
    double vali = fabs(nlCurrentContext->temp_eigen_value[i]);
    double valj = fabs(nlCurrentContext->temp_eigen_value[j]);
    if(vali == valj) {
	return 0;
    }
    return vali < valj ? -1 : 1;
}

void nlEigenSolve_ARPACK(void) {
    NLboolean symmetric =
	nlCurrentContext->symmetric && (nlCurrentContext->B == NULL); 
    int n = (int)nlCurrentContext->M->n; /* Dimension of the matrix */
    int nev = /* Number of eigenvectors requested */
	(int)nlCurrentContext->nb_systems;
    NLMatrix OP = create_OP(symmetric);
    int ncv = (int)(nev * 2.5); /* Length of Arnoldi factorization */
                 /* Rule of thumb in ARPACK documentation: ncv > 2 * nev */
    int* iparam = NULL;
    int* ipntr  = NULL;
    NLdouble* resid = NULL;
    NLdouble* workev = NULL;
    NLdouble* workd = NULL;
    NLdouble* workl = NULL;
    NLdouble* v = NULL;
    NLdouble* d = NULL;
    ARlogical* select = NULL;
    ARlogical rvec = 1;
    double sigmar = 0.0;
    double sigmai = 0.0;
    int ierr;
    int i,k,kk;
    int ldv = (int)n;
    char* bmat = (char*)"I";   /*Standard problem */
    char* which = (char*)"LM"; /*Largest eigenvalues, but we invert->smallest */
    char* howmny = (char*)"A"; /*which eigens should be computed: all */
    double tol = nlCurrentContext->threshold;
    int ido = 0;  /* reverse communication variable (which operation ?) */
    int info = 1; /* start with initial value of resid */
    int lworkl;   /* size of work array */
    NLboolean converged = NL_FALSE;
    NLdouble value;
    int index;
    int* sorted; /* indirection array for sorting eigenpairs */

    if(ncv > n) {
	ncv = n;
    }

    if(nev > n) {
	nev = n;
    }

    if(nev + 2 > ncv) {
	nev = ncv  - 2;
    }

    
    if(symmetric) {
	lworkl = ncv * (ncv + 8) ;
    } else {
	lworkl = 3*ncv*ncv + 6*ncv ; 
    }
    iparam = NL_NEW_ARRAY(int, 11);
    ipntr  = NL_NEW_ARRAY(int, 14);

    iparam[1-1] = 1; /* ARPACK chooses the shifts */
    iparam[3-1] = (int)nlCurrentContext->max_iterations;
    iparam[7-1] = 1; /* Normal mode (we do not use 
         shift-invert (3) since we do our own shift-invert */

    workev = NL_NEW_ARRAY(NLdouble, 3*ncv);
    workd = NL_NEW_ARRAY(NLdouble, 3*n);

    resid = NL_NEW_ARRAY(NLdouble, n);
    for(i=0; i<n; ++i) {
	resid[i] = 1.0; /* (double)i / (double)n; */
    }
    v = NL_NEW_ARRAY(NLdouble, ldv*ncv);
    if(symmetric) {
	d = NL_NEW_ARRAY(NLdouble, 2*ncv);
    } else {
	d = NL_NEW_ARRAY(NLdouble, 3*ncv);	
    }
    workl = NL_NEW_ARRAY(NLdouble, lworkl);

    
    if(nlCurrentContext->verbose) {
	if(symmetric) {
	    nl_printf("calling dsaupd()\n");	    
	} else {
	    nl_printf("calling dnaupd()\n");
	}
    }
    while(!converged) {
	/*
	if(nlCurrentContext->verbose) {
	    fprintf(stderr, ".");
	    fflush(stderr);
	}
	*/
	if(symmetric) {
	    ARPACK()->dsaupd(
		&ido, bmat, &n, which, &nev, &tol, resid, &ncv,
		v, &ldv, iparam, ipntr, workd, workl, &lworkl, &info
	    );
	} else {
	    ARPACK()->dnaupd(
		&ido, bmat, &n, which, &nev, &tol, resid, &ncv,
		v, &ldv, iparam, ipntr, workd, workl, &lworkl, &info
	    );
	}
	if(ido == 1) {
	    nlMultMatrixVector(
             OP,
	     workd+ipntr[1-1]-1, /*The "-1"'s are for FORTRAN-to-C conversion */
	     workd+ipntr[2-1]-1  /*to keep the same indices as in ARPACK doc  */
	    );
	} else {
	    converged = NL_TRUE;
	}
    }

    
    if(info < 0) {
	if(symmetric) {
	    nl_fprintf(stderr, "\nError with dsaupd(): %d\n", info);	    
	} else {
	    nl_fprintf(stderr, "\nError with dnaupd(): %d\n", info);
	}
    } else {
	if(nlCurrentContext->verbose) {
	    fprintf(stderr, "\nconverged\n");
	}
	
	select = NL_NEW_ARRAY(ARlogical, ncv);
	for(i=0; i<ncv; ++i) {
	    select[i] = 1;
	}
	
	if(nlCurrentContext->verbose) {
	    if(symmetric) {
		nl_printf("calling dseupd()\n");		
	    } else {
		nl_printf("calling dneupd()\n");
	    }
	}
	
        if(symmetric) {
            ARPACK()->dseupd(
                &rvec, howmny, select, d, v, 
                &ldv, &sigmar, bmat, &n, which, &nev, 
                &tol, resid, &ncv, v, &ldv, 
                iparam, ipntr, workd,
		workl, &lworkl, &ierr 
	    );
        } else {
	    ARPACK()->dneupd(
		&rvec, howmny, select, d, d+ncv,
                v, &ldv, 
                &sigmar, &sigmai, workev, bmat, &n,
		which, &nev, &tol, 
                resid, &ncv, v, &ldv, iparam, 
		ipntr, workd, workl, &lworkl, &ierr 
            ) ;
	}	


	if(nlCurrentContext->verbose) {
	    if(ierr != 0) {		
		if(symmetric) {
		    nl_fprintf(stderr, "Error with dseupd(): %d\n", ierr);		
		} else {
		    nl_fprintf(stderr, "Error with dneupd(): %d\n", ierr);
		}
	    } else {
		if(symmetric) {
		    nl_printf("dseupd() OK, nconv= %d\n", iparam[3-1]);
		} else {
		    nl_printf("dneupd() OK, nconv= %d\n", iparam[3-1]);
		}
	    }
	}
	
	NL_DELETE_ARRAY(select);
    }

    
    for(i=0; i<nev; ++i) {
	d[i] = (fabs(d[i]) < 1e-30) ? 1e30 : 1.0 / d[i] ;
	d[i] += nlCurrentContext->eigen_shift ;
    }            

    
    /* Make it visible to the eigen_compare function */
    nlCurrentContext->temp_eigen_value = d;
    sorted = NL_NEW_ARRAY(int, nev);
    for(i=0; i<nev; ++i) {
	sorted[i] = i;
    }
    qsort(sorted, (size_t)nev, sizeof(NLuint), eigencompare);
    nlCurrentContext->temp_eigen_value = NULL;
    
    
    for(k=0; k<nev; ++k) {
	kk = sorted[k];
	nlCurrentContext->eigen_value[k] = d[kk];
	for(i=0; i<(int)nlCurrentContext->nb_variables; ++i) {
	    if(!nlCurrentContext->variable_is_locked[i]) {
		index = (int)nlCurrentContext->variable_index[i];
		nl_assert(index < n);
		value = v[kk*n+index];
		NL_BUFFER_ITEM(
		    nlCurrentContext->variable_buffer[k],(NLuint)i
		) = value;
	    }
	}
    }
    
    
    NL_DELETE_ARRAY(sorted);
    NL_DELETE_ARRAY(workl);
    NL_DELETE_ARRAY(d);
    NL_DELETE_ARRAY(v);
    NL_DELETE_ARRAY(resid);
    NL_DELETE_ARRAY(workd);
    NL_DELETE_ARRAY(workev);
    nlDeleteMatrix(OP);
    NL_DELETE_ARRAY(iparam);
    NL_DELETE_ARRAY(ipntr);
}


/******* extracted from nl_mkl.c *******/


typedef unsigned int MKL_INT;

typedef void (*FUNPTR_mkl_cspblas_dcsrgemv)(
    const char *transa, const MKL_INT *m, const double *a,
    const MKL_INT *ia, const MKL_INT *ja, const double *x, double *y
);

typedef void (*FUNPTR_mkl_cspblas_dcsrsymv)(
    const char *transa, const MKL_INT *m, const double *a,
    const MKL_INT *ia, const MKL_INT *ja, const double *x, double *y
 );

typedef struct {
    NLdll DLL_mkl_intel_lp64;
    NLdll DLL_mkl_intel_thread;
    NLdll DLL_mkl_core;
    NLdll DLL_iomp5;

    FUNPTR_mkl_cspblas_dcsrgemv mkl_cspblas_dcsrgemv;
    FUNPTR_mkl_cspblas_dcsrsymv mkl_cspblas_dcsrsymv;   
} MKLContext;

static MKLContext* MKL() {
    static MKLContext context;
    static NLboolean init = NL_FALSE;
    if(!init) {
        init = NL_TRUE;
        memset(&context, 0, sizeof(context));
    }
    return &context;
}

NLboolean nlExtensionIsInitialized_MKL() {
    if(
	MKL()->DLL_iomp5 == NULL ||
	MKL()->DLL_mkl_core == NULL ||
	MKL()->DLL_mkl_intel_thread == NULL ||
	MKL()->DLL_mkl_intel_lp64 == NULL ||
	MKL()->mkl_cspblas_dcsrgemv == NULL ||
	MKL()->mkl_cspblas_dcsrsymv == NULL 	
    ) {
        return NL_FALSE;
    }
    return NL_TRUE;
}

#define find_mkl_func(name)                                  \
    if(                                                      \
        (                                                    \
            MKL()->name =                                    \
            (FUNPTR_##name)nlFindFunction(                   \
		   MKL()->DLL_mkl_intel_lp64,#name           \
	    )					             \
        ) == NULL                                            \
    ) {                                                      \
        nlError("nlInitExtension_MKL","function not found"); \
        return NL_FALSE;                                     \
    }

static void nlTerminateExtension_MKL(void) {
    if(!nlExtensionIsInitialized_MKL()) {
	return;
    }
    nlCloseDLL(MKL()->DLL_mkl_intel_lp64);
    nlCloseDLL(MKL()->DLL_mkl_intel_thread);
    nlCloseDLL(MKL()->DLL_mkl_core);
    nlCloseDLL(MKL()->DLL_iomp5);
}

NLMultMatrixVectorFunc NLMultMatrixVector_MKL = NULL;

static void NLMultMatrixVector_MKL_impl(NLMatrix M_in, const double* x, double* y) {
    NLCRSMatrix* M = (NLCRSMatrix*)(M_in);
    nl_debug_assert(M_in->type == NL_MATRIX_CRS);
    if(M->symmetric_storage) {
	MKL()->mkl_cspblas_dcsrsymv(
	    "N", /* No transpose */
	    &M->m,
	    M->val,
	    M->rowptr,
	    M->colind,
	    x,
	    y
	);
    } else {
	MKL()->mkl_cspblas_dcsrgemv(
	    "N", /* No transpose */
	    &M->m,
	    M->val,
	    M->rowptr,
	    M->colind,
	    x,
	    y
	);
    }
}


#define INTEL_PREFIX "/opt/intel/"
#define LIB_DIR "lib/intel64/"
#define MKL_PREFIX  INTEL_PREFIX "mkl/" LIB_DIR

NLboolean nlInitExtension_MKL(void) {
    NLenum flags = NL_LINK_LAZY | NL_LINK_GLOBAL;
    if(nlCurrentContext == NULL || !nlCurrentContext->verbose) {
	flags |= NL_LINK_QUIET;
    }
    
    if(MKL()->DLL_mkl_intel_lp64 != NULL) {
        return nlExtensionIsInitialized_MKL();
    }
    
    MKL()->DLL_iomp5 = nlOpenDLL(
	INTEL_PREFIX LIB_DIR "libiomp5.so",
	flags
    );    
    MKL()->DLL_mkl_core = nlOpenDLL(
	MKL_PREFIX "libmkl_core.so",
	flags
    );    
    MKL()->DLL_mkl_intel_thread = nlOpenDLL(
	MKL_PREFIX "libmkl_intel_thread.so",
	flags
    );    
    MKL()->DLL_mkl_intel_lp64 = nlOpenDLL(
	MKL_PREFIX "libmkl_intel_lp64.so",
	flags
    );
    
    if(
	MKL()->DLL_iomp5 == NULL ||
	MKL()->DLL_mkl_core == NULL ||
	MKL()->DLL_mkl_intel_thread == NULL ||
	MKL()->DLL_mkl_intel_lp64 == NULL
    ) {
        return NL_FALSE;
    }

    find_mkl_func(mkl_cspblas_dcsrgemv);
    find_mkl_func(mkl_cspblas_dcsrsymv);

    if(nlExtensionIsInitialized_MKL()) {
	NLMultMatrixVector_MKL = NLMultMatrixVector_MKL_impl;
    }
    
    atexit(nlTerminateExtension_MKL);
    return NL_TRUE;
}


/******* extracted from nl_cuda.c *******/


/*      CUDA structures and functions                     */
/* Repeated here so that one can compile OpenNL without   */
/* requiring CUDA to be installed in the system.          */


struct cudaDeviceProp {
    char name[256];
    size_t totalGlobalMem;
    size_t sharedMemPerBlock;
    int regsPerBlock;
    int warpSize;
    size_t memPitch;
    int maxThreadsPerBlock;
    int maxThreadsDim[3];
    int maxGridSize[3];
    int clockRate;
    size_t totalConstMem;
    int major;
    int minor;
    size_t textureAlignment;
    size_t texturePitchAlignment;
    int deviceOverlap;
    int multiProcessorCount;
    int kernelExecTimeoutEnabled;
    int integrated;
    int canMapHostMemory;
    int computeMode;
    int maxTexture1D;
    int maxTexture1DMipmap;
    int maxTexture1DLinear;
    int maxTexture2D[2];
    int maxTexture2DMipmap[2];
    int maxTexture2DLinear[3];
    int maxTexture2DGather[2];
    int maxTexture3D[3];
    int maxTexture3DAlt[3];
    int maxTextureCubemap;
    int maxTexture1DLayered[2];
    int maxTexture2DLayered[3];
    int maxTextureCubemapLayered[2];
    int maxSurface1D;
    int maxSurface2D[2];
    int maxSurface3D[3];
    int maxSurface1DLayered[2];
    int maxSurface2DLayered[3];
    int maxSurfaceCubemap;
    int maxSurfaceCubemapLayered[2];
    size_t surfaceAlignment;
    int concurrentKernels;
    int ECCEnabled;
    int pciBusID;
    int pciDeviceID;
    int pciDomainID;
    int tccDriver;
    int asyncEngineCount;
    int unifiedAddressing;
    int memoryClockRate;
    int memoryBusWidth;
    int l2CacheSize;
    int maxThreadsPerMultiProcessor;
    int streamPrioritiesSupported;
    int globalL1CacheSupported;
    int localL1CacheSupported;
    size_t sharedMemPerMultiprocessor;
    int regsPerMultiprocessor;
    int managedMemSupported;
    int isMultiGpuBoard;
    int multiGpuBoardGroupID;
    int singleToDoublePrecisionPerfRatio;
    int pageableMemoryAccess;
    int concurrentManagedAccess;
    char padding[1024]; /* More room for future evolutions */
};

enum cudaComputeMode {
    cudaComputeModeDefault          = 0, 
    cudaComputeModeExclusive        = 1, 
    cudaComputeModeProhibited       = 2, 
    cudaComputeModeExclusiveProcess = 3   
};

enum cudaMemcpyKind {
    cudaMemcpyHostToHost          =   0, 
    cudaMemcpyHostToDevice        =   1, 
    cudaMemcpyDeviceToHost        =   2, 
    cudaMemcpyDeviceToDevice      =   3, 
    cudaMemcpyDefault             =   4  
};

typedef int cudaError_t;

typedef cudaError_t (*FUNPTR_cudaGetDeviceCount)(int* device_count);
typedef cudaError_t (*FUNPTR_cudaGetDeviceProperties)(
    struct cudaDeviceProp *props, int device
);
typedef cudaError_t (*FUNPTR_cudaDeviceReset)(void);
typedef cudaError_t (*FUNPTR_cudaMalloc)(void **devPtr, size_t size);
typedef cudaError_t (*FUNPTR_cudaFree)(void* devPtr);
typedef cudaError_t (*FUNPTR_cudaMemcpy)(
    void *dst, const void *src, size_t count, enum cudaMemcpyKind kind
);

#define find_cuda_func(name)                                  \
    if(                                                       \
        (                                                     \
            CUDA()->name =                                    \
            (FUNPTR_##name)nlFindFunction(                    \
		   CUDA()->DLL_cudart,#name                   \
	    )					              \
        ) == NULL                                             \
    ) {                                                       \
        nlError("nlInitExtension_CUDA: function not found", #name); \
        return NL_FALSE;                                      \
    }


/*      CUBLAS structures and functions                   */


struct cublasContext;
typedef struct cublasContext *cublasHandle_t;
typedef int cublasStatus_t;

typedef enum {
    CUBLAS_SIDE_LEFT =0, 
    CUBLAS_SIDE_RIGHT=1
} cublasSideMode_t; 

typedef enum {
    CUBLAS_FILL_MODE_LOWER=0, 
    CUBLAS_FILL_MODE_UPPER=1
} cublasFillMode_t;

typedef enum {
    CUBLAS_OP_N=0,  
    CUBLAS_OP_T=1,  
    CUBLAS_OP_C=2  
} cublasOperation_t;

typedef enum {
    CUBLAS_DIAG_NON_UNIT=0, 
    CUBLAS_DIAG_UNIT=1
} cublasDiagType_t; 

typedef cublasStatus_t (*FUNPTR_cublasCreate)(cublasHandle_t* handle);
typedef cublasStatus_t (*FUNPTR_cublasDestroy)(cublasHandle_t handle);

typedef cublasStatus_t (*FUNPTR_cublasGetVersion)(
    cublasHandle_t handle, int* version
);

typedef cublasStatus_t (*FUNPTR_cublasDdot)(
    cublasHandle_t handle, int n,
    const double *x, int incx,
    const double *y, int incy,
    double *result
);

typedef cublasStatus_t (*FUNPTR_cublasDcopy)(
    cublasHandle_t handle, int n,
    const double *x, int incx,
    const double *y, int incy
);

typedef cublasStatus_t (*FUNPTR_cublasDaxpy)(
    cublasHandle_t handle, int n,
    const double* alpha, 
    const double *x, int incx,
    const double *y, int incy
);

typedef cublasStatus_t (*FUNPTR_cublasDscal)(
    cublasHandle_t handle, int n,
    const double* alpha, 
    const double *x, int incx
);

typedef cublasStatus_t (*FUNPTR_cublasDnrm2)(
    cublasHandle_t handle, int n,
    const double *x, int incx,
    double* result
);
                                
typedef cublasStatus_t (*FUNPTR_cublasDdgmm)(
    cublasHandle_t handle, cublasSideMode_t mode,
    int m, int n,
    const double* A, int lda,
    const double* x, int incx,
    double* C, int ldc
);

typedef cublasStatus_t (*FUNPTR_cublasDgemv)(
    cublasHandle_t handle, 
    cublasOperation_t trans, 
    int m,
    int n,
    const double *alpha,
    const double *A,
    int lda,
    const double *x,
    int incx,
    const double *beta, 
    double *y, 
    int incy    
);

typedef cublasStatus_t (*FUNPTR_cublasDtpsv)(
    cublasHandle_t handle, cublasFillMode_t uplo,
    cublasOperation_t trans, cublasDiagType_t diag,
    int n, const double *AP,
    double* x, int incx
);


#define find_cublas_func(name)	    		              \
    if(                                                       \
        (                                                     \
            CUDA()->name =                                    \
            (FUNPTR_##name)nlFindFunction(                    \
		   CUDA()->DLL_cublas,#name "_v2"             \
	    )					              \
        ) == NULL                                             \
    ) {                                                       \
        nlError("nlInitExtension_CUDA: function not found", #name); \
        return NL_FALSE;                                      \
    }

#define find_cublas_func_v1(name)                             \
    if(                                                       \
        (                                                     \
            CUDA()->name =                                    \
            (FUNPTR_##name)nlFindFunction(                    \
		   CUDA()->DLL_cublas,#name                   \
	    )					              \
        ) == NULL                                             \
    ) {                                                       \
        nlError("nlInitExtension_CUDA: function not found", #name); \
        return NL_FALSE;                                      \
    }


/*      CUSPARSE structures and functions                 */


struct cusparseContext;
typedef struct cusparseContext *cusparseHandle_t;
typedef int cusparseStatus_t;
struct cusparseMatDescr;
typedef struct cusparseMatDescr *cusparseMatDescr_t;

typedef enum {
    CUSPARSE_MATRIX_TYPE_GENERAL = 0, 
    CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1,     
    CUSPARSE_MATRIX_TYPE_HERMITIAN = 2, 
    CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3 
} cusparseMatrixType_t;

typedef enum {
    CUSPARSE_INDEX_BASE_ZERO = 0, 
    CUSPARSE_INDEX_BASE_ONE = 1
} cusparseIndexBase_t;

typedef enum {
    CUSPARSE_OPERATION_NON_TRANSPOSE = 0,  
    CUSPARSE_OPERATION_TRANSPOSE = 1,  
    CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2  
} cusparseOperation_t;

struct cusparseHybMat;
typedef struct cusparseHybMat *cusparseHybMat_t;

typedef enum {
    CUSPARSE_HYB_PARTITION_AUTO = 0,  
    CUSPARSE_HYB_PARTITION_USER = 1,  
    CUSPARSE_HYB_PARTITION_MAX = 2    
} cusparseHybPartition_t;

typedef cusparseStatus_t (*FUNPTR_cusparseCreate)(cusparseHandle_t* handle);
typedef cusparseStatus_t (*FUNPTR_cusparseDestroy)(cusparseHandle_t handle);
typedef cusparseStatus_t (*FUNPTR_cusparseGetVersion)(
    cusparseHandle_t handle, int* version
);
typedef cusparseStatus_t (*FUNPTR_cusparseCreateMatDescr)(
    cusparseMatDescr_t* descr
);
typedef cusparseStatus_t (*FUNPTR_cusparseDestroyMatDescr)(
    cusparseMatDescr_t descr
);
typedef cusparseStatus_t (*FUNPTR_cusparseSetMatType)(
    cusparseMatDescr_t descr, cusparseMatrixType_t mtype
);
typedef cusparseStatus_t (*FUNPTR_cusparseSetMatIndexBase)(
    cusparseMatDescr_t descr, cusparseIndexBase_t ibase
);
typedef cusparseStatus_t (*FUNPTR_cusparseDcsrmv)(
     cusparseHandle_t handle, cusparseOperation_t transA, 
     int m, int n, int nnz,
     const double *alpha, const cusparseMatDescr_t descrA, 
     const double *csrSortedValA, const int *csrSortedRowPtrA, 
     const int *csrSortedColIndA, const double *x, 
     const double *beta, double *y
);
typedef cusparseStatus_t (*FUNPTR_cusparseCreateHybMat)(
    cusparseHybMat_t *hybA
);
typedef cusparseStatus_t (*FUNPTR_cusparseDestroyHybMat)(
    cusparseHybMat_t hybA
);
typedef cusparseStatus_t (*FUNPTR_cusparseDcsr2hyb)(
    cusparseHandle_t handle,
    int m,
    int n,
    const cusparseMatDescr_t descrA,
    const double *csrSortedValA,
    const int *csrSortedRowPtrA,
    const int *csrSortedColIndA,
    cusparseHybMat_t hybA,
    int userEllWidth,
    cusparseHybPartition_t partitionType
);
typedef cusparseStatus_t (*FUNPTR_cusparseDhybmv)(
    cusparseHandle_t handle,
    cusparseOperation_t transA,
    const double *alpha,
    const cusparseMatDescr_t descrA,
    const cusparseHybMat_t hybA,
    const double *x,
    const double *beta,
    double *y
);

#define find_cusparse_func(name)                              \
    if(                                                       \
        (                                                     \
            CUDA()->name =                                    \
            (FUNPTR_##name)nlFindFunction(                    \
		   CUDA()->DLL_cusparse,#name                 \
	    )					              \
        ) == NULL                                             \
    ) {                                                       \
        nlError("nlInitExtension_CUDA : function not found", #name);	\
        return NL_FALSE;                                      \
    }


typedef struct {
    NLdll DLL_cudart;
    FUNPTR_cudaGetDeviceCount cudaGetDeviceCount;
    FUNPTR_cudaGetDeviceProperties cudaGetDeviceProperties;
    FUNPTR_cudaDeviceReset cudaDeviceReset;
    FUNPTR_cudaMalloc cudaMalloc;
    FUNPTR_cudaFree cudaFree;
    FUNPTR_cudaMemcpy cudaMemcpy;

    NLdll DLL_cublas;
    cublasHandle_t HNDL_cublas;
    FUNPTR_cublasCreate cublasCreate;
    FUNPTR_cublasDestroy cublasDestroy;
    FUNPTR_cublasGetVersion cublasGetVersion;
    FUNPTR_cublasDdot cublasDdot;
    FUNPTR_cublasDcopy cublasDcopy;
    FUNPTR_cublasDaxpy cublasDaxpy;
    FUNPTR_cublasDscal cublasDscal;    
    FUNPTR_cublasDnrm2 cublasDnrm2;
    FUNPTR_cublasDdgmm cublasDdgmm;
    FUNPTR_cublasDgemv cublasDgemv;
    FUNPTR_cublasDtpsv cublasDtpsv;
    
    NLdll DLL_cusparse;
    cusparseHandle_t HNDL_cusparse;
    FUNPTR_cusparseCreate cusparseCreate;
    FUNPTR_cusparseDestroy cusparseDestroy;
    FUNPTR_cusparseGetVersion cusparseGetVersion;
    FUNPTR_cusparseCreateMatDescr cusparseCreateMatDescr;
    FUNPTR_cusparseDestroyMatDescr cusparseDestroyMatDescr;    
    FUNPTR_cusparseSetMatType cusparseSetMatType;
    FUNPTR_cusparseSetMatIndexBase cusparseSetMatIndexBase;    
    FUNPTR_cusparseDcsrmv cusparseDcsrmv;
    FUNPTR_cusparseCreateHybMat cusparseCreateHybMat;
    FUNPTR_cusparseDestroyHybMat cusparseDestroyHybMat;
    FUNPTR_cusparseDcsr2hyb cusparseDcsr2hyb;
    FUNPTR_cusparseDhybmv cusparseDhybmv;
    
    int devID;
} CUDAContext;

static CUDAContext* CUDA() {
    static CUDAContext context;
    static NLboolean init = NL_FALSE;
    if(!init) {
        init = NL_TRUE;
        memset(&context, 0, sizeof(context));
    }
    return &context;
}

NLboolean nlExtensionIsInitialized_CUDA() {
    if(
	CUDA()->DLL_cudart == NULL ||
	CUDA()->cudaGetDeviceCount == NULL ||
	CUDA()->cudaGetDeviceProperties == NULL ||
	CUDA()->cudaDeviceReset == NULL ||
	CUDA()->cudaMalloc == NULL ||
	CUDA()->cudaFree == NULL ||
	CUDA()->cudaMemcpy == NULL ||
	
	CUDA()->DLL_cublas == NULL ||
	CUDA()->HNDL_cublas == NULL ||
	CUDA()->cublasCreate == NULL ||
	CUDA()->cublasDestroy == NULL ||
	CUDA()->cublasGetVersion == NULL ||
	CUDA()->cublasDdot == NULL ||
	CUDA()->cublasDcopy == NULL ||
	CUDA()->cublasDaxpy == NULL ||
	CUDA()->cublasDscal == NULL ||
	CUDA()->cublasDnrm2 == NULL ||
	CUDA()->cublasDdgmm == NULL ||
	
	CUDA()->DLL_cusparse == NULL ||
	CUDA()->HNDL_cusparse == NULL ||
	CUDA()->cusparseCreate == NULL ||
	CUDA()->cusparseDestroy == NULL ||
	CUDA()->cusparseGetVersion == NULL ||
	CUDA()->cusparseCreateMatDescr == NULL ||
	CUDA()->cusparseDestroyMatDescr == NULL ||	
	CUDA()->cusparseSetMatType == NULL ||
	CUDA()->cusparseSetMatIndexBase == NULL ||
	CUDA()->cusparseDcsrmv == NULL ||
	CUDA()->cusparseCreateHybMat == NULL ||
	CUDA()->cusparseDestroyHybMat == NULL ||
	CUDA()->cusparseDcsr2hyb == NULL ||
	CUDA()->cusparseDhybmv == NULL
    ) {
        return NL_FALSE;
    }
    return NL_TRUE;
}

static void nlTerminateExtension_CUDA(void) {
    if(!nlExtensionIsInitialized_CUDA()) {
	return;
    }

    CUDA()->cusparseDestroy(CUDA()->HNDL_cusparse);    
    nlCloseDLL(CUDA()->DLL_cusparse);
    
    CUDA()->cublasDestroy(CUDA()->HNDL_cublas);
    nlCloseDLL(CUDA()->DLL_cublas);

    CUDA()->cudaDeviceReset();    
    nlCloseDLL(CUDA()->DLL_cudart);
}


static int ConvertSMVer2Cores(int major, int minor) {
    /* Defines for GPU Architecture types (using the SM version 
       to determine the # of cores per SM */
    typedef struct {
        int SM; /* 0xMm (hexadecimal notation), 
                    M = SM Major version, 
                    and m = SM minor version */
        int Cores;
    } sSMtoCores;

    sSMtoCores nGpuArchCoresPerSM[] = {
        { 0x10,  8 }, /* Tesla Generation   (SM 1.0) G80 class    */
        { 0x11,  8 }, /* Tesla Generation   (SM 1.1) G8x class    */
        { 0x12,  8 }, /* Tesla Generation   (SM 1.2) G9x class    */
        { 0x13,  8 }, /* Tesla Generation   (SM 1.3) GT200 class  */
        { 0x20, 32 }, /* Fermi Generation   (SM 2.0) GF100 class  */
        { 0x21, 48 }, /* Fermi Generation   (SM 2.1) GF10x class  */
        { 0x30, 192}, /* Kepler Generation  (SM 3.0) GK10x class  */
        { 0x35, 192}, /* Kepler Generation  (SM 3.5) GK11x class  */
	{ 0x50, 128}, /* Maxwell Generation (SM 5.0) GM10x class 
                             (yes, #cores smaller than with 3.x)  */
	{ 0x52, 128}, /* Maxwell Generation (SM 5.2) GM20x class  */
	{ 0x60, 64 }, /* Pascal Generation  (SM 6.0) GP100,GP102  
              (yes, 64, but GP100 has superfast double precision) */
	{ 0x61, 128}, /* Pascal Generation  (SM 6.1) GP104 class  
                               (but FP64 runs as 1/32 FP32 speed) */ 	
        {   -1, -1 }
    };
    int index = 0;
    while (nGpuArchCoresPerSM[index].SM != -1) {
        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
            return nGpuArchCoresPerSM[index].Cores;
        }
        index++;
    }
    /* If we don't find the values, we default use the 
       previous one to run properly */
    nl_printf(
      "MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n",
      major, minor, nGpuArchCoresPerSM[8].Cores
    );
    return nGpuArchCoresPerSM[8].Cores;
}

static int getBestDeviceID() {
    int current_device     = 0, sm_per_multiproc  = 0;
    int max_compute_perf   = 0, max_perf_device   = 0;
    int device_count       = 0, best_SM_arch      = 0;
    int compute_perf       = 0;
    struct cudaDeviceProp deviceProp;
    CUDA()->cudaGetDeviceCount(&device_count);
    /* Find the best major SM Architecture GPU device */
    while (current_device < device_count) {
        CUDA()->cudaGetDeviceProperties(&deviceProp, current_device);
        /* If this GPU is not running on Compute Mode prohibited, 
           then we can add it to the list */
        if (deviceProp.computeMode != cudaComputeModeProhibited) {
            if (deviceProp.major > 0 && deviceProp.major < 9999) {
                best_SM_arch = MAX(best_SM_arch, deviceProp.major);
            }
        }
        current_device++;
    }
    /* Find the best CUDA capable GPU device */
    current_device = 0;
    while (current_device < device_count) {
        CUDA()->cudaGetDeviceProperties(&deviceProp, current_device);
        /* If this GPU is not running on Compute Mode prohibited, 
           then we can add it to the list */
        if (deviceProp.computeMode != cudaComputeModeProhibited) {
            if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
                sm_per_multiproc = 1;
            } else {
                sm_per_multiproc = ConvertSMVer2Cores(
		    deviceProp.major, deviceProp.minor
		);
            }
            compute_perf  =
		deviceProp.multiProcessorCount *
		sm_per_multiproc * deviceProp.clockRate;
            if (compute_perf  > max_compute_perf) {
                /* If we find GPU with SM major > 2, search only these */
                if (best_SM_arch > 2) {
                    /* If our device==dest_SM_arch, choose this, or else pass */
                    if (deviceProp.major == best_SM_arch) {
                        max_compute_perf  = compute_perf;
                        max_perf_device   = current_device;
                    }
                } else {
                    max_compute_perf  = compute_perf;
                    max_perf_device   = current_device;
                }
            }
        }
        ++current_device;
    }

 
    return max_perf_device;
}


#ifdef NL_OS_UNIX
#  define LIBPREFIX "lib"
#  ifdef NL_OS_APPLE
#      define LIBEXTENSION ".dylib"
#  else
#      define LIBEXTENSION ".so"
#  endif
#else
#  define LIBPREFIX 
#  define LIBEXTENSION ".dll"
#endif


NLboolean nlInitExtension_CUDA(void) {
    struct cudaDeviceProp deviceProp;
    int cublas_version;
    int cusparse_version;
    NLenum flags = NL_LINK_LAZY | NL_LINK_GLOBAL;
    if(nlCurrentContext == NULL || !nlCurrentContext->verbose) {
	flags |= NL_LINK_QUIET;
    }
    
    if(nlExtensionIsInitialized_CUDA()) {
	return NL_TRUE;
    }

    CUDA()->DLL_cudart = nlOpenDLL(
	LIBPREFIX "cudart" LIBEXTENSION, flags
    );

    find_cuda_func(cudaGetDeviceCount);
    find_cuda_func(cudaGetDeviceProperties);
    find_cuda_func(cudaDeviceReset);        
    find_cuda_func(cudaMalloc);
    find_cuda_func(cudaFree);
    find_cuda_func(cudaMemcpy);
    
    CUDA()->devID = getBestDeviceID();

    if(CUDA()->cudaGetDeviceProperties(&deviceProp, CUDA()->devID)) {
	nl_fprintf(stderr,"OpenNL CUDA: could not find a CUDA device\n");
	return NL_FALSE;
    }
    
    nl_printf("OpenNL CUDA: Device ID = %d\n", CUDA()->devID);
    nl_printf("OpenNL CUDA: Device name=%s\n", deviceProp.name);
    nl_printf(
	"OpenNL CUDA: Device has %d Multi-Processors, "
	"%d cores per Multi-Processor, SM %d.%d compute capabilities\n",
	deviceProp.multiProcessorCount,
	ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
	deviceProp.major, deviceProp.minor
    );

    nl_printf(
	"OpenNL CUDA: %d kB shared mem. per block, %d per MP\n",
	(int)(deviceProp.sharedMemPerBlock / 1024),
	(int)(deviceProp.sharedMemPerMultiprocessor / 1024)
    );
    
    nl_printf(
	"OpenNL CUDA: %d regs. per block, %d per MP\n",
	deviceProp.regsPerBlock,
	deviceProp.regsPerMultiprocessor	
    );

    nl_printf(
	"OpenNL CUDA: warpsize=%d\n",
	deviceProp.warpSize
    );
    
    if ((deviceProp.major * 0x10 + deviceProp.minor) < 0x11) {
        nl_fprintf(stderr, "OpenNL CUDA requires a minimum CUDA compute 1.1 capability\n");
        CUDA()->cudaDeviceReset();
	return NL_FALSE;
    }
    
    CUDA()->DLL_cublas = nlOpenDLL(
	LIBPREFIX "cublas" LIBEXTENSION, flags
    );

    find_cublas_func(cublasCreate);
    find_cublas_func(cublasDestroy);
    find_cublas_func(cublasGetVersion);    
    find_cublas_func(cublasDdot);
    find_cublas_func(cublasDaxpy);
    find_cublas_func(cublasDcopy);
    find_cublas_func(cublasDscal);
    find_cublas_func(cublasDnrm2);
    find_cublas_func(cublasDgemv);        
    find_cublas_func(cublasDtpsv);    
    find_cublas_func_v1(cublasDdgmm);

    
    if(CUDA()->cublasCreate(&CUDA()->HNDL_cublas)) {
	return NL_FALSE;
    }

    if(CUDA()->cublasGetVersion(CUDA()->HNDL_cublas, &cublas_version)) {
	return NL_FALSE;
    }
    nl_printf("OpenNL CUDA: cublas version = %d\n", cublas_version);
    
    CUDA()->DLL_cusparse = nlOpenDLL(
	LIBPREFIX "cusparse" LIBEXTENSION, flags
    );
    find_cusparse_func(cusparseCreate);
    find_cusparse_func(cusparseDestroy);
    find_cusparse_func(cusparseGetVersion);
    find_cusparse_func(cusparseCreateMatDescr);
    find_cusparse_func(cusparseDestroyMatDescr);    
    find_cusparse_func(cusparseSetMatType);
    find_cusparse_func(cusparseSetMatIndexBase);
    find_cusparse_func(cusparseDcsrmv);
    find_cusparse_func(cusparseCreateHybMat);
    find_cusparse_func(cusparseDestroyHybMat);
    find_cusparse_func(cusparseDcsr2hyb);
    find_cusparse_func(cusparseDhybmv);                    
    
    if(CUDA()->cusparseCreate(&CUDA()->HNDL_cusparse)) {
	return NL_FALSE;
    }
    if(CUDA()->cusparseGetVersion(CUDA()->HNDL_cusparse, &cusparse_version)) {
	return NL_FALSE;
    }
    nl_printf("OpenNL CUDA: cusparse version = %d\n", cusparse_version);
    
    if(!nlExtensionIsInitialized_CUDA()) {
	return NL_FALSE;
    }

    atexit(nlTerminateExtension_CUDA);
    return NL_TRUE;
    
}

static void nlCUDACheckImpl(int status, int line) {
    if(status != 0) {
	nl_fprintf(stderr,"nl_cuda.c:%d fatal error %d\n",line, status);
	CUDA()->cudaDeviceReset();    	
	exit(-1);
    }
}

#define nlCUDACheck(status) nlCUDACheckImpl(status, __LINE__)


typedef struct {
    NLuint m;
    NLuint n;
    NLenum type;
    NLDestroyMatrixFunc destroy_func;
    NLMultMatrixVectorFunc mult_func;
    cusparseMatDescr_t descr;
    NLuint nnz;
    int* colind;
    int* rowptr;
    double* val;
    cusparseHybMat_t hyb;
} NLCUDASparseMatrix;


static void nlCRSMatrixCUDADestroyCRS(NLCUDASparseMatrix* Mcuda) {
    if(Mcuda->colind != NULL) {
	nlCUDACheck(CUDA()->cudaFree(Mcuda->colind));
	Mcuda->colind = NULL;
    }
    if(Mcuda->rowptr != NULL) {
	nlCUDACheck(CUDA()->cudaFree(Mcuda->rowptr));
	Mcuda->rowptr = NULL;
    }
    if(Mcuda->val != NULL) {
	nlCUDACheck(CUDA()->cudaFree(Mcuda->val));
	Mcuda->val = NULL;
    }
}

static void nlCRSMatrixCUDADestroy(NLCUDASparseMatrix* Mcuda) {
    if(Mcuda->hyb != NULL) {
	nlCUDACheck(CUDA()->cusparseDestroyHybMat(Mcuda->hyb));
    }
    nlCRSMatrixCUDADestroyCRS(Mcuda);
    nlCUDACheck(CUDA()->cusparseDestroyMatDescr(Mcuda->descr));
    memset(Mcuda, 0, sizeof(*Mcuda));
}

static void nlCRSMatrixCUDAMult(
    NLCUDASparseMatrix* Mcuda, const double* x, double* y
) {
    const double one = 1;
    const double zero = 0;
    if(Mcuda->hyb != NULL) {
	nlCUDACheck(
	    CUDA()->cusparseDhybmv(
		CUDA()->HNDL_cusparse,
		CUSPARSE_OPERATION_NON_TRANSPOSE,
		&one,
		Mcuda->descr,
		Mcuda->hyb,
		x,
		&zero,
		y
	    )
	);
    } else {
	nlCUDACheck(
	    CUDA()->cusparseDcsrmv(
		CUDA()->HNDL_cusparse,
		CUSPARSE_OPERATION_NON_TRANSPOSE,
		(int)Mcuda->m,
		(int)Mcuda->n,
		(int)Mcuda->nnz,
		&one,
		Mcuda->descr,
		Mcuda->val,
		Mcuda->rowptr,
		Mcuda->colind,
		x,
		&zero,
		y
		)
	    );
    }
    nlCUDABlas()->flops += (NLulong)(2*Mcuda->nnz);
}

NLMatrix nlCUDAMatrixNewFromCRSMatrix(NLMatrix M_in) {
    NLCUDASparseMatrix* Mcuda = NL_NEW(NLCUDASparseMatrix);
    NLCRSMatrix* M = (NLCRSMatrix*)(M_in);
    size_t colind_sz, rowptr_sz, val_sz;
    nl_assert(M_in->type == NL_MATRIX_CRS);
    nlCUDACheck(CUDA()->cusparseCreateMatDescr(&Mcuda->descr));
    if(M->symmetric_storage) {
	nlCUDACheck(CUDA()->cusparseSetMatType(
			Mcuda->descr, CUSPARSE_MATRIX_TYPE_SYMMETRIC)
	);
    } else {
	nlCUDACheck(CUDA()->cusparseSetMatType(
			Mcuda->descr, CUSPARSE_MATRIX_TYPE_GENERAL)
	);	
    }
    nlCUDACheck(CUDA()->cusparseSetMatIndexBase(
		    Mcuda->descr, CUSPARSE_INDEX_BASE_ZERO)
    );	
    Mcuda->m = M->m;
    Mcuda->n = M->n;
    Mcuda->nnz = nlCRSMatrixNNZ(M);

    colind_sz = (size_t)Mcuda->nnz*sizeof(int);
    rowptr_sz = (size_t)(Mcuda->m+1)*sizeof(int);
    val_sz    = (size_t)Mcuda->nnz*sizeof(double);

    nlCUDACheck(CUDA()->cudaMalloc((void**)&Mcuda->colind,colind_sz));
    nlCUDACheck(CUDA()->cudaMalloc((void**)&Mcuda->rowptr,rowptr_sz));
    nlCUDACheck(CUDA()->cudaMalloc((void**)&Mcuda->val,val_sz));
    nlCUDACheck(CUDA()->cudaMemcpy(
      Mcuda->colind, M->colind, colind_sz, cudaMemcpyHostToDevice)
    );
    nlCUDACheck(CUDA()->cudaMemcpy(
      Mcuda->rowptr, M->rowptr, rowptr_sz, cudaMemcpyHostToDevice)
    );
    nlCUDACheck(CUDA()->cudaMemcpy(
      Mcuda->val, M->val, val_sz, cudaMemcpyHostToDevice)
    );
    Mcuda->hyb=NULL;
    if(!M->symmetric_storage) {
	nlCUDACheck(CUDA()->cusparseCreateHybMat(&Mcuda->hyb));
	nlCUDACheck(CUDA()->cusparseDcsr2hyb(
			CUDA()->HNDL_cusparse,
			(int)M->m,
			(int)M->n,
			Mcuda->descr,
			Mcuda->val,
			Mcuda->rowptr,
			Mcuda->colind,
			Mcuda->hyb,
			0,
			CUSPARSE_HYB_PARTITION_AUTO 
	));
	/* We no longer need the CRS part */
	nlCRSMatrixCUDADestroyCRS(Mcuda);	
    }
    Mcuda->type=NL_MATRIX_OTHER;
    Mcuda->destroy_func=(NLDestroyMatrixFunc)nlCRSMatrixCUDADestroy;
    Mcuda->mult_func=(NLMultMatrixVectorFunc)nlCRSMatrixCUDAMult;
    return (NLMatrix)Mcuda;
}


typedef struct {
    NLuint m;
    NLuint n;
    NLenum type;
    NLDestroyMatrixFunc destroy_func;
    NLMultMatrixVectorFunc mult_func;
    double* val;
} NLDiagonalMatrixCUDA;

static void nlDiagonalMatrixCUDADestroy(NLDiagonalMatrixCUDA* Mcuda) {
    nlCUDACheck(CUDA()->cudaFree(Mcuda->val));
    memset(Mcuda, 0, sizeof(*Mcuda));
}

static void nlDiagonalMatrixCUDAMult(
    NLDiagonalMatrixCUDA* Mcuda, const double* x, double* y
) {
    int N = (int)Mcuda->n;
    /*
     * vector x vector component-wise product implemented
     * using diagonal matrix x matrix function.
     */
    nlCUDACheck(CUDA()->cublasDdgmm(
	CUDA()->HNDL_cublas, CUBLAS_SIDE_LEFT,
	N, 1,
	x, N,
	Mcuda->val, 1,
	y, N
    ));
    nlCUDABlas()->flops += (NLulong)N;
}

static NLMatrix nlDiagonalMatrixCUDANew(const double* diag, NLuint n) {
    NLDiagonalMatrixCUDA* Mcuda = NL_NEW(NLDiagonalMatrixCUDA);
    Mcuda->m = n;
    Mcuda->n = n;
    Mcuda->type = NL_MATRIX_OTHER;
    nlCUDACheck(CUDA()->cudaMalloc(
       (void**)&Mcuda->val, n*sizeof(double))
    );
    nlCUDACheck(CUDA()->cudaMemcpy(
       Mcuda->val, diag, n*sizeof(double), cudaMemcpyHostToDevice)
    );
    Mcuda->destroy_func=(NLDestroyMatrixFunc)nlDiagonalMatrixCUDADestroy;
    Mcuda->mult_func=(NLMultMatrixVectorFunc)nlDiagonalMatrixCUDAMult;
    return (NLMatrix)Mcuda;
}

NLMatrix nlCUDAJacobiPreconditionerNewFromCRSMatrix(NLMatrix M_in) {
    NLuint N = M_in->n;
    NLuint i,jj;
    double* diag = NULL;
    NLMatrix result = NULL;
    NLCRSMatrix* M = (NLCRSMatrix*)(M_in);
    nl_assert(M_in->type == NL_MATRIX_CRS);
    diag = NL_NEW_ARRAY(double,N);
    for(i=0; i<N; ++i) {
	for(jj=M->rowptr[i]; jj<M->rowptr[i+1]; ++jj) {
	    if(M->colind[jj] == i) {
		diag[i] = M->val[jj];
	    }
	}
    }
    for(i=0; i<N; ++i) {
	diag[i] = ((diag[i] == 0.0) ? 1.0 : 1.0 / diag[i]);
    }
    result = nlDiagonalMatrixCUDANew(diag, N);
    NL_DELETE_ARRAY(diag);
    return result;
}


static void* cuda_blas_malloc(
    NLBlas_t blas, NLmemoryType type, size_t size
) {
    void* result = NULL;
    blas->used_ram[type] += (NLulong)size;
    blas->max_used_ram[type] = MAX(
	blas->max_used_ram[type],blas->used_ram[type]
    );
    if(type == NL_HOST_MEMORY) {
	result = malloc(size);
    } else {
	nlCUDACheck(CUDA()->cudaMalloc(&result,size));	
    }
    return result;
}

static void cuda_blas_free(
    NLBlas_t blas, NLmemoryType type, size_t size, void* ptr
) {
    blas->used_ram[type] -= (NLulong)size;
    if(type == NL_HOST_MEMORY) {
	free(ptr);
    } else {
	nlCUDACheck(CUDA()->cudaFree(ptr));	
    }
}

static void cuda_blas_memcpy(
    NLBlas_t blas,
    void* to, NLmemoryType to_type,
    void* from, NLmemoryType from_type,
    size_t size
) {
    enum cudaMemcpyKind kind = cudaMemcpyDefault;
    nl_arg_used(blas);
    if(from_type == NL_HOST_MEMORY) {
	if(to_type == NL_HOST_MEMORY) {
	    kind = cudaMemcpyHostToHost;
	} else {
	    kind = cudaMemcpyHostToDevice;	    
	}
    } else {
	if(to_type == NL_HOST_MEMORY) {
	    kind = cudaMemcpyDeviceToHost;
	} else {
	    kind = cudaMemcpyDeviceToDevice;	    
	}
    }
    nlCUDACheck(CUDA()->cudaMemcpy(to, from, size, kind));
}

static void cuda_blas_dcopy(
    NLBlas_t blas, int n, const double *x, int incx, double *y, int incy    
) {
    nl_arg_used(blas);
    CUDA()->cublasDcopy(CUDA()->HNDL_cublas,n,x,incx,y,incy);
}

static double cuda_blas_ddot(
    NLBlas_t blas, int n, const double *x, int incx, const double *y, int incy
) {
    double result = 0.0;
    blas->flops += (NLulong)(2*n);
    CUDA()->cublasDdot(CUDA()->HNDL_cublas,n,x,incx,y,incy,&result);
    return result;
}

static double cuda_blas_dnrm2(
    NLBlas_t blas, int n, const double *x, int incx
) {
    double result = 0.0;
    blas->flops += (NLulong)(2*n);
    CUDA()->cublasDnrm2(CUDA()->HNDL_cublas,n,x,incx,&result);    
    return result;
}

static void cuda_blas_daxpy(
    NLBlas_t blas, int n,
    double a, const double *x, int incx, double *y, int incy
) {
    blas->flops += (NLulong)(2*n);
    CUDA()->cublasDaxpy(CUDA()->HNDL_cublas,n,&a,x,incx,y,incy);        
}

static void cuda_blas_dscal(
    NLBlas_t blas, int n, double a, double *x, int incx    
) {
    blas->flops += (NLulong)n;
    CUDA()->cublasDscal(CUDA()->HNDL_cublas,n,&a,x,incx);            
}


static void cuda_blas_dgemv(
    NLBlas_t blas, MatrixTranspose trans, int m, int n, double alpha,
    const double *A, int ldA, const double *x, int incx,
    double beta, double *y, int incy 
) {
    nl_arg_used(blas);
    /* TODO: update FLOPS */
    CUDA()->cublasDgemv(
	CUDA()->HNDL_cublas, (cublasOperation_t)trans,
	m, n, &alpha, A, ldA, x, incx, &beta, y, incy
    );
}

static void cuda_blas_dtpsv(
    NLBlas_t blas, MatrixTriangle uplo, MatrixTranspose trans,
    MatrixUnitTriangular diag, int n, const double *AP,
    double *x, int incx 
) {
    nl_arg_used(blas);
    /* TODO: update FLOPS */
    CUDA()->cublasDtpsv(
	CUDA()->HNDL_cublas,
	(cublasFillMode_t)uplo,
	(cublasOperation_t)trans,
	(cublasDiagType_t)diag, n,
	AP, x, incx	
    );
}


NLBlas_t nlCUDABlas() {
    static NLboolean initialized = NL_FALSE;
    static struct NLBlas blas;
    if(!initialized) {
	memset(&blas, 0, sizeof(blas));
	blas.has_unified_memory = NL_FALSE;
	blas.Malloc = cuda_blas_malloc;
	blas.Free = cuda_blas_free;
	blas.Memcpy = cuda_blas_memcpy;
	blas.Dcopy = cuda_blas_dcopy;
	blas.Ddot = cuda_blas_ddot;
	blas.Dnrm2 = cuda_blas_dnrm2;
	blas.Daxpy = cuda_blas_daxpy;
	blas.Dscal = cuda_blas_dscal;
	blas.Dgemv = cuda_blas_dgemv;
	blas.Dtpsv = cuda_blas_dtpsv;
	nlBlasResetStats(&blas);
	initialized = NL_TRUE;
    }
    return &blas;
}


/******* extracted from nl_api.c *******/


static NLSparseMatrix* nlGetCurrentSparseMatrix() {
    NLSparseMatrix* result = NULL;
    switch(nlCurrentContext->matrix_mode) {
	case NL_STIFFNESS_MATRIX: {
	    nl_assert(nlCurrentContext->M != NULL);	    
	    nl_assert(nlCurrentContext->M->type == NL_MATRIX_SPARSE_DYNAMIC);
	    result = (NLSparseMatrix*)(nlCurrentContext->M);
	} break;
	case NL_MASS_MATRIX: {
	    nl_assert(nlCurrentContext->B != NULL);
	    nl_assert(nlCurrentContext->B->type == NL_MATRIX_SPARSE_DYNAMIC);
	    result = (NLSparseMatrix*)(nlCurrentContext->B);
	} break;
	default:
	    nl_assert_not_reached;
    }
    return result;
}


NLboolean nlInitExtension(const char* extension) {
    if(!strcmp(extension, "SUPERLU")) {
        return nlInitExtension_SUPERLU();
    } else if(!strcmp(extension, "CHOLMOD")) {
        return nlInitExtension_CHOLMOD();
    } else if(!strcmp(extension, "ARPACK")) {
	/* 
	 * SUPERLU is needed by OpenNL's ARPACK driver
	 * (factorizes the matrix for the shift-invert spectral
	 *  transform).
	 */
	return nlInitExtension_SUPERLU() && nlInitExtension_ARPACK();
    } else if(!strcmp(extension, "MKL")) {
	return nlInitExtension_MKL();
    } else if(!strcmp(extension, "CUDA")) {
	return nlInitExtension_CUDA();
    }
    return NL_FALSE;
}

NLboolean nlExtensionIsInitialized(const char* extension) {
    if(!strcmp(extension, "SUPERLU")) {
        return nlExtensionIsInitialized_SUPERLU();
    } else if(!strcmp(extension, "CHOLMOD")) {
        return nlExtensionIsInitialized_CHOLMOD();
    } else if(!strcmp(extension, "ARPACK")) {
	/* 
	 * SUPERLU is needed by OpenNL's ARPACK driver
	 * (factorizes the matrix for the shift-invert spectral
	 *  transform).
	 */
	return nlExtensionIsInitialized_SUPERLU() && nlExtensionIsInitialized_ARPACK();
    } else if(!strcmp(extension, "MKL")) {
	return nlExtensionIsInitialized_MKL();
    } else if(!strcmp(extension, "CUDA")) {
	return nlExtensionIsInitialized_CUDA();
    }
    return NL_FALSE;
}

void nlInitialize(int argc, char** argv) {
    int i=0;
    char* ptr=NULL;
    char extension[255];
    /* Find all the arguments with the form:
     * nl:<extension>=true|false
     * and try to activate the corresponding extensions.
     */
    for(i=1; i<argc; ++i) {
	ptr = strstr(argv[i],"=true");
	if(!strncmp(argv[i], "nl:", 3) &&
	   (strlen(argv[i]) > 3) &&
	   (ptr != NULL)) {
	    strncpy(extension, argv[i]+3, (size_t)(ptr-argv[i]-3));
	    extension[(size_t)(ptr-argv[i]-3)] = '\0';
	    if(nlInitExtension(extension)) {
		nl_fprintf(stdout,"OpenNL %s: initialized\n", extension);
	    } else {
		nl_fprintf(stderr,"OpenNL %s: could not initialize\n", extension);		
	    }
	}
    }
}


/* Get/Set parameters */

void nlSolverParameterd(NLenum pname, NLdouble param) {
    nlCheckState(NL_STATE_INITIAL);
    switch(pname) {
    case NL_THRESHOLD: {
        nl_assert(param >= 0);
        nlCurrentContext->threshold = (NLdouble)param;
        nlCurrentContext->threshold_defined = NL_TRUE;
    } break;
    case NL_OMEGA: {
        nl_range_assert(param,1.0,2.0);
        nlCurrentContext->omega = (NLdouble)param;
    } break;
    default: {
        nlError("nlSolverParameterd","Invalid parameter");
        nl_assert_not_reached;
    }
    }
}

void nlSolverParameteri(NLenum pname, NLint param) {
    nlCheckState(NL_STATE_INITIAL);
    switch(pname) {
    case NL_SOLVER: {
        nlCurrentContext->solver = (NLenum)param;
    } break;
    case NL_NB_VARIABLES: {
        nl_assert(param > 0);
        nlCurrentContext->nb_variables = (NLuint)param;
    } break;
    case NL_NB_SYSTEMS: {
	nl_assert(param > 0);
	nlCurrentContext->nb_systems = (NLuint)param;
    } break;
    case NL_LEAST_SQUARES: {
        nlCurrentContext->least_squares = (NLboolean)param;
    } break;
    case NL_MAX_ITERATIONS: {
        nl_assert(param > 0);
        nlCurrentContext->max_iterations = (NLuint)param;
        nlCurrentContext->max_iterations_defined = NL_TRUE;
    } break;
    case NL_SYMMETRIC: {
        nlCurrentContext->symmetric = (NLboolean)param;        
    } break;
    case NL_INNER_ITERATIONS: {
        nl_assert(param > 0);
        nlCurrentContext->inner_iterations = (NLuint)param;
    } break;
    case NL_PRECONDITIONER: {
        nlCurrentContext->preconditioner = (NLuint)param;
        nlCurrentContext->preconditioner_defined = NL_TRUE;
    } break;
    default: {
        nlError("nlSolverParameteri","Invalid parameter");
        nl_assert_not_reached;
    }
    }
}

void nlGetBooleanv(NLenum pname, NLboolean* params) {
    switch(pname) {
    case NL_LEAST_SQUARES: {
        *params = nlCurrentContext->least_squares;
    } break;
    case NL_SYMMETRIC: {
        *params = nlCurrentContext->symmetric;
    } break;
    default: {
        nlError("nlGetBooleanv","Invalid parameter");
        nl_assert_not_reached;
    } 
    }
}

void nlGetDoublev(NLenum pname, NLdouble* params) {
    switch(pname) {
    case NL_THRESHOLD: {
        *params = nlCurrentContext->threshold;
    } break;
    case NL_OMEGA: {
        *params = nlCurrentContext->omega;
    } break;
    case NL_ERROR: {
        *params = nlCurrentContext->error;
    } break;
    case NL_ELAPSED_TIME: {
        *params = nlCurrentContext->elapsed_time;        
    } break;
    case NL_GFLOPS: {
        if(nlCurrentContext->elapsed_time == 0) {
            *params = 0.0;
        } else {
            *params = (NLdouble)(nlCurrentContext->flops) /
                (nlCurrentContext->elapsed_time * 1e9);
        }
    } break;
    default: {
        nlError("nlGetDoublev","Invalid parameter");
        nl_assert_not_reached;
    } 
    }
}

void nlGetIntegerv(NLenum pname, NLint* params) {
    switch(pname) {
    case NL_SOLVER: {
        *params = (NLint)(nlCurrentContext->solver);
    } break;
    case NL_NB_VARIABLES: {
        *params = (NLint)(nlCurrentContext->nb_variables);
    } break;
    case NL_NB_SYSTEMS: {
	*params = (NLint)(nlCurrentContext->nb_systems);
    } break;
    case NL_LEAST_SQUARES: {
        *params = (NLint)(nlCurrentContext->least_squares);
    } break;
    case NL_MAX_ITERATIONS: {
        *params = (NLint)(nlCurrentContext->max_iterations);
    } break;
    case NL_SYMMETRIC: {
        *params = (NLint)(nlCurrentContext->symmetric);
    } break;
    case NL_USED_ITERATIONS: {
        *params = (NLint)(nlCurrentContext->used_iterations);
    } break;
    case NL_PRECONDITIONER: {
        *params = (NLint)(nlCurrentContext->preconditioner);        
    } break;
    case NL_NNZ: {
        *params = (NLint)(nlMatrixNNZ(nlCurrentContext->M));
    } break;
    default: {
        nlError("nlGetIntegerv","Invalid parameter");
        nl_assert_not_reached;
    } 
    }
}


/* Enable / Disable */

void nlEnable(NLenum pname) {
    switch(pname) {
	case NL_NORMALIZE_ROWS: {
	    nl_assert(nlCurrentContext->state != NL_STATE_ROW);
	    nlCurrentContext->normalize_rows = NL_TRUE;
	} break;
	case NL_VERBOSE: {
	    nlCurrentContext->verbose = NL_TRUE;
	} break;
	case NL_VARIABLES_BUFFER: {
	    nlCurrentContext->user_variable_buffers = NL_TRUE;
	} break;
    default: {
        nlError("nlEnable","Invalid parameter");        
        nl_assert_not_reached;
    }
    }
}

void nlDisable(NLenum pname) {
    switch(pname) {
	case NL_NORMALIZE_ROWS: {
	    nl_assert(nlCurrentContext->state != NL_STATE_ROW);
	    nlCurrentContext->normalize_rows = NL_FALSE;
	} break;
	case NL_VERBOSE: {
	    nlCurrentContext->verbose = NL_FALSE;
	} break;
	case NL_VARIABLES_BUFFER: {
	    nlCurrentContext->user_variable_buffers = NL_FALSE;
	} break;
	default: {
	    nlError("nlDisable","Invalid parameter");                
	    nl_assert_not_reached;
	}
    }
}

NLboolean nlIsEnabled(NLenum pname) {
    NLboolean result = NL_FALSE;
    switch(pname) {
	case NL_NORMALIZE_ROWS: {
	    result = nlCurrentContext->normalize_rows;
	} break;
	case NL_VERBOSE: {
	    result = nlCurrentContext->verbose;
	} break;
	case NL_VARIABLES_BUFFER: {
	    result = nlCurrentContext->user_variable_buffers;
	} break;
	default: {
	    nlError("nlIsEnables","Invalid parameter");
	    nl_assert_not_reached;
	}
    }
    return result;
}


/* NL functions */

void  nlSetFunction(NLenum pname, NLfunc param) {
    switch(pname) {
    case NL_FUNC_SOLVER:
        nlCurrentContext->solver_func = (NLSolverFunc)(param);
        nlCurrentContext->solver = NL_SOLVER_USER;	
        break;
    case NL_FUNC_MATRIX:
	nlDeleteMatrix(nlCurrentContext->M);
	nlCurrentContext->M = nlMatrixNewFromFunction(
	    nlCurrentContext->n, nlCurrentContext->n,
	    (NLMatrixFunc)param
	);
        break;
    case NL_FUNC_PRECONDITIONER:
	nlDeleteMatrix(nlCurrentContext->P);
	nlCurrentContext->P = nlMatrixNewFromFunction(
	    nlCurrentContext->n, nlCurrentContext->n,
	    (NLMatrixFunc)param
	);
        nlCurrentContext->preconditioner = NL_PRECOND_USER;
        break;
    case NL_FUNC_PROGRESS:
        nlCurrentContext->progress_func = (NLProgressFunc)(param);
        break;
    default:
        nlError("nlSetFunction","Invalid parameter");        
        nl_assert_not_reached;
    }
}

void nlGetFunction(NLenum pname, NLfunc* param) {
    switch(pname) {
    case NL_FUNC_SOLVER:
        *param = (NLfunc)(nlCurrentContext->solver_func);
        break;
    case NL_FUNC_MATRIX:
        *param = (NLfunc)(nlMatrixGetFunction(nlCurrentContext->M));
        break;
    case NL_FUNC_PRECONDITIONER:
        *param = (NLfunc)(nlMatrixGetFunction(nlCurrentContext->P));
        break;
    default:
        nlError("nlGetFunction","Invalid parameter");                
        nl_assert_not_reached;
    }
}


/* Get/Set Lock/Unlock variables */

void nlSetVariable(NLuint index, NLdouble value) {
    nlCheckState(NL_STATE_SYSTEM);
    nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables - 1);
    NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[0],index) = value;
}

void nlMultiSetVariable(NLuint index, NLuint system, NLdouble value) {
    nlCheckState(NL_STATE_SYSTEM);
    nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables-1);
    nl_debug_range_assert(system, 0, nlCurrentContext->nb_systems-1);    
    NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[system],index) = value;
}

NLdouble nlGetVariable(NLuint index) {
    nl_assert(nlCurrentContext->state != NL_STATE_INITIAL);
    nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables - 1);
    return NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[0],index);
}

NLdouble nlMultiGetVariable(NLuint index, NLuint system) {
    nl_assert(nlCurrentContext->state != NL_STATE_INITIAL);
    nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables-1);
    nl_debug_range_assert(system, 0, nlCurrentContext->nb_systems-1);
    return NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[system],index);    
}


void nlLockVariable(NLuint index) {
    nlCheckState(NL_STATE_SYSTEM);
    nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables - 1);
    nlCurrentContext->variable_is_locked[index] = NL_TRUE;
}

void nlUnlockVariable(NLuint index) {
    nlCheckState(NL_STATE_SYSTEM);
    nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables - 1);
    nlCurrentContext->variable_is_locked[index] = NL_FALSE;
}

NLboolean nlVariableIsLocked(NLuint index) {
    nl_assert(nlCurrentContext->state != NL_STATE_INITIAL);
    nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables - 1);
    return nlCurrentContext->variable_is_locked[index];
}


/* System construction */

static void nlVariablesToVector() {
    NLuint n=nlCurrentContext->n;
    NLuint k,i,index;
    NLdouble value;
    
    nl_assert(nlCurrentContext->x != NULL);
    for(k=0; k<nlCurrentContext->nb_systems; ++k) {
	for(i=0; i<nlCurrentContext->nb_variables; ++i) {
	    if(!nlCurrentContext->variable_is_locked[i]) {
		index = nlCurrentContext->variable_index[i];
		nl_assert(index < nlCurrentContext->n);		
		value = NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[k],i);
		nlCurrentContext->x[index+k*n] = value;
	    }
	}
    }
}

static void nlVectorToVariables() {
    NLuint n=nlCurrentContext->n;
    NLuint k,i,index;
    NLdouble value;

    nl_assert(nlCurrentContext->x != NULL);
    for(k=0; k<nlCurrentContext->nb_systems; ++k) {
	for(i=0; i<nlCurrentContext->nb_variables; ++i) {
	    if(!nlCurrentContext->variable_is_locked[i]) {
		index = nlCurrentContext->variable_index[i];
		nl_assert(index < nlCurrentContext->n);
		value = nlCurrentContext->x[index+k*n];
		NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[k],i) = value;
	    }
	}
    }
}


static void nlBeginSystem() {
    NLuint k;
    
    nlTransition(NL_STATE_INITIAL, NL_STATE_SYSTEM);
    nl_assert(nlCurrentContext->nb_variables > 0);

    nlCurrentContext->variable_buffer = NL_NEW_ARRAY(
	NLBufferBinding, nlCurrentContext->nb_systems
    );
    
    if(nlCurrentContext->user_variable_buffers) {
	nlCurrentContext->variable_value = NULL;
    } else {
	nlCurrentContext->variable_value = NL_NEW_ARRAY(
	    NLdouble,
	    nlCurrentContext->nb_variables * nlCurrentContext->nb_systems
	);
	for(k=0; k<nlCurrentContext->nb_systems; ++k) {
	    nlCurrentContext->variable_buffer[k].base_address =
		nlCurrentContext->variable_value +
		k * nlCurrentContext->nb_variables;
	    nlCurrentContext->variable_buffer[k].stride = sizeof(NLdouble);
	}
    }
    
    nlCurrentContext->variable_is_locked = NL_NEW_ARRAY(
	NLboolean, nlCurrentContext->nb_variables
    );
    nlCurrentContext->variable_index = NL_NEW_ARRAY(
	NLuint, nlCurrentContext->nb_variables
    );
}

static void nlEndSystem() {
    nlTransition(NL_STATE_MATRIX_CONSTRUCTED, NL_STATE_SYSTEM_CONSTRUCTED);    
}

static void nlInitializeM() {
    NLuint i;
    NLuint n = 0;
    NLenum storage = NL_MATRIX_STORE_ROWS;


    for(i=0; i<nlCurrentContext->nb_variables; i++) {
        if(!nlCurrentContext->variable_is_locked[i]) {
            nlCurrentContext->variable_index[i] = n;
            n++;
        } else {
            nlCurrentContext->variable_index[i] = (NLuint)~0;
        }
    }

    nlCurrentContext->n = n;

    /*
     * If the user trusts OpenNL and has left solver as NL_SOLVER_DEFAULT,
     * then we setup reasonable parameters for him.
     */
    if(nlCurrentContext->solver == NL_SOLVER_DEFAULT) {
        if(nlCurrentContext->least_squares || nlCurrentContext->symmetric) {
            nlCurrentContext->solver = NL_CG;
            if(!nlCurrentContext->preconditioner_defined) {
                nlCurrentContext->preconditioner = NL_PRECOND_JACOBI;
            }
        } else {
            nlCurrentContext->solver = NL_BICGSTAB;
        }
        if(!nlCurrentContext->max_iterations_defined) {
            nlCurrentContext->max_iterations = n*5;
        }
        if(!nlCurrentContext->threshold_defined) {
            nlCurrentContext->threshold = 1e-6;
        }
    }

    
    /* SSOR preconditioner requires rows and columns */
    if(nlCurrentContext->preconditioner == NL_PRECOND_SSOR) {
        storage = (storage | NL_MATRIX_STORE_COLUMNS);
    }

    /* a least squares problem results in a symmetric matrix */
    if(nlCurrentContext->least_squares) {
        nlCurrentContext->symmetric = NL_TRUE;
    }

    if(
	nlCurrentContext->symmetric &&
        nlCurrentContext->preconditioner == NL_PRECOND_SSOR 
    ) {
	/* 
	 * For now, only used with SSOR preconditioner, because
	 * for other modes it is either unsupported (SUPERLU) or
	 * causes performance loss (non-parallel sparse SpMV)
	 */
        storage = (storage | NL_MATRIX_STORE_SYMMETRIC);
    }

    nlCurrentContext->M = (NLMatrix)(NL_NEW(NLSparseMatrix));
    nlSparseMatrixConstruct(
	     (NLSparseMatrix*)(nlCurrentContext->M), n, n, storage
    );

    nlCurrentContext->x = NL_NEW_ARRAY(
	NLdouble, n*nlCurrentContext->nb_systems
    );
    nlCurrentContext->b = NL_NEW_ARRAY(
	NLdouble, n*nlCurrentContext->nb_systems
    );

    nlVariablesToVector();

    nlRowColumnConstruct(&nlCurrentContext->af);
    nlRowColumnConstruct(&nlCurrentContext->al);

    nlCurrentContext->right_hand_side = NL_NEW_ARRAY(
	double, nlCurrentContext->nb_systems
    );
    nlCurrentContext->current_row = 0;
}

static void nlEndMatrix() {
    nlTransition(NL_STATE_MATRIX, NL_STATE_MATRIX_CONSTRUCTED);    

    nlRowColumnClear(&nlCurrentContext->af);
    nlRowColumnClear(&nlCurrentContext->al);
    
    if(!nlCurrentContext->least_squares) {
        nl_assert(
            nlCurrentContext->ij_coefficient_called || (
                nlCurrentContext->current_row == 
                nlCurrentContext->n
            )
        );
    }
}

static void nlBeginRow() {
    nlTransition(NL_STATE_MATRIX, NL_STATE_ROW);
    nlRowColumnZero(&nlCurrentContext->af);
    nlRowColumnZero(&nlCurrentContext->al);
}

static void nlScaleRow(NLdouble s) {
    NLRowColumn*    af = &nlCurrentContext->af;
    NLRowColumn*    al = &nlCurrentContext->al;
    NLuint nf            = af->size;
    NLuint nl            = al->size;
    NLuint i,k;
    for(i=0; i<nf; i++) {
        af->coeff[i].value *= s;
    }
    for(i=0; i<nl; i++) {
        al->coeff[i].value *= s;
    }
    for(k=0; k<nlCurrentContext->nb_systems; ++k) {
	nlCurrentContext->right_hand_side[k] *= s;
    }
}

static void nlNormalizeRow(NLdouble weight) {
    NLRowColumn*    af = &nlCurrentContext->af;
    NLRowColumn*    al = &nlCurrentContext->al;
    NLuint nf            = af->size;
    NLuint nl            = al->size;
    NLuint i;
    NLdouble norm = 0.0;
    for(i=0; i<nf; i++) {
        norm += af->coeff[i].value * af->coeff[i].value;
    }
    for(i=0; i<nl; i++) {
        norm += al->coeff[i].value * al->coeff[i].value;
    }
    norm = sqrt(norm);
    nlScaleRow(weight / norm);
}

static void nlEndRow() {
    NLRowColumn*    af = &nlCurrentContext->af;
    NLRowColumn*    al = &nlCurrentContext->al;
    NLSparseMatrix* M  = nlGetCurrentSparseMatrix();
    NLdouble* b        = nlCurrentContext->b;
    NLuint nf          = af->size;
    NLuint nl          = al->size;
    NLuint n           = nlCurrentContext->n;
    NLuint current_row = nlCurrentContext->current_row;
    NLuint i,j,jj;
    NLdouble S;
    NLuint k;
    nlTransition(NL_STATE_ROW, NL_STATE_MATRIX);

    if(nlCurrentContext->normalize_rows) {
        nlNormalizeRow(nlCurrentContext->row_scaling);
    } else if(nlCurrentContext->row_scaling != 1.0) {
        nlScaleRow(nlCurrentContext->row_scaling);
    }
    /*
     * if least_squares : we want to solve
     * A'A x = A'b
     */

    if(nlCurrentContext->least_squares) {
        for(i=0; i<nf; i++) {
            for(j=0; j<nf; j++) {
                nlSparseMatrixAdd(
                    M, af->coeff[i].index, af->coeff[j].index,
                    af->coeff[i].value * af->coeff[j].value
                );
            }
        }
	for(k=0; k<nlCurrentContext->nb_systems; ++k) {
	    S = -nlCurrentContext->right_hand_side[k];
	    for(jj=0; jj<nl; ++jj) {
		j = al->coeff[jj].index;
		S += al->coeff[jj].value *
		    NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[k],j);
	    }
	    for(jj=0; jj<nf; jj++) {
		b[ k*n+af->coeff[jj].index ] -= af->coeff[jj].value * S;
	    }
	}
    } else {
        for(jj=0; jj<nf; ++jj) {
            nlSparseMatrixAdd(
                M, current_row, af->coeff[jj].index, af->coeff[jj].value
            );
        }
	for(k=0; k<nlCurrentContext->nb_systems; ++k) {
	    b[k*n+current_row] = nlCurrentContext->right_hand_side[k];
	    for(jj=0; jj<nl; ++jj) {
		j = al->coeff[jj].index;
		b[k*n+current_row] -= al->coeff[jj].value *
		    NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[k],j);
	    }
	}
    }
    nlCurrentContext->current_row++;
    for(k=0; k<nlCurrentContext->nb_systems; ++k) {
	nlCurrentContext->right_hand_side[k] = 0.0;
    }
    nlCurrentContext->row_scaling = 1.0;
}

void nlCoefficient(NLuint index, NLdouble value) {
    nlCheckState(NL_STATE_ROW);
    nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables - 1);
    if(nlCurrentContext->variable_is_locked[index]) {
	/* 
	 * Note: in al, indices are NLvariable indices, 
	 * within [0..nb_variables-1]
	 */
        nlRowColumnAppend(&(nlCurrentContext->al), index, value);
    } else {
	/*
	 * Note: in af, indices are system indices, 
	 * within [0..n-1]
	 */
        nlRowColumnAppend(
	    &(nlCurrentContext->af),
	    nlCurrentContext->variable_index[index], value
	);
    }
}

void nlAddIJCoefficient(NLuint i, NLuint j, NLdouble value) {
    NLSparseMatrix* M  = nlGetCurrentSparseMatrix();    
    nlCheckState(NL_STATE_MATRIX);
    nl_debug_range_assert(i, 0, nlCurrentContext->nb_variables - 1);
    nl_debug_range_assert(j, 0, nlCurrentContext->nb_variables - 1);
#ifdef NL_DEBUG
    for(NLuint i=0; i<nlCurrentContext->nb_variables; ++i) {
        nl_debug_assert(!nlCurrentContext->variable_is_locked[i]);
    }
#endif    
    nlSparseMatrixAdd(M, i, j, value);
    nlCurrentContext->ij_coefficient_called = NL_TRUE;
}

void nlAddIRightHandSide(NLuint i, NLdouble value) {
    nlCheckState(NL_STATE_MATRIX);
    nl_debug_range_assert(i, 0, nlCurrentContext->nb_variables - 1);
#ifdef NL_DEBUG
    for(NLuint i=0; i<nlCurrentContext->nb_variables; ++i) {
        nl_debug_assert(!nlCurrentContext->variable_is_locked[i]);
    }
#endif
    nlCurrentContext->b[i] += value;
    nlCurrentContext->ij_coefficient_called = NL_TRUE;
}

void nlMultiAddIRightHandSide(NLuint i, NLuint k, NLdouble value) {
    NLuint n = nlCurrentContext->n;
    nlCheckState(NL_STATE_MATRIX);
    nl_debug_range_assert(i, 0, nlCurrentContext->nb_variables - 1);
    nl_debug_range_assert(k, 0, nlCurrentContext->nb_systems - 1);
#ifdef NL_DEBUG
    for(NLuint i=0; i<nlCurrentContext->nb_variables; ++i) {
        nl_debug_assert(!nlCurrentContext->variable_is_locked[i]);
    }
#endif
    nlCurrentContext->b[i + k*n] += value;
    nlCurrentContext->ij_coefficient_called = NL_TRUE;
}

void nlRightHandSide(NLdouble value) {
    nlCurrentContext->right_hand_side[0] = value;
}

void nlMultiRightHandSide(NLuint k, NLdouble value) {
    nl_debug_range_assert(k, 0, nlCurrentContext->nb_systems - 1);
    nlCurrentContext->right_hand_side[k] = value;
}

void nlRowScaling(NLdouble value) {
    nlCheckState(NL_STATE_MATRIX);
    nlCurrentContext->row_scaling = value;
}

void nlBegin(NLenum prim) {
    switch(prim) {
    case NL_SYSTEM: {
        nlBeginSystem();
    } break;
    case NL_MATRIX: {
	nlTransition(NL_STATE_SYSTEM, NL_STATE_MATRIX);
	if(
	    nlCurrentContext->matrix_mode == NL_STIFFNESS_MATRIX &&
	    nlCurrentContext->M == NULL
	) {
	    nlInitializeM();
	}
    } break;
    case NL_ROW: {
        nlBeginRow();
    } break;
    default: {
        nl_assert_not_reached;
    }
    }
}

void nlEnd(NLenum prim) {
    switch(prim) {
    case NL_SYSTEM: {
        nlEndSystem();
    } break;
    case NL_MATRIX: {
        nlEndMatrix();
    } break;
    case NL_ROW: {
        nlEndRow();
    } break;
    default: {
        nl_assert_not_reached;
    }
    }
}


/* nlSolve() driver routine */

NLboolean nlSolve() {
    NLboolean result;
    nlCheckState(NL_STATE_SYSTEM_CONSTRUCTED);
    nlCurrentContext->start_time = nlCurrentTime();
    nlCurrentContext->elapsed_time = 0.0;
    nlCurrentContext->flops = 0;    
    result = nlCurrentContext->solver_func();
    nlVectorToVariables();
    nlCurrentContext->elapsed_time = nlCurrentTime() - nlCurrentContext->start_time;
    nlTransition(NL_STATE_SYSTEM_CONSTRUCTED, NL_STATE_SOLVED);
    return result;
}

void nlUpdateRightHandSide(NLdouble* values) {
    /*
     * If we are in the solved state, get back to the
     * constructed state.
     */
    nl_assert(nlCurrentContext->nb_systems == 1);
    if(nlCurrentContext->state == NL_STATE_SOLVED) {
        nlTransition(NL_STATE_SOLVED, NL_STATE_SYSTEM_CONSTRUCTED);
    }
    nlCheckState(NL_STATE_SYSTEM_CONSTRUCTED);
    memcpy(nlCurrentContext->x, values, nlCurrentContext->n * sizeof(double));
}


/* Buffers management */

void nlBindBuffer(
    NLenum buffer, NLuint k, void* addr, NLuint stride
) {
    nlCheckState(NL_STATE_SYSTEM);    
    nl_assert(nlIsEnabled(buffer));
    nl_assert(buffer == NL_VARIABLES_BUFFER);
    nl_assert(k<nlCurrentContext->nb_systems);
    if(stride == 0) {
	stride = sizeof(NLdouble);
    }
    nlCurrentContext->variable_buffer[k].base_address = addr;
    nlCurrentContext->variable_buffer[k].stride = stride;
}


/* Eigen solver */

void nlMatrixMode(NLenum matrix) {
    NLuint n = 0;
    NLuint i;
    nl_assert(
	nlCurrentContext->state == NL_STATE_SYSTEM ||
	nlCurrentContext->state == NL_STATE_MATRIX_CONSTRUCTED
    );
    nlCurrentContext->state = NL_STATE_SYSTEM;
    nlCurrentContext->matrix_mode = matrix;
    nlCurrentContext->current_row = 0;
    nlCurrentContext->ij_coefficient_called = NL_FALSE;
    switch(matrix) {
	case NL_STIFFNESS_MATRIX: {
	    /* Stiffness matrix is already constructed. */
	} break ;
	case NL_MASS_MATRIX: {
	    if(nlCurrentContext->B == NULL) {
		for(i=0; i<nlCurrentContext->nb_variables; ++i) {
		    if(!nlCurrentContext->variable_is_locked[i]) {
			++n;
		    }
		}
		nlCurrentContext->B = (NLMatrix)(NL_NEW(NLSparseMatrix));
		nlSparseMatrixConstruct(
		    (NLSparseMatrix*)(nlCurrentContext->B),
		    n, n, NL_MATRIX_STORE_ROWS
		);
	    }
	} break ;
	default:
	    nl_assert_not_reached;
    }
}


void nlEigenSolverParameterd(
    NLenum pname, NLdouble val
) {
    switch(pname) {
	case NL_EIGEN_SHIFT: {
	    nlCurrentContext->eigen_shift =  val;
	} break;
	case NL_EIGEN_THRESHOLD: {
	    nlSolverParameterd(pname, val);
	} break;
	default:
	    nl_assert_not_reached;
    }
}

void nlEigenSolverParameteri(
    NLenum pname, NLint val
) {
    switch(pname) {
	case NL_EIGEN_SOLVER: {
	    nlCurrentContext->eigen_solver = (NLenum)val;
	} break;
	case NL_SYMMETRIC:
	case NL_NB_VARIABLES:	    
	case NL_NB_EIGENS:
	case NL_EIGEN_MAX_ITERATIONS: {
	    nlSolverParameteri(pname, val);
	} break;
	default:
	    nl_assert_not_reached;
    }
}

void nlEigenSolve() {
    if(nlCurrentContext->eigen_value == NULL) {
	nlCurrentContext->eigen_value = NL_NEW_ARRAY(
	    NLdouble,nlCurrentContext->nb_systems
	);
    }
    
    nlMatrixCompress(&nlCurrentContext->M);
    if(nlCurrentContext->B != NULL) {
	nlMatrixCompress(&nlCurrentContext->B);
    }
    
    switch(nlCurrentContext->eigen_solver) {
	case NL_ARPACK_EXT:
	    nlEigenSolve_ARPACK();
	    break;
	default:
	    nl_assert_not_reached;
    }
}

double nlGetEigenValue(NLuint i) {
    nl_debug_assert(i < nlCurrentContext->nb_variables);
    return nlCurrentContext->eigen_value[i];
}