#include "OpenNL_psm.h" /* * Copyright (c) 2004-2010, Bruno Levy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of the ALICE Project-Team nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * If you modify this software, you should include a notice giving the * name of the person performing the modification, the date of modification, * and the reason for such modification. * * Contact: Bruno Levy * * levy@loria.fr * * ALICE Project * LORIA, INRIA Lorraine, * Campus Scientifique, BP 239 * 54506 VANDOEUVRE LES NANCY CEDEX * FRANCE * */ /* * This file is a PSM (pluggable software module) * generated from the distribution of Geogram. * * See Geogram documentation on: * http://alice.loria.fr/software/geogram/doc/html/index.html * * See documentation of the functions bundled in this PSM on: * http://alice.loria.fr/software/geogram/doc/html/nl_8h.html */ /******* extracted from nl_private.h *******/ #ifndef OPENNL_PRIVATE_H #define OPENNL_PRIVATE_H #include #include #include #include #if defined(__APPLE__) && defined(__MACH__) #define NL_OS_APPLE #endif #if defined(__linux__) || defined(__ANDROID__) || defined(NL_OS_APPLE) #define NL_OS_UNIX #endif #if defined(WIN32) || defined(_WIN64) #define NL_OS_WINDOWS #endif #define nl_arg_used(x) (void)x #if defined(__clang__) || defined(__GNUC__) #define NL_NORETURN __attribute__((noreturn)) #else #define NL_NORETURN #endif #if defined(_MSC_VER) #define NL_NORETURN_DECL __declspec(noreturn) #else #define NL_NORETURN_DECL #endif NL_NORETURN_DECL void nl_assertion_failed( const char* cond, const char* file, int line ) NL_NORETURN; NL_NORETURN_DECL void nl_range_assertion_failed( double x, double min_val, double max_val, const char* file, int line ) NL_NORETURN; NL_NORETURN_DECL void nl_should_not_have_reached( const char* file, int line ) NL_NORETURN; #define nl_assert(x) { \ if(!(x)) { \ nl_assertion_failed(#x,__FILE__, __LINE__) ; \ } \ } #define nl_range_assert(x,min_val,max_val) { \ if(((x) < (min_val)) || ((x) > (max_val))) { \ nl_range_assertion_failed(x, min_val, max_val, \ __FILE__, __LINE__ \ ) ; \ } \ } #define nl_assert_not_reached { \ nl_should_not_have_reached(__FILE__, __LINE__) ; \ } #ifdef NL_DEBUG #define nl_debug_assert(x) nl_assert(x) #define nl_debug_range_assert(x,min_val,max_val) \ nl_range_assert(x,min_val,max_val) #else #define nl_debug_assert(x) #define nl_debug_range_assert(x,min_val,max_val) #endif #ifdef NL_PARANOID #define nl_parano_assert(x) nl_assert(x) #define nl_parano_range_assert(x,min_val,max_val) \ nl_range_assert(x,min_val,max_val) #else #define nl_parano_assert(x) #define nl_parano_range_assert(x,min_val,max_val) #endif void nlError(const char* function, const char* message) ; void nlWarning(const char* function, const char* message) ; NLdouble nlCurrentTime(void); typedef void* NLdll; #define NL_LINK_NOW 1 #define NL_LINK_LAZY 2 #define NL_LINK_GLOBAL 4 #define NL_LINK_QUIET 8 #define NL_LINK_USE_FALLBACK 16 NLdll nlOpenDLL(const char* filename, NLenum flags); void nlCloseDLL(NLdll handle); NLfunc nlFindFunction(NLdll handle, const char* funcname); /* classic macros */ #ifndef MIN #define MIN(x,y) (((x) < (y)) ? (x) : (y)) #endif #ifndef MAX #define MAX(x,y) (((x) > (y)) ? (x) : (y)) #endif #define NL_NEW(T) (T*)(calloc(1, sizeof(T))) #define NL_NEW_ARRAY(T,NB) (T*)(calloc((size_t)(NB),sizeof(T))) #define NL_RENEW_ARRAY(T,x,NB) (T*)(realloc(x,(size_t)(NB)*sizeof(T))) #define NL_DELETE(x) free(x); x = NULL #define NL_DELETE_ARRAY(x) free(x); x = NULL #define NL_CLEAR(T, x) memset(x, 0, sizeof(T)) #define NL_CLEAR_ARRAY(T,x,NB) memset(x, 0, (size_t)(NB)*sizeof(T)) #define NL_UINT_MAX 0xffffffff #define NL_USHORT_MAX 0xffff extern NLprintfFunc nl_printf; extern NLfprintfFunc nl_fprintf; #endif /******* extracted from nl_blas.h *******/ #ifndef OPENNL_BLAS_H #define OPENNL_BLAS_H struct NLBlas; typedef struct NLBlas* NLBlas_t; typedef enum { NoTranspose=0, Transpose=1, ConjugateTranspose=2 } MatrixTranspose ; typedef enum { UpperTriangle=0, LowerTriangle=1 } MatrixTriangle ; typedef enum { UnitTriangular=0, NotUnitTriangular=1 } MatrixUnitTriangular ; typedef enum { NL_HOST_MEMORY, NL_DEVICE_MEMORY } NLmemoryType; typedef void* (*FUNPTR_malloc)( NLBlas_t blas, NLmemoryType type, size_t size ); typedef void (*FUNPTR_free)( NLBlas_t blas, NLmemoryType type, size_t size, void* ptr ); typedef void (*FUNPTR_memcpy)( NLBlas_t blas, void* to, NLmemoryType to_type, void* from, NLmemoryType from_type, size_t size ); typedef void (*FUNPTR_dcopy)( NLBlas_t blas, int n, const double *x, int incx, double *y, int incy ); typedef void (*FUNPTR_dscal)( NLBlas_t blas, int n, double a, double *x, int incx ); typedef double (*FUNPTR_ddot)( NLBlas_t blas, int n, const double *x, int incx, const double *y, int incy ); typedef double (*FUNPTR_dnrm2)(NLBlas_t blas, int n, const double *x, int incx); typedef void (*FUNPTR_daxpy)( NLBlas_t blas, int n, double a, const double *x, int incx, double *y, int incy ); typedef void (*FUNPTR_dgemv)( NLBlas_t blas, MatrixTranspose trans, int m, int n, double alpha, const double *A, int ldA, const double *x, int incx, double beta, double *y, int incy ); typedef void (*FUNPTR_dtpsv)( NLBlas_t blas, MatrixTriangle uplo, MatrixTranspose trans, MatrixUnitTriangular diag, int n, const double *AP, double *x, int incx ); struct NLBlas { FUNPTR_malloc Malloc; FUNPTR_free Free; FUNPTR_memcpy Memcpy; FUNPTR_dcopy Dcopy; FUNPTR_dscal Dscal; FUNPTR_ddot Ddot; FUNPTR_dnrm2 Dnrm2; FUNPTR_daxpy Daxpy; FUNPTR_dgemv Dgemv; FUNPTR_dtpsv Dtpsv; NLboolean has_unified_memory; double start_time; NLulong flops; NLulong used_ram[2]; NLulong max_used_ram[2]; /* * Used for stats of the linear solver * (a bit ugly, should not be here, but * more convenient for now...) */ double sq_rnorm; double sq_bnorm; }; NLboolean nlBlasHasUnifiedMemory(NLBlas_t blas); void nlBlasResetStats(NLBlas_t blas); double nlBlasGFlops(NLBlas_t blas); NLulong nlBlasUsedRam(NLBlas_t blas, NLmemoryType type); NLulong nlBlasMaxUsedRam(NLBlas_t blas, NLmemoryType type); NLBlas_t nlHostBlas(void); #define NL_NEW_VECTOR(blas, memtype, dim) \ (double*)blas->Malloc(blas,memtype,(size_t)(dim)*sizeof(double)) #define NL_DELETE_VECTOR(blas, memtype, dim, ptr) \ blas->Free(blas,memtype,(size_t)(dim)*sizeof(double),ptr) #endif /******* extracted from nl_matrix.h *******/ #ifndef OPENNL_MATRIX_H #define OPENNL_MATRIX_H #ifdef __cplusplus extern "C" { #endif /* Abstract matrix interface */ struct NLMatrixStruct; typedef struct NLMatrixStruct* NLMatrix; typedef void(*NLDestroyMatrixFunc)(NLMatrix M); typedef void(*NLMultMatrixVectorFunc)(NLMatrix M, const double* x, double* y); #define NL_MATRIX_SPARSE_DYNAMIC 0x1001 #define NL_MATRIX_CRS 0x1002 #define NL_MATRIX_SUPERLU_EXT 0x1003 #define NL_MATRIX_CHOLMOD_EXT 0x1004 #define NL_MATRIX_FUNCTION 0x1005 #define NL_MATRIX_OTHER 0x1006 struct NLMatrixStruct { NLuint m; NLuint n; NLenum type; NLDestroyMatrixFunc destroy_func; NLMultMatrixVectorFunc mult_func; }; NLAPI void NLAPIENTRY nlDeleteMatrix(NLMatrix M); NLAPI void NLAPIENTRY nlMultMatrixVector( NLMatrix M, const double* x, double* y ); /* Dynamic arrays for sparse row/columns */ typedef struct { NLuint index; NLdouble value; } NLCoeff; typedef struct { NLuint size; NLuint capacity; NLCoeff* coeff; } NLRowColumn; NLAPI void NLAPIENTRY nlRowColumnConstruct(NLRowColumn* c); NLAPI void NLAPIENTRY nlRowColumnDestroy(NLRowColumn* c); NLAPI void NLAPIENTRY nlRowColumnGrow(NLRowColumn* c); NLAPI void NLAPIENTRY nlRowColumnAdd( NLRowColumn* c, NLuint index, NLdouble value ); NLAPI void NLAPIENTRY nlRowColumnAppend( NLRowColumn* c, NLuint index, NLdouble value ); NLAPI void NLAPIENTRY nlRowColumnZero(NLRowColumn* c); NLAPI void NLAPIENTRY nlRowColumnClear(NLRowColumn* c); NLAPI void NLAPIENTRY nlRowColumnSort(NLRowColumn* c); /* Compressed Row Storage */ typedef struct { NLuint m; NLuint n; NLenum type; NLDestroyMatrixFunc destroy_func; NLMultMatrixVectorFunc mult_func; NLdouble* val; NLuint* rowptr; NLuint* colind; NLuint nslices; NLuint* sliceptr; NLboolean symmetric_storage; } NLCRSMatrix; NLAPI void NLAPIENTRY nlCRSMatrixConstruct( NLCRSMatrix* M, NLuint m, NLuint n, NLuint nnz, NLuint nslices ); NLAPI void NLAPIENTRY nlCRSMatrixConstructSymmetric( NLCRSMatrix* M, NLuint n, NLuint nnz ); NLAPI NLboolean NLAPIENTRY nlCRSMatrixLoad( NLCRSMatrix* M, const char* filename ); NLAPI NLboolean NLAPIENTRY nlCRSMatrixSave( NLCRSMatrix* M, const char* filename ); NLAPI NLuint NLAPIENTRY nlCRSMatrixNNZ(NLCRSMatrix* M); /* SparseMatrix data structure */ #define NL_MATRIX_STORE_ROWS 1 #define NL_MATRIX_STORE_COLUMNS 2 #define NL_MATRIX_STORE_SYMMETRIC 4 typedef struct { NLuint m; NLuint n; NLenum type; NLDestroyMatrixFunc destroy_func; NLMultMatrixVectorFunc mult_func; NLuint diag_size; NLuint diag_capacity; NLenum storage; NLRowColumn* row; NLRowColumn* column; NLdouble* diag; NLuint row_capacity; NLuint column_capacity; } NLSparseMatrix; NLAPI NLMatrix NLAPIENTRY nlSparseMatrixNew( NLuint m, NLuint n, NLenum storage ); NLAPI void NLAPIENTRY nlSparseMatrixConstruct( NLSparseMatrix* M, NLuint m, NLuint n, NLenum storage ); NLAPI void NLAPIENTRY nlSparseMatrixDestroy(NLSparseMatrix* M); NLAPI void NLAPIENTRY nlSparseMatrixMult( NLSparseMatrix* A, const NLdouble* x, NLdouble* y ); NLAPI void NLAPIENTRY nlSparseMatrixAdd( NLSparseMatrix* M, NLuint i, NLuint j, NLdouble value ); NLAPI void NLAPIENTRY nlSparseMatrixAddMatrix( NLSparseMatrix* M, double mul, const NLMatrix N ); NLAPI void NLAPIENTRY nlSparseMatrixZero( NLSparseMatrix* M); NLAPI void NLAPIENTRY nlSparseMatrixClear( NLSparseMatrix* M); NLAPI NLuint NLAPIENTRY nlSparseMatrixNNZ( NLSparseMatrix* M); NLAPI void NLAPIENTRY nlSparseMatrixSort( NLSparseMatrix* M); NLAPI void NLAPIENTRY nlSparseMatrixAddRow( NLSparseMatrix* M); NLAPI void NLAPIENTRY nlSparseMatrixAddColumn( NLSparseMatrix* M); NLAPI void NLAPIENTRY nlSparseMatrixMAddRow( NLSparseMatrix* M, NLuint i1, double s, NLuint i2 ); NLAPI void NLAPIENTRY nlSparseMatrixScaleRow( NLSparseMatrix* M, NLuint i, double s ); NLAPI void NLAPIENTRY nlSparseMatrixZeroRow( NLSparseMatrix* M, NLuint i ); NLAPI NLMatrix NLAPIENTRY nlCRSMatrixNewFromSparseMatrix(NLSparseMatrix* M); NLAPI NLMatrix NLAPIENTRY nlCRSMatrixNewFromSparseMatrixSymmetric( NLSparseMatrix* M ); NLAPI void NLAPIENTRY nlMatrixCompress(NLMatrix* M); NLAPI NLuint NLAPIENTRY nlMatrixNNZ(NLMatrix M); NLAPI NLMatrix NLAPIENTRY nlMatrixFactorize(NLMatrix M, NLenum solver); typedef void(*NLMatrixFunc)(const double* x, double* y); NLAPI NLMatrix NLAPIENTRY nlMatrixNewFromFunction( NLuint m, NLuint n, NLMatrixFunc func ); NLAPI NLMatrixFunc NLAPIENTRY nlMatrixGetFunction(NLMatrix M); NLAPI NLMatrix NLAPIENTRY nlMatrixNewFromProduct( NLMatrix M, NLboolean product_owns_M, NLMatrix N, NLboolean product_owns_N ); #ifdef __cplusplus } #endif #endif /******* extracted from nl_context.h *******/ #ifndef OPENNL_CONTEXT_H #define OPENNL_CONTEXT_H /* NLContext data structure */ typedef NLboolean(*NLSolverFunc)(void); typedef void(*NLProgressFunc)( NLuint cur_iter, NLuint max_iter, double cur_err, double max_err ); #define NL_STATE_INITIAL 0 #define NL_STATE_SYSTEM 1 #define NL_STATE_MATRIX 2 #define NL_STATE_ROW 3 #define NL_STATE_MATRIX_CONSTRUCTED 4 #define NL_STATE_SYSTEM_CONSTRUCTED 5 #define NL_STATE_SOLVED 6 typedef struct { void* base_address; NLuint stride; } NLBufferBinding; #define NL_BUFFER_ITEM(B,i) \ *(double*)((void*)((char*)((B).base_address)+((i)*(B).stride))) typedef struct { NLenum state; NLboolean user_variable_buffers; NLBufferBinding* variable_buffer; NLdouble* variable_value; NLboolean* variable_is_locked; NLuint* variable_index; NLuint n; NLenum matrix_mode; NLMatrix M; NLMatrix P; NLMatrix B; NLRowColumn af; NLRowColumn al; NLdouble* x; NLdouble* b; NLdouble* right_hand_side; NLdouble row_scaling; NLenum solver; NLenum preconditioner; NLboolean preconditioner_defined; NLuint nb_variables; NLuint nb_systems; NLboolean ij_coefficient_called; NLuint current_row; NLboolean least_squares; NLboolean symmetric; NLuint max_iterations; NLboolean max_iterations_defined; NLuint inner_iterations; NLdouble threshold; NLboolean threshold_defined; NLdouble omega; NLboolean normalize_rows; NLuint used_iterations; NLdouble error; NLdouble start_time; NLdouble elapsed_time; NLSolverFunc solver_func; NLProgressFunc progress_func; NLboolean verbose; NLulong flops; NLenum eigen_solver; NLdouble eigen_shift; NLboolean eigen_shift_invert; NLdouble* eigen_value; NLdouble* temp_eigen_value; } NLContextStruct; extern NLContextStruct* nlCurrentContext; void nlCheckState(NLenum state); void nlTransition(NLenum from_state, NLenum to_state); NLboolean nlDefaultSolver(void); #endif /******* extracted from nl_iterative_solvers.h *******/ #ifndef OPENNL_ITERATIVE_SOLVERS_H #define OPENNL_ITERATIVE_SOLVERS_H NLAPI NLuint NLAPIENTRY nlSolveSystemIterative( NLBlas_t blas, NLMatrix M, NLMatrix P, NLdouble* b, NLdouble* x, NLenum solver, double eps, NLuint max_iter, NLuint inner_iter ); #endif /******* extracted from nl_preconditioners.h *******/ #ifndef OPENNL_PRECONDITIONERS_H #define OPENNL_PRECONDITIONERS_H /* preconditioners */ NLMatrix nlNewJacobiPreconditioner(NLMatrix M); NLMatrix nlNewSSORPreconditioner(NLMatrix M, double omega); #endif /******* extracted from nl_superlu.h *******/ #ifndef OPENNL_SUPERLU_H #define OPENNL_SUPERLU_H NLAPI NLMatrix NLAPIENTRY nlMatrixFactorize_SUPERLU( NLMatrix M, NLenum solver ); NLboolean nlInitExtension_SUPERLU(void); NLboolean nlExtensionIsInitialized_SUPERLU(void); #endif /******* extracted from nl_cholmod.h *******/ #ifndef OPENNL_CHOLMOD_H #define OPENNL_CHOLMOD_H NLAPI NLMatrix NLAPIENTRY nlMatrixFactorize_CHOLMOD( NLMatrix M, NLenum solver ); NLboolean nlInitExtension_CHOLMOD(void); NLboolean nlExtensionIsInitialized_CHOLMOD(void); #endif /******* extracted from nl_arpack.h *******/ #ifndef OPENNL_ARPACK_H #define OPENNL_ARPACK_H NLboolean nlInitExtension_ARPACK(void); NLboolean nlExtensionIsInitialized_ARPACK(void); void nlEigenSolve_ARPACK(void); #endif /******* extracted from nl_mkl.h *******/ #ifndef OPENNL_MKL_H #define OPENNL_MKL_H NLboolean nlInitExtension_MKL(void); NLboolean nlExtensionIsInitialized_MKL(void); extern NLMultMatrixVectorFunc NLMultMatrixVector_MKL; #endif /******* extracted from nl_cuda.h *******/ #ifndef OPENNL_CUDA_EXT_H #define OPENNL_CUDA_EXT_H NLboolean nlInitExtension_CUDA(void); NLboolean nlExtensionIsInitialized_CUDA(void); NLMatrix nlCUDAMatrixNewFromCRSMatrix(NLMatrix M); NLMatrix nlCUDAJacobiPreconditionerNewFromCRSMatrix(NLMatrix M); NLBlas_t nlCUDABlas(void); #endif /******* extracted from nl_os.c *******/ #if (defined (WIN32) || defined(_WIN64)) #include #else #include #include #endif #if defined(GEO_DYNAMIC_LIBS) && defined(NL_OS_UNIX) #include #endif /* Assertions */ void nl_assertion_failed(const char* cond, const char* file, int line) { nl_fprintf( stderr, "OpenNL assertion failed: %s, file:%s, line:%d\n", cond,file,line ) ; abort() ; } void nl_range_assertion_failed( double x, double min_val, double max_val, const char* file, int line ) { nl_fprintf( stderr, "OpenNL range assertion failed: " "%f in [ %f ... %f ], file:%s, line:%d\n", x, min_val, max_val, file,line ) ; abort() ; } void nl_should_not_have_reached(const char* file, int line) { nl_fprintf( stderr, "OpenNL should not have reached this point: file:%s, line:%d\n", file,line ) ; abort() ; } /* Timing */ #ifdef WIN32 NLdouble nlCurrentTime() { return (NLdouble)GetTickCount() / 1000.0 ; } #else double nlCurrentTime() { clock_t user_clock ; struct tms user_tms ; user_clock = times(&user_tms) ; return (NLdouble)user_clock / 100.0 ; } #endif /* DLLs/shared objects/dylibs */ #if defined(GEO_DYNAMIC_LIBS) # if defined(NL_OS_UNIX) NLdll nlOpenDLL(const char* name, NLenum flags_in) { void* result = NULL; int flags = 0; if((flags_in & NL_LINK_NOW) != 0) { flags |= RTLD_NOW; } if((flags_in & NL_LINK_LAZY) != 0) { flags |= RTLD_LAZY; } if((flags_in & NL_LINK_GLOBAL) != 0) { flags |= RTLD_GLOBAL; } if((flags_in & NL_LINK_QUIET) == 0) { nl_fprintf(stdout,"Trying to load %s\n", name); } result = dlopen(name, flags); if(result == NULL) { if((flags_in & NL_LINK_QUIET) == 0) { nl_fprintf(stderr,"Did not find %s,\n", name); nl_fprintf(stderr,"Retrying with libgeogram_num_3rdparty.so\n"); } if((flags_in & NL_LINK_USE_FALLBACK) != 0) { result=dlopen("libgeogram_num_3rdparty.so", flags); if(result == NULL) { if((flags_in & NL_LINK_QUIET) == 0) { nlError("nlOpenDLL/dlopen",dlerror()); } } } } if((flags_in & NL_LINK_QUIET) == 0 && result != NULL) { nl_fprintf(stdout,"Loaded %s\n", name); } return result; } void nlCloseDLL(void* handle) { dlclose(handle); } NLfunc nlFindFunction(void* handle, const char* name) { /* * It is not legal in modern C to cast a void* * pointer into a function pointer, thus requiring this * (quite dirty) function that uses a union. */ union { void* ptr; NLfunc fptr; } u; u.ptr = dlsym(handle, name); return u.fptr; } # elif defined(NL_OS_WINDOWS) NLdll nlOpenDLL(const char* name, NLenum flags) { /* Note: NL_LINK_LAZY and NL_LINK_GLOBAL are ignored. */ void* result = LoadLibrary(name); if(result == NULL && ((flags & NL_LINK_USE_FALLBACK) != 0)) { if((flags & NL_LINK_QUIET) == 0) { nl_fprintf(stderr,"Did not find %s,\n", name); nl_fprintf(stderr,"Retrying with geogram_num_3rdparty\n"); } result=LoadLibrary("geogram_num_3rdparty.dll"); } return result; } void nlCloseDLL(void* handle) { FreeLibrary((HMODULE)handle); } NLfunc nlFindFunction(void* handle, const char* name) { return (NLfunc)GetProcAddress((HMODULE)handle, name); } # endif #else NLdll nlOpenDLL(const char* name, NLenum flags) { nl_arg_used(name); nl_arg_used(flags); #ifdef NL_OS_UNIX nlError("nlOpenDLL","Was not compiled with dynamic linking enabled"); nlError("nlOpenDLL","(see VORPALINE_BUILD_DYNAMIC in CMakeLists.txt)"); #else nlError("nlOpenDLL","Not implemented"); #endif return NULL; } void nlCloseDLL(void* handle) { nl_arg_used(handle); nlError("nlCloseDLL","Not implemented"); } NLfunc nlFindFunction(void* handle, const char* name) { nl_arg_used(handle); nl_arg_used(name); nlError("nlFindFunction","Not implemented"); return NULL; } #endif /* Error-reporting functions */ NLprintfFunc nl_printf = printf; NLfprintfFunc nl_fprintf = fprintf; void nlError(const char* function, const char* message) { nl_fprintf(stderr, "OpenNL error in %s(): %s\n", function, message) ; } void nlWarning(const char* function, const char* message) { nl_fprintf(stderr, "OpenNL warning in %s(): %s\n", function, message) ; } void nlPrintfFuncs(NLprintfFunc f1, NLfprintfFunc f2) { nl_printf = f1; nl_fprintf = f2; } /******* extracted from nl_matrix.c *******/ /* Some warnings about const cast in callback for qsort() function. */ #ifdef __clang__ #pragma GCC diagnostic ignored "-Wcast-qual" #endif void nlDeleteMatrix(NLMatrix M) { if(M == NULL) { return; } M->destroy_func(M); NL_DELETE(M); } void nlMultMatrixVector( NLMatrix M, const double* x, double* y ) { M->mult_func(M,x,y); } void nlRowColumnConstruct(NLRowColumn* c) { c->size = 0; c->capacity = 0; c->coeff = NULL; } void nlRowColumnDestroy(NLRowColumn* c) { NL_DELETE_ARRAY(c->coeff); c->size = 0; c->capacity = 0; } void nlRowColumnGrow(NLRowColumn* c) { if(c->capacity != 0) { c->capacity = 2 * c->capacity; c->coeff = NL_RENEW_ARRAY(NLCoeff, c->coeff, c->capacity); } else { c->capacity = 4; c->coeff = NL_NEW_ARRAY(NLCoeff, c->capacity); } } void nlRowColumnAdd(NLRowColumn* c, NLuint index, NLdouble value) { NLuint i; for(i=0; isize; i++) { if(c->coeff[i].index == index) { c->coeff[i].value += value; return; } } if(c->size == c->capacity) { nlRowColumnGrow(c); } c->coeff[c->size].index = index; c->coeff[c->size].value = value; c->size++; } /* Does not check whether the index already exists */ void nlRowColumnAppend(NLRowColumn* c, NLuint index, NLdouble value) { if(c->size == c->capacity) { nlRowColumnGrow(c); } c->coeff[c->size].index = index; c->coeff[c->size].value = value; c->size++; } void nlRowColumnZero(NLRowColumn* c) { c->size = 0; } void nlRowColumnClear(NLRowColumn* c) { c->size = 0; c->capacity = 0; NL_DELETE_ARRAY(c->coeff); } static int nlCoeffCompare(const void* p1, const void* p2) { return (((NLCoeff*)(p2))->index < ((NLCoeff*)(p1))->index); } void nlRowColumnSort(NLRowColumn* c) { qsort(c->coeff, c->size, sizeof(NLCoeff), nlCoeffCompare); } /* CRSMatrix data structure */ static void nlCRSMatrixDestroy(NLCRSMatrix* M) { NL_DELETE_ARRAY(M->val); NL_DELETE_ARRAY(M->rowptr); NL_DELETE_ARRAY(M->colind); NL_DELETE_ARRAY(M->sliceptr); M->m = 0; M->n = 0; M->nslices = 0; } NLboolean nlCRSMatrixSave(NLCRSMatrix* M, const char* filename) { NLuint nnz = M->rowptr[M->m]; FILE* f = fopen(filename, "rb"); if(f == NULL) { nlError("nlCRSMatrixSave", "Could not open file"); return NL_FALSE; } fwrite(&M->m, sizeof(NLuint), 1, f); fwrite(&M->n, sizeof(NLuint), 1, f); fwrite(&nnz, sizeof(NLuint), 1, f); fwrite(M->rowptr, sizeof(NLuint), M->m+1, f); fwrite(M->colind, sizeof(NLuint), nnz, f); fwrite(M->val, sizeof(double), nnz, f); return NL_TRUE; } NLboolean nlCRSMatrixLoad(NLCRSMatrix* M, const char* filename) { NLuint nnz = 0; FILE* f = fopen(filename, "rb"); NLboolean truncated = NL_FALSE; if(f == NULL) { nlError("nlCRSMatrixLoad", "Could not open file"); return NL_FALSE; } truncated = truncated || ( fread(&M->m, sizeof(NLuint), 1, f) != 1 || fread(&M->n, sizeof(NLuint), 1, f) != 1 || fread(&nnz, sizeof(NLuint), 1, f) != 1 ); if(truncated) { M->rowptr = NULL; M->colind = NULL; M->val = NULL; } else { M->rowptr = NL_NEW_ARRAY(NLuint, M->m+1); M->colind = NL_NEW_ARRAY(NLuint, nnz); M->val = NL_NEW_ARRAY(double, nnz); truncated = truncated || ( fread(M->rowptr, sizeof(NLuint), M->m+1, f) != M->m+1 || fread(M->colind, sizeof(NLuint), nnz, f) != nnz || fread(M->val, sizeof(double), nnz, f) != nnz ); } if(truncated) { nlError("nlCRSMatrixSave", "File appears to be truncated"); NL_DELETE_ARRAY(M->rowptr); NL_DELETE_ARRAY(M->colind); NL_DELETE_ARRAY(M->val); return NL_FALSE; } else { M->nslices = 1; M->sliceptr = NL_NEW_ARRAY(NLuint, M->nslices+1); M->sliceptr[0] = 0; M->sliceptr[1] = M->m; } fclose(f); return NL_TRUE; } NLuint nlCRSMatrixNNZ(NLCRSMatrix* M) { return M->rowptr[M->m]; } static void nlCRSMatrixMultSlice( NLCRSMatrix* M, const double* x, double* y, NLuint Ibegin, NLuint Iend ) { NLuint i,j; for(i=Ibegin; irowptr[i]; jrowptr[i+1]; ++j) { sum += M->val[j] * x[M->colind[j]]; } y[i] = sum; } } static void nlCRSMatrixMult( NLCRSMatrix* M, const double* x, double* y ) { int slice; int nslices = (int)(M->nslices); NLuint i,j,jj; NLdouble a; if(M->symmetric_storage) { for(i=0; im; ++i) { y[i] = 0.0; } for(i=0; im; ++i) { for(jj=M->rowptr[i]; jjrowptr[i+1]; ++jj) { a = M->val[jj]; j = M->colind[jj]; y[i] += a * x[j]; if(j != i) { y[j] += a * x[i]; } } } } else { #if defined(_OPENMP) #pragma omp parallel for private(slice) #endif for(slice=0; slicesliceptr[slice],M->sliceptr[slice+1] ); } } nlHostBlas()->flops += (NLulong)(2*nlCRSMatrixNNZ(M)); } void nlCRSMatrixConstruct( NLCRSMatrix* M, NLuint m, NLuint n, NLuint nnz, NLuint nslices ) { M->m = m; M->n = n; M->type = NL_MATRIX_CRS; M->destroy_func = (NLDestroyMatrixFunc)nlCRSMatrixDestroy; if(NLMultMatrixVector_MKL != NULL) { M->mult_func = (NLMultMatrixVectorFunc)NLMultMatrixVector_MKL; } else { M->mult_func = (NLMultMatrixVectorFunc)nlCRSMatrixMult; } M->nslices = nslices; M->val = NL_NEW_ARRAY(double, nnz); M->rowptr = NL_NEW_ARRAY(NLuint, m+1); M->colind = NL_NEW_ARRAY(NLuint, nnz); M->sliceptr = NL_NEW_ARRAY(NLuint, nslices+1); M->symmetric_storage = NL_FALSE; } void nlCRSMatrixConstructSymmetric( NLCRSMatrix* M, NLuint n, NLuint nnz ) { M->m = n; M->n = n; M->type = NL_MATRIX_CRS; M->destroy_func = (NLDestroyMatrixFunc)nlCRSMatrixDestroy; M->mult_func = (NLMultMatrixVectorFunc)nlCRSMatrixMult; M->nslices = 0; M->val = NL_NEW_ARRAY(double, nnz); M->rowptr = NL_NEW_ARRAY(NLuint, n+1); M->colind = NL_NEW_ARRAY(NLuint, nnz); M->sliceptr = NULL; M->symmetric_storage = NL_TRUE; } /* SparseMatrix data structure */ static void nlSparseMatrixDestroyRowColumns(NLSparseMatrix* M) { NLuint i; if(M->storage & NL_MATRIX_STORE_ROWS) { for(i=0; im; i++) { nlRowColumnDestroy(&(M->row[i])); } NL_DELETE_ARRAY(M->row); } M->storage = (NLenum)((int)(M->storage) & ~NL_MATRIX_STORE_ROWS); if(M->storage & NL_MATRIX_STORE_COLUMNS) { for(i=0; in; i++) { nlRowColumnDestroy(&(M->column[i])); } NL_DELETE_ARRAY(M->column); } M->storage = (NLenum)((int)(M->storage) & ~NL_MATRIX_STORE_COLUMNS); } void nlSparseMatrixDestroy(NLSparseMatrix* M) { nl_assert(M->type == NL_MATRIX_SPARSE_DYNAMIC); nlSparseMatrixDestroyRowColumns(M); NL_DELETE_ARRAY(M->diag); #ifdef NL_PARANOID NL_CLEAR(NLSparseMatrix,M); #endif } void nlSparseMatrixAdd(NLSparseMatrix* M, NLuint i, NLuint j, NLdouble value) { nl_parano_range_assert(i, 0, M->m - 1); nl_parano_range_assert(j, 0, M->n - 1); if((M->storage & NL_MATRIX_STORE_SYMMETRIC) && (j > i)) { return; } if(i == j) { M->diag[i] += value; } if(M->storage & NL_MATRIX_STORE_ROWS) { nlRowColumnAdd(&(M->row[i]), j, value); } if(M->storage & NL_MATRIX_STORE_COLUMNS) { nlRowColumnAdd(&(M->column[j]), i, value); } } static void nlSparseMatrixAddSparseMatrix( NLSparseMatrix* M, double mul, const NLSparseMatrix* N ) { NLuint i,j,ii,jj; nl_assert(M->m == N->m); nl_assert(M->n == N->n); if(N->storage & NL_MATRIX_STORE_SYMMETRIC) { nl_assert(M->storage & NL_MATRIX_STORE_SYMMETRIC); } if(N->storage & NL_MATRIX_STORE_ROWS) { for(i=0; im; ++i) { for(jj=0; jjrow[i].size; ++jj) { nlSparseMatrixAdd( M, i, N->row[i].coeff[jj].index, mul*N->row[i].coeff[jj].value ); } } } else { nl_assert(N->storage & NL_MATRIX_STORE_COLUMNS); for(j=0; jn; ++j) { for(ii=0; iicolumn[j].size; ++ii) { nlSparseMatrixAdd( M, N->column[j].coeff[ii].index, j, mul*N->column[j].coeff[ii].value ); } } } } static void nlSparseMatrixAddCRSMatrix( NLSparseMatrix* M, double mul, const NLCRSMatrix* N ) { NLuint i,jj; nl_assert(M->m == N->m); nl_assert(M->n == N->n); for(i=0; im; ++i) { for(jj=N->rowptr[i]; jjrowptr[i+1]; ++jj) { nlSparseMatrixAdd( M, i, N->colind[jj], mul*N->val[jj] ); } } } void nlSparseMatrixAddMatrix( NLSparseMatrix* M, double mul, const NLMatrix N ) { nl_assert(M->m == N->m); nl_assert(M->n == N->n); if(N->type == NL_MATRIX_SPARSE_DYNAMIC) { nlSparseMatrixAddSparseMatrix(M, mul, (const NLSparseMatrix*)N); } else if(N->type == NL_MATRIX_CRS) { nlSparseMatrixAddCRSMatrix(M, mul, (const NLCRSMatrix*)N); } else { nl_assert_not_reached; } } void nlSparseMatrixZero( NLSparseMatrix* M) { NLuint i; if(M->storage & NL_MATRIX_STORE_ROWS) { for(i=0; im; i++) { nlRowColumnZero(&(M->row[i])); } } if(M->storage & NL_MATRIX_STORE_COLUMNS) { for(i=0; in; i++) { nlRowColumnZero(&(M->column[i])); } } NL_CLEAR_ARRAY(NLdouble, M->diag, M->diag_size); } void nlSparseMatrixClear( NLSparseMatrix* M) { NLuint i; if(M->storage & NL_MATRIX_STORE_ROWS) { for(i=0; im; i++) { nlRowColumnClear(&(M->row[i])); } } if(M->storage & NL_MATRIX_STORE_COLUMNS) { for(i=0; in; i++) { nlRowColumnClear(&(M->column[i])); } } NL_CLEAR_ARRAY(NLdouble, M->diag, M->diag_size); } /* Returns the number of non-zero coefficients */ NLuint nlSparseMatrixNNZ( NLSparseMatrix* M) { NLuint nnz = 0; NLuint i; if(M->storage & NL_MATRIX_STORE_ROWS) { for(i = 0; im; i++) { nnz += M->row[i].size; } } else if (M->storage & NL_MATRIX_STORE_COLUMNS) { for(i = 0; in; i++) { nnz += M->column[i].size; } } else { nl_assert_not_reached; } return nnz; } void nlSparseMatrixSort( NLSparseMatrix* M) { NLuint i; if(M->storage & NL_MATRIX_STORE_ROWS) { for(i = 0; im; i++) { nlRowColumnSort(&(M->row[i])); } } if (M->storage & NL_MATRIX_STORE_COLUMNS) { for(i = 0; in; i++) { nlRowColumnSort(&(M->column[i])); } } } void nlSparseMatrixMAddRow( NLSparseMatrix* M, NLuint i1, double s, NLuint i2 ) { NLuint jj; NLRowColumn* Ri2 = &(M->row[i2]); NLCoeff* c = NULL; nl_debug_assert(i1 < M->m); nl_debug_assert(i2 < M->m); for(jj=0; jjsize; ++jj) { c = &(Ri2->coeff[jj]); nlSparseMatrixAdd(M, i1, c->index, s*c->value); } } void nlSparseMatrixScaleRow( NLSparseMatrix* M, NLuint i, double s ) { NLuint jj; NLRowColumn* Ri = &(M->row[i]); NLCoeff* c = NULL; nl_assert(M->storage & NL_MATRIX_STORE_ROWS); nl_assert(!(M->storage & NL_MATRIX_STORE_COLUMNS)); nl_debug_assert(i < M->m); for(jj=0; jjsize; ++jj) { c = &(Ri->coeff[jj]); c->value *= s; } if(i < M->diag_size) { M->diag[i] *= s; } } void nlSparseMatrixZeroRow( NLSparseMatrix* M, NLuint i ) { NLRowColumn* Ri = &(M->row[i]); nl_debug_assert(i < M->m); Ri->size = 0; if(i < M->diag_size) { M->diag[i] = 0.0; } } /* SparseMatrix x Vector routines, internal helper routines */ static void nlSparseMatrix_mult_rows_symmetric( NLSparseMatrix* A, const NLdouble* x, NLdouble* y ) { NLuint m = A->m; NLuint i,ij; NLCoeff* c = NULL; for(i=0; irow[i]); y[i] = 0; for(ij=0; ijsize; ++ij) { c = &(Ri->coeff[ij]); y[i] += c->value * x[c->index]; if(i != c->index) { y[c->index] += c->value * x[i]; } } } } static void nlSparseMatrix_mult_rows( NLSparseMatrix* A, const NLdouble* x, NLdouble* y ) { /* * Note: OpenMP does not like unsigned ints * (causes some floating point exceptions), * therefore I use here signed ints for all * indices. */ int m = (int)(A->m); int i,ij; NLCoeff* c = NULL; NLRowColumn* Ri = NULL; #if defined(_OPENMP) #pragma omp parallel for private(i,ij,c,Ri) #endif for(i=0; irow[i]); y[i] = 0; for(ij=0; ij<(int)(Ri->size); ij++) { c = &(Ri->coeff[ij]); y[i] += c->value * x[c->index]; } } } static void nlSparseMatrix_mult_cols_symmetric( NLSparseMatrix* A, const NLdouble* x, NLdouble* y ) { NLuint n = A->n; NLuint j,ii; NLCoeff* c = NULL; for(j=0; jcolumn[j]); y[j] = 0; for(ii=0; iisize; ii++) { c = &(Cj->coeff[ii]); y[c->index] += c->value * x[j]; if(j != c->index) { y[j] += c->value * x[c->index]; } } } } static void nlSparseMatrix_mult_cols( NLSparseMatrix* A, const NLdouble* x, NLdouble* y ) { NLuint n = A->n; NLuint j,ii; NLCoeff* c = NULL; NL_CLEAR_ARRAY(NLdouble, y, A->m); for(j=0; jcolumn[j]); for(ii=0; iisize; ii++) { c = &(Cj->coeff[ii]); y[c->index] += c->value * x[j]; } } } void nlSparseMatrixMult( NLSparseMatrix* A, const NLdouble* x, NLdouble* y ) { nl_assert(A->type == NL_MATRIX_SPARSE_DYNAMIC); if(A->storage & NL_MATRIX_STORE_ROWS) { if(A->storage & NL_MATRIX_STORE_SYMMETRIC) { nlSparseMatrix_mult_rows_symmetric(A, x, y); } else { nlSparseMatrix_mult_rows(A, x, y); } } else { if(A->storage & NL_MATRIX_STORE_SYMMETRIC) { nlSparseMatrix_mult_cols_symmetric(A, x, y); } else { nlSparseMatrix_mult_cols(A, x, y); } } nlHostBlas()->flops += (NLulong)(2*nlSparseMatrixNNZ(A)); } NLMatrix nlSparseMatrixNew( NLuint m, NLuint n, NLenum storage ) { NLSparseMatrix* result = NL_NEW(NLSparseMatrix); nlSparseMatrixConstruct(result, m, n, storage); return (NLMatrix)result; } void nlSparseMatrixConstruct( NLSparseMatrix* M, NLuint m, NLuint n, NLenum storage ) { NLuint i; M->m = m; M->n = n; M->type = NL_MATRIX_SPARSE_DYNAMIC; M->destroy_func = (NLDestroyMatrixFunc)nlSparseMatrixDestroy; M->mult_func = (NLMultMatrixVectorFunc)nlSparseMatrixMult; M->storage = storage; if(storage & NL_MATRIX_STORE_ROWS) { M->row = NL_NEW_ARRAY(NLRowColumn, m); M->row_capacity = m; for(i=0; irow[i])); } } else { M->row = NULL; M->row_capacity = 0; } if(storage & NL_MATRIX_STORE_COLUMNS) { M->column = NL_NEW_ARRAY(NLRowColumn, n); M->column_capacity = n; for(i=0; icolumn[i])); } } else { M->column = NULL; M->column_capacity = 0; } M->diag_size = MIN(m,n); M->diag_capacity = M->diag_size; M->diag = NL_NEW_ARRAY(NLdouble, M->diag_size); } static void adjust_diag(NLSparseMatrix* M) { NLuint new_diag_size = MIN(M->m, M->n); NLuint i; if(new_diag_size > M->diag_size) { if(new_diag_size > M->diag_capacity) { M->diag_capacity *= 2; if(M->diag_capacity == 0) { M->diag_capacity = 16; } M->diag = NL_RENEW_ARRAY(double, M->diag, M->diag_capacity); for(i=M->diag_size; idiag[i] = 0.0; } } M->diag_size= new_diag_size; } } void nlSparseMatrixAddRow( NLSparseMatrix* M) { ++M->m; if(M->storage & NL_MATRIX_STORE_ROWS) { if(M->m > M->row_capacity) { M->row_capacity *= 2; if(M->row_capacity == 0) { M->row_capacity = 16; } M->row = NL_RENEW_ARRAY( NLRowColumn, M->row, M->row_capacity ); } nlRowColumnConstruct(&(M->row[M->m-1])); } adjust_diag(M); } void nlSparseMatrixAddColumn( NLSparseMatrix* M) { ++M->n; if(M->storage & NL_MATRIX_STORE_COLUMNS) { if(M->n > M->column_capacity) { M->column_capacity *= 2; if(M->column_capacity == 0) { M->column_capacity = 16; } M->column = NL_RENEW_ARRAY( NLRowColumn, M->column, M->column_capacity ); } nlRowColumnConstruct(&(M->column[M->n-1])); } adjust_diag(M); } NLMatrix nlCRSMatrixNewFromSparseMatrix(NLSparseMatrix* M) { NLuint nnz = nlSparseMatrixNNZ(M); NLuint nslices = 8; /* TODO: get number of cores */ NLuint slice, cur_bound, cur_NNZ, cur_row; NLuint i,ij,k; NLuint slice_size = nnz / nslices; NLCRSMatrix* CRS = NL_NEW(NLCRSMatrix); nl_assert(M->storage & NL_MATRIX_STORE_ROWS); if(M->storage & NL_MATRIX_STORE_SYMMETRIC) { nl_assert(M->m == M->n); nlCRSMatrixConstructSymmetric(CRS, M->n, nnz); } else { nlCRSMatrixConstruct(CRS, M->m, M->n, nnz, nslices); } nlSparseMatrixSort(M); /* Convert matrix to CRS format */ k=0; for(i=0; im; ++i) { NLRowColumn* Ri = &(M->row[i]); CRS->rowptr[i] = k; for(ij=0; ijsize; ij++) { NLCoeff* c = &(Ri->coeff[ij]); CRS->val[k] = c->value; CRS->colind[k] = c->index; ++k; } } CRS->rowptr[M->m] = k; /* Create "slices" to be used by parallel sparse matrix vector product */ if(CRS->sliceptr != NULL) { cur_bound = slice_size; cur_NNZ = 0; cur_row = 0; CRS->sliceptr[0]=0; for(slice=1; slicem) { ++cur_row; cur_NNZ += CRS->rowptr[cur_row+1] - CRS->rowptr[cur_row]; } CRS->sliceptr[slice] = cur_row; cur_bound += slice_size; } CRS->sliceptr[nslices]=M->m; } return (NLMatrix)CRS; } NLMatrix nlCRSMatrixNewFromSparseMatrixSymmetric(NLSparseMatrix* M) { NLuint nnz; NLuint i,j,jj,k; NLCRSMatrix* CRS = NL_NEW(NLCRSMatrix); nl_assert(M->storage & NL_MATRIX_STORE_ROWS); nl_assert(M->m == M->n); nlSparseMatrixSort(M); if(M->storage & NL_MATRIX_STORE_SYMMETRIC) { nnz = nlSparseMatrixNNZ(M); } else { nnz = 0; for(i=0; in; ++i) { NLRowColumn* Ri = &M->row[i]; for(jj=0; jjsize; ++jj) { j = Ri->coeff[jj].index; if(j <= i) { ++nnz; } } } } nlCRSMatrixConstructSymmetric(CRS, M->n, nnz); k=0; for(i=0; im; ++i) { NLRowColumn* Ri = &(M->row[i]); CRS->rowptr[i] = k; for(jj=0; jjsize; ++jj) { j = Ri->coeff[jj].index; if((M->storage & NL_MATRIX_STORE_SYMMETRIC)) { nl_debug_assert(j <= i); } if(j <= i) { CRS->val[k] = Ri->coeff[jj].value; CRS->colind[k] = j; ++k; } } } CRS->rowptr[M->m] = k; return (NLMatrix)CRS; } void nlMatrixCompress(NLMatrix* M) { NLMatrix CRS = NULL; if((*M)->type != NL_MATRIX_SPARSE_DYNAMIC) { return; } CRS = nlCRSMatrixNewFromSparseMatrix((NLSparseMatrix*)*M); nlDeleteMatrix(*M); *M = CRS; } NLuint nlMatrixNNZ(NLMatrix M) { if(M->type == NL_MATRIX_SPARSE_DYNAMIC) { return nlSparseMatrixNNZ((NLSparseMatrix*)M); } else if(M->type == NL_MATRIX_CRS) { return nlCRSMatrixNNZ((NLCRSMatrix*)M); } return M->m * M->n; } NLMatrix nlMatrixFactorize(NLMatrix M, NLenum solver) { NLMatrix result = NULL; switch(solver) { case NL_SUPERLU_EXT: case NL_PERM_SUPERLU_EXT: case NL_SYMMETRIC_SUPERLU_EXT: result = nlMatrixFactorize_SUPERLU(M,solver); break; case NL_CHOLMOD_EXT: result = nlMatrixFactorize_CHOLMOD(M,solver); break; default: nlError("nlMatrixFactorize","unknown solver"); } return result; } typedef struct { NLuint m; NLuint n; NLenum type; NLDestroyMatrixFunc destroy_func; NLMultMatrixVectorFunc mult_func; NLMatrixFunc matrix_func; } NLFunctionMatrix; static void nlFunctionMatrixDestroy(NLFunctionMatrix* M) { (void)M; /* to avoid 'unused parameter' warning */ /* * Nothing special to do, * there is no dynamic allocated mem. */ } static void nlFunctionMatrixMult( NLFunctionMatrix* M, const NLdouble* x, NLdouble* y ) { M->matrix_func(x,y); } NLMatrix nlMatrixNewFromFunction(NLuint m, NLuint n, NLMatrixFunc func) { NLFunctionMatrix* result = NL_NEW(NLFunctionMatrix); result->m = m; result->n = n; result->type = NL_MATRIX_FUNCTION; result->destroy_func = (NLDestroyMatrixFunc)nlFunctionMatrixDestroy; result->mult_func = (NLMultMatrixVectorFunc)nlFunctionMatrixMult; result->matrix_func = func; return (NLMatrix)result; } NLMatrixFunc nlMatrixGetFunction(NLMatrix M) { if(M == NULL) { return NULL; } if(M->type != NL_MATRIX_FUNCTION) { return NULL; } return ((NLFunctionMatrix*)M)->matrix_func; } typedef struct { NLuint m; NLuint n; NLenum type; NLDestroyMatrixFunc destroy_func; NLMultMatrixVectorFunc mult_func; NLMatrixFunc matrix_func; NLMatrix M; NLboolean owns_M; NLMatrix N; NLboolean owns_N; NLdouble* work; } NLMatrixProduct; static void nlMatrixProductDestroy(NLMatrixProduct* P) { NL_DELETE_ARRAY(P->work); if(P->owns_M) { nlDeleteMatrix(P->M); P->M = NULL; } if(P->owns_N) { nlDeleteMatrix(P->N); P->N = NULL; } } static void nlMatrixProductMult( NLMatrixProduct* P, const NLdouble* x, NLdouble* y ) { nlMultMatrixVector(P->N, x, P->work); nlMultMatrixVector(P->M, P->work, y); } NLMatrix nlMatrixNewFromProduct( NLMatrix M, NLboolean owns_M, NLMatrix N, NLboolean owns_N ) { NLMatrixProduct* result = NL_NEW(NLMatrixProduct); nl_assert(M->n == N->m); result->m = M->m; result->n = N->n; result->type = NL_MATRIX_OTHER; result->work = NL_NEW_ARRAY(NLdouble,N->m); result->destroy_func = (NLDestroyMatrixFunc)nlMatrixProductDestroy; result->mult_func = (NLMultMatrixVectorFunc)nlMatrixProductMult; result->M = M; result->owns_M = owns_M; result->N = N; result->owns_N = owns_N; return (NLMatrix)result; } /******* extracted from nl_context.c *******/ NLContextStruct* nlCurrentContext = NULL; NLContext nlNewContext() { NLContextStruct* result = NL_NEW(NLContextStruct); result->state = NL_STATE_INITIAL; result->solver = NL_SOLVER_DEFAULT; result->max_iterations = 100; result->threshold = 1e-6; result->omega = 1.5; result->row_scaling = 1.0; result->inner_iterations = 5; result->solver_func = nlDefaultSolver; result->progress_func = NULL; result->verbose = NL_FALSE; result->nb_systems = 1; result->matrix_mode = NL_STIFFNESS_MATRIX; nlMakeCurrent(result); return result; } void nlDeleteContext(NLContext context_in) { NLContextStruct* context = (NLContextStruct*)(context_in); if(nlCurrentContext == context) { nlCurrentContext = NULL; } nlDeleteMatrix(context->M); context->M = NULL; nlDeleteMatrix(context->P); context->P = NULL; nlDeleteMatrix(context->B); context->B = NULL; nlRowColumnDestroy(&context->af); nlRowColumnDestroy(&context->al); NL_DELETE_ARRAY(context->variable_value); NL_DELETE_ARRAY(context->variable_buffer); NL_DELETE_ARRAY(context->variable_is_locked); NL_DELETE_ARRAY(context->variable_index); NL_DELETE_ARRAY(context->x); NL_DELETE_ARRAY(context->b); NL_DELETE_ARRAY(context->right_hand_side); NL_DELETE_ARRAY(context->eigen_value); #ifdef NL_PARANOID NL_CLEAR(NLContextStruct, context); #endif NL_DELETE(context); } void nlMakeCurrent(NLContext context) { nlCurrentContext = (NLContextStruct*)(context); } NLContext nlGetCurrent() { return nlCurrentContext; } /* Finite state automaton */ void nlCheckState(NLenum state) { nl_assert(nlCurrentContext->state == state); } void nlTransition(NLenum from_state, NLenum to_state) { nlCheckState(from_state); nlCurrentContext->state = to_state; } /* Preconditioner setup and default solver */ static void nlSetupPreconditioner() { /* Check compatibility between solver and preconditioner */ if( nlCurrentContext->solver == NL_BICGSTAB && nlCurrentContext->preconditioner == NL_PRECOND_SSOR ) { nlWarning( "nlSolve", "cannot use SSOR preconditioner with non-symmetric matrix, " "switching to Jacobi" ); nlCurrentContext->preconditioner = NL_PRECOND_JACOBI; } if( nlCurrentContext->solver == NL_GMRES && nlCurrentContext->preconditioner != NL_PRECOND_NONE ) { nlWarning("nlSolve", "Preconditioner not implemented yet for GMRES"); nlCurrentContext->preconditioner = NL_PRECOND_NONE; } if( nlCurrentContext->solver == NL_SUPERLU_EXT && nlCurrentContext->preconditioner != NL_PRECOND_NONE ) { nlWarning("nlSolve", "Preconditioner not implemented yet for SUPERLU"); nlCurrentContext->preconditioner = NL_PRECOND_NONE; } if( nlCurrentContext->solver == NL_CHOLMOD_EXT && nlCurrentContext->preconditioner != NL_PRECOND_NONE ) { nlWarning("nlSolve", "Preconditioner not implemented yet for CHOLMOD"); nlCurrentContext->preconditioner = NL_PRECOND_NONE; } if( nlCurrentContext->solver == NL_PERM_SUPERLU_EXT && nlCurrentContext->preconditioner != NL_PRECOND_NONE ) { nlWarning( "nlSolve", "Preconditioner not implemented yet for PERMSUPERLU" ); nlCurrentContext->preconditioner = NL_PRECOND_NONE; } if( nlCurrentContext->solver == NL_SYMMETRIC_SUPERLU_EXT && nlCurrentContext->preconditioner != NL_PRECOND_NONE ) { nlWarning( "nlSolve", "Preconditioner not implemented yet for PERMSUPERLU" ); nlCurrentContext->preconditioner = NL_PRECOND_NONE; } nlDeleteMatrix(nlCurrentContext->P); nlCurrentContext->P = NULL; switch(nlCurrentContext->preconditioner) { case NL_PRECOND_NONE: break; case NL_PRECOND_JACOBI: nlCurrentContext->P = nlNewJacobiPreconditioner(nlCurrentContext->M); break; case NL_PRECOND_SSOR: nlCurrentContext->P = nlNewSSORPreconditioner( nlCurrentContext->M,nlCurrentContext->omega ); break; case NL_PRECOND_USER: break; default: nl_assert_not_reached; } if(nlCurrentContext->preconditioner != NL_PRECOND_SSOR) { if(getenv("NL_LOW_MEM") == NULL) { nlMatrixCompress(&nlCurrentContext->M); } } } static NLboolean nlSolveDirect() { NLdouble* b = nlCurrentContext->b; NLdouble* x = nlCurrentContext->x; NLuint n = nlCurrentContext->n; NLuint k; NLMatrix F = nlMatrixFactorize( nlCurrentContext->M, nlCurrentContext->solver ); if(F == NULL) { return NL_FALSE; } for(k=0; knb_systems; ++k) { nlMultMatrixVector(F, b, x); b += n; x += n; } nlDeleteMatrix(F); return NL_TRUE; } static NLboolean nlSolveIterative() { NLboolean use_CUDA = NL_FALSE; NLdouble* b = nlCurrentContext->b; NLdouble* x = nlCurrentContext->x; NLuint n = nlCurrentContext->n; NLuint k; NLBlas_t blas = nlHostBlas(); NLMatrix M = nlCurrentContext->M; NLMatrix P = nlCurrentContext->P; /* * For CUDA: it is implemented for * all iterative solvers except GMRES * Jacobi preconditioner */ if(nlExtensionIsInitialized_CUDA() && (nlCurrentContext->solver != NL_GMRES) && (nlCurrentContext->preconditioner == NL_PRECOND_NONE || nlCurrentContext->preconditioner == NL_PRECOND_JACOBI) ) { if(nlCurrentContext->verbose) { nl_printf("Using CUDA\n"); } use_CUDA = NL_TRUE; blas = nlCUDABlas(); if(nlCurrentContext->preconditioner == NL_PRECOND_JACOBI) { P = nlCUDAJacobiPreconditionerNewFromCRSMatrix(M); } M = nlCUDAMatrixNewFromCRSMatrix(M); } /* * We do not count CUDA transfers and CUDA matrix construction * when estimating GFlops */ nlCurrentContext->start_time = nlCurrentTime(); nlBlasResetStats(blas); for(k=0; knb_systems; ++k) { nlSolveSystemIterative( blas, M, P, b, x, nlCurrentContext->solver, nlCurrentContext->threshold, nlCurrentContext->max_iterations, nlCurrentContext->inner_iterations ); b += n; x += n; } nlCurrentContext->flops += blas->flops; if(use_CUDA) { nlDeleteMatrix(M); nlDeleteMatrix(P); } return NL_TRUE; } NLboolean nlDefaultSolver() { NLboolean result = NL_TRUE; nlSetupPreconditioner(); switch(nlCurrentContext->solver) { case NL_CG: case NL_BICGSTAB: case NL_GMRES: { result = nlSolveIterative(); } break; case NL_SUPERLU_EXT: case NL_PERM_SUPERLU_EXT: case NL_SYMMETRIC_SUPERLU_EXT: case NL_CHOLMOD_EXT: { result = nlSolveDirect(); } break; default: nl_assert_not_reached; } return result; } /******* extracted from nl_blas.c *******/ /* Many warnings about const double* converted to double* when calling BLAS functions that do not have the const qualifier in their prototypes. */ #ifdef __clang__ #pragma GCC diagnostic ignored "-Wcast-qual" #endif #ifndef NL_FORTRAN_WRAP #define NL_FORTRAN_WRAP(x) x##_ #endif #ifdef NL_USE_ATLAS int NL_FORTRAN_WRAP(xerbla)(char *srname, int *info) { nl_printf(stderr, "** On entry to %6s, parameter number %2d had an illegal value\n", srname, *info ); return 0; } #ifndef NL_USE_BLAS #define NL_USE_BLAS #endif #endif #ifdef NL_USE_SUPERLU #ifndef NL_USE_BLAS #define NL_USE_BLAS /* * The BLAS included in SuperLU does not have DTPSV, * we use the DTPSV embedded in OpenNL. */ #define NEEDS_DTPSV #endif #endif #ifndef NL_USE_BLAS #define NEEDS_DTPSV #endif /* BLAS routines */ /* copy-pasted from CBLAS (i.e. generated from f2c) */ /* * lsame * xerbla * daxpy * ddot * dscal * dnrm2 * dcopy * dgemv * dtpsv */ typedef NLint integer ; typedef NLdouble doublereal ; typedef NLboolean logical ; typedef NLint ftnlen ; #ifndef max #define max(x,y) ((x) > (y) ? (x) : (y)) #endif #ifndef NL_USE_BLAS static int NL_FORTRAN_WRAP(lsame)(const char *ca, const char *cb) { /* -- LAPACK auxiliary routine (version 2.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University September 30, 1994 Purpose ======= LSAME returns .TRUE. if CA is the same letter as CB regardless of case. Arguments ========= CA (input) CHARACTER*1 CB (input) CHARACTER*1 CA and CB specify the single characters to be compared. ===================================================================== */ /* System generated locals */ int ret_val; /* Local variables */ int inta, intb, zcode; ret_val = *(unsigned char *)ca == *(unsigned char *)cb; if (ret_val) { return ret_val; } /* Now test for equivalence if both characters are alphabetic. */ zcode = 'Z'; /* Use 'Z' rather than 'A' so that ASCII can be detected on Prime machines, on which ICHAR returns a value with bit 8 set. ICHAR('A') on Prime machines returns 193 which is the same as ICHAR('A') on an EBCDIC machine. */ inta = *(unsigned char *)ca; intb = *(unsigned char *)cb; if (zcode == 90 || zcode == 122) { /* ASCII is assumed - ZCODE is the ASCII code of either lower or upper case 'Z'. */ if (inta >= 97 && inta <= 122) inta += -32; if (intb >= 97 && intb <= 122) intb += -32; } else if (zcode == 233 || zcode == 169) { /* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or upper case 'Z'. */ if ((inta >= 129 && inta <= 137) || (inta >= 145 && inta <= 153) || (inta >= 162 && inta <= 169) ) inta += 64; if ( (intb >= 129 && intb <= 137) || (intb >= 145 && intb <= 153) || (intb >= 162 && intb <= 169) ) intb += 64; } else if (zcode == 218 || zcode == 250) { /* ASCII is assumed, on Prime machines - ZCODE is the ASCII code plus 128 of either lower or upper case 'Z'. */ if (inta >= 225 && inta <= 250) inta += -32; if (intb >= 225 && intb <= 250) intb += -32; } ret_val = inta == intb; return ret_val; } /* lsame_ */ /* Subroutine */ static int NL_FORTRAN_WRAP(xerbla)(const char *srname, int *info) { /* -- LAPACK auxiliary routine (version 2.0) -- Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., Courant Institute, Argonne National Lab, and Rice University September 30, 1994 Purpose ======= XERBLA is an error handler for the LAPACK routines. It is called by an LAPACK routine if an input parameter has an invalid value. A message is printed and execution stops. Installers may consider modifying the STOP statement in order to call system-specific exception-handling facilities. Arguments ========= SRNAME (input) CHARACTER*6 The name of the routine which called XERBLA. INFO (input) INT The position of the invalid parameter in the parameter list of the calling routine. ===================================================================== */ nl_fprintf(stderr, "** On entry to %6s, parameter number %2d had an illegal value\n", srname, *info); /* End of XERBLA */ return 0; } /* xerbla_ */ /* Subroutine */ static int NL_FORTRAN_WRAP(daxpy)(integer *n, doublereal *da, doublereal *dx, integer *incx, doublereal *dy, integer *incy) { /* System generated locals */ integer i__1; /* Local variables */ static integer i, m, ix, iy, mp1; /* constant times a vector plus a vector. uses unrolled loops for increments equal to one. jack dongarra, linpack, 3/11/78. modified 12/3/93, array(1) declarations changed to array(*) Parameter adjustments Function Body */ #define DY(I) dy[(I)-1] #define DX(I) dx[(I)-1] if (*n <= 0) { return 0; } if (*da == 0.) { return 0; } if (*incx == 1 && *incy == 1) { goto L20; } /* code for unequal increments or equal increments not equal to 1 */ ix = 1; iy = 1; if (*incx < 0) { ix = (-(*n) + 1) * *incx + 1; } if (*incy < 0) { iy = (-(*n) + 1) * *incy + 1; } i__1 = *n; for (i = 1; i <= *n; ++i) { DY(iy) += *da * DX(ix); ix += *incx; iy += *incy; /* L10: */ } return 0; /* code for both increments equal to 1 clean-up loop */ L20: m = *n % 4; if (m == 0) { goto L40; } i__1 = m; for (i = 1; i <= m; ++i) { DY(i) += *da * DX(i); /* L30: */ } if (*n < 4) { return 0; } L40: mp1 = m + 1; i__1 = *n; for (i = mp1; i <= *n; i += 4) { DY(i) += *da * DX(i); DY(i + 1) += *da * DX(i + 1); DY(i + 2) += *da * DX(i + 2); DY(i + 3) += *da * DX(i + 3); /* L50: */ } nl_arg_used(i__1); return 0; } /* daxpy_ */ #undef DY #undef DX static doublereal NL_FORTRAN_WRAP(ddot)(integer *n, doublereal *dx, integer *incx, doublereal *dy, integer *incy) { /* System generated locals */ integer i__1; doublereal ret_val; /* Local variables */ static integer i, m; static doublereal dtemp; static integer ix, iy, mp1; /* forms the dot product of two vectors. uses unrolled loops for increments equal to one. jack dongarra, linpack, 3/11/78. modified 12/3/93, array(1) declarations changed to array(*) Parameter adjustments Function Body */ #define DY(I) dy[(I)-1] #define DX(I) dx[(I)-1] ret_val = 0.; dtemp = 0.; if (*n <= 0) { return ret_val; } if (*incx == 1 && *incy == 1) { goto L20; } /* code for unequal increments or equal increments not equal to 1 */ ix = 1; iy = 1; if (*incx < 0) { ix = (-(*n) + 1) * *incx + 1; } if (*incy < 0) { iy = (-(*n) + 1) * *incy + 1; } i__1 = *n; for (i = 1; i <= *n; ++i) { dtemp += DX(ix) * DY(iy); ix += *incx; iy += *incy; /* L10: */ } ret_val = dtemp; return ret_val; /* code for both increments equal to 1 clean-up loop */ L20: m = *n % 5; if (m == 0) { goto L40; } i__1 = m; for (i = 1; i <= m; ++i) { dtemp += DX(i) * DY(i); /* L30: */ } if (*n < 5) { goto L60; } L40: mp1 = m + 1; i__1 = *n; for (i = mp1; i <= *n; i += 5) { dtemp = dtemp + DX(i) * DY(i) + DX(i + 1) * DY(i + 1) + DX(i + 2) * DY(i + 2) + DX(i + 3) * DY(i + 3) + DX(i + 4) * DY(i + 4); /* L50: */ } L60: ret_val = dtemp; nl_arg_used(i__1); return ret_val; } /* ddot_ */ #undef DY #undef DX /* Subroutine */ static int NL_FORTRAN_WRAP(dscal)(integer *n, doublereal *da, doublereal *dx, integer *incx) { /* System generated locals */ integer i__1, i__2; /* Local variables */ static integer i, m, nincx, mp1; /* scales a vector by a constant. uses unrolled loops for increment equal to one. jack dongarra, linpack, 3/11/78. modified 3/93 to return if incx .le. 0. modified 12/3/93, array(1) declarations changed to array(*) Parameter adjustments Function Body */ #ifdef DX #undef DX #endif #define DX(I) dx[(I)-1] if (*n <= 0 || *incx <= 0) { return 0; } if (*incx == 1) { goto L20; } /* code for increment not equal to 1 */ nincx = *n * *incx; i__1 = nincx; i__2 = *incx; for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) { DX(i) = *da * DX(i); /* L10: */ } return 0; /* code for increment equal to 1 clean-up loop */ L20: m = *n % 5; if (m == 0) { goto L40; } i__2 = m; for (i = 1; i <= m; ++i) { DX(i) = *da * DX(i); /* L30: */ } if (*n < 5) { return 0; } L40: mp1 = m + 1; i__2 = *n; for (i = mp1; i <= *n; i += 5) { DX(i) = *da * DX(i); DX(i + 1) = *da * DX(i + 1); DX(i + 2) = *da * DX(i + 2); DX(i + 3) = *da * DX(i + 3); DX(i + 4) = *da * DX(i + 4); /* L50: */ } nl_arg_used(i__1); nl_arg_used(i__2); return 0; } /* dscal_ */ #undef DX static doublereal NL_FORTRAN_WRAP(dnrm2)(integer *n, doublereal *x, integer *incx) { /* System generated locals */ integer i__1, i__2; doublereal ret_val, d__1; /* Builtin functions */ /* BL: already declared in the included , we do not need it here. */ /*double sqrt(doublereal); */ /* Local variables */ static doublereal norm, scale, absxi; static integer ix; static doublereal ssq; /* DNRM2 returns the euclidean norm of a vector via the function name, so that DNRM2 := sqrt( x'*x ) -- This version written on 25-October-1982. Modified on 14-October-1993 to inline the call to DLASSQ. Sven Hammarling, Nag Ltd. Parameter adjustments Function Body */ #ifdef X #undef X #endif #define X(I) x[(I)-1] if (*n < 1 || *incx < 1) { norm = 0.; } else if (*n == 1) { norm = fabs(X(1)); } else { scale = 0.; ssq = 1.; /* The following loop is equivalent to this call to the LAPACK auxiliary routine: CALL DLASSQ( N, X, INCX, SCALE, SSQ ) */ i__1 = (*n - 1) * *incx + 1; i__2 = *incx; for (ix = 1; *incx < 0 ? ix >= (*n-1)**incx+1 : ix <= (*n-1)**incx+1; ix += *incx) { if (X(ix) != 0.) { absxi = (d__1 = X(ix), fabs(d__1)); if (scale < absxi) { /* Computing 2nd power */ d__1 = scale / absxi; ssq = ssq * (d__1 * d__1) + 1.; scale = absxi; } else { /* Computing 2nd power */ d__1 = absxi / scale; ssq += d__1 * d__1; } } /* L10: */ } norm = scale * sqrt(ssq); } ret_val = norm; nl_arg_used(i__1); nl_arg_used(i__2); return ret_val; /* End of DNRM2. */ } /* dnrm2_ */ #undef X /* Subroutine */ static int NL_FORTRAN_WRAP(dcopy)(integer *n, doublereal *dx, integer *incx, doublereal *dy, integer *incy) { /* System generated locals */ integer i__1; /* Local variables */ static integer i, m, ix, iy, mp1; /* copies a vector, x, to a vector, y. uses unrolled loops for increments equal to one. jack dongarra, linpack, 3/11/78. modified 12/3/93, array(1) declarations changed to array(*) Parameter adjustments Function Body */ #define DY(I) dy[(I)-1] #define DX(I) dx[(I)-1] if (*n <= 0) { return 0; } if (*incx == 1 && *incy == 1) { goto L20; } /* code for unequal increments or equal increments not equal to 1 */ ix = 1; iy = 1; if (*incx < 0) { ix = (-(*n) + 1) * *incx + 1; } if (*incy < 0) { iy = (-(*n) + 1) * *incy + 1; } i__1 = *n; for (i = 1; i <= *n; ++i) { DY(iy) = DX(ix); ix += *incx; iy += *incy; /* L10: */ } return 0; /* code for both increments equal to 1 clean-up loop */ L20: m = *n % 7; if (m == 0) { goto L40; } i__1 = m; for (i = 1; i <= m; ++i) { DY(i) = DX(i); /* L30: */ } if (*n < 7) { return 0; } L40: mp1 = m + 1; i__1 = *n; for (i = mp1; i <= *n; i += 7) { DY(i) = DX(i); DY(i + 1) = DX(i + 1); DY(i + 2) = DX(i + 2); DY(i + 3) = DX(i + 3); DY(i + 4) = DX(i + 4); DY(i + 5) = DX(i + 5); DY(i + 6) = DX(i + 6); /* L50: */ } nl_arg_used(i__1); return 0; } /* dcopy_ */ #undef DX #undef DY /* Subroutine */ static int NL_FORTRAN_WRAP(dgemv)(const char *trans, integer *m, integer *n, doublereal * alpha, doublereal *a, integer *lda, doublereal *x, integer *incx, doublereal *beta, doublereal *y, integer *incy) { /* System generated locals */ /* integer a_dim1, a_offset ; */ integer i__1, i__2; /* Local variables */ static integer info; static doublereal temp; static integer lenx, leny, i, j; /* extern logical lsame_(char *, char *); */ static integer ix, iy, jx, jy, kx, ky; /* extern int xerbla_(char *, integer *); */ /* Purpose ======= DGEMV performs one of the matrix-vector operations y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, where alpha and beta are scalars, x and y are vectors and A is an m by n matrix. Parameters ========== TRANS - CHARACTER*1. On entry, TRANS specifies the operation to be performed as follows: TRANS = 'N' or 'n' y := alpha*A*x + beta*y. TRANS = 'T' or 't' y := alpha*A'*x + beta*y. TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. Unchanged on exit. M - INTEGER. On entry, M specifies the number of rows of the matrix A. M must be at least zero. Unchanged on exit. N - INTEGER. On entry, N specifies the number of columns of the matrix A. N must be at least zero. Unchanged on exit. ALPHA - DOUBLE PRECISION. On entry, ALPHA specifies the scalar alpha. Unchanged on exit. A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). Before entry, the leading m by n part of the array A must contain the matrix of coefficients. Unchanged on exit. LDA - INTEGER. On entry, LDA specifies the first dimension of A as declared in the calling (sub) program. LDA must be at least max( 1, m ). Unchanged on exit. X - DOUBLE PRECISION array of DIMENSION at least ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' and at least ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. Before entry, the incremented array X must contain the vector x. Unchanged on exit. INCX - INTEGER. On entry, INCX specifies the increment for the elements of X. INCX must not be zero. Unchanged on exit. BETA - DOUBLE PRECISION. On entry, BETA specifies the scalar beta. When BETA is supplied as zero then Y need not be set on input. Unchanged on exit. Y - DOUBLE PRECISION array of DIMENSION at least ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' and at least ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. Before entry with BETA non-zero, the incremented array Y must contain the vector y. On exit, Y is overwritten by the updated vector y. INCY - INTEGER. On entry, INCY specifies the increment for the elements of Y. INCY must not be zero. Unchanged on exit. Level 2 Blas routine. -- Written on 22-October-1986. Jack Dongarra, Argonne National Lab. Jeremy Du Croz, Nag Central Office. Sven Hammarling, Nag Central Office. Richard Hanson, Sandia National Labs. Test the input parameters. Parameter adjustments Function Body */ #define X(I) x[(I)-1] #define Y(I) y[(I)-1] #define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)] info = 0; if (! NL_FORTRAN_WRAP(lsame)(trans, "N") && ! NL_FORTRAN_WRAP(lsame)(trans, "T") && ! NL_FORTRAN_WRAP(lsame)(trans, "C")) { info = 1; } else if (*m < 0) { info = 2; } else if (*n < 0) { info = 3; } else if (*lda < max(1,*m)) { info = 6; } else if (*incx == 0) { info = 8; } else if (*incy == 0) { info = 11; } if (info != 0) { NL_FORTRAN_WRAP(xerbla)("DGEMV ", &info); return 0; } /* Quick return if possible. */ if (*m == 0 || *n == 0 || (*alpha == 0. && *beta == 1.)) { return 0; } /* Set LENX and LENY, the lengths of the vectors x and y, and set up the start points in X and Y. */ if (NL_FORTRAN_WRAP(lsame)(trans, "N")) { lenx = *n; leny = *m; } else { lenx = *m; leny = *n; } if (*incx > 0) { kx = 1; } else { kx = 1 - (lenx - 1) * *incx; } if (*incy > 0) { ky = 1; } else { ky = 1 - (leny - 1) * *incy; } /* Start the operations. In this version the elements of A are accessed sequentially with one pass through A. First form y := beta*y. */ if (*beta != 1.) { if (*incy == 1) { if (*beta == 0.) { i__1 = leny; for (i = 1; i <= leny; ++i) { Y(i) = 0.; /* L10: */ } } else { i__1 = leny; for (i = 1; i <= leny; ++i) { Y(i) = *beta * Y(i); /* L20: */ } } } else { iy = ky; if (*beta == 0.) { i__1 = leny; for (i = 1; i <= leny; ++i) { Y(iy) = 0.; iy += *incy; /* L30: */ } } else { i__1 = leny; for (i = 1; i <= leny; ++i) { Y(iy) = *beta * Y(iy); iy += *incy; /* L40: */ } } } } if (*alpha == 0.) { return 0; } if (NL_FORTRAN_WRAP(lsame)(trans, "N")) { /* Form y := alpha*A*x + y. */ jx = kx; if (*incy == 1) { i__1 = *n; for (j = 1; j <= *n; ++j) { if (X(jx) != 0.) { temp = *alpha * X(jx); i__2 = *m; for (i = 1; i <= *m; ++i) { Y(i) += temp * A(i,j); /* L50: */ } } jx += *incx; /* L60: */ } } else { i__1 = *n; for (j = 1; j <= *n; ++j) { if (X(jx) != 0.) { temp = *alpha * X(jx); iy = ky; i__2 = *m; for (i = 1; i <= *m; ++i) { Y(iy) += temp * A(i,j); iy += *incy; /* L70: */ } } jx += *incx; /* L80: */ } } } else { /* Form y := alpha*A'*x + y. */ jy = ky; if (*incx == 1) { i__1 = *n; for (j = 1; j <= *n; ++j) { temp = 0.; i__2 = *m; for (i = 1; i <= *m; ++i) { temp += A(i,j) * X(i); /* L90: */ } Y(jy) += *alpha * temp; jy += *incy; /* L100: */ } } else { i__1 = *n; for (j = 1; j <= *n; ++j) { temp = 0.; ix = kx; i__2 = *m; for (i = 1; i <= *m; ++i) { temp += A(i,j) * X(ix); ix += *incx; /* L110: */ } Y(jy) += *alpha * temp; jy += *incy; /* L120: */ } } } nl_arg_used(i__1); nl_arg_used(i__2); return 0; /* End of DGEMV . */ } /* dgemv_ */ #undef X #undef Y #undef A #else extern void NL_FORTRAN_WRAP(daxpy)( int *n, double *alpha, double *x, int *incx, double *y, int *incy ) ; extern double NL_FORTRAN_WRAP(dnrm2)( int *n, double *x, int *incx ) ; extern int NL_FORTRAN_WRAP(dcopy)(int* n, double* dx, int* incx, double* dy, int* incy) ; extern void NL_FORTRAN_WRAP(dscal)(int* n, double* alpha, double *x, int* incx) ; #ifndef NEEDS_DTPSV extern void NL_FORTRAN_WRAP(dtpsv)( char *uplo, char *trans, char *diag, int *n, double *AP, double *x, int *incx ) ; #endif extern void NL_FORTRAN_WRAP(dgemv)( char *trans, int *m, int *n, double *alpha, double *A, int *ldA, double *x, int *incx, double *beta, double *y, int *incy ) ; #endif #ifdef NEEDS_DTPSV /* DECK DTPSV */ /* Subroutine */ static int NL_FORTRAN_WRAP(dtpsv)( const char* uplo, const char* trans, const char* diag, integer* n, doublereal* ap, doublereal* x, integer* incx ) { /* System generated locals */ integer i__1, i__2; /* Local variables */ static integer info; static doublereal temp; static integer i__, j, k; /* extern logical lsame_(); */ static integer kk, ix, jx, kx; /* extern int xerbla_(); */ static logical nounit; /* ***BEGIN PROLOGUE DTPSV */ /* ***PURPOSE Solve one of the systems of equations. */ /* ***LIBRARY SLATEC (BLAS) */ /* ***CATEGORY D1B4 */ /* ***TYPE DOUBLE PRECISION (STPSV-S, DTPSV-D, CTPSV-C) */ /* ***KEYWORDS LEVEL 2 BLAS, LINEAR ALGEBRA */ /* ***AUTHOR Dongarra, J. J., (ANL) */ /* Du Croz, J., (NAG) */ /* Hammarling, S., (NAG) */ /* Hanson, R. J., (SNLA) */ /* ***DESCRIPTION */ /* DTPSV solves one of the systems of equations */ /* A*x = b, or A'*x = b, */ /* where b and x are n element vectors and A is an n by n unit, or */ /* non-unit, upper or lower triangular matrix, supplied in packed form. */ /* No test for singularity or near-singularity is included in this */ /* routine. Such tests must be performed before calling this routine. */ /* Parameters */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the matrix is an upper or */ /* lower triangular matrix as follows: */ /* UPLO = 'U' or 'u' A is an upper triangular matrix. */ /* UPLO = 'L' or 'l' A is a lower triangular matrix. */ /* Unchanged on exit. */ /* TRANS - CHARACTER*1. */ /* On entry, TRANS specifies the equations to be solved as */ /* follows: */ /* TRANS = 'N' or 'n' A*x = b. */ /* TRANS = 'T' or 't' A'*x = b. */ /* TRANS = 'C' or 'c' A'*x = b. */ /* Unchanged on exit. */ /* DIAG - CHARACTER*1. */ /* On entry, DIAG specifies whether or not A is unit */ /* triangular as follows: */ /* DIAG = 'U' or 'u' A is assumed to be unit triangular. */ /* DIAG = 'N' or 'n' A is not assumed to be unit */ /* triangular. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* AP - DOUBLE PRECISION array of DIMENSION at least */ /* ( ( n*( n + 1))/2). */ /* Before entry with UPLO = 'U' or 'u', the array AP must */ /* contain the upper triangular matrix packed sequentially, */ /* column by column, so that AP( 1 ) contains a( 1, 1 ), */ /* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) */ /* respectively, and so on. */ /* Before entry with UPLO = 'L' or 'l', the array AP must */ /* contain the lower triangular matrix packed sequentially, */ /* column by column, so that AP( 1 ) contains a( 1, 1 ), */ /* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) */ /* respectively, and so on. */ /* Note that when DIAG = 'U' or 'u', the diagonal elements of */ /* A are not referenced, but are assumed to be unity. */ /* Unchanged on exit. */ /* X - DOUBLE PRECISION array of dimension at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the n */ /* element right-hand side vector b. On exit, X is overwritten */ /* with the solution vector x. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* ***REFERENCES Dongarra, J. J., Du Croz, J., Hammarling, S., and */ /* Hanson, R. J. An extended set of Fortran basic linear */ /* algebra subprograms. ACM TOMS, Vol. 14, No. 1, */ /* pp. 1-17, March 1988. */ /* ***ROUTINES CALLED LSAME, XERBLA */ /* ***REVISION HISTORY (YYMMDD) */ /* 861022 DATE WRITTEN */ /* 910605 Modified to meet SLATEC prologue standards. Only comment */ /* lines were modified. (BKS) */ /* ***END PROLOGUE DTPSV */ /* .. Scalar Arguments .. */ /* .. Array Arguments .. */ /* .. Parameters .. */ /* .. Local Scalars .. */ /* .. External Functions .. */ /* .. External Subroutines .. */ /* ***FIRST EXECUTABLE STATEMENT DTPSV */ /* Test the input parameters. */ /* Parameter adjustments */ --x; --ap; /* Function Body */ info = 0; if (!NL_FORTRAN_WRAP(lsame)(uplo, "U") && !NL_FORTRAN_WRAP(lsame)(uplo, "L") ) { info = 1; } else if ( !NL_FORTRAN_WRAP(lsame)(trans, "N") && !NL_FORTRAN_WRAP(lsame)(trans, "T") && !NL_FORTRAN_WRAP(lsame)(trans, "C") ) { info = 2; } else if ( !NL_FORTRAN_WRAP(lsame)(diag, "U") && !NL_FORTRAN_WRAP(lsame)(diag, "N") ) { info = 3; } else if (*n < 0) { info = 4; } else if (*incx == 0) { info = 7; } if (info != 0) { NL_FORTRAN_WRAP(xerbla)("DTPSV ", &info); return 0; } /* Quick return if possible. */ if (*n == 0) { return 0; } nounit = (logical)(NL_FORTRAN_WRAP(lsame)(diag, "N")); /* Set up the start point in X if the increment is not unity. This */ /* will be ( N - 1 )*INCX too small for descending loops. */ if (*incx <= 0) { kx = 1 - (*n - 1) * *incx; } else if (*incx != 1) { kx = 1; } /* Start the operations. In this version the elements of AP are */ /* accessed sequentially with one pass through AP. */ if (NL_FORTRAN_WRAP(lsame)(trans, "N")) { /* Form x := inv( A )*x. */ if (NL_FORTRAN_WRAP(lsame)(uplo, "U")) { kk = *n * (*n + 1) / 2; if (*incx == 1) { for (j = *n; j >= 1; --j) { if (x[j] != 0.) { if (nounit) { x[j] /= ap[kk]; } temp = x[j]; k = kk - 1; for (i__ = j - 1; i__ >= 1; --i__) { x[i__] -= temp * ap[k]; --k; /* L10: */ } } kk -= j; /* L20: */ } } else { jx = kx + (*n - 1) * *incx; for (j = *n; j >= 1; --j) { if (x[jx] != 0.) { if (nounit) { x[jx] /= ap[kk]; } temp = x[jx]; ix = jx; i__1 = kk - j + 1; for (k = kk - 1; k >= i__1; --k) { ix -= *incx; x[ix] -= temp * ap[k]; /* L30: */ } } jx -= *incx; kk -= j; /* L40: */ } } } else { kk = 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[j] != 0.) { if (nounit) { x[j] /= ap[kk]; } temp = x[j]; k = kk + 1; i__2 = *n; for (i__ = j + 1; i__ <= i__2; ++i__) { x[i__] -= temp * ap[k]; ++k; /* L50: */ } } kk += *n - j + 1; /* L60: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { if (x[jx] != 0.) { if (nounit) { x[jx] /= ap[kk]; } temp = x[jx]; ix = jx; i__2 = kk + *n - j; for (k = kk + 1; k <= i__2; ++k) { ix += *incx; x[ix] -= temp * ap[k]; /* L70: */ } } jx += *incx; kk += *n - j + 1; /* L80: */ } } } } else { /* Form x := inv( A' )*x. */ if (NL_FORTRAN_WRAP(lsame)(uplo, "U")) { kk = 1; if (*incx == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { temp = x[j]; k = kk; i__2 = j - 1; for (i__ = 1; i__ <= i__2; ++i__) { temp -= ap[k] * x[i__]; ++k; /* L90: */ } if (nounit) { temp /= ap[kk + j - 1]; } x[j] = temp; kk += j; /* L100: */ } } else { jx = kx; i__1 = *n; for (j = 1; j <= i__1; ++j) { temp = x[jx]; ix = kx; i__2 = kk + j - 2; for (k = kk; k <= i__2; ++k) { temp -= ap[k] * x[ix]; ix += *incx; /* L110: */ } if (nounit) { temp /= ap[kk + j - 1]; } x[jx] = temp; jx += *incx; kk += j; /* L120: */ } } } else { kk = *n * (*n + 1) / 2; if (*incx == 1) { for (j = *n; j >= 1; --j) { temp = x[j]; k = kk; i__1 = j + 1; for (i__ = *n; i__ >= i__1; --i__) { temp -= ap[k] * x[i__]; --k; /* L130: */ } if (nounit) { temp /= ap[kk - *n + j]; } x[j] = temp; kk -= *n - j + 1; /* L140: */ } } else { kx += (*n - 1) * *incx; jx = kx; for (j = *n; j >= 1; --j) { temp = x[jx]; ix = kx; i__1 = kk - (*n - (j + 1)); for (k = kk; k >= i__1; --k) { temp -= ap[k] * x[ix]; ix -= *incx; /* L150: */ } if (nounit) { temp /= ap[kk - *n + j]; } x[jx] = temp; jx -= *incx; kk -= *n - j + 1; /* L160: */ } } } } return 0; /* End of DTPSV . */ } /* dtpsv_ */ #endif /* End of BLAS routines */ /* Abstract BLAS interface */ void nlBlasResetStats(NLBlas_t blas) { blas->start_time = nlCurrentTime(); blas->flops = 0; blas->used_ram[0] = 0; blas->used_ram[1] = 0; blas->max_used_ram[0] = 0; blas->max_used_ram[1] = 0; blas->sq_rnorm = 0.0; blas->sq_bnorm = 0.0; } double nlBlasGFlops(NLBlas_t blas) { double now = nlCurrentTime(); double elapsed_time = now - blas->start_time; return (NLdouble)(blas->flops) / (elapsed_time * 1e9); } NLulong nlBlasUsedRam(NLBlas_t blas, NLmemoryType type) { return blas->used_ram[type]; } NLulong nlBlasMaxUsedRam(NLBlas_t blas, NLmemoryType type) { return blas->max_used_ram[type]; } NLboolean nlBlasHasUnifiedMemory(NLBlas_t blas) { return blas->has_unified_memory; } static void* host_blas_malloc( NLBlas_t blas, NLmemoryType type, size_t size ) { nl_arg_used(type); blas->used_ram[type] += (NLulong)size; blas->max_used_ram[type] = MAX( blas->max_used_ram[type],blas->used_ram[type] ); return malloc(size); } static void host_blas_free( NLBlas_t blas, NLmemoryType type, size_t size, void* ptr ) { nl_arg_used(type); blas->used_ram[type] -= (NLulong)size; free(ptr); } static void host_blas_memcpy( NLBlas_t blas, void* to, NLmemoryType to_type, void* from, NLmemoryType from_type, size_t size ) { nl_arg_used(blas); nl_arg_used(to_type); nl_arg_used(from_type); memcpy(to,from,size); } static void host_blas_dcopy( NLBlas_t blas, int n, const double *x, int incx, double *y, int incy ) { nl_arg_used(blas); NL_FORTRAN_WRAP(dcopy)(&n,(double*)x,&incx,y,&incy); } static double host_blas_ddot( NLBlas_t blas, int n, const double *x, int incx, const double *y, int incy ) { blas->flops += (NLulong)(2*n); return NL_FORTRAN_WRAP(ddot)(&n,(double*)x,&incx,(double*)y,&incy); } static double host_blas_dnrm2( NLBlas_t blas, int n, const double *x, int incx ) { blas->flops += (NLulong)(2*n); return NL_FORTRAN_WRAP(dnrm2)(&n,(double*)x,&incx); } static void host_blas_daxpy( NLBlas_t blas, int n, double a, const double *x, int incx, double *y, int incy ) { blas->flops += (NLulong)(2*n); NL_FORTRAN_WRAP(daxpy)(&n,&a,(double*)x,&incx,y,&incy); } static void host_blas_dscal( NLBlas_t blas, int n, double a, double *x, int incx ) { blas->flops += (NLulong)n; NL_FORTRAN_WRAP(dscal)(&n,&a,x,&incx); } static void host_blas_dgemv( NLBlas_t blas, MatrixTranspose trans, int m, int n, double alpha, const double *A, int ldA, const double *x, int incx, double beta, double *y, int incy ) { static const char *T[3] = { "N", "T", 0 }; nl_arg_used(blas); NL_FORTRAN_WRAP(dgemv)( T[(int)trans],&m,&n,&alpha,(double*)A,&ldA, (double*)x,&incx,&beta,y,&incy ); /* TODO: update flops */ } static void host_blas_dtpsv( NLBlas_t blas, MatrixTriangle uplo, MatrixTranspose trans, MatrixUnitTriangular diag, int n, const double *AP, double *x, int incx ) { static const char *UL[2] = { "U", "L" }; static const char *T[3] = { "N", "T", 0 }; static const char *D[2] = { "U", "N" }; nl_arg_used(blas); NL_FORTRAN_WRAP(dtpsv)( UL[(int)uplo],T[(int)trans],D[(int)diag],&n,(double*)AP,x,&incx ); /* TODO: update flops */ } NLBlas_t nlHostBlas() { static NLboolean initialized = NL_FALSE; static struct NLBlas blas; if(!initialized) { memset(&blas, 0, sizeof(blas)); blas.has_unified_memory = NL_TRUE; blas.Malloc = host_blas_malloc; blas.Free = host_blas_free; blas.Memcpy = host_blas_memcpy; blas.Dcopy = host_blas_dcopy; blas.Ddot = host_blas_ddot; blas.Dnrm2 = host_blas_dnrm2; blas.Daxpy = host_blas_daxpy; blas.Dscal = host_blas_dscal; blas.Dgemv = host_blas_dgemv; blas.Dtpsv = host_blas_dtpsv; nlBlasResetStats(&blas); initialized = NL_TRUE; } return &blas; } /******* extracted from nl_iterative_solvers.c *******/ /* Solvers */ /* * The implementation of the solvers is inspired by * the lsolver library, by Christian Badura, available from: * http://www.mathematik.uni-freiburg.de * /IAM/Research/projectskr/lin_solver/ * * About the Conjugate Gradient, details can be found in: * Ashby, Manteuffel, Saylor * A taxononmy for conjugate gradient methods * SIAM J Numer Anal 27, 1542-1568 (1990) * * This version is completely abstract, the same code can be used for * CPU/GPU, dense matrix / sparse matrix etc... * Abstraction is realized through: * - Abstract blas interface (NLBlas_t), that can implement BLAS * operations on the CPU or on the GPU. * - Abstract matrix interface (NLMatrix), that can implement different * versions of matrix x vector product (CPU/GPU, sparse/dense ...) */ static NLuint nlSolveSystem_CG( NLBlas_t blas, NLMatrix M, NLdouble* b, NLdouble* x, double eps, NLuint max_iter ) { NLint N = (NLint)M->m; NLdouble *g = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLdouble *r = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLdouble *p = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLuint its=0; NLdouble t, tau, sig, rho, gam; NLdouble b_square=blas->Ddot(blas,N,b,1,b,1); NLdouble err=eps*eps*b_square; NLdouble curr_err; nlMultMatrixVector(M,x,g); blas->Daxpy(blas,N,-1.,b,1,g,1); blas->Dscal(blas,N,-1.,g,1); blas->Dcopy(blas,N,g,1,r,1); curr_err = blas->Ddot(blas,N,g,1,g,1); while ( curr_err >err && its < max_iter) { if(nlCurrentContext != NULL) { if(nlCurrentContext->progress_func != NULL) { nlCurrentContext->progress_func(its, max_iter, curr_err, err); } if(nlCurrentContext->verbose && !(its % 100)) { nl_printf ( "%d : %.10e -- %.10e\n", its, curr_err, err ); } } nlMultMatrixVector(M,r,p); rho=blas->Ddot(blas,N,p,1,p,1); sig=blas->Ddot(blas,N,r,1,p,1); tau=blas->Ddot(blas,N,g,1,r,1); t=tau/sig; blas->Daxpy(blas,N,t,r,1,x,1); blas->Daxpy(blas,N,-t,p,1,g,1); gam=(t*t*rho-tau)/tau; blas->Dscal(blas,N,gam,r,1); blas->Daxpy(blas,N,1.,g,1,r,1); ++its; curr_err = blas->Ddot(blas,N,g,1,g,1); } NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, g); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, r); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, p); blas->sq_bnorm = b_square; blas->sq_rnorm = curr_err; return its; } static NLuint nlSolveSystem_PRE_CG( NLBlas_t blas, NLMatrix M, NLMatrix P, NLdouble* b, NLdouble* x, double eps, NLuint max_iter ) { NLint N = (NLint)M->n; NLdouble* r = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLdouble* d = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLdouble* h = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLdouble *Ad = h; NLuint its=0; NLdouble rh, alpha, beta; NLdouble b_square = blas->Ddot(blas,N,b,1,b,1); NLdouble err=eps*eps*b_square; NLdouble curr_err; nlMultMatrixVector(M,x,r); blas->Daxpy(blas,N,-1.,b,1,r,1); nlMultMatrixVector(P,r,d); blas->Dcopy(blas,N,d,1,h,1); rh=blas->Ddot(blas,N,r,1,h,1); curr_err = blas->Ddot(blas,N,r,1,r,1); while ( curr_err >err && its < max_iter) { if(nlCurrentContext != NULL) { if(nlCurrentContext->progress_func != NULL) { nlCurrentContext->progress_func(its, max_iter, curr_err, err); } if( nlCurrentContext->verbose && !(its % 100)) { nl_printf ( "%d : %.10e -- %.10e\n", its, curr_err, err ); } } nlMultMatrixVector(M,d,Ad); alpha=rh/blas->Ddot(blas,N,d,1,Ad,1); blas->Daxpy(blas,N,-alpha,d,1,x,1); blas->Daxpy(blas,N,-alpha,Ad,1,r,1); nlMultMatrixVector(P,r,h); beta=1./rh; rh=blas->Ddot(blas,N,r,1,h,1); beta*=rh; blas->Dscal(blas,N,beta,d,1); blas->Daxpy(blas,N,1.,h,1,d,1); ++its; curr_err = blas->Ddot(blas,N,r,1,r,1); } NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, r); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, d); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, h); blas->sq_bnorm = b_square; blas->sq_rnorm = curr_err; return its; } static NLuint nlSolveSystem_BICGSTAB( NLBlas_t blas, NLMatrix M, NLdouble* b, NLdouble* x, double eps, NLuint max_iter ) { NLint N = (NLint)M->n; NLdouble *rT = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLdouble *d = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLdouble *h = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLdouble *u = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLdouble *Ad = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLdouble *t = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLdouble *s = h; NLdouble rTh, rTAd, rTr, alpha, beta, omega, st, tt; NLuint its=0; NLdouble b_square = blas->Ddot(blas,N,b,1,b,1); NLdouble err=eps*eps*b_square; NLdouble *r = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); nlMultMatrixVector(M,x,r); blas->Daxpy(blas,N,-1.,b,1,r,1); blas->Dcopy(blas,N,r,1,d,1); blas->Dcopy(blas,N,d,1,h,1); blas->Dcopy(blas,N,h,1,rT,1); nl_assert( blas->Ddot(blas,N,rT,1,rT,1)>1e-40 ); rTh=blas->Ddot(blas,N,rT,1,h,1); rTr=blas->Ddot(blas,N,r,1,r,1); while ( rTr>err && its < max_iter) { if(nlCurrentContext != NULL) { if(nlCurrentContext->progress_func != NULL) { nlCurrentContext->progress_func(its, max_iter, rTr, err); } if( (nlCurrentContext->verbose) && !(its % 100)) { nl_printf ( "%d : %.10e -- %.10e\n", its, rTr, err ); } } nlMultMatrixVector(M,d,Ad); rTAd=blas->Ddot(blas,N,rT,1,Ad,1); nl_assert( fabs(rTAd)>1e-40 ); alpha=rTh/rTAd; blas->Daxpy(blas,N,-alpha,Ad,1,r,1); blas->Dcopy(blas,N,h,1,s,1); blas->Daxpy(blas,N,-alpha,Ad,1,s,1); nlMultMatrixVector(M,s,t); blas->Daxpy(blas,N,1.,t,1,u,1); blas->Dscal(blas,N,alpha,u,1); st=blas->Ddot(blas,N,s,1,t,1); tt=blas->Ddot(blas,N,t,1,t,1); if ( fabs(st)<1e-40 || fabs(tt)<1e-40 ) { omega = 0.; } else { omega = st/tt; } blas->Daxpy(blas,N,-omega,t,1,r,1); blas->Daxpy(blas,N,-alpha,d,1,x,1); blas->Daxpy(blas,N,-omega,s,1,x,1); blas->Dcopy(blas,N,s,1,h,1); blas->Daxpy(blas,N,-omega,t,1,h,1); beta=(alpha/omega)/rTh; rTh=blas->Ddot(blas,N,rT,1,h,1); beta*=rTh; blas->Dscal(blas,N,beta,d,1); blas->Daxpy(blas,N,1.,h,1,d,1); blas->Daxpy(blas,N,-beta*omega,Ad,1,d,1); rTr=blas->Ddot(blas,N,r,1,r,1); ++its; } NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, r); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, rT); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, d); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, h); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, u); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, Ad); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, t); blas->sq_bnorm = b_square; blas->sq_rnorm = rTr; return its; } static NLuint nlSolveSystem_PRE_BICGSTAB( NLBlas_t blas, NLMatrix M, NLMatrix P, NLdouble* b, NLdouble* x, double eps, NLuint max_iter ) { NLint N = (NLint)M->n; NLdouble *rT = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLdouble *d = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLdouble *h = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLdouble *u = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLdouble *Sd = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLdouble *t = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLdouble *aux = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); NLdouble *s = h; NLdouble rTh, rTSd, rTr, alpha, beta, omega, st, tt; NLuint its=0; NLdouble b_square = blas->Ddot(blas,N,b,1,b,1); NLdouble err = eps*eps*b_square; NLdouble *r = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N); nlMultMatrixVector(M,x,r); blas->Daxpy(blas,N,-1.,b,1,r,1); nlMultMatrixVector(P,r,d); blas->Dcopy(blas,N,d,1,h,1); blas->Dcopy(blas,N,h,1,rT,1); nl_assert( blas->Ddot(blas,N,rT,1,rT,1)>1e-40 ); rTh=blas->Ddot(blas,N,rT,1,h,1); rTr=blas->Ddot(blas,N,r,1,r,1); while ( rTr>err && its < max_iter) { if(nlCurrentContext != NULL) { if(nlCurrentContext->progress_func != NULL) { nlCurrentContext->progress_func(its, max_iter, rTr, err); } if( (nlCurrentContext->verbose) && !(its % 100)) { nl_printf ( "%d : %.10e -- %.10e\n", its, rTr, err ); } } nlMultMatrixVector(M,d,aux); nlMultMatrixVector(P,aux,Sd); rTSd=blas->Ddot(blas,N,rT,1,Sd,1); nl_assert( fabs(rTSd)>1e-40 ); alpha=rTh/rTSd; blas->Daxpy(blas,N,-alpha,aux,1,r,1); blas->Dcopy(blas,N,h,1,s,1); blas->Daxpy(blas,N,-alpha,Sd,1,s,1); nlMultMatrixVector(M,s,aux); nlMultMatrixVector(P,aux,t); blas->Daxpy(blas,N,1.,t,1,u,1); blas->Dscal(blas,N,alpha,u,1); st=blas->Ddot(blas,N,s,1,t,1); tt=blas->Ddot(blas,N,t,1,t,1); if ( fabs(st)<1e-40 || fabs(tt)<1e-40 ) { omega = 0.; } else { omega = st/tt; } blas->Daxpy(blas,N,-omega,aux,1,r,1); blas->Daxpy(blas,N,-alpha,d,1,x,1); blas->Daxpy(blas,N,-omega,s,1,x,1); blas->Dcopy(blas,N,s,1,h,1); blas->Daxpy(blas,N,-omega,t,1,h,1); beta=(alpha/omega)/rTh; rTh=blas->Ddot(blas,N,rT,1,h,1); beta*=rTh; blas->Dscal(blas,N,beta,d,1); blas->Daxpy(blas,N,1.,h,1,d,1); blas->Daxpy(blas,N,-beta*omega,Sd,1,d,1); rTr=blas->Ddot(blas,N,r,1,r,1); ++its; } NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, r); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, rT); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, d); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, h); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, u); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, Sd); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, t); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, aux); blas->sq_bnorm = b_square; blas->sq_rnorm = rTr; return its; } /* * Note: this one cannot be executed on device (GPU) * because it directly manipulates the vectors. */ static NLuint nlSolveSystem_GMRES( NLBlas_t blas, NLMatrix M, NLdouble* b, NLdouble* x, double eps, NLuint max_iter, NLuint inner_iter ) { NLint n = (NLint)M->n; NLint m = (NLint)inner_iter; typedef NLdouble *NLdoubleP; NLdouble *V = NL_NEW_ARRAY(NLdouble, n*(m+1) ); NLdouble *U = NL_NEW_ARRAY(NLdouble, m*(m+1)/2 ); NLdouble *r = NL_NEW_ARRAY(NLdouble, n ); NLdouble *y = NL_NEW_ARRAY(NLdouble, m+1 ); NLdouble *c = NL_NEW_ARRAY(NLdouble, m ); NLdouble *s = NL_NEW_ARRAY(NLdouble, m ); NLdouble **v = NL_NEW_ARRAY(NLdoubleP, m+1 ); NLint i, j, io, uij, u0j; NLint its = -1; NLdouble beta, h, rd, dd, nrm2b; /* * The way it is written, this routine will not * work on the GPU since it directly modifies the * vectors. */ nl_assert(nlBlasHasUnifiedMemory(blas)); for ( i=0; i<=m; ++i ){ v[i]=V+i*n; } nrm2b=blas->Dnrm2(blas,n,b,1); io=0; do { /* outer loop */ ++io; nlMultMatrixVector(M,x,r); blas->Daxpy(blas,n,-1.,b,1,r,1); beta=blas->Dnrm2(blas,n,r,1); blas->Dcopy(blas,n,r,1,v[0],1); blas->Dscal(blas,n,1./beta,v[0],1); y[0]=beta; j=0; uij=0; do { /* inner loop: j=0,...,m-1 */ u0j=uij; nlMultMatrixVector(M,v[j],v[j+1]); blas->Dgemv( blas,Transpose,n,j+1,1.,V,n,v[j+1],1,0.,U+u0j,1 ); blas->Dgemv( blas,NoTranspose,n,j+1,-1.,V,n,U+u0j,1,1.,v[j+1],1 ); h=blas->Dnrm2(blas,n,v[j+1],1); blas->Dscal(blas,n,1./h,v[j+1],1); for (i=0; i=eps*nrm2b ); { /* minimiere bzgl Y */ blas->Dtpsv( blas, UpperTriangle, NoTranspose, NotUnitTriangular, j,U,y,1 ); /* correct X */ blas->Dgemv(blas,NoTranspose,n,j,-1.,V,n,y,1,1.,x,1); } } while ( fabs(y[j])>=eps*nrm2b && (m*(io-1)+j) < (NLint)max_iter); /* Count the inner iterations */ its = m*(io-1)+j; blas->sq_bnorm = nrm2b*nrm2b; blas->sq_rnorm = y[j]*y[j]; NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, V); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, U); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, r); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, y); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, c); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, s); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, v); return (NLuint)its; } /* Main driver routine */ NLuint nlSolveSystemIterative( NLBlas_t blas, NLMatrix M, NLMatrix P, NLdouble* b_in, NLdouble* x_in, NLenum solver, double eps, NLuint max_iter, NLuint inner_iter ) { NLuint N = M->n; NLuint result=0; NLdouble rnorm=0.0; NLdouble bnorm=0.0; double* b = b_in; double* x = x_in; nl_assert(M->m == M->n); if(!nlBlasHasUnifiedMemory(blas)) { b = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, (int)M->n); blas->Memcpy( blas, b, NL_DEVICE_MEMORY, b_in, NL_HOST_MEMORY, (size_t)N*sizeof(double) ); x = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, (int)M->n); blas->Memcpy( blas, x, NL_DEVICE_MEMORY, x_in, NL_HOST_MEMORY, (size_t)N*sizeof(double) ); } switch(solver) { case NL_CG: if(P == NULL) { result = nlSolveSystem_CG(blas,M,b,x,eps,max_iter); } else { result = nlSolveSystem_PRE_CG(blas,M,P,b,x,eps,max_iter); } break; case NL_BICGSTAB: if(P == NULL) { result = nlSolveSystem_BICGSTAB(blas,M,b,x,eps,max_iter); } else { result = nlSolveSystem_PRE_BICGSTAB(blas,M,P,b,x,eps,max_iter); } break; case NL_GMRES: result = nlSolveSystem_GMRES(blas,M,b,x,eps,max_iter,inner_iter); break; default: nl_assert_not_reached; } /* Get residual norm and rhs norm from BLAS context */ if(nlCurrentContext != NULL) { bnorm = sqrt(blas->sq_bnorm); rnorm = sqrt(blas->sq_rnorm); if(bnorm == 0.0) { nlCurrentContext->error = rnorm; if(nlCurrentContext->verbose) { nl_printf("in OpenNL : ||Ax-b|| = %e\n",nlCurrentContext->error); } } else { nlCurrentContext->error = rnorm/bnorm; if(nlCurrentContext->verbose) { nl_printf("in OpenNL : ||Ax-b||/||b|| = %e\n", nlCurrentContext->error ); } } } nlCurrentContext->used_iterations = result; if(!nlBlasHasUnifiedMemory(blas)) { blas->Memcpy( blas, x_in, NL_HOST_MEMORY, x, NL_DEVICE_MEMORY, (size_t)N*sizeof(double) ); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, (int)M->n, x); NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, (int)M->n, b); } return result; } /******* extracted from nl_preconditioners.c *******/ typedef struct { NLuint m; NLuint n; NLenum type; NLDestroyMatrixFunc destroy_func; NLMultMatrixVectorFunc mult_func; NLdouble* diag_inv; } NLJacobiPreconditioner; static void nlJacobiPreconditionerDestroy(NLJacobiPreconditioner* M) { NL_DELETE_ARRAY(M->diag_inv); } static void nlJacobiPreconditionerMult( NLJacobiPreconditioner* M, const double* x, double* y ) { NLuint i; for(i=0; in; ++i) { y[i] = x[i] * M->diag_inv[i]; } nlHostBlas()->flops += (NLulong)(M->n); } NLMatrix nlNewJacobiPreconditioner(NLMatrix M_in) { NLSparseMatrix* M = NULL; NLJacobiPreconditioner* result = NULL; NLuint i; nl_assert(M_in->type == NL_MATRIX_SPARSE_DYNAMIC); nl_assert(M_in->m == M_in->n); M = (NLSparseMatrix*)M_in; result = NL_NEW(NLJacobiPreconditioner); result->m = M->m; result->n = M->n; result->type = NL_MATRIX_OTHER; result->destroy_func = (NLDestroyMatrixFunc)nlJacobiPreconditionerDestroy; result->mult_func = (NLMultMatrixVectorFunc)nlJacobiPreconditionerMult; result->diag_inv = NL_NEW_ARRAY(double, M->n); for(i=0; in; ++i) { result->diag_inv[i] = (M->diag[i] == 0.0) ? 1.0 : 1.0/M->diag[i]; } return (NLMatrix)result; } typedef struct { NLuint m; NLuint n; NLenum type; NLDestroyMatrixFunc destroy_func; NLMultMatrixVectorFunc mult_func; NLSparseMatrix* M; double omega; NLdouble* work; } NLSSORPreconditioner; static void nlSSORPreconditionerDestroy(NLSSORPreconditioner* M) { NL_DELETE_ARRAY(M->work); } static void nlSparseMatrixMultLowerInverse( NLSparseMatrix* A, const NLdouble* x, NLdouble* y, double omega ) { NLuint n = A->n; NLdouble* diag = A->diag; NLuint i; NLuint ij; NLCoeff* c = NULL; NLdouble S; nl_assert(A->storage & NL_MATRIX_STORE_SYMMETRIC); nl_assert(A->storage & NL_MATRIX_STORE_ROWS); for(i=0; irow[i]); S = 0; for(ij=0; ij < Ri->size; ij++) { c = &(Ri->coeff[ij]); nl_parano_assert(c->index <= i); if(c->index != i) { S += c->value * y[c->index]; } } nlHostBlas()->flops += (NLulong)(2*Ri->size); y[i] = (x[i] - S) * omega / diag[i]; } nlHostBlas()->flops += (NLulong)(n*3); } static void nlSparseMatrixMultUpperInverse( NLSparseMatrix* A, const NLdouble* x, NLdouble* y, NLdouble omega ) { NLuint n = A->n; NLdouble* diag = A->diag; NLint i; NLuint ij; NLCoeff* c = NULL; NLdouble S; nl_assert(A->storage & NL_MATRIX_STORE_SYMMETRIC); nl_assert(A->storage & NL_MATRIX_STORE_COLUMNS); for(i=(NLint)(n-1); i>=0; i--) { NLRowColumn* Ci = &(A->column[i]); S = 0; for(ij=0; ij < Ci->size; ij++) { c = &(Ci->coeff[ij]); nl_parano_assert(c->index >= i); if((NLint)(c->index) != i) { S += c->value * y[c->index]; } } nlHostBlas()->flops += (NLulong)(2*Ci->size); y[i] = (x[i] - S) * omega / diag[i]; } nlHostBlas()->flops += (NLulong)(n*3); } static void nlSSORPreconditionerMult( NLSSORPreconditioner* P, const double* x, double* y ) { NLdouble* diag = P->M->diag; NLuint i; nlSparseMatrixMultLowerInverse( P->M, x, P->work, P->omega ); for(i=0; in; i++) { P->work[i] *= (diag[i] / P->omega); } nlHostBlas()->flops += (NLulong)(P->n); nlSparseMatrixMultUpperInverse( P->M, P->work, y, P->omega ); nlHostBlas()->Dscal(nlHostBlas(),(NLint)P->n, 2.0 - P->omega, y, 1); } NLMatrix nlNewSSORPreconditioner(NLMatrix M_in, double omega) { NLSparseMatrix* M = NULL; NLSSORPreconditioner* result = NULL; nl_assert(M_in->type == NL_MATRIX_SPARSE_DYNAMIC); nl_assert(M_in->m == M_in->n); M = (NLSparseMatrix*)M_in; result = NL_NEW(NLSSORPreconditioner); result->m = M->m; result->n = M->n; result->type = NL_MATRIX_OTHER; result->destroy_func = (NLDestroyMatrixFunc)nlSSORPreconditionerDestroy; result->mult_func = (NLMultMatrixVectorFunc)nlSSORPreconditionerMult; result->M = M; result->work = NL_NEW_ARRAY(NLdouble, result->n); result->omega = omega; return (NLMatrix)result; } /******* extracted from nl_superlu.c *******/ #ifdef NL_OS_UNIX # ifdef NL_OS_APPLE # define SUPERLU_LIB_NAME "libsuperlu_5.dylib" # else # define SUPERLU_LIB_NAME "libsuperlu.so" # endif #else # define SUPERLU_LIB_NAME "libsuperlu.xxx" #endif typedef enum { SLU_NC, /* column-wise, no supernode */ SLU_NCP, /* column-wise, column-permuted, no supernode (The consecutive columns of nonzeros, after permutation, may not be stored contiguously.) */ SLU_NR, /* row-wize, no supernode */ SLU_SC, /* column-wise, supernode */ SLU_SCP, /* supernode, column-wise, permuted */ SLU_SR, /* row-wise, supernode */ SLU_DN, /* Fortran style column-wise storage for dense matrix */ SLU_NR_loc /* distributed compressed row format */ } Stype_t; typedef enum { SLU_S, /* single */ SLU_D, /* double */ SLU_C, /* single complex */ SLU_Z /* double complex */ } Dtype_t; typedef enum { SLU_GE, /* general */ SLU_TRLU, /* lower triangular, unit diagonal */ SLU_TRUU, /* upper triangular, unit diagonal */ SLU_TRL, /* lower triangular */ SLU_TRU, /* upper triangular */ SLU_SYL, /* symmetric, store lower half */ SLU_SYU, /* symmetric, store upper half */ SLU_HEL, /* Hermitian, store lower half */ SLU_HEU /* Hermitian, store upper half */ } Mtype_t; typedef int int_t; typedef struct { int_t nnz; /* number of nonzeros in the matrix */ void *nzval; /* pointer to array of nonzero values, packed by raw */ int_t *colind; /* pointer to array of columns indices of the nonzeros */ int_t *rowptr; /* pointer to array of beginning of rows in nzval[] and colind[] */ /* Note: Zero-based indexing is used; rowptr[] has nrow+1 entries, the last one pointing beyond the last row, so that rowptr[nrow] = nnz. */ } NRformat; typedef struct { Stype_t Stype; /* Storage type: interprets the storage structure pointed to by *Store. */ Dtype_t Dtype; /* Data type. */ Mtype_t Mtype; /* Matrix type: describes the mathematical property of the matrix. */ int_t nrow; /* number of rows */ int_t ncol; /* number of columns */ void *Store; /* pointer to the actual storage of the matrix */ } SuperMatrix; /* Stype == SLU_DN */ typedef struct { int_t lda; /* leading dimension */ void *nzval; /* array of size lda*ncol to represent a dense matrix */ } DNformat; typedef enum {NO, YES} yes_no_t; typedef enum {DOFACT, SamePattern, SamePattern_SameRowPerm, FACTORED} fact_t; typedef enum {NOROWPERM, LargeDiag, MY_PERMR} rowperm_t; typedef enum {NATURAL, MMD_ATA, MMD_AT_PLUS_A, COLAMD, METIS_AT_PLUS_A, PARMETIS, ZOLTAN, MY_PERMC} colperm_t; typedef enum {NOTRANS, TRANS, CONJ} trans_t; typedef enum {NOEQUIL, ROW, COL, BOTH} DiagScale_t; typedef enum {NOREFINE, SLU_SINGLE=1, SLU_DOUBLE, SLU_EXTRA} IterRefine_t; typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL} MemType; typedef enum {HEAD, TAIL} stack_end_t; typedef enum {SYSTEM, USER} LU_space_t; typedef enum {ONE_NORM, TWO_NORM, INF_NORM} norm_t; typedef enum {SILU, SMILU_1, SMILU_2, SMILU_3} milu_t; typedef struct { fact_t Fact; yes_no_t Equil; colperm_t ColPerm; trans_t Trans; IterRefine_t IterRefine; double DiagPivotThresh; yes_no_t SymmetricMode; yes_no_t PivotGrowth; yes_no_t ConditionNumber; rowperm_t RowPerm; int ILU_DropRule; double ILU_DropTol; /* threshold for dropping */ double ILU_FillFactor; /* gamma in the secondary dropping */ norm_t ILU_Norm; /* infinity-norm, 1-norm, or 2-norm */ double ILU_FillTol; /* threshold for zero pivot perturbation */ milu_t ILU_MILU; double ILU_MILU_Dim; /* Dimension of PDE (if available) */ yes_no_t ParSymbFact; yes_no_t ReplaceTinyPivot; /* used in SuperLU_DIST */ yes_no_t SolveInitialized; yes_no_t RefineInitialized; yes_no_t PrintStat; int nnzL, nnzU; /* used to store nnzs for now */ int num_lookaheads; /* num of levels in look-ahead */ yes_no_t lookahead_etree; /* use etree computed from the serial symbolic factorization */ yes_no_t SymPattern; /* symmetric factorization */ } superlu_options_t; typedef void* superlu_options_ptr; typedef float flops_t; typedef unsigned char Logical; typedef struct { int *panel_histo; /* histogram of panel size distribution */ double *utime; /* running time at various phases */ flops_t *ops; /* operation count at various phases */ int TinyPivots; /* number of tiny pivots */ int RefineSteps; /* number of iterative refinement steps */ int expansions; /* number of memory expansions (SuperLU4) */ } SuperLUStat_t; /*! \brief Headers for 4 types of dynamatically managed memory */ typedef struct e_node { int size; /* length of the memory that has been used */ void *mem; /* pointer to the new malloc'd store */ } ExpHeader; typedef struct { int size; int used; int top1; /* grow upward, relative to &array[0] */ int top2; /* grow downward */ void *array; } LU_stack_t; typedef struct { int *xsup; /* supernode and column mapping */ int *supno; int *lsub; /* compressed L subscripts */ int *xlsub; void *lusup; /* L supernodes */ int *xlusup; void *ucol; /* U columns */ int *usub; int *xusub; int nzlmax; /* current max size of lsub */ int nzumax; /* " " " ucol */ int nzlumax; /* " " " lusup */ int n; /* number of columns in the matrix */ LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */ int num_expansions; ExpHeader *expanders; /* Array of pointers to 4 types of memory */ LU_stack_t stack; /* use user supplied memory */ } GlobalLU_t; typedef void (*FUNPTR_set_default_options)(superlu_options_ptr options); typedef void (*FUNPTR_ilu_set_default_options)(superlu_options_ptr options); typedef void (*FUNPTR_StatInit)(SuperLUStat_t *); typedef void (*FUNPTR_StatFree)(SuperLUStat_t *); typedef void (*FUNPTR_dCreate_CompCol_Matrix)( SuperMatrix *, int, int, int, const double *, const int *, const int *, Stype_t, Dtype_t, Mtype_t); typedef void (*FUNPTR_dCreate_Dense_Matrix)( SuperMatrix *, int, int, const double *, int, Stype_t, Dtype_t, Mtype_t); typedef void (*FUNPTR_Destroy_SuperNode_Matrix)(SuperMatrix *); typedef void (*FUNPTR_Destroy_CompCol_Matrix)(SuperMatrix *); typedef void (*FUNPTR_Destroy_CompCol_Permuted)(SuperMatrix *); typedef void (*FUNPTR_Destroy_SuperMatrix_Store)(SuperMatrix *); typedef void (*FUNPTR_dgssv)( superlu_options_ptr, SuperMatrix *, int *, int *, SuperMatrix *, SuperMatrix *, SuperMatrix *, SuperLUStat_t *, int * ); typedef void (*FUNPTR_dgstrs)( trans_t, SuperMatrix *, SuperMatrix *, int *, int *, SuperMatrix *, SuperLUStat_t*, int * ); typedef void (*FUNPTR_get_perm_c)(int, SuperMatrix *, int *); typedef void (*FUNPTR_sp_preorder)( superlu_options_t *, SuperMatrix*, int*, int*, SuperMatrix* ); typedef int (*FUNPTR_sp_ienv)(int); typedef int (*FUNPTR_input_error)(const char *, int *); typedef void (*FUNPTR_dgstrf) (superlu_options_t *options, SuperMatrix *A, int relax, int panel_size, int *etree, void *work, int lwork, int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U, GlobalLU_t *Glu, /* persistent to facilitate multiple factorizations */ SuperLUStat_t *stat, int *info ); typedef struct { FUNPTR_set_default_options set_default_options; FUNPTR_ilu_set_default_options ilu_set_default_options; FUNPTR_StatInit StatInit; FUNPTR_StatFree StatFree; FUNPTR_dCreate_CompCol_Matrix dCreate_CompCol_Matrix; FUNPTR_dCreate_Dense_Matrix dCreate_Dense_Matrix; FUNPTR_Destroy_SuperNode_Matrix Destroy_SuperNode_Matrix; FUNPTR_Destroy_CompCol_Matrix Destroy_CompCol_Matrix; FUNPTR_Destroy_CompCol_Permuted Destroy_CompCol_Permuted; FUNPTR_Destroy_SuperMatrix_Store Destroy_SuperMatrix_Store; FUNPTR_dgssv dgssv; FUNPTR_dgstrs dgstrs; FUNPTR_get_perm_c get_perm_c; FUNPTR_sp_preorder sp_preorder; FUNPTR_sp_ienv sp_ienv; FUNPTR_dgstrf dgstrf; FUNPTR_input_error input_error; NLdll DLL_handle; } SuperLUContext; static SuperLUContext* SuperLU() { static SuperLUContext context; static NLboolean init = NL_FALSE; if(!init) { init = NL_TRUE; memset(&context, 0, sizeof(context)); } return &context; } NLboolean nlExtensionIsInitialized_SUPERLU() { return SuperLU()->DLL_handle != NULL && SuperLU()->set_default_options != NULL && SuperLU()->ilu_set_default_options != NULL && SuperLU()->StatInit != NULL && SuperLU()->StatFree != NULL && SuperLU()->dCreate_CompCol_Matrix != NULL && SuperLU()->dCreate_Dense_Matrix != NULL && SuperLU()->Destroy_SuperNode_Matrix != NULL && SuperLU()->Destroy_CompCol_Matrix != NULL && SuperLU()->Destroy_CompCol_Permuted != NULL && SuperLU()->Destroy_SuperMatrix_Store != NULL && SuperLU()->dgssv != NULL && SuperLU()->dgstrs != NULL && SuperLU()->get_perm_c != NULL && SuperLU()->sp_preorder != NULL && SuperLU()->sp_ienv != NULL && SuperLU()->dgstrf != NULL && SuperLU()->input_error != NULL; } static void nlTerminateExtension_SUPERLU(void) { if(SuperLU()->DLL_handle != NULL) { nlCloseDLL(SuperLU()->DLL_handle); SuperLU()->DLL_handle = NULL; } } #define find_superlu_func(name) \ if( \ ( \ SuperLU()->name = \ (FUNPTR_##name)nlFindFunction(SuperLU()->DLL_handle,#name) \ ) == NULL \ ) { \ nlError("nlInitExtension_SUPERLU","function not found"); \ nlError("nlInitExtension_SUPERLU",#name); \ return NL_FALSE; \ } NLboolean nlInitExtension_SUPERLU(void) { NLenum flags = NL_LINK_NOW | NL_LINK_USE_FALLBACK; if(nlCurrentContext == NULL || !nlCurrentContext->verbose) { flags |= NL_LINK_QUIET; } if(SuperLU()->DLL_handle != NULL) { return nlExtensionIsInitialized_SUPERLU(); } SuperLU()->DLL_handle = nlOpenDLL(SUPERLU_LIB_NAME, flags); if(SuperLU()->DLL_handle == NULL) { return NL_FALSE; } find_superlu_func(set_default_options); find_superlu_func(ilu_set_default_options); find_superlu_func(StatInit); find_superlu_func(StatFree); find_superlu_func(dCreate_CompCol_Matrix); find_superlu_func(dCreate_Dense_Matrix); find_superlu_func(Destroy_SuperNode_Matrix); find_superlu_func(Destroy_CompCol_Matrix); find_superlu_func(Destroy_CompCol_Permuted); find_superlu_func(Destroy_SuperMatrix_Store); find_superlu_func(dgssv); find_superlu_func(dgstrs); find_superlu_func(get_perm_c); find_superlu_func(sp_preorder); find_superlu_func(sp_ienv); find_superlu_func(dgstrf); find_superlu_func(input_error); atexit(nlTerminateExtension_SUPERLU); return NL_TRUE; } typedef struct { NLuint m; NLuint n; NLenum type; NLDestroyMatrixFunc destroy_func; NLMultMatrixVectorFunc mult_func; SuperMatrix L; SuperMatrix U; int* perm_r; int* perm_c; trans_t trans; } NLSuperLUFactorizedMatrix; static void nlSuperLUFactorizedMatrixDestroy(NLSuperLUFactorizedMatrix* M) { SuperLU()->Destroy_SuperNode_Matrix(&M->L); SuperLU()->Destroy_CompCol_Matrix(&M->U); NL_DELETE_ARRAY(M->perm_r); NL_DELETE_ARRAY(M->perm_c); } static void nlSuperLUFactorizedMatrixMult( NLSuperLUFactorizedMatrix* M, const double* x, double* y ) { SuperMatrix B; SuperLUStat_t stat; int info = 0; NLuint i; /* Create vector */ SuperLU()->dCreate_Dense_Matrix( &B, (int)(M->n), 1, y, (int)(M->n), SLU_DN, /* Fortran-type column-wise storage */ SLU_D, /* doubles */ SLU_GE /* general */ ); /* copy rhs onto y (superLU matrix-vector product expects it here */ for(i = 0; i < M->n; i++){ y[i] = x[i]; } /* Call SuperLU triangular solve */ SuperLU()->StatInit(&stat) ; SuperLU()->dgstrs( M->trans, &M->L, &M->U, M->perm_c, M->perm_r, &B, &stat, &info ); SuperLU()->StatFree(&stat) ; /* Only the "store" structure needs to be * deallocated (the array has been allocated * by client code). */ SuperLU()->Destroy_SuperMatrix_Store(&B) ; } /* * Copied from SUPERLU/dgssv.c, removed call to linear solve. */ static void dgssv_factorize_only( superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U, SuperLUStat_t *stat, int *info, trans_t *trans ) { SuperMatrix *AA = NULL; /* A in SLU_NC format used by the factorization routine.*/ SuperMatrix AC; /* Matrix postmultiplied by Pc */ int lwork = 0, *etree, i; GlobalLU_t Glu; /* Not needed on return. */ /* Set default values for some parameters */ int panel_size; /* panel size */ int relax; /* no of columns in a relaxed snodes */ int permc_spec; nl_assert(A->Stype == SLU_NR || A->Stype == SLU_NC); *trans = NOTRANS; if ( options->Fact != DOFACT ) *info = -1; else if ( A->nrow != A->ncol || A->nrow < 0 || (A->Stype != SLU_NC && A->Stype != SLU_NR) || A->Dtype != SLU_D || A->Mtype != SLU_GE ) *info = -2; if ( *info != 0 ) { i = -(*info); SuperLU()->input_error("SUPERLU/OpenNL dgssv_factorize_only", &i); return; } /* Convert A to SLU_NC format when necessary. */ if ( A->Stype == SLU_NR ) { NRformat *Astore = (NRformat*)A->Store; AA = NL_NEW(SuperMatrix); SuperLU()->dCreate_CompCol_Matrix( AA, A->ncol, A->nrow, Astore->nnz, (double*)Astore->nzval, Astore->colind, Astore->rowptr, SLU_NC, A->Dtype, A->Mtype ); *trans = TRANS; } else { if ( A->Stype == SLU_NC ) AA = A; } nl_assert(AA != NULL); /* * Get column permutation vector perm_c[], according to permc_spec: * permc_spec = NATURAL: natural ordering * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A * permc_spec = MMD_ATA: minimum degree on structure of A'*A * permc_spec = COLAMD: approximate minimum degree column ordering * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] */ permc_spec = options->ColPerm; if ( permc_spec != MY_PERMC && options->Fact == DOFACT ) SuperLU()->get_perm_c(permc_spec, AA, perm_c); etree = NL_NEW_ARRAY(int,A->ncol); SuperLU()->sp_preorder(options, AA, perm_c, etree, &AC); panel_size = SuperLU()->sp_ienv(1); relax = SuperLU()->sp_ienv(2); SuperLU()->dgstrf(options, &AC, relax, panel_size, etree, NULL, lwork, perm_c, perm_r, L, U, &Glu, stat, info); NL_DELETE_ARRAY(etree); SuperLU()->Destroy_CompCol_Permuted(&AC); if ( A->Stype == SLU_NR ) { SuperLU()->Destroy_SuperMatrix_Store(AA); NL_DELETE(AA); } } NLMatrix nlMatrixFactorize_SUPERLU( NLMatrix M, NLenum solver ) { NLSuperLUFactorizedMatrix* LU = NULL; NLCRSMatrix* CRS = NULL; SuperMatrix superM; NLuint n = M->n; superlu_options_t options; SuperLUStat_t stat; NLint info = 0; /* status code */ nl_assert(M->m == M->n); if(M->type == NL_MATRIX_CRS) { CRS = (NLCRSMatrix*)M; } else if(M->type == NL_MATRIX_SPARSE_DYNAMIC) { CRS = (NLCRSMatrix*)nlCRSMatrixNewFromSparseMatrix((NLSparseMatrix*)M); } nl_assert(!(CRS->symmetric_storage)); LU = NL_NEW(NLSuperLUFactorizedMatrix); LU->m = M->m; LU->n = M->n; LU->type = NL_MATRIX_OTHER; LU->destroy_func = (NLDestroyMatrixFunc)(nlSuperLUFactorizedMatrixDestroy); LU->mult_func = (NLMultMatrixVectorFunc)(nlSuperLUFactorizedMatrixMult); LU->perm_c = NL_NEW_ARRAY(int, n); LU->perm_r = NL_NEW_ARRAY(int, n); SuperLU()->dCreate_CompCol_Matrix( &superM, (int)n, (int)n, (int)nlCRSMatrixNNZ(CRS), CRS->val, (int*)CRS->colind, (int*)CRS->rowptr, SLU_NR, /* Row_wise, no supernode */ SLU_D, /* doubles */ CRS->symmetric_storage ? SLU_SYL : SLU_GE ); SuperLU()->set_default_options(&options); switch(solver) { case NL_SUPERLU_EXT: { options.ColPerm = NATURAL; } break; case NL_PERM_SUPERLU_EXT: { options.ColPerm = COLAMD; } break; case NL_SYMMETRIC_SUPERLU_EXT: { options.ColPerm = MMD_AT_PLUS_A; options.SymmetricMode = YES; } break; default: nl_assert_not_reached; } SuperLU()->StatInit(&stat); dgssv_factorize_only( &options, &superM, LU->perm_c, LU->perm_r, &LU->L, &LU->U, &stat, &info, &LU->trans ); SuperLU()->StatFree(&stat); /* * Only the "store" structure needs to be deallocated * (the arrays have been allocated by us, they are in CRS). */ SuperLU()->Destroy_SuperMatrix_Store(&superM); if((NLMatrix)CRS != M) { nlDeleteMatrix((NLMatrix)CRS); } if(info != 0) { NL_DELETE(LU); LU = NULL; } return (NLMatrix)LU; } /******* extracted from nl_cholmod.c *******/ #ifdef NL_OS_UNIX # ifdef NL_OS_APPLE # define CHOLMOD_LIB_NAME "libcholmod.dylib" # else # define CHOLMOD_LIB_NAME "libcholmod.so" # endif #else # define CHOLMOD_LIB_NAME "libcholmod.xxx" #endif /* Excerpt from cholmod_core.h */ /* A dense matrix in column-oriented form. It has no itype since it contains * no integers. Entry in row i and column j is located in x [i+j*d]. */ typedef struct cholmod_dense_struct { size_t nrow ; /* the matrix is nrow-by-ncol */ size_t ncol ; size_t nzmax ; /* maximum number of entries in the matrix */ size_t d ; /* leading dimension (d >= nrow must hold) */ void *x ; /* size nzmax or 2*nzmax, if present */ void *z ; /* size nzmax, if present */ int xtype ; /* pattern, real, complex, or zomplex */ int dtype ; /* x and z double or float */ } cholmod_dense ; /* A sparse matrix stored in compressed-column form. */ typedef struct cholmod_sparse_struct { size_t nrow ; /* the matrix is nrow-by-ncol */ size_t ncol ; size_t nzmax ; /* maximum number of entries in the matrix */ /* pointers to int or SuiteSparse_long: */ void *p ; /* p [0..ncol], the column pointers */ void *i ; /* i [0..nzmax-1], the row indices */ /* for unpacked matrices only: */ void *nz ; /* nz [0..ncol-1], the # of nonzeros in each col. In * packed form, the nonzero pattern of column j is in * A->i [A->p [j] ... A->p [j+1]-1]. In unpacked form, column j is in * A->i [A->p [j] ... A->p [j]+A->nz[j]-1] instead. In both cases, the * numerical values (if present) are in the corresponding locations in * the array x (or z if A->xtype is CHOLMOD_ZOMPLEX). */ /* pointers to double or float: */ void *x ; /* size nzmax or 2*nzmax, if present */ void *z ; /* size nzmax, if present */ int stype ; /* Describes what parts of the matrix are considered: * * 0: matrix is "unsymmetric": use both upper and lower triangular parts * (the matrix may actually be symmetric in pattern and value, but * both parts are explicitly stored and used). May be square or * rectangular. * >0: matrix is square and symmetric, use upper triangular part. * Entries in the lower triangular part are ignored. * <0: matrix is square and symmetric, use lower triangular part. * Entries in the upper triangular part are ignored. * * Note that stype>0 and stype<0 are different for cholmod_sparse and * cholmod_triplet. See the cholmod_triplet data structure for more * details. */ int itype ; /* CHOLMOD_INT: p, i, and nz are int. * CHOLMOD_INTLONG: p is SuiteSparse_long, * i and nz are int. * CHOLMOD_LONG: p, i, and nz are SuiteSparse_long */ int xtype ; /* pattern, real, complex, or zomplex */ int dtype ; /* x and z are double or float */ int sorted ; /* TRUE if columns are sorted, FALSE otherwise */ int packed ; /* TRUE if packed (nz ignored), FALSE if unpacked * (nz is required) */ } cholmod_sparse ; typedef void* cholmod_common_ptr; typedef cholmod_dense* cholmod_dense_ptr; typedef cholmod_sparse* cholmod_sparse_ptr; typedef void* cholmod_factor_ptr; typedef enum cholmod_xtype_enum { CHOLMOD_PATTERN =0, CHOLMOD_REAL =1, CHOLMOD_COMPLEX =2, CHOLMOD_ZOMPLEX =3 } cholmod_xtype; typedef enum cholmod_solve_type_enum { CHOLMOD_A =0, CHOLMOD_LDLt =1, CHOLMOD_LD =2, CHOLMOD_DLt =3, CHOLMOD_L =4, CHOLMOD_Lt =5, CHOLMOD_D =6, CHOLMOD_P =7, CHOLMOD_Pt =8 } cholmod_solve_type; typedef int cholmod_stype; typedef void (*FUNPTR_cholmod_start)(cholmod_common_ptr); typedef cholmod_sparse_ptr (*FUNPTR_cholmod_allocate_sparse)( size_t m, size_t n, size_t nnz, int sorted, int packed, int stype, int xtype, cholmod_common_ptr ); typedef cholmod_dense_ptr (*FUNPTR_cholmod_allocate_dense)( size_t m, size_t n, size_t d, int xtype, cholmod_common_ptr ); typedef cholmod_factor_ptr (*FUNPTR_cholmod_analyze)( cholmod_sparse_ptr A, cholmod_common_ptr ); typedef int (*FUNPTR_cholmod_factorize)( cholmod_sparse_ptr A, cholmod_factor_ptr L, cholmod_common_ptr ); typedef cholmod_dense_ptr (*FUNPTR_cholmod_solve)( int solve_type, cholmod_factor_ptr, cholmod_dense_ptr, cholmod_common_ptr ); typedef void (*FUNPTR_cholmod_free_factor)( cholmod_factor_ptr*, cholmod_common_ptr ); typedef void (*FUNPTR_cholmod_free_dense)( cholmod_dense_ptr*, cholmod_common_ptr ); typedef void (*FUNPTR_cholmod_free_sparse)( cholmod_sparse_ptr*, cholmod_common_ptr ); typedef void (*FUNPTR_cholmod_finish)(cholmod_common_ptr); typedef struct { char cholmod_common[16384]; FUNPTR_cholmod_start cholmod_start; FUNPTR_cholmod_allocate_sparse cholmod_allocate_sparse; FUNPTR_cholmod_allocate_dense cholmod_allocate_dense; FUNPTR_cholmod_analyze cholmod_analyze; FUNPTR_cholmod_factorize cholmod_factorize; FUNPTR_cholmod_solve cholmod_solve; FUNPTR_cholmod_free_factor cholmod_free_factor; FUNPTR_cholmod_free_sparse cholmod_free_sparse; FUNPTR_cholmod_free_dense cholmod_free_dense; FUNPTR_cholmod_finish cholmod_finish; NLdll DLL_handle; } CHOLMODContext; static CHOLMODContext* CHOLMOD() { static CHOLMODContext context; static NLboolean init = NL_FALSE; if(!init) { init = NL_TRUE; memset(&context, 0, sizeof(context)); } return &context; } NLboolean nlExtensionIsInitialized_CHOLMOD() { return CHOLMOD()->DLL_handle != NULL && CHOLMOD()->cholmod_start != NULL && CHOLMOD()->cholmod_allocate_sparse != NULL && CHOLMOD()->cholmod_allocate_dense != NULL && CHOLMOD()->cholmod_analyze != NULL && CHOLMOD()->cholmod_factorize != NULL && CHOLMOD()->cholmod_solve != NULL && CHOLMOD()->cholmod_free_factor != NULL && CHOLMOD()->cholmod_free_sparse != NULL && CHOLMOD()->cholmod_free_dense != NULL && CHOLMOD()->cholmod_finish != NULL ; } #define find_cholmod_func(name) \ if( \ ( \ CHOLMOD()->name = \ (FUNPTR_##name)nlFindFunction(CHOLMOD()->DLL_handle,#name) \ ) == NULL \ ) { \ nlError("nlInitExtension_CHOLMOD","function not found"); \ return NL_FALSE; \ } static void nlTerminateExtension_CHOLMOD(void) { if(CHOLMOD()->DLL_handle != NULL) { CHOLMOD()->cholmod_finish(&CHOLMOD()->cholmod_common); nlCloseDLL(CHOLMOD()->DLL_handle); CHOLMOD()->DLL_handle = NULL; } } NLboolean nlInitExtension_CHOLMOD(void) { NLenum flags = NL_LINK_NOW | NL_LINK_USE_FALLBACK; if(nlCurrentContext == NULL || !nlCurrentContext->verbose) { flags |= NL_LINK_QUIET; } if(CHOLMOD()->DLL_handle != NULL) { return nlExtensionIsInitialized_CHOLMOD(); } /* * MKL has a built-in CHOLMOD that conflicts with * the CHOLMOD used by OpenNL (to be fixed). For now * we simply output a warning message and deactivate the * CHOLMOD extension if the MKL extension was initialized * before. */ if(NLMultMatrixVector_MKL != NULL) { nl_fprintf( stderr, "CHOLMOD extension incompatible with MKL (deactivating)" ); return NL_FALSE; } CHOLMOD()->DLL_handle = nlOpenDLL(CHOLMOD_LIB_NAME,flags); if(CHOLMOD()->DLL_handle == NULL) { return NL_FALSE; } find_cholmod_func(cholmod_start); find_cholmod_func(cholmod_allocate_sparse); find_cholmod_func(cholmod_allocate_dense); find_cholmod_func(cholmod_analyze); find_cholmod_func(cholmod_factorize); find_cholmod_func(cholmod_solve); find_cholmod_func(cholmod_free_factor); find_cholmod_func(cholmod_free_sparse); find_cholmod_func(cholmod_free_dense); find_cholmod_func(cholmod_finish); CHOLMOD()->cholmod_start(&CHOLMOD()->cholmod_common); atexit(nlTerminateExtension_CHOLMOD); return NL_TRUE; } typedef struct { NLuint m; NLuint n; NLenum type; NLDestroyMatrixFunc destroy_func; NLMultMatrixVectorFunc mult_func; cholmod_factor_ptr L; } NLCholmodFactorizedMatrix; static void nlCholmodFactorizedMatrixDestroy(NLCholmodFactorizedMatrix* M) { CHOLMOD()->cholmod_free_factor(&M->L, &CHOLMOD()->cholmod_common); } static void nlCholmodFactorizedMatrixMult( NLCholmodFactorizedMatrix* M, const double* x, double* y ) { /* * TODO: see whether CHOLDMOD can use user-allocated vectors * (and avoid copy) */ cholmod_dense_ptr X=CHOLMOD()->cholmod_allocate_dense( M->n, 1, M->n, CHOLMOD_REAL, &CHOLMOD()->cholmod_common ); cholmod_dense_ptr Y=NULL; memcpy(X->x, x, M->n*sizeof(double)); Y = CHOLMOD()->cholmod_solve( CHOLMOD_A, M->L, X, &CHOLMOD()->cholmod_common ); memcpy(y, Y->x, M->n*sizeof(double)); CHOLMOD()->cholmod_free_dense(&X, &CHOLMOD()->cholmod_common); CHOLMOD()->cholmod_free_dense(&Y, &CHOLMOD()->cholmod_common); } NLMatrix nlMatrixFactorize_CHOLMOD( NLMatrix M, NLenum solver ) { NLCholmodFactorizedMatrix* LLt = NULL; NLCRSMatrix* CRS = NULL; cholmod_sparse_ptr cM= NULL; NLuint nnz, cur, i, j, jj; int* rowptr = NULL; int* colind = NULL; double* val = NULL; NLuint n = M->n; nl_assert(solver == NL_CHOLMOD_EXT); nl_assert(M->m == M->n); if(M->type == NL_MATRIX_CRS) { CRS = (NLCRSMatrix*)M; } else if(M->type == NL_MATRIX_SPARSE_DYNAMIC) { /* * Note: since we convert once again into symmetric storage, * we could also directly read the NLSparseMatrix there instead * of copying once more... */ CRS = (NLCRSMatrix*)nlCRSMatrixNewFromSparseMatrix((NLSparseMatrix*)M); } LLt = NL_NEW(NLCholmodFactorizedMatrix); LLt->m = M->m; LLt->n = M->n; LLt->type = NL_MATRIX_OTHER; LLt->destroy_func = (NLDestroyMatrixFunc)(nlCholmodFactorizedMatrixDestroy); LLt->mult_func = (NLMultMatrixVectorFunc)(nlCholmodFactorizedMatrixMult); /* * Compute required nnz, if matrix is not already with symmetric storage, * ignore entries in the upper triangular part. */ nnz=0; for(i=0; irowptr[i]; jjrowptr[i+1]; ++jj) { j=CRS->colind[jj]; if(j <= i) { ++nnz; } } } /* * Copy CRS matrix into CHOLDMOD matrix (and ignore upper trianglar part) */ cM = CHOLMOD()->cholmod_allocate_sparse( n, n, nnz, /* Dimensions and number of non-zeros */ NL_FALSE, /* Sorted = false */ NL_TRUE, /* Packed = true */ 1, /* stype (-1 = lower triangular, 1 = upper triangular) */ CHOLMOD_REAL, /* Entries are real numbers */ &CHOLMOD()->cholmod_common ); rowptr = (int*)cM->p; colind = (int*)cM->i; val = (double*)cM->x; cur = 0; for(i=0; irowptr[i]; jjrowptr[i+1]; ++jj) { j = CRS->colind[jj]; if(j <= i) { val[cur] = CRS->val[jj]; colind[cur] = (int)j; ++cur; } } } rowptr[n] = (int)cur; nl_assert(cur==nnz); LLt->L = CHOLMOD()->cholmod_analyze(cM, &CHOLMOD()->cholmod_common); if(!CHOLMOD()->cholmod_factorize(cM, LLt->L, &CHOLMOD()->cholmod_common)) { CHOLMOD()->cholmod_free_factor(&LLt->L, &CHOLMOD()->cholmod_common); NL_DELETE(LLt); } CHOLMOD()->cholmod_free_sparse(&cM, &CHOLMOD()->cholmod_common); if((NLMatrix)CRS != M) { nlDeleteMatrix((NLMatrix)CRS); } return (NLMatrix)(LLt); } /******* extracted from nl_arpack.c *******/ #ifdef NL_OS_UNIX # ifdef NL_OS_APPLE # define ARPACK_LIB_NAME "libarpack.dylib" # else # define ARPACK_LIB_NAME "libarpack.so" # endif #else # define ARPACK_LIB_NAME "libarpack.dll" #endif typedef int ARint; typedef int ARlogical; /* double precision symmetric routines */ typedef void (*FUNPTR_dsaupd)( ARint *ido, char *bmat, ARint *n, char *which, ARint *nev, double *tol, double *resid, ARint *ncv, double *V, ARint *ldv, ARint *iparam, ARint *ipntr, double *workd, double *workl, ARint *lworkl, ARint *info ); typedef void (*FUNPTR_dseupd)( ARlogical *rvec, char *HowMny, ARlogical *select, double *d, double *Z, ARint *ldz, double *sigma, char *bmat, ARint *n, char *which, ARint *nev, double *tol, double *resid, ARint *ncv, double *V, ARint *ldv, ARint *iparam, ARint *ipntr, double *workd, double *workl, ARint *lworkl, ARint *info ); /* double precision nonsymmetric routines */ typedef void (*FUNPTR_dnaupd)( ARint *ido, char *bmat, ARint *n, char *which, ARint *nev, double *tol, double *resid, ARint *ncv, double *V, ARint *ldv, ARint *iparam, ARint *ipntr, double *workd, double *workl, ARint *lworkl, ARint *info ); typedef void (*FUNPTR_dneupd)( ARlogical *rvec, char *HowMny, ARlogical *select, double *dr, double *di, double *Z, ARint *ldz, double *sigmar, double *sigmai, double *workev, char *bmat, ARint *n, char *which, ARint *nev, double *tol, double *resid, ARint *ncv, double *V, ARint *ldv, ARint *iparam, ARint *ipntr, double *workd, double *workl, ARint *lworkl, ARint *info ); typedef struct { FUNPTR_dsaupd dsaupd; FUNPTR_dseupd dseupd; FUNPTR_dnaupd dnaupd; FUNPTR_dneupd dneupd; NLdll DLL_handle; } ARPACKContext; static ARPACKContext* ARPACK() { static ARPACKContext context; static NLboolean init = NL_FALSE; if(!init) { init = NL_TRUE; memset(&context, 0, sizeof(context)); } return &context; } NLboolean nlExtensionIsInitialized_ARPACK() { return ARPACK()->DLL_handle != NULL && ARPACK()->dsaupd != NULL && ARPACK()->dseupd != NULL && ARPACK()->dnaupd != NULL && ARPACK()->dneupd != NULL; } static void nlTerminateExtension_ARPACK(void) { if(ARPACK()->DLL_handle != NULL) { nlCloseDLL(ARPACK()->DLL_handle); ARPACK()->DLL_handle = NULL; } } static char* u(const char* str) { static char buff[1000]; sprintf(buff, "%s_", str); return buff; } #define find_arpack_func(name) \ if( \ ( \ ARPACK()->name = \ (FUNPTR_##name)nlFindFunction(ARPACK()->DLL_handle,u(#name)) \ ) == NULL \ ) { \ nlError("nlInitExtension_ARPACK","function not found"); \ nlError("nlInitExtension_ARPACK",u(#name)); \ return NL_FALSE; \ } NLboolean nlInitExtension_ARPACK(void) { NLenum flags = NL_LINK_NOW | NL_LINK_USE_FALLBACK; if(nlCurrentContext == NULL || !nlCurrentContext->verbose) { flags |= NL_LINK_QUIET; } if(ARPACK()->DLL_handle != NULL) { return nlExtensionIsInitialized_ARPACK(); } ARPACK()->DLL_handle = nlOpenDLL(ARPACK_LIB_NAME, flags); if(ARPACK()->DLL_handle == NULL) { return NL_FALSE; } find_arpack_func(dsaupd); find_arpack_func(dseupd); find_arpack_func(dnaupd); find_arpack_func(dneupd); atexit(nlTerminateExtension_ARPACK); return NL_TRUE; } static NLMatrix create_OP(NLboolean symmetric) { NLuint n = nlCurrentContext->M->n; NLuint i; NLMatrix result = NULL; if(nlCurrentContext->eigen_shift != 0.0) { /* * A = M */ NLSparseMatrix* A = NL_NEW(NLSparseMatrix); nlSparseMatrixConstruct(A, n, n, NL_MATRIX_STORE_ROWS); nlSparseMatrixAddMatrix(A, 1.0, nlCurrentContext->M); if(nlCurrentContext->B == NULL) { /* * A = A - shift * Id */ for(i=0; ieigen_shift); } } else { /* * A = A - shift * B */ nlSparseMatrixAddMatrix( A, -nlCurrentContext->eigen_shift, nlCurrentContext->B ); } /* * OP = A^{-1} */ if(nlCurrentContext->verbose) { nl_printf("Factorizing matrix...\n"); } result = nlMatrixFactorize( (NLMatrix)A, symmetric ? NL_SYMMETRIC_SUPERLU_EXT : NL_PERM_SUPERLU_EXT ); if(nlCurrentContext->verbose) { nl_printf("Matrix factorized\n"); } nlDeleteMatrix((NLMatrix)A); } else { /* * OP = M^{-1} */ if(nlCurrentContext->verbose) { nl_printf("Factorizing matrix...\n"); } result = nlMatrixFactorize( nlCurrentContext->M, symmetric ? NL_SYMMETRIC_SUPERLU_EXT : NL_PERM_SUPERLU_EXT ); if(nlCurrentContext->verbose) { nl_printf("Matrix factorized\n"); } } if(nlCurrentContext->B != NULL) { /* * OP = OP * B */ result = nlMatrixNewFromProduct( result, NL_TRUE, /* mem. ownership transferred */ nlCurrentContext->B, NL_FALSE /* mem. ownership kept by context */ ); } return result; } static int eigencompare(const void* pi, const void* pj) { NLuint i = *(const NLuint*)pi; NLuint j = *(const NLuint*)pj; double vali = fabs(nlCurrentContext->temp_eigen_value[i]); double valj = fabs(nlCurrentContext->temp_eigen_value[j]); if(vali == valj) { return 0; } return vali < valj ? -1 : 1; } void nlEigenSolve_ARPACK(void) { NLboolean symmetric = nlCurrentContext->symmetric && (nlCurrentContext->B == NULL); int n = (int)nlCurrentContext->M->n; /* Dimension of the matrix */ int nev = /* Number of eigenvectors requested */ (int)nlCurrentContext->nb_systems; NLMatrix OP = create_OP(symmetric); int ncv = (int)(nev * 2.5); /* Length of Arnoldi factorization */ /* Rule of thumb in ARPACK documentation: ncv > 2 * nev */ int* iparam = NULL; int* ipntr = NULL; NLdouble* resid = NULL; NLdouble* workev = NULL; NLdouble* workd = NULL; NLdouble* workl = NULL; NLdouble* v = NULL; NLdouble* d = NULL; ARlogical* select = NULL; ARlogical rvec = 1; double sigmar = 0.0; double sigmai = 0.0; int ierr; int i,k,kk; int ldv = (int)n; char* bmat = (char*)"I"; /*Standard problem */ char* which = (char*)"LM"; /*Largest eigenvalues, but we invert->smallest */ char* howmny = (char*)"A"; /*which eigens should be computed: all */ double tol = nlCurrentContext->threshold; int ido = 0; /* reverse communication variable (which operation ?) */ int info = 1; /* start with initial value of resid */ int lworkl; /* size of work array */ NLboolean converged = NL_FALSE; NLdouble value; int index; int* sorted; /* indirection array for sorting eigenpairs */ if(ncv > n) { ncv = n; } if(nev > n) { nev = n; } if(nev + 2 > ncv) { nev = ncv - 2; } if(symmetric) { lworkl = ncv * (ncv + 8) ; } else { lworkl = 3*ncv*ncv + 6*ncv ; } iparam = NL_NEW_ARRAY(int, 11); ipntr = NL_NEW_ARRAY(int, 14); iparam[1-1] = 1; /* ARPACK chooses the shifts */ iparam[3-1] = (int)nlCurrentContext->max_iterations; iparam[7-1] = 1; /* Normal mode (we do not use shift-invert (3) since we do our own shift-invert */ workev = NL_NEW_ARRAY(NLdouble, 3*ncv); workd = NL_NEW_ARRAY(NLdouble, 3*n); resid = NL_NEW_ARRAY(NLdouble, n); for(i=0; iverbose) { if(symmetric) { nl_printf("calling dsaupd()\n"); } else { nl_printf("calling dnaupd()\n"); } } while(!converged) { /* if(nlCurrentContext->verbose) { fprintf(stderr, "."); fflush(stderr); } */ if(symmetric) { ARPACK()->dsaupd( &ido, bmat, &n, which, &nev, &tol, resid, &ncv, v, &ldv, iparam, ipntr, workd, workl, &lworkl, &info ); } else { ARPACK()->dnaupd( &ido, bmat, &n, which, &nev, &tol, resid, &ncv, v, &ldv, iparam, ipntr, workd, workl, &lworkl, &info ); } if(ido == 1) { nlMultMatrixVector( OP, workd+ipntr[1-1]-1, /*The "-1"'s are for FORTRAN-to-C conversion */ workd+ipntr[2-1]-1 /*to keep the same indices as in ARPACK doc */ ); } else { converged = NL_TRUE; } } if(info < 0) { if(symmetric) { nl_fprintf(stderr, "\nError with dsaupd(): %d\n", info); } else { nl_fprintf(stderr, "\nError with dnaupd(): %d\n", info); } } else { if(nlCurrentContext->verbose) { fprintf(stderr, "\nconverged\n"); } select = NL_NEW_ARRAY(ARlogical, ncv); for(i=0; iverbose) { if(symmetric) { nl_printf("calling dseupd()\n"); } else { nl_printf("calling dneupd()\n"); } } if(symmetric) { ARPACK()->dseupd( &rvec, howmny, select, d, v, &ldv, &sigmar, bmat, &n, which, &nev, &tol, resid, &ncv, v, &ldv, iparam, ipntr, workd, workl, &lworkl, &ierr ); } else { ARPACK()->dneupd( &rvec, howmny, select, d, d+ncv, v, &ldv, &sigmar, &sigmai, workev, bmat, &n, which, &nev, &tol, resid, &ncv, v, &ldv, iparam, ipntr, workd, workl, &lworkl, &ierr ) ; } if(nlCurrentContext->verbose) { if(ierr != 0) { if(symmetric) { nl_fprintf(stderr, "Error with dseupd(): %d\n", ierr); } else { nl_fprintf(stderr, "Error with dneupd(): %d\n", ierr); } } else { if(symmetric) { nl_printf("dseupd() OK, nconv= %d\n", iparam[3-1]); } else { nl_printf("dneupd() OK, nconv= %d\n", iparam[3-1]); } } } NL_DELETE_ARRAY(select); } for(i=0; ieigen_shift ; } /* Make it visible to the eigen_compare function */ nlCurrentContext->temp_eigen_value = d; sorted = NL_NEW_ARRAY(int, nev); for(i=0; itemp_eigen_value = NULL; for(k=0; keigen_value[k] = d[kk]; for(i=0; i<(int)nlCurrentContext->nb_variables; ++i) { if(!nlCurrentContext->variable_is_locked[i]) { index = (int)nlCurrentContext->variable_index[i]; nl_assert(index < n); value = v[kk*n+index]; NL_BUFFER_ITEM( nlCurrentContext->variable_buffer[k],(NLuint)i ) = value; } } } NL_DELETE_ARRAY(sorted); NL_DELETE_ARRAY(workl); NL_DELETE_ARRAY(d); NL_DELETE_ARRAY(v); NL_DELETE_ARRAY(resid); NL_DELETE_ARRAY(workd); NL_DELETE_ARRAY(workev); nlDeleteMatrix(OP); NL_DELETE_ARRAY(iparam); NL_DELETE_ARRAY(ipntr); } /******* extracted from nl_mkl.c *******/ typedef unsigned int MKL_INT; typedef void (*FUNPTR_mkl_cspblas_dcsrgemv)( const char *transa, const MKL_INT *m, const double *a, const MKL_INT *ia, const MKL_INT *ja, const double *x, double *y ); typedef void (*FUNPTR_mkl_cspblas_dcsrsymv)( const char *transa, const MKL_INT *m, const double *a, const MKL_INT *ia, const MKL_INT *ja, const double *x, double *y ); typedef struct { NLdll DLL_mkl_intel_lp64; NLdll DLL_mkl_intel_thread; NLdll DLL_mkl_core; NLdll DLL_iomp5; FUNPTR_mkl_cspblas_dcsrgemv mkl_cspblas_dcsrgemv; FUNPTR_mkl_cspblas_dcsrsymv mkl_cspblas_dcsrsymv; } MKLContext; static MKLContext* MKL() { static MKLContext context; static NLboolean init = NL_FALSE; if(!init) { init = NL_TRUE; memset(&context, 0, sizeof(context)); } return &context; } NLboolean nlExtensionIsInitialized_MKL() { if( MKL()->DLL_iomp5 == NULL || MKL()->DLL_mkl_core == NULL || MKL()->DLL_mkl_intel_thread == NULL || MKL()->DLL_mkl_intel_lp64 == NULL || MKL()->mkl_cspblas_dcsrgemv == NULL || MKL()->mkl_cspblas_dcsrsymv == NULL ) { return NL_FALSE; } return NL_TRUE; } #define find_mkl_func(name) \ if( \ ( \ MKL()->name = \ (FUNPTR_##name)nlFindFunction( \ MKL()->DLL_mkl_intel_lp64,#name \ ) \ ) == NULL \ ) { \ nlError("nlInitExtension_MKL","function not found"); \ return NL_FALSE; \ } static void nlTerminateExtension_MKL(void) { if(!nlExtensionIsInitialized_MKL()) { return; } nlCloseDLL(MKL()->DLL_mkl_intel_lp64); nlCloseDLL(MKL()->DLL_mkl_intel_thread); nlCloseDLL(MKL()->DLL_mkl_core); nlCloseDLL(MKL()->DLL_iomp5); } NLMultMatrixVectorFunc NLMultMatrixVector_MKL = NULL; static void NLMultMatrixVector_MKL_impl(NLMatrix M_in, const double* x, double* y) { NLCRSMatrix* M = (NLCRSMatrix*)(M_in); nl_debug_assert(M_in->type == NL_MATRIX_CRS); if(M->symmetric_storage) { MKL()->mkl_cspblas_dcsrsymv( "N", /* No transpose */ &M->m, M->val, M->rowptr, M->colind, x, y ); } else { MKL()->mkl_cspblas_dcsrgemv( "N", /* No transpose */ &M->m, M->val, M->rowptr, M->colind, x, y ); } } #define INTEL_PREFIX "/opt/intel/" #define LIB_DIR "lib/intel64/" #define MKL_PREFIX INTEL_PREFIX "mkl/" LIB_DIR NLboolean nlInitExtension_MKL(void) { NLenum flags = NL_LINK_LAZY | NL_LINK_GLOBAL; if(nlCurrentContext == NULL || !nlCurrentContext->verbose) { flags |= NL_LINK_QUIET; } if(MKL()->DLL_mkl_intel_lp64 != NULL) { return nlExtensionIsInitialized_MKL(); } MKL()->DLL_iomp5 = nlOpenDLL( INTEL_PREFIX LIB_DIR "libiomp5.so", flags ); MKL()->DLL_mkl_core = nlOpenDLL( MKL_PREFIX "libmkl_core.so", flags ); MKL()->DLL_mkl_intel_thread = nlOpenDLL( MKL_PREFIX "libmkl_intel_thread.so", flags ); MKL()->DLL_mkl_intel_lp64 = nlOpenDLL( MKL_PREFIX "libmkl_intel_lp64.so", flags ); if( MKL()->DLL_iomp5 == NULL || MKL()->DLL_mkl_core == NULL || MKL()->DLL_mkl_intel_thread == NULL || MKL()->DLL_mkl_intel_lp64 == NULL ) { return NL_FALSE; } find_mkl_func(mkl_cspblas_dcsrgemv); find_mkl_func(mkl_cspblas_dcsrsymv); if(nlExtensionIsInitialized_MKL()) { NLMultMatrixVector_MKL = NLMultMatrixVector_MKL_impl; } atexit(nlTerminateExtension_MKL); return NL_TRUE; } /******* extracted from nl_cuda.c *******/ /* CUDA structures and functions */ /* Repeated here so that one can compile OpenNL without */ /* requiring CUDA to be installed in the system. */ struct cudaDeviceProp { char name[256]; size_t totalGlobalMem; size_t sharedMemPerBlock; int regsPerBlock; int warpSize; size_t memPitch; int maxThreadsPerBlock; int maxThreadsDim[3]; int maxGridSize[3]; int clockRate; size_t totalConstMem; int major; int minor; size_t textureAlignment; size_t texturePitchAlignment; int deviceOverlap; int multiProcessorCount; int kernelExecTimeoutEnabled; int integrated; int canMapHostMemory; int computeMode; int maxTexture1D; int maxTexture1DMipmap; int maxTexture1DLinear; int maxTexture2D[2]; int maxTexture2DMipmap[2]; int maxTexture2DLinear[3]; int maxTexture2DGather[2]; int maxTexture3D[3]; int maxTexture3DAlt[3]; int maxTextureCubemap; int maxTexture1DLayered[2]; int maxTexture2DLayered[3]; int maxTextureCubemapLayered[2]; int maxSurface1D; int maxSurface2D[2]; int maxSurface3D[3]; int maxSurface1DLayered[2]; int maxSurface2DLayered[3]; int maxSurfaceCubemap; int maxSurfaceCubemapLayered[2]; size_t surfaceAlignment; int concurrentKernels; int ECCEnabled; int pciBusID; int pciDeviceID; int pciDomainID; int tccDriver; int asyncEngineCount; int unifiedAddressing; int memoryClockRate; int memoryBusWidth; int l2CacheSize; int maxThreadsPerMultiProcessor; int streamPrioritiesSupported; int globalL1CacheSupported; int localL1CacheSupported; size_t sharedMemPerMultiprocessor; int regsPerMultiprocessor; int managedMemSupported; int isMultiGpuBoard; int multiGpuBoardGroupID; int singleToDoublePrecisionPerfRatio; int pageableMemoryAccess; int concurrentManagedAccess; char padding[1024]; /* More room for future evolutions */ }; enum cudaComputeMode { cudaComputeModeDefault = 0, cudaComputeModeExclusive = 1, cudaComputeModeProhibited = 2, cudaComputeModeExclusiveProcess = 3 }; enum cudaMemcpyKind { cudaMemcpyHostToHost = 0, cudaMemcpyHostToDevice = 1, cudaMemcpyDeviceToHost = 2, cudaMemcpyDeviceToDevice = 3, cudaMemcpyDefault = 4 }; typedef int cudaError_t; typedef cudaError_t (*FUNPTR_cudaGetDeviceCount)(int* device_count); typedef cudaError_t (*FUNPTR_cudaGetDeviceProperties)( struct cudaDeviceProp *props, int device ); typedef cudaError_t (*FUNPTR_cudaDeviceReset)(void); typedef cudaError_t (*FUNPTR_cudaMalloc)(void **devPtr, size_t size); typedef cudaError_t (*FUNPTR_cudaFree)(void* devPtr); typedef cudaError_t (*FUNPTR_cudaMemcpy)( void *dst, const void *src, size_t count, enum cudaMemcpyKind kind ); #define find_cuda_func(name) \ if( \ ( \ CUDA()->name = \ (FUNPTR_##name)nlFindFunction( \ CUDA()->DLL_cudart,#name \ ) \ ) == NULL \ ) { \ nlError("nlInitExtension_CUDA: function not found", #name); \ return NL_FALSE; \ } /* CUBLAS structures and functions */ struct cublasContext; typedef struct cublasContext *cublasHandle_t; typedef int cublasStatus_t; typedef enum { CUBLAS_SIDE_LEFT =0, CUBLAS_SIDE_RIGHT=1 } cublasSideMode_t; typedef enum { CUBLAS_FILL_MODE_LOWER=0, CUBLAS_FILL_MODE_UPPER=1 } cublasFillMode_t; typedef enum { CUBLAS_OP_N=0, CUBLAS_OP_T=1, CUBLAS_OP_C=2 } cublasOperation_t; typedef enum { CUBLAS_DIAG_NON_UNIT=0, CUBLAS_DIAG_UNIT=1 } cublasDiagType_t; typedef cublasStatus_t (*FUNPTR_cublasCreate)(cublasHandle_t* handle); typedef cublasStatus_t (*FUNPTR_cublasDestroy)(cublasHandle_t handle); typedef cublasStatus_t (*FUNPTR_cublasGetVersion)( cublasHandle_t handle, int* version ); typedef cublasStatus_t (*FUNPTR_cublasDdot)( cublasHandle_t handle, int n, const double *x, int incx, const double *y, int incy, double *result ); typedef cublasStatus_t (*FUNPTR_cublasDcopy)( cublasHandle_t handle, int n, const double *x, int incx, const double *y, int incy ); typedef cublasStatus_t (*FUNPTR_cublasDaxpy)( cublasHandle_t handle, int n, const double* alpha, const double *x, int incx, const double *y, int incy ); typedef cublasStatus_t (*FUNPTR_cublasDscal)( cublasHandle_t handle, int n, const double* alpha, const double *x, int incx ); typedef cublasStatus_t (*FUNPTR_cublasDnrm2)( cublasHandle_t handle, int n, const double *x, int incx, double* result ); typedef cublasStatus_t (*FUNPTR_cublasDdgmm)( cublasHandle_t handle, cublasSideMode_t mode, int m, int n, const double* A, int lda, const double* x, int incx, double* C, int ldc ); typedef cublasStatus_t (*FUNPTR_cublasDgemv)( cublasHandle_t handle, cublasOperation_t trans, int m, int n, const double *alpha, const double *A, int lda, const double *x, int incx, const double *beta, double *y, int incy ); typedef cublasStatus_t (*FUNPTR_cublasDtpsv)( cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans, cublasDiagType_t diag, int n, const double *AP, double* x, int incx ); #define find_cublas_func(name) \ if( \ ( \ CUDA()->name = \ (FUNPTR_##name)nlFindFunction( \ CUDA()->DLL_cublas,#name "_v2" \ ) \ ) == NULL \ ) { \ nlError("nlInitExtension_CUDA: function not found", #name); \ return NL_FALSE; \ } #define find_cublas_func_v1(name) \ if( \ ( \ CUDA()->name = \ (FUNPTR_##name)nlFindFunction( \ CUDA()->DLL_cublas,#name \ ) \ ) == NULL \ ) { \ nlError("nlInitExtension_CUDA: function not found", #name); \ return NL_FALSE; \ } /* CUSPARSE structures and functions */ struct cusparseContext; typedef struct cusparseContext *cusparseHandle_t; typedef int cusparseStatus_t; struct cusparseMatDescr; typedef struct cusparseMatDescr *cusparseMatDescr_t; typedef enum { CUSPARSE_MATRIX_TYPE_GENERAL = 0, CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1, CUSPARSE_MATRIX_TYPE_HERMITIAN = 2, CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3 } cusparseMatrixType_t; typedef enum { CUSPARSE_INDEX_BASE_ZERO = 0, CUSPARSE_INDEX_BASE_ONE = 1 } cusparseIndexBase_t; typedef enum { CUSPARSE_OPERATION_NON_TRANSPOSE = 0, CUSPARSE_OPERATION_TRANSPOSE = 1, CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2 } cusparseOperation_t; struct cusparseHybMat; typedef struct cusparseHybMat *cusparseHybMat_t; typedef enum { CUSPARSE_HYB_PARTITION_AUTO = 0, CUSPARSE_HYB_PARTITION_USER = 1, CUSPARSE_HYB_PARTITION_MAX = 2 } cusparseHybPartition_t; typedef cusparseStatus_t (*FUNPTR_cusparseCreate)(cusparseHandle_t* handle); typedef cusparseStatus_t (*FUNPTR_cusparseDestroy)(cusparseHandle_t handle); typedef cusparseStatus_t (*FUNPTR_cusparseGetVersion)( cusparseHandle_t handle, int* version ); typedef cusparseStatus_t (*FUNPTR_cusparseCreateMatDescr)( cusparseMatDescr_t* descr ); typedef cusparseStatus_t (*FUNPTR_cusparseDestroyMatDescr)( cusparseMatDescr_t descr ); typedef cusparseStatus_t (*FUNPTR_cusparseSetMatType)( cusparseMatDescr_t descr, cusparseMatrixType_t mtype ); typedef cusparseStatus_t (*FUNPTR_cusparseSetMatIndexBase)( cusparseMatDescr_t descr, cusparseIndexBase_t ibase ); typedef cusparseStatus_t (*FUNPTR_cusparseDcsrmv)( cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz, const double *alpha, const cusparseMatDescr_t descrA, const double *csrSortedValA, const int *csrSortedRowPtrA, const int *csrSortedColIndA, const double *x, const double *beta, double *y ); typedef cusparseStatus_t (*FUNPTR_cusparseCreateHybMat)( cusparseHybMat_t *hybA ); typedef cusparseStatus_t (*FUNPTR_cusparseDestroyHybMat)( cusparseHybMat_t hybA ); typedef cusparseStatus_t (*FUNPTR_cusparseDcsr2hyb)( cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA, const double *csrSortedValA, const int *csrSortedRowPtrA, const int *csrSortedColIndA, cusparseHybMat_t hybA, int userEllWidth, cusparseHybPartition_t partitionType ); typedef cusparseStatus_t (*FUNPTR_cusparseDhybmv)( cusparseHandle_t handle, cusparseOperation_t transA, const double *alpha, const cusparseMatDescr_t descrA, const cusparseHybMat_t hybA, const double *x, const double *beta, double *y ); #define find_cusparse_func(name) \ if( \ ( \ CUDA()->name = \ (FUNPTR_##name)nlFindFunction( \ CUDA()->DLL_cusparse,#name \ ) \ ) == NULL \ ) { \ nlError("nlInitExtension_CUDA : function not found", #name); \ return NL_FALSE; \ } typedef struct { NLdll DLL_cudart; FUNPTR_cudaGetDeviceCount cudaGetDeviceCount; FUNPTR_cudaGetDeviceProperties cudaGetDeviceProperties; FUNPTR_cudaDeviceReset cudaDeviceReset; FUNPTR_cudaMalloc cudaMalloc; FUNPTR_cudaFree cudaFree; FUNPTR_cudaMemcpy cudaMemcpy; NLdll DLL_cublas; cublasHandle_t HNDL_cublas; FUNPTR_cublasCreate cublasCreate; FUNPTR_cublasDestroy cublasDestroy; FUNPTR_cublasGetVersion cublasGetVersion; FUNPTR_cublasDdot cublasDdot; FUNPTR_cublasDcopy cublasDcopy; FUNPTR_cublasDaxpy cublasDaxpy; FUNPTR_cublasDscal cublasDscal; FUNPTR_cublasDnrm2 cublasDnrm2; FUNPTR_cublasDdgmm cublasDdgmm; FUNPTR_cublasDgemv cublasDgemv; FUNPTR_cublasDtpsv cublasDtpsv; NLdll DLL_cusparse; cusparseHandle_t HNDL_cusparse; FUNPTR_cusparseCreate cusparseCreate; FUNPTR_cusparseDestroy cusparseDestroy; FUNPTR_cusparseGetVersion cusparseGetVersion; FUNPTR_cusparseCreateMatDescr cusparseCreateMatDescr; FUNPTR_cusparseDestroyMatDescr cusparseDestroyMatDescr; FUNPTR_cusparseSetMatType cusparseSetMatType; FUNPTR_cusparseSetMatIndexBase cusparseSetMatIndexBase; FUNPTR_cusparseDcsrmv cusparseDcsrmv; FUNPTR_cusparseCreateHybMat cusparseCreateHybMat; FUNPTR_cusparseDestroyHybMat cusparseDestroyHybMat; FUNPTR_cusparseDcsr2hyb cusparseDcsr2hyb; FUNPTR_cusparseDhybmv cusparseDhybmv; int devID; } CUDAContext; static CUDAContext* CUDA() { static CUDAContext context; static NLboolean init = NL_FALSE; if(!init) { init = NL_TRUE; memset(&context, 0, sizeof(context)); } return &context; } NLboolean nlExtensionIsInitialized_CUDA() { if( CUDA()->DLL_cudart == NULL || CUDA()->cudaGetDeviceCount == NULL || CUDA()->cudaGetDeviceProperties == NULL || CUDA()->cudaDeviceReset == NULL || CUDA()->cudaMalloc == NULL || CUDA()->cudaFree == NULL || CUDA()->cudaMemcpy == NULL || CUDA()->DLL_cublas == NULL || CUDA()->HNDL_cublas == NULL || CUDA()->cublasCreate == NULL || CUDA()->cublasDestroy == NULL || CUDA()->cublasGetVersion == NULL || CUDA()->cublasDdot == NULL || CUDA()->cublasDcopy == NULL || CUDA()->cublasDaxpy == NULL || CUDA()->cublasDscal == NULL || CUDA()->cublasDnrm2 == NULL || CUDA()->cublasDdgmm == NULL || CUDA()->DLL_cusparse == NULL || CUDA()->HNDL_cusparse == NULL || CUDA()->cusparseCreate == NULL || CUDA()->cusparseDestroy == NULL || CUDA()->cusparseGetVersion == NULL || CUDA()->cusparseCreateMatDescr == NULL || CUDA()->cusparseDestroyMatDescr == NULL || CUDA()->cusparseSetMatType == NULL || CUDA()->cusparseSetMatIndexBase == NULL || CUDA()->cusparseDcsrmv == NULL || CUDA()->cusparseCreateHybMat == NULL || CUDA()->cusparseDestroyHybMat == NULL || CUDA()->cusparseDcsr2hyb == NULL || CUDA()->cusparseDhybmv == NULL ) { return NL_FALSE; } return NL_TRUE; } static void nlTerminateExtension_CUDA(void) { if(!nlExtensionIsInitialized_CUDA()) { return; } CUDA()->cusparseDestroy(CUDA()->HNDL_cusparse); nlCloseDLL(CUDA()->DLL_cusparse); CUDA()->cublasDestroy(CUDA()->HNDL_cublas); nlCloseDLL(CUDA()->DLL_cublas); CUDA()->cudaDeviceReset(); nlCloseDLL(CUDA()->DLL_cudart); } static int ConvertSMVer2Cores(int major, int minor) { /* Defines for GPU Architecture types (using the SM version to determine the # of cores per SM */ typedef struct { int SM; /* 0xMm (hexadecimal notation), M = SM Major version, and m = SM minor version */ int Cores; } sSMtoCores; sSMtoCores nGpuArchCoresPerSM[] = { { 0x10, 8 }, /* Tesla Generation (SM 1.0) G80 class */ { 0x11, 8 }, /* Tesla Generation (SM 1.1) G8x class */ { 0x12, 8 }, /* Tesla Generation (SM 1.2) G9x class */ { 0x13, 8 }, /* Tesla Generation (SM 1.3) GT200 class */ { 0x20, 32 }, /* Fermi Generation (SM 2.0) GF100 class */ { 0x21, 48 }, /* Fermi Generation (SM 2.1) GF10x class */ { 0x30, 192}, /* Kepler Generation (SM 3.0) GK10x class */ { 0x35, 192}, /* Kepler Generation (SM 3.5) GK11x class */ { 0x50, 128}, /* Maxwell Generation (SM 5.0) GM10x class (yes, #cores smaller than with 3.x) */ { 0x52, 128}, /* Maxwell Generation (SM 5.2) GM20x class */ { 0x60, 64 }, /* Pascal Generation (SM 6.0) GP100,GP102 (yes, 64, but GP100 has superfast double precision) */ { 0x61, 128}, /* Pascal Generation (SM 6.1) GP104 class (but FP64 runs as 1/32 FP32 speed) */ { -1, -1 } }; int index = 0; while (nGpuArchCoresPerSM[index].SM != -1) { if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { return nGpuArchCoresPerSM[index].Cores; } index++; } /* If we don't find the values, we default use the previous one to run properly */ nl_printf( "MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[8].Cores ); return nGpuArchCoresPerSM[8].Cores; } static int getBestDeviceID() { int current_device = 0, sm_per_multiproc = 0; int max_compute_perf = 0, max_perf_device = 0; int device_count = 0, best_SM_arch = 0; int compute_perf = 0; struct cudaDeviceProp deviceProp; CUDA()->cudaGetDeviceCount(&device_count); /* Find the best major SM Architecture GPU device */ while (current_device < device_count) { CUDA()->cudaGetDeviceProperties(&deviceProp, current_device); /* If this GPU is not running on Compute Mode prohibited, then we can add it to the list */ if (deviceProp.computeMode != cudaComputeModeProhibited) { if (deviceProp.major > 0 && deviceProp.major < 9999) { best_SM_arch = MAX(best_SM_arch, deviceProp.major); } } current_device++; } /* Find the best CUDA capable GPU device */ current_device = 0; while (current_device < device_count) { CUDA()->cudaGetDeviceProperties(&deviceProp, current_device); /* If this GPU is not running on Compute Mode prohibited, then we can add it to the list */ if (deviceProp.computeMode != cudaComputeModeProhibited) { if (deviceProp.major == 9999 && deviceProp.minor == 9999) { sm_per_multiproc = 1; } else { sm_per_multiproc = ConvertSMVer2Cores( deviceProp.major, deviceProp.minor ); } compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate; if (compute_perf > max_compute_perf) { /* If we find GPU with SM major > 2, search only these */ if (best_SM_arch > 2) { /* If our device==dest_SM_arch, choose this, or else pass */ if (deviceProp.major == best_SM_arch) { max_compute_perf = compute_perf; max_perf_device = current_device; } } else { max_compute_perf = compute_perf; max_perf_device = current_device; } } } ++current_device; } return max_perf_device; } #ifdef NL_OS_UNIX # define LIBPREFIX "lib" # ifdef NL_OS_APPLE # define LIBEXTENSION ".dylib" # else # define LIBEXTENSION ".so" # endif #else # define LIBPREFIX # define LIBEXTENSION ".dll" #endif NLboolean nlInitExtension_CUDA(void) { struct cudaDeviceProp deviceProp; int cublas_version; int cusparse_version; NLenum flags = NL_LINK_LAZY | NL_LINK_GLOBAL; if(nlCurrentContext == NULL || !nlCurrentContext->verbose) { flags |= NL_LINK_QUIET; } if(nlExtensionIsInitialized_CUDA()) { return NL_TRUE; } CUDA()->DLL_cudart = nlOpenDLL( LIBPREFIX "cudart" LIBEXTENSION, flags ); find_cuda_func(cudaGetDeviceCount); find_cuda_func(cudaGetDeviceProperties); find_cuda_func(cudaDeviceReset); find_cuda_func(cudaMalloc); find_cuda_func(cudaFree); find_cuda_func(cudaMemcpy); CUDA()->devID = getBestDeviceID(); if(CUDA()->cudaGetDeviceProperties(&deviceProp, CUDA()->devID)) { nl_fprintf(stderr,"OpenNL CUDA: could not find a CUDA device\n"); return NL_FALSE; } nl_printf("OpenNL CUDA: Device ID = %d\n", CUDA()->devID); nl_printf("OpenNL CUDA: Device name=%s\n", deviceProp.name); nl_printf( "OpenNL CUDA: Device has %d Multi-Processors, " "%d cores per Multi-Processor, SM %d.%d compute capabilities\n", deviceProp.multiProcessorCount, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), deviceProp.major, deviceProp.minor ); nl_printf( "OpenNL CUDA: %d kB shared mem. per block, %d per MP\n", (int)(deviceProp.sharedMemPerBlock / 1024), (int)(deviceProp.sharedMemPerMultiprocessor / 1024) ); nl_printf( "OpenNL CUDA: %d regs. per block, %d per MP\n", deviceProp.regsPerBlock, deviceProp.regsPerMultiprocessor ); nl_printf( "OpenNL CUDA: warpsize=%d\n", deviceProp.warpSize ); if ((deviceProp.major * 0x10 + deviceProp.minor) < 0x11) { nl_fprintf(stderr, "OpenNL CUDA requires a minimum CUDA compute 1.1 capability\n"); CUDA()->cudaDeviceReset(); return NL_FALSE; } CUDA()->DLL_cublas = nlOpenDLL( LIBPREFIX "cublas" LIBEXTENSION, flags ); find_cublas_func(cublasCreate); find_cublas_func(cublasDestroy); find_cublas_func(cublasGetVersion); find_cublas_func(cublasDdot); find_cublas_func(cublasDaxpy); find_cublas_func(cublasDcopy); find_cublas_func(cublasDscal); find_cublas_func(cublasDnrm2); find_cublas_func(cublasDgemv); find_cublas_func(cublasDtpsv); find_cublas_func_v1(cublasDdgmm); if(CUDA()->cublasCreate(&CUDA()->HNDL_cublas)) { return NL_FALSE; } if(CUDA()->cublasGetVersion(CUDA()->HNDL_cublas, &cublas_version)) { return NL_FALSE; } nl_printf("OpenNL CUDA: cublas version = %d\n", cublas_version); CUDA()->DLL_cusparse = nlOpenDLL( LIBPREFIX "cusparse" LIBEXTENSION, flags ); find_cusparse_func(cusparseCreate); find_cusparse_func(cusparseDestroy); find_cusparse_func(cusparseGetVersion); find_cusparse_func(cusparseCreateMatDescr); find_cusparse_func(cusparseDestroyMatDescr); find_cusparse_func(cusparseSetMatType); find_cusparse_func(cusparseSetMatIndexBase); find_cusparse_func(cusparseDcsrmv); find_cusparse_func(cusparseCreateHybMat); find_cusparse_func(cusparseDestroyHybMat); find_cusparse_func(cusparseDcsr2hyb); find_cusparse_func(cusparseDhybmv); if(CUDA()->cusparseCreate(&CUDA()->HNDL_cusparse)) { return NL_FALSE; } if(CUDA()->cusparseGetVersion(CUDA()->HNDL_cusparse, &cusparse_version)) { return NL_FALSE; } nl_printf("OpenNL CUDA: cusparse version = %d\n", cusparse_version); if(!nlExtensionIsInitialized_CUDA()) { return NL_FALSE; } atexit(nlTerminateExtension_CUDA); return NL_TRUE; } static void nlCUDACheckImpl(int status, int line) { if(status != 0) { nl_fprintf(stderr,"nl_cuda.c:%d fatal error %d\n",line, status); CUDA()->cudaDeviceReset(); exit(-1); } } #define nlCUDACheck(status) nlCUDACheckImpl(status, __LINE__) typedef struct { NLuint m; NLuint n; NLenum type; NLDestroyMatrixFunc destroy_func; NLMultMatrixVectorFunc mult_func; cusparseMatDescr_t descr; NLuint nnz; int* colind; int* rowptr; double* val; cusparseHybMat_t hyb; } NLCUDASparseMatrix; static void nlCRSMatrixCUDADestroyCRS(NLCUDASparseMatrix* Mcuda) { if(Mcuda->colind != NULL) { nlCUDACheck(CUDA()->cudaFree(Mcuda->colind)); Mcuda->colind = NULL; } if(Mcuda->rowptr != NULL) { nlCUDACheck(CUDA()->cudaFree(Mcuda->rowptr)); Mcuda->rowptr = NULL; } if(Mcuda->val != NULL) { nlCUDACheck(CUDA()->cudaFree(Mcuda->val)); Mcuda->val = NULL; } } static void nlCRSMatrixCUDADestroy(NLCUDASparseMatrix* Mcuda) { if(Mcuda->hyb != NULL) { nlCUDACheck(CUDA()->cusparseDestroyHybMat(Mcuda->hyb)); } nlCRSMatrixCUDADestroyCRS(Mcuda); nlCUDACheck(CUDA()->cusparseDestroyMatDescr(Mcuda->descr)); memset(Mcuda, 0, sizeof(*Mcuda)); } static void nlCRSMatrixCUDAMult( NLCUDASparseMatrix* Mcuda, const double* x, double* y ) { const double one = 1; const double zero = 0; if(Mcuda->hyb != NULL) { nlCUDACheck( CUDA()->cusparseDhybmv( CUDA()->HNDL_cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, &one, Mcuda->descr, Mcuda->hyb, x, &zero, y ) ); } else { nlCUDACheck( CUDA()->cusparseDcsrmv( CUDA()->HNDL_cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE, (int)Mcuda->m, (int)Mcuda->n, (int)Mcuda->nnz, &one, Mcuda->descr, Mcuda->val, Mcuda->rowptr, Mcuda->colind, x, &zero, y ) ); } nlCUDABlas()->flops += (NLulong)(2*Mcuda->nnz); } NLMatrix nlCUDAMatrixNewFromCRSMatrix(NLMatrix M_in) { NLCUDASparseMatrix* Mcuda = NL_NEW(NLCUDASparseMatrix); NLCRSMatrix* M = (NLCRSMatrix*)(M_in); size_t colind_sz, rowptr_sz, val_sz; nl_assert(M_in->type == NL_MATRIX_CRS); nlCUDACheck(CUDA()->cusparseCreateMatDescr(&Mcuda->descr)); if(M->symmetric_storage) { nlCUDACheck(CUDA()->cusparseSetMatType( Mcuda->descr, CUSPARSE_MATRIX_TYPE_SYMMETRIC) ); } else { nlCUDACheck(CUDA()->cusparseSetMatType( Mcuda->descr, CUSPARSE_MATRIX_TYPE_GENERAL) ); } nlCUDACheck(CUDA()->cusparseSetMatIndexBase( Mcuda->descr, CUSPARSE_INDEX_BASE_ZERO) ); Mcuda->m = M->m; Mcuda->n = M->n; Mcuda->nnz = nlCRSMatrixNNZ(M); colind_sz = (size_t)Mcuda->nnz*sizeof(int); rowptr_sz = (size_t)(Mcuda->m+1)*sizeof(int); val_sz = (size_t)Mcuda->nnz*sizeof(double); nlCUDACheck(CUDA()->cudaMalloc((void**)&Mcuda->colind,colind_sz)); nlCUDACheck(CUDA()->cudaMalloc((void**)&Mcuda->rowptr,rowptr_sz)); nlCUDACheck(CUDA()->cudaMalloc((void**)&Mcuda->val,val_sz)); nlCUDACheck(CUDA()->cudaMemcpy( Mcuda->colind, M->colind, colind_sz, cudaMemcpyHostToDevice) ); nlCUDACheck(CUDA()->cudaMemcpy( Mcuda->rowptr, M->rowptr, rowptr_sz, cudaMemcpyHostToDevice) ); nlCUDACheck(CUDA()->cudaMemcpy( Mcuda->val, M->val, val_sz, cudaMemcpyHostToDevice) ); Mcuda->hyb=NULL; if(!M->symmetric_storage) { nlCUDACheck(CUDA()->cusparseCreateHybMat(&Mcuda->hyb)); nlCUDACheck(CUDA()->cusparseDcsr2hyb( CUDA()->HNDL_cusparse, (int)M->m, (int)M->n, Mcuda->descr, Mcuda->val, Mcuda->rowptr, Mcuda->colind, Mcuda->hyb, 0, CUSPARSE_HYB_PARTITION_AUTO )); /* We no longer need the CRS part */ nlCRSMatrixCUDADestroyCRS(Mcuda); } Mcuda->type=NL_MATRIX_OTHER; Mcuda->destroy_func=(NLDestroyMatrixFunc)nlCRSMatrixCUDADestroy; Mcuda->mult_func=(NLMultMatrixVectorFunc)nlCRSMatrixCUDAMult; return (NLMatrix)Mcuda; } typedef struct { NLuint m; NLuint n; NLenum type; NLDestroyMatrixFunc destroy_func; NLMultMatrixVectorFunc mult_func; double* val; } NLDiagonalMatrixCUDA; static void nlDiagonalMatrixCUDADestroy(NLDiagonalMatrixCUDA* Mcuda) { nlCUDACheck(CUDA()->cudaFree(Mcuda->val)); memset(Mcuda, 0, sizeof(*Mcuda)); } static void nlDiagonalMatrixCUDAMult( NLDiagonalMatrixCUDA* Mcuda, const double* x, double* y ) { int N = (int)Mcuda->n; /* * vector x vector component-wise product implemented * using diagonal matrix x matrix function. */ nlCUDACheck(CUDA()->cublasDdgmm( CUDA()->HNDL_cublas, CUBLAS_SIDE_LEFT, N, 1, x, N, Mcuda->val, 1, y, N )); nlCUDABlas()->flops += (NLulong)N; } static NLMatrix nlDiagonalMatrixCUDANew(const double* diag, NLuint n) { NLDiagonalMatrixCUDA* Mcuda = NL_NEW(NLDiagonalMatrixCUDA); Mcuda->m = n; Mcuda->n = n; Mcuda->type = NL_MATRIX_OTHER; nlCUDACheck(CUDA()->cudaMalloc( (void**)&Mcuda->val, n*sizeof(double)) ); nlCUDACheck(CUDA()->cudaMemcpy( Mcuda->val, diag, n*sizeof(double), cudaMemcpyHostToDevice) ); Mcuda->destroy_func=(NLDestroyMatrixFunc)nlDiagonalMatrixCUDADestroy; Mcuda->mult_func=(NLMultMatrixVectorFunc)nlDiagonalMatrixCUDAMult; return (NLMatrix)Mcuda; } NLMatrix nlCUDAJacobiPreconditionerNewFromCRSMatrix(NLMatrix M_in) { NLuint N = M_in->n; NLuint i,jj; double* diag = NULL; NLMatrix result = NULL; NLCRSMatrix* M = (NLCRSMatrix*)(M_in); nl_assert(M_in->type == NL_MATRIX_CRS); diag = NL_NEW_ARRAY(double,N); for(i=0; irowptr[i]; jjrowptr[i+1]; ++jj) { if(M->colind[jj] == i) { diag[i] = M->val[jj]; } } } for(i=0; iused_ram[type] += (NLulong)size; blas->max_used_ram[type] = MAX( blas->max_used_ram[type],blas->used_ram[type] ); if(type == NL_HOST_MEMORY) { result = malloc(size); } else { nlCUDACheck(CUDA()->cudaMalloc(&result,size)); } return result; } static void cuda_blas_free( NLBlas_t blas, NLmemoryType type, size_t size, void* ptr ) { blas->used_ram[type] -= (NLulong)size; if(type == NL_HOST_MEMORY) { free(ptr); } else { nlCUDACheck(CUDA()->cudaFree(ptr)); } } static void cuda_blas_memcpy( NLBlas_t blas, void* to, NLmemoryType to_type, void* from, NLmemoryType from_type, size_t size ) { enum cudaMemcpyKind kind = cudaMemcpyDefault; nl_arg_used(blas); if(from_type == NL_HOST_MEMORY) { if(to_type == NL_HOST_MEMORY) { kind = cudaMemcpyHostToHost; } else { kind = cudaMemcpyHostToDevice; } } else { if(to_type == NL_HOST_MEMORY) { kind = cudaMemcpyDeviceToHost; } else { kind = cudaMemcpyDeviceToDevice; } } nlCUDACheck(CUDA()->cudaMemcpy(to, from, size, kind)); } static void cuda_blas_dcopy( NLBlas_t blas, int n, const double *x, int incx, double *y, int incy ) { nl_arg_used(blas); CUDA()->cublasDcopy(CUDA()->HNDL_cublas,n,x,incx,y,incy); } static double cuda_blas_ddot( NLBlas_t blas, int n, const double *x, int incx, const double *y, int incy ) { double result = 0.0; blas->flops += (NLulong)(2*n); CUDA()->cublasDdot(CUDA()->HNDL_cublas,n,x,incx,y,incy,&result); return result; } static double cuda_blas_dnrm2( NLBlas_t blas, int n, const double *x, int incx ) { double result = 0.0; blas->flops += (NLulong)(2*n); CUDA()->cublasDnrm2(CUDA()->HNDL_cublas,n,x,incx,&result); return result; } static void cuda_blas_daxpy( NLBlas_t blas, int n, double a, const double *x, int incx, double *y, int incy ) { blas->flops += (NLulong)(2*n); CUDA()->cublasDaxpy(CUDA()->HNDL_cublas,n,&a,x,incx,y,incy); } static void cuda_blas_dscal( NLBlas_t blas, int n, double a, double *x, int incx ) { blas->flops += (NLulong)n; CUDA()->cublasDscal(CUDA()->HNDL_cublas,n,&a,x,incx); } static void cuda_blas_dgemv( NLBlas_t blas, MatrixTranspose trans, int m, int n, double alpha, const double *A, int ldA, const double *x, int incx, double beta, double *y, int incy ) { nl_arg_used(blas); /* TODO: update FLOPS */ CUDA()->cublasDgemv( CUDA()->HNDL_cublas, (cublasOperation_t)trans, m, n, &alpha, A, ldA, x, incx, &beta, y, incy ); } static void cuda_blas_dtpsv( NLBlas_t blas, MatrixTriangle uplo, MatrixTranspose trans, MatrixUnitTriangular diag, int n, const double *AP, double *x, int incx ) { nl_arg_used(blas); /* TODO: update FLOPS */ CUDA()->cublasDtpsv( CUDA()->HNDL_cublas, (cublasFillMode_t)uplo, (cublasOperation_t)trans, (cublasDiagType_t)diag, n, AP, x, incx ); } NLBlas_t nlCUDABlas() { static NLboolean initialized = NL_FALSE; static struct NLBlas blas; if(!initialized) { memset(&blas, 0, sizeof(blas)); blas.has_unified_memory = NL_FALSE; blas.Malloc = cuda_blas_malloc; blas.Free = cuda_blas_free; blas.Memcpy = cuda_blas_memcpy; blas.Dcopy = cuda_blas_dcopy; blas.Ddot = cuda_blas_ddot; blas.Dnrm2 = cuda_blas_dnrm2; blas.Daxpy = cuda_blas_daxpy; blas.Dscal = cuda_blas_dscal; blas.Dgemv = cuda_blas_dgemv; blas.Dtpsv = cuda_blas_dtpsv; nlBlasResetStats(&blas); initialized = NL_TRUE; } return &blas; } /******* extracted from nl_api.c *******/ static NLSparseMatrix* nlGetCurrentSparseMatrix() { NLSparseMatrix* result = NULL; switch(nlCurrentContext->matrix_mode) { case NL_STIFFNESS_MATRIX: { nl_assert(nlCurrentContext->M != NULL); nl_assert(nlCurrentContext->M->type == NL_MATRIX_SPARSE_DYNAMIC); result = (NLSparseMatrix*)(nlCurrentContext->M); } break; case NL_MASS_MATRIX: { nl_assert(nlCurrentContext->B != NULL); nl_assert(nlCurrentContext->B->type == NL_MATRIX_SPARSE_DYNAMIC); result = (NLSparseMatrix*)(nlCurrentContext->B); } break; default: nl_assert_not_reached; } return result; } NLboolean nlInitExtension(const char* extension) { if(!strcmp(extension, "SUPERLU")) { return nlInitExtension_SUPERLU(); } else if(!strcmp(extension, "CHOLMOD")) { return nlInitExtension_CHOLMOD(); } else if(!strcmp(extension, "ARPACK")) { /* * SUPERLU is needed by OpenNL's ARPACK driver * (factorizes the matrix for the shift-invert spectral * transform). */ return nlInitExtension_SUPERLU() && nlInitExtension_ARPACK(); } else if(!strcmp(extension, "MKL")) { return nlInitExtension_MKL(); } else if(!strcmp(extension, "CUDA")) { return nlInitExtension_CUDA(); } return NL_FALSE; } NLboolean nlExtensionIsInitialized(const char* extension) { if(!strcmp(extension, "SUPERLU")) { return nlExtensionIsInitialized_SUPERLU(); } else if(!strcmp(extension, "CHOLMOD")) { return nlExtensionIsInitialized_CHOLMOD(); } else if(!strcmp(extension, "ARPACK")) { /* * SUPERLU is needed by OpenNL's ARPACK driver * (factorizes the matrix for the shift-invert spectral * transform). */ return nlExtensionIsInitialized_SUPERLU() && nlExtensionIsInitialized_ARPACK(); } else if(!strcmp(extension, "MKL")) { return nlExtensionIsInitialized_MKL(); } else if(!strcmp(extension, "CUDA")) { return nlExtensionIsInitialized_CUDA(); } return NL_FALSE; } void nlInitialize(int argc, char** argv) { int i=0; char* ptr=NULL; char extension[255]; /* Find all the arguments with the form: * nl:=true|false * and try to activate the corresponding extensions. */ for(i=1; i 3) && (ptr != NULL)) { strncpy(extension, argv[i]+3, (size_t)(ptr-argv[i]-3)); extension[(size_t)(ptr-argv[i]-3)] = '\0'; if(nlInitExtension(extension)) { nl_fprintf(stdout,"OpenNL %s: initialized\n", extension); } else { nl_fprintf(stderr,"OpenNL %s: could not initialize\n", extension); } } } } /* Get/Set parameters */ void nlSolverParameterd(NLenum pname, NLdouble param) { nlCheckState(NL_STATE_INITIAL); switch(pname) { case NL_THRESHOLD: { nl_assert(param >= 0); nlCurrentContext->threshold = (NLdouble)param; nlCurrentContext->threshold_defined = NL_TRUE; } break; case NL_OMEGA: { nl_range_assert(param,1.0,2.0); nlCurrentContext->omega = (NLdouble)param; } break; default: { nlError("nlSolverParameterd","Invalid parameter"); nl_assert_not_reached; } } } void nlSolverParameteri(NLenum pname, NLint param) { nlCheckState(NL_STATE_INITIAL); switch(pname) { case NL_SOLVER: { nlCurrentContext->solver = (NLenum)param; } break; case NL_NB_VARIABLES: { nl_assert(param > 0); nlCurrentContext->nb_variables = (NLuint)param; } break; case NL_NB_SYSTEMS: { nl_assert(param > 0); nlCurrentContext->nb_systems = (NLuint)param; } break; case NL_LEAST_SQUARES: { nlCurrentContext->least_squares = (NLboolean)param; } break; case NL_MAX_ITERATIONS: { nl_assert(param > 0); nlCurrentContext->max_iterations = (NLuint)param; nlCurrentContext->max_iterations_defined = NL_TRUE; } break; case NL_SYMMETRIC: { nlCurrentContext->symmetric = (NLboolean)param; } break; case NL_INNER_ITERATIONS: { nl_assert(param > 0); nlCurrentContext->inner_iterations = (NLuint)param; } break; case NL_PRECONDITIONER: { nlCurrentContext->preconditioner = (NLuint)param; nlCurrentContext->preconditioner_defined = NL_TRUE; } break; default: { nlError("nlSolverParameteri","Invalid parameter"); nl_assert_not_reached; } } } void nlGetBooleanv(NLenum pname, NLboolean* params) { switch(pname) { case NL_LEAST_SQUARES: { *params = nlCurrentContext->least_squares; } break; case NL_SYMMETRIC: { *params = nlCurrentContext->symmetric; } break; default: { nlError("nlGetBooleanv","Invalid parameter"); nl_assert_not_reached; } } } void nlGetDoublev(NLenum pname, NLdouble* params) { switch(pname) { case NL_THRESHOLD: { *params = nlCurrentContext->threshold; } break; case NL_OMEGA: { *params = nlCurrentContext->omega; } break; case NL_ERROR: { *params = nlCurrentContext->error; } break; case NL_ELAPSED_TIME: { *params = nlCurrentContext->elapsed_time; } break; case NL_GFLOPS: { if(nlCurrentContext->elapsed_time == 0) { *params = 0.0; } else { *params = (NLdouble)(nlCurrentContext->flops) / (nlCurrentContext->elapsed_time * 1e9); } } break; default: { nlError("nlGetDoublev","Invalid parameter"); nl_assert_not_reached; } } } void nlGetIntegerv(NLenum pname, NLint* params) { switch(pname) { case NL_SOLVER: { *params = (NLint)(nlCurrentContext->solver); } break; case NL_NB_VARIABLES: { *params = (NLint)(nlCurrentContext->nb_variables); } break; case NL_NB_SYSTEMS: { *params = (NLint)(nlCurrentContext->nb_systems); } break; case NL_LEAST_SQUARES: { *params = (NLint)(nlCurrentContext->least_squares); } break; case NL_MAX_ITERATIONS: { *params = (NLint)(nlCurrentContext->max_iterations); } break; case NL_SYMMETRIC: { *params = (NLint)(nlCurrentContext->symmetric); } break; case NL_USED_ITERATIONS: { *params = (NLint)(nlCurrentContext->used_iterations); } break; case NL_PRECONDITIONER: { *params = (NLint)(nlCurrentContext->preconditioner); } break; case NL_NNZ: { *params = (NLint)(nlMatrixNNZ(nlCurrentContext->M)); } break; default: { nlError("nlGetIntegerv","Invalid parameter"); nl_assert_not_reached; } } } /* Enable / Disable */ void nlEnable(NLenum pname) { switch(pname) { case NL_NORMALIZE_ROWS: { nl_assert(nlCurrentContext->state != NL_STATE_ROW); nlCurrentContext->normalize_rows = NL_TRUE; } break; case NL_VERBOSE: { nlCurrentContext->verbose = NL_TRUE; } break; case NL_VARIABLES_BUFFER: { nlCurrentContext->user_variable_buffers = NL_TRUE; } break; default: { nlError("nlEnable","Invalid parameter"); nl_assert_not_reached; } } } void nlDisable(NLenum pname) { switch(pname) { case NL_NORMALIZE_ROWS: { nl_assert(nlCurrentContext->state != NL_STATE_ROW); nlCurrentContext->normalize_rows = NL_FALSE; } break; case NL_VERBOSE: { nlCurrentContext->verbose = NL_FALSE; } break; case NL_VARIABLES_BUFFER: { nlCurrentContext->user_variable_buffers = NL_FALSE; } break; default: { nlError("nlDisable","Invalid parameter"); nl_assert_not_reached; } } } NLboolean nlIsEnabled(NLenum pname) { NLboolean result = NL_FALSE; switch(pname) { case NL_NORMALIZE_ROWS: { result = nlCurrentContext->normalize_rows; } break; case NL_VERBOSE: { result = nlCurrentContext->verbose; } break; case NL_VARIABLES_BUFFER: { result = nlCurrentContext->user_variable_buffers; } break; default: { nlError("nlIsEnables","Invalid parameter"); nl_assert_not_reached; } } return result; } /* NL functions */ void nlSetFunction(NLenum pname, NLfunc param) { switch(pname) { case NL_FUNC_SOLVER: nlCurrentContext->solver_func = (NLSolverFunc)(param); nlCurrentContext->solver = NL_SOLVER_USER; break; case NL_FUNC_MATRIX: nlDeleteMatrix(nlCurrentContext->M); nlCurrentContext->M = nlMatrixNewFromFunction( nlCurrentContext->n, nlCurrentContext->n, (NLMatrixFunc)param ); break; case NL_FUNC_PRECONDITIONER: nlDeleteMatrix(nlCurrentContext->P); nlCurrentContext->P = nlMatrixNewFromFunction( nlCurrentContext->n, nlCurrentContext->n, (NLMatrixFunc)param ); nlCurrentContext->preconditioner = NL_PRECOND_USER; break; case NL_FUNC_PROGRESS: nlCurrentContext->progress_func = (NLProgressFunc)(param); break; default: nlError("nlSetFunction","Invalid parameter"); nl_assert_not_reached; } } void nlGetFunction(NLenum pname, NLfunc* param) { switch(pname) { case NL_FUNC_SOLVER: *param = (NLfunc)(nlCurrentContext->solver_func); break; case NL_FUNC_MATRIX: *param = (NLfunc)(nlMatrixGetFunction(nlCurrentContext->M)); break; case NL_FUNC_PRECONDITIONER: *param = (NLfunc)(nlMatrixGetFunction(nlCurrentContext->P)); break; default: nlError("nlGetFunction","Invalid parameter"); nl_assert_not_reached; } } /* Get/Set Lock/Unlock variables */ void nlSetVariable(NLuint index, NLdouble value) { nlCheckState(NL_STATE_SYSTEM); nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables - 1); NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[0],index) = value; } void nlMultiSetVariable(NLuint index, NLuint system, NLdouble value) { nlCheckState(NL_STATE_SYSTEM); nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables-1); nl_debug_range_assert(system, 0, nlCurrentContext->nb_systems-1); NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[system],index) = value; } NLdouble nlGetVariable(NLuint index) { nl_assert(nlCurrentContext->state != NL_STATE_INITIAL); nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables - 1); return NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[0],index); } NLdouble nlMultiGetVariable(NLuint index, NLuint system) { nl_assert(nlCurrentContext->state != NL_STATE_INITIAL); nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables-1); nl_debug_range_assert(system, 0, nlCurrentContext->nb_systems-1); return NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[system],index); } void nlLockVariable(NLuint index) { nlCheckState(NL_STATE_SYSTEM); nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables - 1); nlCurrentContext->variable_is_locked[index] = NL_TRUE; } void nlUnlockVariable(NLuint index) { nlCheckState(NL_STATE_SYSTEM); nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables - 1); nlCurrentContext->variable_is_locked[index] = NL_FALSE; } NLboolean nlVariableIsLocked(NLuint index) { nl_assert(nlCurrentContext->state != NL_STATE_INITIAL); nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables - 1); return nlCurrentContext->variable_is_locked[index]; } /* System construction */ static void nlVariablesToVector() { NLuint n=nlCurrentContext->n; NLuint k,i,index; NLdouble value; nl_assert(nlCurrentContext->x != NULL); for(k=0; knb_systems; ++k) { for(i=0; inb_variables; ++i) { if(!nlCurrentContext->variable_is_locked[i]) { index = nlCurrentContext->variable_index[i]; nl_assert(index < nlCurrentContext->n); value = NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[k],i); nlCurrentContext->x[index+k*n] = value; } } } } static void nlVectorToVariables() { NLuint n=nlCurrentContext->n; NLuint k,i,index; NLdouble value; nl_assert(nlCurrentContext->x != NULL); for(k=0; knb_systems; ++k) { for(i=0; inb_variables; ++i) { if(!nlCurrentContext->variable_is_locked[i]) { index = nlCurrentContext->variable_index[i]; nl_assert(index < nlCurrentContext->n); value = nlCurrentContext->x[index+k*n]; NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[k],i) = value; } } } } static void nlBeginSystem() { NLuint k; nlTransition(NL_STATE_INITIAL, NL_STATE_SYSTEM); nl_assert(nlCurrentContext->nb_variables > 0); nlCurrentContext->variable_buffer = NL_NEW_ARRAY( NLBufferBinding, nlCurrentContext->nb_systems ); if(nlCurrentContext->user_variable_buffers) { nlCurrentContext->variable_value = NULL; } else { nlCurrentContext->variable_value = NL_NEW_ARRAY( NLdouble, nlCurrentContext->nb_variables * nlCurrentContext->nb_systems ); for(k=0; knb_systems; ++k) { nlCurrentContext->variable_buffer[k].base_address = nlCurrentContext->variable_value + k * nlCurrentContext->nb_variables; nlCurrentContext->variable_buffer[k].stride = sizeof(NLdouble); } } nlCurrentContext->variable_is_locked = NL_NEW_ARRAY( NLboolean, nlCurrentContext->nb_variables ); nlCurrentContext->variable_index = NL_NEW_ARRAY( NLuint, nlCurrentContext->nb_variables ); } static void nlEndSystem() { nlTransition(NL_STATE_MATRIX_CONSTRUCTED, NL_STATE_SYSTEM_CONSTRUCTED); } static void nlInitializeM() { NLuint i; NLuint n = 0; NLenum storage = NL_MATRIX_STORE_ROWS; for(i=0; inb_variables; i++) { if(!nlCurrentContext->variable_is_locked[i]) { nlCurrentContext->variable_index[i] = n; n++; } else { nlCurrentContext->variable_index[i] = (NLuint)~0; } } nlCurrentContext->n = n; /* * If the user trusts OpenNL and has left solver as NL_SOLVER_DEFAULT, * then we setup reasonable parameters for him. */ if(nlCurrentContext->solver == NL_SOLVER_DEFAULT) { if(nlCurrentContext->least_squares || nlCurrentContext->symmetric) { nlCurrentContext->solver = NL_CG; if(!nlCurrentContext->preconditioner_defined) { nlCurrentContext->preconditioner = NL_PRECOND_JACOBI; } } else { nlCurrentContext->solver = NL_BICGSTAB; } if(!nlCurrentContext->max_iterations_defined) { nlCurrentContext->max_iterations = n*5; } if(!nlCurrentContext->threshold_defined) { nlCurrentContext->threshold = 1e-6; } } /* SSOR preconditioner requires rows and columns */ if(nlCurrentContext->preconditioner == NL_PRECOND_SSOR) { storage = (storage | NL_MATRIX_STORE_COLUMNS); } /* a least squares problem results in a symmetric matrix */ if(nlCurrentContext->least_squares) { nlCurrentContext->symmetric = NL_TRUE; } if( nlCurrentContext->symmetric && nlCurrentContext->preconditioner == NL_PRECOND_SSOR ) { /* * For now, only used with SSOR preconditioner, because * for other modes it is either unsupported (SUPERLU) or * causes performance loss (non-parallel sparse SpMV) */ storage = (storage | NL_MATRIX_STORE_SYMMETRIC); } nlCurrentContext->M = (NLMatrix)(NL_NEW(NLSparseMatrix)); nlSparseMatrixConstruct( (NLSparseMatrix*)(nlCurrentContext->M), n, n, storage ); nlCurrentContext->x = NL_NEW_ARRAY( NLdouble, n*nlCurrentContext->nb_systems ); nlCurrentContext->b = NL_NEW_ARRAY( NLdouble, n*nlCurrentContext->nb_systems ); nlVariablesToVector(); nlRowColumnConstruct(&nlCurrentContext->af); nlRowColumnConstruct(&nlCurrentContext->al); nlCurrentContext->right_hand_side = NL_NEW_ARRAY( double, nlCurrentContext->nb_systems ); nlCurrentContext->current_row = 0; } static void nlEndMatrix() { nlTransition(NL_STATE_MATRIX, NL_STATE_MATRIX_CONSTRUCTED); nlRowColumnClear(&nlCurrentContext->af); nlRowColumnClear(&nlCurrentContext->al); if(!nlCurrentContext->least_squares) { nl_assert( nlCurrentContext->ij_coefficient_called || ( nlCurrentContext->current_row == nlCurrentContext->n ) ); } } static void nlBeginRow() { nlTransition(NL_STATE_MATRIX, NL_STATE_ROW); nlRowColumnZero(&nlCurrentContext->af); nlRowColumnZero(&nlCurrentContext->al); } static void nlScaleRow(NLdouble s) { NLRowColumn* af = &nlCurrentContext->af; NLRowColumn* al = &nlCurrentContext->al; NLuint nf = af->size; NLuint nl = al->size; NLuint i,k; for(i=0; icoeff[i].value *= s; } for(i=0; icoeff[i].value *= s; } for(k=0; knb_systems; ++k) { nlCurrentContext->right_hand_side[k] *= s; } } static void nlNormalizeRow(NLdouble weight) { NLRowColumn* af = &nlCurrentContext->af; NLRowColumn* al = &nlCurrentContext->al; NLuint nf = af->size; NLuint nl = al->size; NLuint i; NLdouble norm = 0.0; for(i=0; icoeff[i].value * af->coeff[i].value; } for(i=0; icoeff[i].value * al->coeff[i].value; } norm = sqrt(norm); nlScaleRow(weight / norm); } static void nlEndRow() { NLRowColumn* af = &nlCurrentContext->af; NLRowColumn* al = &nlCurrentContext->al; NLSparseMatrix* M = nlGetCurrentSparseMatrix(); NLdouble* b = nlCurrentContext->b; NLuint nf = af->size; NLuint nl = al->size; NLuint n = nlCurrentContext->n; NLuint current_row = nlCurrentContext->current_row; NLuint i,j,jj; NLdouble S; NLuint k; nlTransition(NL_STATE_ROW, NL_STATE_MATRIX); if(nlCurrentContext->normalize_rows) { nlNormalizeRow(nlCurrentContext->row_scaling); } else if(nlCurrentContext->row_scaling != 1.0) { nlScaleRow(nlCurrentContext->row_scaling); } /* * if least_squares : we want to solve * A'A x = A'b */ if(nlCurrentContext->least_squares) { for(i=0; icoeff[i].index, af->coeff[j].index, af->coeff[i].value * af->coeff[j].value ); } } for(k=0; knb_systems; ++k) { S = -nlCurrentContext->right_hand_side[k]; for(jj=0; jjcoeff[jj].index; S += al->coeff[jj].value * NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[k],j); } for(jj=0; jjcoeff[jj].index ] -= af->coeff[jj].value * S; } } } else { for(jj=0; jjcoeff[jj].index, af->coeff[jj].value ); } for(k=0; knb_systems; ++k) { b[k*n+current_row] = nlCurrentContext->right_hand_side[k]; for(jj=0; jjcoeff[jj].index; b[k*n+current_row] -= al->coeff[jj].value * NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[k],j); } } } nlCurrentContext->current_row++; for(k=0; knb_systems; ++k) { nlCurrentContext->right_hand_side[k] = 0.0; } nlCurrentContext->row_scaling = 1.0; } void nlCoefficient(NLuint index, NLdouble value) { nlCheckState(NL_STATE_ROW); nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables - 1); if(nlCurrentContext->variable_is_locked[index]) { /* * Note: in al, indices are NLvariable indices, * within [0..nb_variables-1] */ nlRowColumnAppend(&(nlCurrentContext->al), index, value); } else { /* * Note: in af, indices are system indices, * within [0..n-1] */ nlRowColumnAppend( &(nlCurrentContext->af), nlCurrentContext->variable_index[index], value ); } } void nlAddIJCoefficient(NLuint i, NLuint j, NLdouble value) { NLSparseMatrix* M = nlGetCurrentSparseMatrix(); nlCheckState(NL_STATE_MATRIX); nl_debug_range_assert(i, 0, nlCurrentContext->nb_variables - 1); nl_debug_range_assert(j, 0, nlCurrentContext->nb_variables - 1); #ifdef NL_DEBUG for(NLuint i=0; inb_variables; ++i) { nl_debug_assert(!nlCurrentContext->variable_is_locked[i]); } #endif nlSparseMatrixAdd(M, i, j, value); nlCurrentContext->ij_coefficient_called = NL_TRUE; } void nlAddIRightHandSide(NLuint i, NLdouble value) { nlCheckState(NL_STATE_MATRIX); nl_debug_range_assert(i, 0, nlCurrentContext->nb_variables - 1); #ifdef NL_DEBUG for(NLuint i=0; inb_variables; ++i) { nl_debug_assert(!nlCurrentContext->variable_is_locked[i]); } #endif nlCurrentContext->b[i] += value; nlCurrentContext->ij_coefficient_called = NL_TRUE; } void nlMultiAddIRightHandSide(NLuint i, NLuint k, NLdouble value) { NLuint n = nlCurrentContext->n; nlCheckState(NL_STATE_MATRIX); nl_debug_range_assert(i, 0, nlCurrentContext->nb_variables - 1); nl_debug_range_assert(k, 0, nlCurrentContext->nb_systems - 1); #ifdef NL_DEBUG for(NLuint i=0; inb_variables; ++i) { nl_debug_assert(!nlCurrentContext->variable_is_locked[i]); } #endif nlCurrentContext->b[i + k*n] += value; nlCurrentContext->ij_coefficient_called = NL_TRUE; } void nlRightHandSide(NLdouble value) { nlCurrentContext->right_hand_side[0] = value; } void nlMultiRightHandSide(NLuint k, NLdouble value) { nl_debug_range_assert(k, 0, nlCurrentContext->nb_systems - 1); nlCurrentContext->right_hand_side[k] = value; } void nlRowScaling(NLdouble value) { nlCheckState(NL_STATE_MATRIX); nlCurrentContext->row_scaling = value; } void nlBegin(NLenum prim) { switch(prim) { case NL_SYSTEM: { nlBeginSystem(); } break; case NL_MATRIX: { nlTransition(NL_STATE_SYSTEM, NL_STATE_MATRIX); if( nlCurrentContext->matrix_mode == NL_STIFFNESS_MATRIX && nlCurrentContext->M == NULL ) { nlInitializeM(); } } break; case NL_ROW: { nlBeginRow(); } break; default: { nl_assert_not_reached; } } } void nlEnd(NLenum prim) { switch(prim) { case NL_SYSTEM: { nlEndSystem(); } break; case NL_MATRIX: { nlEndMatrix(); } break; case NL_ROW: { nlEndRow(); } break; default: { nl_assert_not_reached; } } } /* nlSolve() driver routine */ NLboolean nlSolve() { NLboolean result; nlCheckState(NL_STATE_SYSTEM_CONSTRUCTED); nlCurrentContext->start_time = nlCurrentTime(); nlCurrentContext->elapsed_time = 0.0; nlCurrentContext->flops = 0; result = nlCurrentContext->solver_func(); nlVectorToVariables(); nlCurrentContext->elapsed_time = nlCurrentTime() - nlCurrentContext->start_time; nlTransition(NL_STATE_SYSTEM_CONSTRUCTED, NL_STATE_SOLVED); return result; } void nlUpdateRightHandSide(NLdouble* values) { /* * If we are in the solved state, get back to the * constructed state. */ nl_assert(nlCurrentContext->nb_systems == 1); if(nlCurrentContext->state == NL_STATE_SOLVED) { nlTransition(NL_STATE_SOLVED, NL_STATE_SYSTEM_CONSTRUCTED); } nlCheckState(NL_STATE_SYSTEM_CONSTRUCTED); memcpy(nlCurrentContext->x, values, nlCurrentContext->n * sizeof(double)); } /* Buffers management */ void nlBindBuffer( NLenum buffer, NLuint k, void* addr, NLuint stride ) { nlCheckState(NL_STATE_SYSTEM); nl_assert(nlIsEnabled(buffer)); nl_assert(buffer == NL_VARIABLES_BUFFER); nl_assert(knb_systems); if(stride == 0) { stride = sizeof(NLdouble); } nlCurrentContext->variable_buffer[k].base_address = addr; nlCurrentContext->variable_buffer[k].stride = stride; } /* Eigen solver */ void nlMatrixMode(NLenum matrix) { NLuint n = 0; NLuint i; nl_assert( nlCurrentContext->state == NL_STATE_SYSTEM || nlCurrentContext->state == NL_STATE_MATRIX_CONSTRUCTED ); nlCurrentContext->state = NL_STATE_SYSTEM; nlCurrentContext->matrix_mode = matrix; nlCurrentContext->current_row = 0; nlCurrentContext->ij_coefficient_called = NL_FALSE; switch(matrix) { case NL_STIFFNESS_MATRIX: { /* Stiffness matrix is already constructed. */ } break ; case NL_MASS_MATRIX: { if(nlCurrentContext->B == NULL) { for(i=0; inb_variables; ++i) { if(!nlCurrentContext->variable_is_locked[i]) { ++n; } } nlCurrentContext->B = (NLMatrix)(NL_NEW(NLSparseMatrix)); nlSparseMatrixConstruct( (NLSparseMatrix*)(nlCurrentContext->B), n, n, NL_MATRIX_STORE_ROWS ); } } break ; default: nl_assert_not_reached; } } void nlEigenSolverParameterd( NLenum pname, NLdouble val ) { switch(pname) { case NL_EIGEN_SHIFT: { nlCurrentContext->eigen_shift = val; } break; case NL_EIGEN_THRESHOLD: { nlSolverParameterd(pname, val); } break; default: nl_assert_not_reached; } } void nlEigenSolverParameteri( NLenum pname, NLint val ) { switch(pname) { case NL_EIGEN_SOLVER: { nlCurrentContext->eigen_solver = (NLenum)val; } break; case NL_SYMMETRIC: case NL_NB_VARIABLES: case NL_NB_EIGENS: case NL_EIGEN_MAX_ITERATIONS: { nlSolverParameteri(pname, val); } break; default: nl_assert_not_reached; } } void nlEigenSolve() { if(nlCurrentContext->eigen_value == NULL) { nlCurrentContext->eigen_value = NL_NEW_ARRAY( NLdouble,nlCurrentContext->nb_systems ); } nlMatrixCompress(&nlCurrentContext->M); if(nlCurrentContext->B != NULL) { nlMatrixCompress(&nlCurrentContext->B); } switch(nlCurrentContext->eigen_solver) { case NL_ARPACK_EXT: nlEigenSolve_ARPACK(); break; default: nl_assert_not_reached; } } double nlGetEigenValue(NLuint i) { nl_debug_assert(i < nlCurrentContext->nb_variables); return nlCurrentContext->eigen_value[i]; }