8051 lines
212 KiB
C
8051 lines
212 KiB
C
|
#include "OpenNL_psm.h"
|
||
|
|
||
|
/*
|
||
|
* Copyright (c) 2004-2010, Bruno Levy
|
||
|
* All rights reserved.
|
||
|
*
|
||
|
* Redistribution and use in source and binary forms, with or without
|
||
|
* modification, are permitted provided that the following conditions are met:
|
||
|
*
|
||
|
* * Redistributions of source code must retain the above copyright notice,
|
||
|
* this list of conditions and the following disclaimer.
|
||
|
* * Redistributions in binary form must reproduce the above copyright notice,
|
||
|
* this list of conditions and the following disclaimer in the documentation
|
||
|
* and/or other materials provided with the distribution.
|
||
|
* * Neither the name of the ALICE Project-Team nor the names of its
|
||
|
* contributors may be used to endorse or promote products derived from this
|
||
|
* software without specific prior written permission.
|
||
|
*
|
||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||
|
*
|
||
|
* If you modify this software, you should include a notice giving the
|
||
|
* name of the person performing the modification, the date of modification,
|
||
|
* and the reason for such modification.
|
||
|
*
|
||
|
* Contact: Bruno Levy
|
||
|
*
|
||
|
* levy@loria.fr
|
||
|
*
|
||
|
* ALICE Project
|
||
|
* LORIA, INRIA Lorraine,
|
||
|
* Campus Scientifique, BP 239
|
||
|
* 54506 VANDOEUVRE LES NANCY CEDEX
|
||
|
* FRANCE
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
|
||
|
/*
|
||
|
* This file is a PSM (pluggable software module)
|
||
|
* generated from the distribution of Geogram.
|
||
|
*
|
||
|
* See Geogram documentation on:
|
||
|
* http://alice.loria.fr/software/geogram/doc/html/index.html
|
||
|
*
|
||
|
* See documentation of the functions bundled in this PSM on:
|
||
|
* http://alice.loria.fr/software/geogram/doc/html/nl_8h.html
|
||
|
*/
|
||
|
|
||
|
|
||
|
|
||
|
/******* extracted from nl_private.h *******/
|
||
|
|
||
|
#ifndef OPENNL_PRIVATE_H
|
||
|
#define OPENNL_PRIVATE_H
|
||
|
|
||
|
#include <stdlib.h>
|
||
|
#include <string.h>
|
||
|
#include <stdio.h>
|
||
|
#include <math.h>
|
||
|
|
||
|
|
||
|
#if defined(__APPLE__) && defined(__MACH__)
|
||
|
#define NL_OS_APPLE
|
||
|
#endif
|
||
|
|
||
|
#if defined(__linux__) || defined(__ANDROID__) || defined(NL_OS_APPLE)
|
||
|
#define NL_OS_UNIX
|
||
|
#endif
|
||
|
|
||
|
|
||
|
#if defined(WIN32) || defined(_WIN64)
|
||
|
#define NL_OS_WINDOWS
|
||
|
#endif
|
||
|
|
||
|
#define nl_arg_used(x) (void)x
|
||
|
|
||
|
|
||
|
#if defined(__clang__) || defined(__GNUC__)
|
||
|
#define NL_NORETURN __attribute__((noreturn))
|
||
|
#else
|
||
|
#define NL_NORETURN
|
||
|
#endif
|
||
|
|
||
|
#if defined(_MSC_VER)
|
||
|
#define NL_NORETURN_DECL __declspec(noreturn)
|
||
|
#else
|
||
|
#define NL_NORETURN_DECL
|
||
|
#endif
|
||
|
|
||
|
NL_NORETURN_DECL void nl_assertion_failed(
|
||
|
const char* cond, const char* file, int line
|
||
|
) NL_NORETURN;
|
||
|
|
||
|
NL_NORETURN_DECL void nl_range_assertion_failed(
|
||
|
double x, double min_val, double max_val, const char* file, int line
|
||
|
) NL_NORETURN;
|
||
|
|
||
|
NL_NORETURN_DECL void nl_should_not_have_reached(
|
||
|
const char* file, int line
|
||
|
) NL_NORETURN;
|
||
|
|
||
|
#define nl_assert(x) { \
|
||
|
if(!(x)) { \
|
||
|
nl_assertion_failed(#x,__FILE__, __LINE__) ; \
|
||
|
} \
|
||
|
}
|
||
|
|
||
|
#define nl_range_assert(x,min_val,max_val) { \
|
||
|
if(((x) < (min_val)) || ((x) > (max_val))) { \
|
||
|
nl_range_assertion_failed(x, min_val, max_val, \
|
||
|
__FILE__, __LINE__ \
|
||
|
) ; \
|
||
|
} \
|
||
|
}
|
||
|
|
||
|
#define nl_assert_not_reached { \
|
||
|
nl_should_not_have_reached(__FILE__, __LINE__) ; \
|
||
|
}
|
||
|
|
||
|
#ifdef NL_DEBUG
|
||
|
#define nl_debug_assert(x) nl_assert(x)
|
||
|
#define nl_debug_range_assert(x,min_val,max_val) \
|
||
|
nl_range_assert(x,min_val,max_val)
|
||
|
#else
|
||
|
#define nl_debug_assert(x)
|
||
|
#define nl_debug_range_assert(x,min_val,max_val)
|
||
|
#endif
|
||
|
|
||
|
#ifdef NL_PARANOID
|
||
|
#define nl_parano_assert(x) nl_assert(x)
|
||
|
#define nl_parano_range_assert(x,min_val,max_val) \
|
||
|
nl_range_assert(x,min_val,max_val)
|
||
|
#else
|
||
|
#define nl_parano_assert(x)
|
||
|
#define nl_parano_range_assert(x,min_val,max_val)
|
||
|
#endif
|
||
|
|
||
|
|
||
|
void nlError(const char* function, const char* message) ;
|
||
|
|
||
|
void nlWarning(const char* function, const char* message) ;
|
||
|
|
||
|
|
||
|
NLdouble nlCurrentTime(void);
|
||
|
|
||
|
typedef void* NLdll;
|
||
|
|
||
|
|
||
|
#define NL_LINK_NOW 1
|
||
|
|
||
|
#define NL_LINK_LAZY 2
|
||
|
|
||
|
#define NL_LINK_GLOBAL 4
|
||
|
|
||
|
#define NL_LINK_QUIET 8
|
||
|
|
||
|
#define NL_LINK_USE_FALLBACK 16
|
||
|
|
||
|
NLdll nlOpenDLL(const char* filename, NLenum flags);
|
||
|
|
||
|
void nlCloseDLL(NLdll handle);
|
||
|
|
||
|
NLfunc nlFindFunction(NLdll handle, const char* funcname);
|
||
|
|
||
|
|
||
|
/* classic macros */
|
||
|
|
||
|
#ifndef MIN
|
||
|
#define MIN(x,y) (((x) < (y)) ? (x) : (y))
|
||
|
#endif
|
||
|
|
||
|
#ifndef MAX
|
||
|
#define MAX(x,y) (((x) > (y)) ? (x) : (y))
|
||
|
#endif
|
||
|
|
||
|
|
||
|
|
||
|
#define NL_NEW(T) (T*)(calloc(1, sizeof(T)))
|
||
|
|
||
|
#define NL_NEW_ARRAY(T,NB) (T*)(calloc((size_t)(NB),sizeof(T)))
|
||
|
|
||
|
#define NL_RENEW_ARRAY(T,x,NB) (T*)(realloc(x,(size_t)(NB)*sizeof(T)))
|
||
|
|
||
|
#define NL_DELETE(x) free(x); x = NULL
|
||
|
|
||
|
#define NL_DELETE_ARRAY(x) free(x); x = NULL
|
||
|
|
||
|
#define NL_CLEAR(T, x) memset(x, 0, sizeof(T))
|
||
|
|
||
|
#define NL_CLEAR_ARRAY(T,x,NB) memset(x, 0, (size_t)(NB)*sizeof(T))
|
||
|
|
||
|
|
||
|
#define NL_UINT_MAX 0xffffffff
|
||
|
|
||
|
#define NL_USHORT_MAX 0xffff
|
||
|
|
||
|
|
||
|
extern NLprintfFunc nl_printf;
|
||
|
|
||
|
extern NLfprintfFunc nl_fprintf;
|
||
|
|
||
|
|
||
|
#endif
|
||
|
|
||
|
/******* extracted from nl_blas.h *******/
|
||
|
|
||
|
|
||
|
|
||
|
#ifndef OPENNL_BLAS_H
|
||
|
#define OPENNL_BLAS_H
|
||
|
|
||
|
struct NLBlas;
|
||
|
|
||
|
typedef struct NLBlas* NLBlas_t;
|
||
|
|
||
|
typedef enum {
|
||
|
NoTranspose=0, Transpose=1, ConjugateTranspose=2
|
||
|
} MatrixTranspose ;
|
||
|
|
||
|
typedef enum {
|
||
|
UpperTriangle=0, LowerTriangle=1
|
||
|
} MatrixTriangle ;
|
||
|
|
||
|
typedef enum {
|
||
|
UnitTriangular=0, NotUnitTriangular=1
|
||
|
} MatrixUnitTriangular ;
|
||
|
|
||
|
typedef enum {
|
||
|
NL_HOST_MEMORY, NL_DEVICE_MEMORY
|
||
|
} NLmemoryType;
|
||
|
|
||
|
typedef void* (*FUNPTR_malloc)(
|
||
|
NLBlas_t blas, NLmemoryType type, size_t size
|
||
|
);
|
||
|
|
||
|
typedef void (*FUNPTR_free)(
|
||
|
NLBlas_t blas, NLmemoryType type, size_t size, void* ptr
|
||
|
);
|
||
|
|
||
|
typedef void (*FUNPTR_memcpy)(
|
||
|
NLBlas_t blas,
|
||
|
void* to, NLmemoryType to_type,
|
||
|
void* from, NLmemoryType from_type,
|
||
|
size_t size
|
||
|
);
|
||
|
|
||
|
typedef void (*FUNPTR_dcopy)(
|
||
|
NLBlas_t blas, int n, const double *x, int incx, double *y, int incy
|
||
|
);
|
||
|
|
||
|
typedef void (*FUNPTR_dscal)(
|
||
|
NLBlas_t blas, int n, double a, double *x, int incx
|
||
|
);
|
||
|
|
||
|
|
||
|
typedef double (*FUNPTR_ddot)(
|
||
|
NLBlas_t blas, int n, const double *x, int incx, const double *y, int incy
|
||
|
);
|
||
|
|
||
|
typedef double (*FUNPTR_dnrm2)(NLBlas_t blas, int n, const double *x, int incx);
|
||
|
|
||
|
typedef void (*FUNPTR_daxpy)(
|
||
|
NLBlas_t blas, int n,
|
||
|
double a, const double *x, int incx, double *y, int incy
|
||
|
);
|
||
|
|
||
|
|
||
|
typedef void (*FUNPTR_dgemv)(
|
||
|
NLBlas_t blas, MatrixTranspose trans, int m, int n, double alpha,
|
||
|
const double *A, int ldA, const double *x, int incx,
|
||
|
double beta, double *y, int incy
|
||
|
);
|
||
|
|
||
|
|
||
|
typedef void (*FUNPTR_dtpsv)(
|
||
|
NLBlas_t blas, MatrixTriangle uplo, MatrixTranspose trans,
|
||
|
MatrixUnitTriangular diag, int n, const double *AP,
|
||
|
double *x, int incx
|
||
|
);
|
||
|
|
||
|
struct NLBlas {
|
||
|
FUNPTR_malloc Malloc;
|
||
|
FUNPTR_free Free;
|
||
|
FUNPTR_memcpy Memcpy;
|
||
|
|
||
|
FUNPTR_dcopy Dcopy;
|
||
|
FUNPTR_dscal Dscal;
|
||
|
FUNPTR_ddot Ddot;
|
||
|
FUNPTR_dnrm2 Dnrm2;
|
||
|
FUNPTR_daxpy Daxpy;
|
||
|
FUNPTR_dgemv Dgemv;
|
||
|
FUNPTR_dtpsv Dtpsv;
|
||
|
|
||
|
NLboolean has_unified_memory;
|
||
|
double start_time;
|
||
|
NLulong flops;
|
||
|
NLulong used_ram[2];
|
||
|
NLulong max_used_ram[2];
|
||
|
|
||
|
/*
|
||
|
* Used for stats of the linear solver
|
||
|
* (a bit ugly, should not be here, but
|
||
|
* more convenient for now...)
|
||
|
*/
|
||
|
double sq_rnorm;
|
||
|
double sq_bnorm;
|
||
|
};
|
||
|
|
||
|
NLboolean nlBlasHasUnifiedMemory(NLBlas_t blas);
|
||
|
|
||
|
void nlBlasResetStats(NLBlas_t blas);
|
||
|
|
||
|
double nlBlasGFlops(NLBlas_t blas);
|
||
|
|
||
|
NLulong nlBlasUsedRam(NLBlas_t blas, NLmemoryType type);
|
||
|
|
||
|
NLulong nlBlasMaxUsedRam(NLBlas_t blas, NLmemoryType type);
|
||
|
|
||
|
NLBlas_t nlHostBlas(void);
|
||
|
|
||
|
#define NL_NEW_VECTOR(blas, memtype, dim) \
|
||
|
(double*)blas->Malloc(blas,memtype,(size_t)(dim)*sizeof(double))
|
||
|
|
||
|
#define NL_DELETE_VECTOR(blas, memtype, dim, ptr) \
|
||
|
blas->Free(blas,memtype,(size_t)(dim)*sizeof(double),ptr)
|
||
|
|
||
|
|
||
|
|
||
|
#endif
|
||
|
|
||
|
/******* extracted from nl_matrix.h *******/
|
||
|
|
||
|
|
||
|
#ifndef OPENNL_MATRIX_H
|
||
|
#define OPENNL_MATRIX_H
|
||
|
|
||
|
|
||
|
#ifdef __cplusplus
|
||
|
extern "C" {
|
||
|
#endif
|
||
|
|
||
|
|
||
|
/* Abstract matrix interface */
|
||
|
|
||
|
struct NLMatrixStruct;
|
||
|
typedef struct NLMatrixStruct* NLMatrix;
|
||
|
|
||
|
typedef void(*NLDestroyMatrixFunc)(NLMatrix M);
|
||
|
|
||
|
typedef void(*NLMultMatrixVectorFunc)(NLMatrix M, const double* x, double* y);
|
||
|
|
||
|
#define NL_MATRIX_SPARSE_DYNAMIC 0x1001
|
||
|
#define NL_MATRIX_CRS 0x1002
|
||
|
#define NL_MATRIX_SUPERLU_EXT 0x1003
|
||
|
#define NL_MATRIX_CHOLMOD_EXT 0x1004
|
||
|
#define NL_MATRIX_FUNCTION 0x1005
|
||
|
#define NL_MATRIX_OTHER 0x1006
|
||
|
|
||
|
struct NLMatrixStruct {
|
||
|
NLuint m;
|
||
|
|
||
|
NLuint n;
|
||
|
|
||
|
NLenum type;
|
||
|
|
||
|
NLDestroyMatrixFunc destroy_func;
|
||
|
|
||
|
NLMultMatrixVectorFunc mult_func;
|
||
|
};
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlDeleteMatrix(NLMatrix M);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlMultMatrixVector(
|
||
|
NLMatrix M, const double* x, double* y
|
||
|
);
|
||
|
|
||
|
|
||
|
/* Dynamic arrays for sparse row/columns */
|
||
|
|
||
|
typedef struct {
|
||
|
NLuint index;
|
||
|
|
||
|
NLdouble value;
|
||
|
} NLCoeff;
|
||
|
|
||
|
typedef struct {
|
||
|
NLuint size;
|
||
|
|
||
|
NLuint capacity;
|
||
|
|
||
|
NLCoeff* coeff;
|
||
|
} NLRowColumn;
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlRowColumnConstruct(NLRowColumn* c);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlRowColumnDestroy(NLRowColumn* c);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlRowColumnGrow(NLRowColumn* c);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlRowColumnAdd(
|
||
|
NLRowColumn* c, NLuint index, NLdouble value
|
||
|
);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlRowColumnAppend(
|
||
|
NLRowColumn* c, NLuint index, NLdouble value
|
||
|
);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlRowColumnZero(NLRowColumn* c);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlRowColumnClear(NLRowColumn* c);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlRowColumnSort(NLRowColumn* c);
|
||
|
|
||
|
|
||
|
/* Compressed Row Storage */
|
||
|
|
||
|
typedef struct {
|
||
|
NLuint m;
|
||
|
|
||
|
NLuint n;
|
||
|
|
||
|
NLenum type;
|
||
|
|
||
|
NLDestroyMatrixFunc destroy_func;
|
||
|
|
||
|
NLMultMatrixVectorFunc mult_func;
|
||
|
|
||
|
NLdouble* val;
|
||
|
|
||
|
NLuint* rowptr;
|
||
|
|
||
|
NLuint* colind;
|
||
|
|
||
|
NLuint nslices;
|
||
|
|
||
|
NLuint* sliceptr;
|
||
|
|
||
|
NLboolean symmetric_storage;
|
||
|
} NLCRSMatrix;
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlCRSMatrixConstruct(
|
||
|
NLCRSMatrix* M, NLuint m, NLuint n, NLuint nnz, NLuint nslices
|
||
|
);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlCRSMatrixConstructSymmetric(
|
||
|
NLCRSMatrix* M, NLuint n, NLuint nnz
|
||
|
);
|
||
|
|
||
|
NLAPI NLboolean NLAPIENTRY nlCRSMatrixLoad(
|
||
|
NLCRSMatrix* M, const char* filename
|
||
|
);
|
||
|
|
||
|
NLAPI NLboolean NLAPIENTRY nlCRSMatrixSave(
|
||
|
NLCRSMatrix* M, const char* filename
|
||
|
);
|
||
|
|
||
|
NLAPI NLuint NLAPIENTRY nlCRSMatrixNNZ(NLCRSMatrix* M);
|
||
|
|
||
|
|
||
|
/* SparseMatrix data structure */
|
||
|
|
||
|
#define NL_MATRIX_STORE_ROWS 1
|
||
|
|
||
|
#define NL_MATRIX_STORE_COLUMNS 2
|
||
|
|
||
|
#define NL_MATRIX_STORE_SYMMETRIC 4
|
||
|
|
||
|
typedef struct {
|
||
|
NLuint m;
|
||
|
|
||
|
NLuint n;
|
||
|
|
||
|
NLenum type;
|
||
|
|
||
|
NLDestroyMatrixFunc destroy_func;
|
||
|
|
||
|
NLMultMatrixVectorFunc mult_func;
|
||
|
|
||
|
|
||
|
NLuint diag_size;
|
||
|
|
||
|
NLuint diag_capacity;
|
||
|
|
||
|
NLenum storage;
|
||
|
|
||
|
NLRowColumn* row;
|
||
|
|
||
|
NLRowColumn* column;
|
||
|
|
||
|
NLdouble* diag;
|
||
|
|
||
|
NLuint row_capacity;
|
||
|
|
||
|
NLuint column_capacity;
|
||
|
|
||
|
} NLSparseMatrix;
|
||
|
|
||
|
|
||
|
NLAPI NLMatrix NLAPIENTRY nlSparseMatrixNew(
|
||
|
NLuint m, NLuint n, NLenum storage
|
||
|
);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlSparseMatrixConstruct(
|
||
|
NLSparseMatrix* M, NLuint m, NLuint n, NLenum storage
|
||
|
);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlSparseMatrixDestroy(NLSparseMatrix* M);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlSparseMatrixMult(
|
||
|
NLSparseMatrix* A, const NLdouble* x, NLdouble* y
|
||
|
);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlSparseMatrixAdd(
|
||
|
NLSparseMatrix* M, NLuint i, NLuint j, NLdouble value
|
||
|
);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlSparseMatrixAddMatrix(
|
||
|
NLSparseMatrix* M, double mul, const NLMatrix N
|
||
|
);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlSparseMatrixZero( NLSparseMatrix* M);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlSparseMatrixClear( NLSparseMatrix* M);
|
||
|
|
||
|
NLAPI NLuint NLAPIENTRY nlSparseMatrixNNZ( NLSparseMatrix* M);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlSparseMatrixSort( NLSparseMatrix* M);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlSparseMatrixAddRow( NLSparseMatrix* M);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlSparseMatrixAddColumn( NLSparseMatrix* M);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlSparseMatrixMAddRow(
|
||
|
NLSparseMatrix* M, NLuint i1, double s, NLuint i2
|
||
|
);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlSparseMatrixScaleRow(
|
||
|
NLSparseMatrix* M, NLuint i, double s
|
||
|
);
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlSparseMatrixZeroRow(
|
||
|
NLSparseMatrix* M, NLuint i
|
||
|
);
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
NLAPI NLMatrix NLAPIENTRY nlCRSMatrixNewFromSparseMatrix(NLSparseMatrix* M);
|
||
|
|
||
|
NLAPI NLMatrix NLAPIENTRY nlCRSMatrixNewFromSparseMatrixSymmetric(
|
||
|
NLSparseMatrix* M
|
||
|
);
|
||
|
|
||
|
|
||
|
NLAPI void NLAPIENTRY nlMatrixCompress(NLMatrix* M);
|
||
|
|
||
|
NLAPI NLuint NLAPIENTRY nlMatrixNNZ(NLMatrix M);
|
||
|
|
||
|
NLAPI NLMatrix NLAPIENTRY nlMatrixFactorize(NLMatrix M, NLenum solver);
|
||
|
|
||
|
|
||
|
|
||
|
typedef void(*NLMatrixFunc)(const double* x, double* y);
|
||
|
|
||
|
NLAPI NLMatrix NLAPIENTRY nlMatrixNewFromFunction(
|
||
|
NLuint m, NLuint n, NLMatrixFunc func
|
||
|
);
|
||
|
|
||
|
NLAPI NLMatrixFunc NLAPIENTRY nlMatrixGetFunction(NLMatrix M);
|
||
|
|
||
|
|
||
|
|
||
|
NLAPI NLMatrix NLAPIENTRY nlMatrixNewFromProduct(
|
||
|
NLMatrix M, NLboolean product_owns_M,
|
||
|
NLMatrix N, NLboolean product_owns_N
|
||
|
);
|
||
|
|
||
|
|
||
|
|
||
|
#ifdef __cplusplus
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
#endif
|
||
|
|
||
|
/******* extracted from nl_context.h *******/
|
||
|
|
||
|
#ifndef OPENNL_CONTEXT_H
|
||
|
#define OPENNL_CONTEXT_H
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
/* NLContext data structure */
|
||
|
|
||
|
|
||
|
typedef NLboolean(*NLSolverFunc)(void);
|
||
|
|
||
|
typedef void(*NLProgressFunc)(
|
||
|
NLuint cur_iter, NLuint max_iter, double cur_err, double max_err
|
||
|
);
|
||
|
|
||
|
#define NL_STATE_INITIAL 0
|
||
|
#define NL_STATE_SYSTEM 1
|
||
|
#define NL_STATE_MATRIX 2
|
||
|
#define NL_STATE_ROW 3
|
||
|
#define NL_STATE_MATRIX_CONSTRUCTED 4
|
||
|
#define NL_STATE_SYSTEM_CONSTRUCTED 5
|
||
|
#define NL_STATE_SOLVED 6
|
||
|
|
||
|
typedef struct {
|
||
|
void* base_address;
|
||
|
NLuint stride;
|
||
|
} NLBufferBinding;
|
||
|
|
||
|
#define NL_BUFFER_ITEM(B,i) \
|
||
|
*(double*)((void*)((char*)((B).base_address)+((i)*(B).stride)))
|
||
|
|
||
|
|
||
|
typedef struct {
|
||
|
NLenum state;
|
||
|
|
||
|
NLboolean user_variable_buffers;
|
||
|
|
||
|
NLBufferBinding* variable_buffer;
|
||
|
|
||
|
NLdouble* variable_value;
|
||
|
|
||
|
NLboolean* variable_is_locked;
|
||
|
|
||
|
NLuint* variable_index;
|
||
|
|
||
|
NLuint n;
|
||
|
|
||
|
|
||
|
NLenum matrix_mode;
|
||
|
|
||
|
NLMatrix M;
|
||
|
|
||
|
NLMatrix P;
|
||
|
|
||
|
NLMatrix B;
|
||
|
|
||
|
NLRowColumn af;
|
||
|
|
||
|
NLRowColumn al;
|
||
|
|
||
|
NLdouble* x;
|
||
|
|
||
|
NLdouble* b;
|
||
|
|
||
|
NLdouble* right_hand_side;
|
||
|
|
||
|
NLdouble row_scaling;
|
||
|
|
||
|
NLenum solver;
|
||
|
|
||
|
NLenum preconditioner;
|
||
|
|
||
|
NLboolean preconditioner_defined;
|
||
|
|
||
|
NLuint nb_variables;
|
||
|
|
||
|
NLuint nb_systems;
|
||
|
|
||
|
NLboolean ij_coefficient_called;
|
||
|
|
||
|
NLuint current_row;
|
||
|
|
||
|
NLboolean least_squares;
|
||
|
|
||
|
NLboolean symmetric;
|
||
|
|
||
|
NLuint max_iterations;
|
||
|
|
||
|
|
||
|
NLboolean max_iterations_defined;
|
||
|
|
||
|
NLuint inner_iterations;
|
||
|
|
||
|
NLdouble threshold;
|
||
|
|
||
|
NLboolean threshold_defined;
|
||
|
|
||
|
NLdouble omega;
|
||
|
|
||
|
NLboolean normalize_rows;
|
||
|
|
||
|
NLuint used_iterations;
|
||
|
|
||
|
NLdouble error;
|
||
|
|
||
|
|
||
|
NLdouble start_time;
|
||
|
|
||
|
NLdouble elapsed_time;
|
||
|
|
||
|
NLSolverFunc solver_func;
|
||
|
|
||
|
NLProgressFunc progress_func;
|
||
|
|
||
|
NLboolean verbose;
|
||
|
|
||
|
NLulong flops;
|
||
|
|
||
|
NLenum eigen_solver;
|
||
|
|
||
|
NLdouble eigen_shift;
|
||
|
|
||
|
NLboolean eigen_shift_invert;
|
||
|
|
||
|
NLdouble* eigen_value;
|
||
|
|
||
|
NLdouble* temp_eigen_value;
|
||
|
|
||
|
} NLContextStruct;
|
||
|
|
||
|
extern NLContextStruct* nlCurrentContext;
|
||
|
|
||
|
void nlCheckState(NLenum state);
|
||
|
|
||
|
void nlTransition(NLenum from_state, NLenum to_state);
|
||
|
|
||
|
NLboolean nlDefaultSolver(void);
|
||
|
|
||
|
#endif
|
||
|
|
||
|
/******* extracted from nl_iterative_solvers.h *******/
|
||
|
|
||
|
|
||
|
#ifndef OPENNL_ITERATIVE_SOLVERS_H
|
||
|
#define OPENNL_ITERATIVE_SOLVERS_H
|
||
|
|
||
|
|
||
|
|
||
|
NLAPI NLuint NLAPIENTRY nlSolveSystemIterative(
|
||
|
NLBlas_t blas,
|
||
|
NLMatrix M, NLMatrix P, NLdouble* b, NLdouble* x,
|
||
|
NLenum solver,
|
||
|
double eps, NLuint max_iter, NLuint inner_iter
|
||
|
);
|
||
|
|
||
|
#endif
|
||
|
|
||
|
|
||
|
/******* extracted from nl_preconditioners.h *******/
|
||
|
|
||
|
#ifndef OPENNL_PRECONDITIONERS_H
|
||
|
#define OPENNL_PRECONDITIONERS_H
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
/* preconditioners */
|
||
|
|
||
|
NLMatrix nlNewJacobiPreconditioner(NLMatrix M);
|
||
|
|
||
|
NLMatrix nlNewSSORPreconditioner(NLMatrix M, double omega);
|
||
|
|
||
|
#endif
|
||
|
|
||
|
/******* extracted from nl_superlu.h *******/
|
||
|
|
||
|
#ifndef OPENNL_SUPERLU_H
|
||
|
#define OPENNL_SUPERLU_H
|
||
|
|
||
|
|
||
|
|
||
|
NLAPI NLMatrix NLAPIENTRY nlMatrixFactorize_SUPERLU(
|
||
|
NLMatrix M, NLenum solver
|
||
|
);
|
||
|
|
||
|
NLboolean nlInitExtension_SUPERLU(void);
|
||
|
|
||
|
NLboolean nlExtensionIsInitialized_SUPERLU(void);
|
||
|
|
||
|
#endif
|
||
|
|
||
|
/******* extracted from nl_cholmod.h *******/
|
||
|
|
||
|
#ifndef OPENNL_CHOLMOD_H
|
||
|
#define OPENNL_CHOLMOD_H
|
||
|
|
||
|
|
||
|
|
||
|
NLAPI NLMatrix NLAPIENTRY nlMatrixFactorize_CHOLMOD(
|
||
|
NLMatrix M, NLenum solver
|
||
|
);
|
||
|
|
||
|
NLboolean nlInitExtension_CHOLMOD(void);
|
||
|
|
||
|
NLboolean nlExtensionIsInitialized_CHOLMOD(void);
|
||
|
|
||
|
#endif
|
||
|
|
||
|
/******* extracted from nl_arpack.h *******/
|
||
|
|
||
|
#ifndef OPENNL_ARPACK_H
|
||
|
#define OPENNL_ARPACK_H
|
||
|
|
||
|
|
||
|
|
||
|
NLboolean nlInitExtension_ARPACK(void);
|
||
|
|
||
|
NLboolean nlExtensionIsInitialized_ARPACK(void);
|
||
|
|
||
|
void nlEigenSolve_ARPACK(void);
|
||
|
|
||
|
|
||
|
#endif
|
||
|
|
||
|
/******* extracted from nl_mkl.h *******/
|
||
|
|
||
|
#ifndef OPENNL_MKL_H
|
||
|
#define OPENNL_MKL_H
|
||
|
|
||
|
|
||
|
|
||
|
NLboolean nlInitExtension_MKL(void);
|
||
|
|
||
|
NLboolean nlExtensionIsInitialized_MKL(void);
|
||
|
|
||
|
extern NLMultMatrixVectorFunc NLMultMatrixVector_MKL;
|
||
|
|
||
|
#endif
|
||
|
|
||
|
/******* extracted from nl_cuda.h *******/
|
||
|
|
||
|
#ifndef OPENNL_CUDA_EXT_H
|
||
|
#define OPENNL_CUDA_EXT_H
|
||
|
|
||
|
|
||
|
|
||
|
NLboolean nlInitExtension_CUDA(void);
|
||
|
|
||
|
NLboolean nlExtensionIsInitialized_CUDA(void);
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
NLMatrix nlCUDAMatrixNewFromCRSMatrix(NLMatrix M);
|
||
|
|
||
|
NLMatrix nlCUDAJacobiPreconditionerNewFromCRSMatrix(NLMatrix M);
|
||
|
|
||
|
NLBlas_t nlCUDABlas(void);
|
||
|
|
||
|
|
||
|
|
||
|
#endif
|
||
|
|
||
|
/******* extracted from nl_os.c *******/
|
||
|
|
||
|
|
||
|
#if (defined (WIN32) || defined(_WIN64))
|
||
|
#include <windows.h>
|
||
|
#else
|
||
|
#include <sys/types.h>
|
||
|
#include <sys/times.h>
|
||
|
#endif
|
||
|
|
||
|
#if defined(GEO_DYNAMIC_LIBS) && defined(NL_OS_UNIX)
|
||
|
#include <dlfcn.h>
|
||
|
#endif
|
||
|
|
||
|
|
||
|
/* Assertions */
|
||
|
|
||
|
|
||
|
void nl_assertion_failed(const char* cond, const char* file, int line) {
|
||
|
nl_fprintf(
|
||
|
stderr,
|
||
|
"OpenNL assertion failed: %s, file:%s, line:%d\n",
|
||
|
cond,file,line
|
||
|
) ;
|
||
|
abort() ;
|
||
|
}
|
||
|
|
||
|
void nl_range_assertion_failed(
|
||
|
double x, double min_val, double max_val, const char* file, int line
|
||
|
) {
|
||
|
nl_fprintf(
|
||
|
stderr,
|
||
|
"OpenNL range assertion failed: "
|
||
|
"%f in [ %f ... %f ], file:%s, line:%d\n",
|
||
|
x, min_val, max_val, file,line
|
||
|
) ;
|
||
|
abort() ;
|
||
|
}
|
||
|
|
||
|
void nl_should_not_have_reached(const char* file, int line) {
|
||
|
nl_fprintf(
|
||
|
stderr,
|
||
|
"OpenNL should not have reached this point: file:%s, line:%d\n",
|
||
|
file,line
|
||
|
) ;
|
||
|
abort() ;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
/* Timing */
|
||
|
|
||
|
#ifdef WIN32
|
||
|
NLdouble nlCurrentTime() {
|
||
|
return (NLdouble)GetTickCount() / 1000.0 ;
|
||
|
}
|
||
|
#else
|
||
|
double nlCurrentTime() {
|
||
|
clock_t user_clock ;
|
||
|
struct tms user_tms ;
|
||
|
user_clock = times(&user_tms) ;
|
||
|
return (NLdouble)user_clock / 100.0 ;
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
|
||
|
/* DLLs/shared objects/dylibs */
|
||
|
|
||
|
#if defined(GEO_DYNAMIC_LIBS)
|
||
|
|
||
|
# if defined(NL_OS_UNIX)
|
||
|
|
||
|
NLdll nlOpenDLL(const char* name, NLenum flags_in) {
|
||
|
void* result = NULL;
|
||
|
int flags = 0;
|
||
|
if((flags_in & NL_LINK_NOW) != 0) {
|
||
|
flags |= RTLD_NOW;
|
||
|
}
|
||
|
if((flags_in & NL_LINK_LAZY) != 0) {
|
||
|
flags |= RTLD_LAZY;
|
||
|
}
|
||
|
if((flags_in & NL_LINK_GLOBAL) != 0) {
|
||
|
flags |= RTLD_GLOBAL;
|
||
|
}
|
||
|
if((flags_in & NL_LINK_QUIET) == 0) {
|
||
|
nl_fprintf(stdout,"Trying to load %s\n", name);
|
||
|
}
|
||
|
result = dlopen(name, flags);
|
||
|
if(result == NULL) {
|
||
|
if((flags_in & NL_LINK_QUIET) == 0) {
|
||
|
nl_fprintf(stderr,"Did not find %s,\n", name);
|
||
|
nl_fprintf(stderr,"Retrying with libgeogram_num_3rdparty.so\n");
|
||
|
}
|
||
|
if((flags_in & NL_LINK_USE_FALLBACK) != 0) {
|
||
|
result=dlopen("libgeogram_num_3rdparty.so", flags);
|
||
|
if(result == NULL) {
|
||
|
if((flags_in & NL_LINK_QUIET) == 0) {
|
||
|
nlError("nlOpenDLL/dlopen",dlerror());
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if((flags_in & NL_LINK_QUIET) == 0 && result != NULL) {
|
||
|
nl_fprintf(stdout,"Loaded %s\n", name);
|
||
|
}
|
||
|
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
void nlCloseDLL(void* handle) {
|
||
|
dlclose(handle);
|
||
|
}
|
||
|
|
||
|
NLfunc nlFindFunction(void* handle, const char* name) {
|
||
|
/*
|
||
|
* It is not legal in modern C to cast a void*
|
||
|
* pointer into a function pointer, thus requiring this
|
||
|
* (quite dirty) function that uses a union.
|
||
|
*/
|
||
|
union {
|
||
|
void* ptr;
|
||
|
NLfunc fptr;
|
||
|
} u;
|
||
|
u.ptr = dlsym(handle, name);
|
||
|
return u.fptr;
|
||
|
}
|
||
|
|
||
|
# elif defined(NL_OS_WINDOWS)
|
||
|
|
||
|
NLdll nlOpenDLL(const char* name, NLenum flags) {
|
||
|
/* Note: NL_LINK_LAZY and NL_LINK_GLOBAL are ignored. */
|
||
|
void* result = LoadLibrary(name);
|
||
|
if(result == NULL && ((flags & NL_LINK_USE_FALLBACK) != 0)) {
|
||
|
if((flags & NL_LINK_QUIET) == 0) {
|
||
|
nl_fprintf(stderr,"Did not find %s,\n", name);
|
||
|
nl_fprintf(stderr,"Retrying with geogram_num_3rdparty\n");
|
||
|
}
|
||
|
result=LoadLibrary("geogram_num_3rdparty.dll");
|
||
|
}
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
void nlCloseDLL(void* handle) {
|
||
|
FreeLibrary((HMODULE)handle);
|
||
|
}
|
||
|
|
||
|
NLfunc nlFindFunction(void* handle, const char* name) {
|
||
|
return (NLfunc)GetProcAddress((HMODULE)handle, name);
|
||
|
}
|
||
|
|
||
|
# endif
|
||
|
|
||
|
#else
|
||
|
|
||
|
NLdll nlOpenDLL(const char* name, NLenum flags) {
|
||
|
nl_arg_used(name);
|
||
|
nl_arg_used(flags);
|
||
|
#ifdef NL_OS_UNIX
|
||
|
nlError("nlOpenDLL","Was not compiled with dynamic linking enabled");
|
||
|
nlError("nlOpenDLL","(see VORPALINE_BUILD_DYNAMIC in CMakeLists.txt)");
|
||
|
#else
|
||
|
nlError("nlOpenDLL","Not implemented");
|
||
|
#endif
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
void nlCloseDLL(void* handle) {
|
||
|
nl_arg_used(handle);
|
||
|
nlError("nlCloseDLL","Not implemented");
|
||
|
}
|
||
|
|
||
|
NLfunc nlFindFunction(void* handle, const char* name) {
|
||
|
nl_arg_used(handle);
|
||
|
nl_arg_used(name);
|
||
|
nlError("nlFindFunction","Not implemented");
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
#endif
|
||
|
|
||
|
|
||
|
/* Error-reporting functions */
|
||
|
|
||
|
NLprintfFunc nl_printf = printf;
|
||
|
NLfprintfFunc nl_fprintf = fprintf;
|
||
|
|
||
|
void nlError(const char* function, const char* message) {
|
||
|
nl_fprintf(stderr, "OpenNL error in %s(): %s\n", function, message) ;
|
||
|
}
|
||
|
|
||
|
void nlWarning(const char* function, const char* message) {
|
||
|
nl_fprintf(stderr, "OpenNL warning in %s(): %s\n", function, message) ;
|
||
|
}
|
||
|
|
||
|
void nlPrintfFuncs(NLprintfFunc f1, NLfprintfFunc f2) {
|
||
|
nl_printf = f1;
|
||
|
nl_fprintf = f2;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
/******* extracted from nl_matrix.c *******/
|
||
|
|
||
|
|
||
|
/*
|
||
|
Some warnings about const cast in callback for
|
||
|
qsort() function.
|
||
|
*/
|
||
|
|
||
|
#ifdef __clang__
|
||
|
#pragma GCC diagnostic ignored "-Wcast-qual"
|
||
|
#endif
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
void nlDeleteMatrix(NLMatrix M) {
|
||
|
if(M == NULL) {
|
||
|
return;
|
||
|
}
|
||
|
M->destroy_func(M);
|
||
|
NL_DELETE(M);
|
||
|
}
|
||
|
|
||
|
void nlMultMatrixVector(
|
||
|
NLMatrix M, const double* x, double* y
|
||
|
) {
|
||
|
M->mult_func(M,x,y);
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
void nlRowColumnConstruct(NLRowColumn* c) {
|
||
|
c->size = 0;
|
||
|
c->capacity = 0;
|
||
|
c->coeff = NULL;
|
||
|
}
|
||
|
|
||
|
void nlRowColumnDestroy(NLRowColumn* c) {
|
||
|
NL_DELETE_ARRAY(c->coeff);
|
||
|
c->size = 0;
|
||
|
c->capacity = 0;
|
||
|
}
|
||
|
|
||
|
void nlRowColumnGrow(NLRowColumn* c) {
|
||
|
if(c->capacity != 0) {
|
||
|
c->capacity = 2 * c->capacity;
|
||
|
c->coeff = NL_RENEW_ARRAY(NLCoeff, c->coeff, c->capacity);
|
||
|
} else {
|
||
|
c->capacity = 4;
|
||
|
c->coeff = NL_NEW_ARRAY(NLCoeff, c->capacity);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void nlRowColumnAdd(NLRowColumn* c, NLuint index, NLdouble value) {
|
||
|
NLuint i;
|
||
|
for(i=0; i<c->size; i++) {
|
||
|
if(c->coeff[i].index == index) {
|
||
|
c->coeff[i].value += value;
|
||
|
return;
|
||
|
}
|
||
|
}
|
||
|
if(c->size == c->capacity) {
|
||
|
nlRowColumnGrow(c);
|
||
|
}
|
||
|
c->coeff[c->size].index = index;
|
||
|
c->coeff[c->size].value = value;
|
||
|
c->size++;
|
||
|
}
|
||
|
|
||
|
/* Does not check whether the index already exists */
|
||
|
void nlRowColumnAppend(NLRowColumn* c, NLuint index, NLdouble value) {
|
||
|
if(c->size == c->capacity) {
|
||
|
nlRowColumnGrow(c);
|
||
|
}
|
||
|
c->coeff[c->size].index = index;
|
||
|
c->coeff[c->size].value = value;
|
||
|
c->size++;
|
||
|
}
|
||
|
|
||
|
void nlRowColumnZero(NLRowColumn* c) {
|
||
|
c->size = 0;
|
||
|
}
|
||
|
|
||
|
void nlRowColumnClear(NLRowColumn* c) {
|
||
|
c->size = 0;
|
||
|
c->capacity = 0;
|
||
|
NL_DELETE_ARRAY(c->coeff);
|
||
|
}
|
||
|
|
||
|
static int nlCoeffCompare(const void* p1, const void* p2) {
|
||
|
return (((NLCoeff*)(p2))->index < ((NLCoeff*)(p1))->index);
|
||
|
}
|
||
|
|
||
|
void nlRowColumnSort(NLRowColumn* c) {
|
||
|
qsort(c->coeff, c->size, sizeof(NLCoeff), nlCoeffCompare);
|
||
|
}
|
||
|
|
||
|
|
||
|
/* CRSMatrix data structure */
|
||
|
|
||
|
static void nlCRSMatrixDestroy(NLCRSMatrix* M) {
|
||
|
NL_DELETE_ARRAY(M->val);
|
||
|
NL_DELETE_ARRAY(M->rowptr);
|
||
|
NL_DELETE_ARRAY(M->colind);
|
||
|
NL_DELETE_ARRAY(M->sliceptr);
|
||
|
M->m = 0;
|
||
|
M->n = 0;
|
||
|
M->nslices = 0;
|
||
|
}
|
||
|
|
||
|
|
||
|
NLboolean nlCRSMatrixSave(NLCRSMatrix* M, const char* filename) {
|
||
|
NLuint nnz = M->rowptr[M->m];
|
||
|
FILE* f = fopen(filename, "rb");
|
||
|
if(f == NULL) {
|
||
|
nlError("nlCRSMatrixSave", "Could not open file");
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
|
||
|
fwrite(&M->m, sizeof(NLuint), 1, f);
|
||
|
fwrite(&M->n, sizeof(NLuint), 1, f);
|
||
|
fwrite(&nnz, sizeof(NLuint), 1, f);
|
||
|
|
||
|
fwrite(M->rowptr, sizeof(NLuint), M->m+1, f);
|
||
|
fwrite(M->colind, sizeof(NLuint), nnz, f);
|
||
|
fwrite(M->val, sizeof(double), nnz, f);
|
||
|
|
||
|
return NL_TRUE;
|
||
|
}
|
||
|
|
||
|
NLboolean nlCRSMatrixLoad(NLCRSMatrix* M, const char* filename) {
|
||
|
NLuint nnz = 0;
|
||
|
FILE* f = fopen(filename, "rb");
|
||
|
NLboolean truncated = NL_FALSE;
|
||
|
|
||
|
if(f == NULL) {
|
||
|
nlError("nlCRSMatrixLoad", "Could not open file");
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
|
||
|
truncated = truncated || (
|
||
|
fread(&M->m, sizeof(NLuint), 1, f) != 1 ||
|
||
|
fread(&M->n, sizeof(NLuint), 1, f) != 1 ||
|
||
|
fread(&nnz, sizeof(NLuint), 1, f) != 1
|
||
|
);
|
||
|
|
||
|
if(truncated) {
|
||
|
M->rowptr = NULL;
|
||
|
M->colind = NULL;
|
||
|
M->val = NULL;
|
||
|
} else {
|
||
|
M->rowptr = NL_NEW_ARRAY(NLuint, M->m+1);
|
||
|
M->colind = NL_NEW_ARRAY(NLuint, nnz);
|
||
|
M->val = NL_NEW_ARRAY(double, nnz);
|
||
|
truncated = truncated || (
|
||
|
fread(M->rowptr, sizeof(NLuint), M->m+1, f) != M->m+1 ||
|
||
|
fread(M->colind, sizeof(NLuint), nnz, f) != nnz ||
|
||
|
fread(M->val, sizeof(double), nnz, f) != nnz
|
||
|
);
|
||
|
}
|
||
|
|
||
|
if(truncated) {
|
||
|
nlError("nlCRSMatrixSave", "File appears to be truncated");
|
||
|
NL_DELETE_ARRAY(M->rowptr);
|
||
|
NL_DELETE_ARRAY(M->colind);
|
||
|
NL_DELETE_ARRAY(M->val);
|
||
|
return NL_FALSE;
|
||
|
} else {
|
||
|
M->nslices = 1;
|
||
|
M->sliceptr = NL_NEW_ARRAY(NLuint, M->nslices+1);
|
||
|
M->sliceptr[0] = 0;
|
||
|
M->sliceptr[1] = M->m;
|
||
|
}
|
||
|
|
||
|
fclose(f);
|
||
|
return NL_TRUE;
|
||
|
}
|
||
|
|
||
|
NLuint nlCRSMatrixNNZ(NLCRSMatrix* M) {
|
||
|
return M->rowptr[M->m];
|
||
|
}
|
||
|
|
||
|
static void nlCRSMatrixMultSlice(
|
||
|
NLCRSMatrix* M, const double* x, double* y, NLuint Ibegin, NLuint Iend
|
||
|
) {
|
||
|
NLuint i,j;
|
||
|
for(i=Ibegin; i<Iend; ++i) {
|
||
|
double sum=0.0;
|
||
|
for(j=M->rowptr[i]; j<M->rowptr[i+1]; ++j) {
|
||
|
sum += M->val[j] * x[M->colind[j]];
|
||
|
}
|
||
|
y[i] = sum;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void nlCRSMatrixMult(
|
||
|
NLCRSMatrix* M, const double* x, double* y
|
||
|
) {
|
||
|
int slice;
|
||
|
int nslices = (int)(M->nslices);
|
||
|
NLuint i,j,jj;
|
||
|
NLdouble a;
|
||
|
|
||
|
if(M->symmetric_storage) {
|
||
|
for(i=0; i<M->m; ++i) {
|
||
|
y[i] = 0.0;
|
||
|
}
|
||
|
for(i=0; i<M->m; ++i) {
|
||
|
for(jj=M->rowptr[i]; jj<M->rowptr[i+1]; ++jj) {
|
||
|
a = M->val[jj];
|
||
|
j = M->colind[jj];
|
||
|
y[i] += a * x[j];
|
||
|
if(j != i) {
|
||
|
y[j] += a * x[i];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
|
||
|
#if defined(_OPENMP)
|
||
|
#pragma omp parallel for private(slice)
|
||
|
#endif
|
||
|
|
||
|
for(slice=0; slice<nslices; ++slice) {
|
||
|
nlCRSMatrixMultSlice(
|
||
|
M,x,y,M->sliceptr[slice],M->sliceptr[slice+1]
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
nlHostBlas()->flops += (NLulong)(2*nlCRSMatrixNNZ(M));
|
||
|
}
|
||
|
|
||
|
void nlCRSMatrixConstruct(
|
||
|
NLCRSMatrix* M, NLuint m, NLuint n, NLuint nnz, NLuint nslices
|
||
|
) {
|
||
|
M->m = m;
|
||
|
M->n = n;
|
||
|
M->type = NL_MATRIX_CRS;
|
||
|
M->destroy_func = (NLDestroyMatrixFunc)nlCRSMatrixDestroy;
|
||
|
if(NLMultMatrixVector_MKL != NULL) {
|
||
|
M->mult_func = (NLMultMatrixVectorFunc)NLMultMatrixVector_MKL;
|
||
|
} else {
|
||
|
M->mult_func = (NLMultMatrixVectorFunc)nlCRSMatrixMult;
|
||
|
}
|
||
|
M->nslices = nslices;
|
||
|
M->val = NL_NEW_ARRAY(double, nnz);
|
||
|
M->rowptr = NL_NEW_ARRAY(NLuint, m+1);
|
||
|
M->colind = NL_NEW_ARRAY(NLuint, nnz);
|
||
|
M->sliceptr = NL_NEW_ARRAY(NLuint, nslices+1);
|
||
|
M->symmetric_storage = NL_FALSE;
|
||
|
}
|
||
|
|
||
|
void nlCRSMatrixConstructSymmetric(
|
||
|
NLCRSMatrix* M, NLuint n, NLuint nnz
|
||
|
) {
|
||
|
M->m = n;
|
||
|
M->n = n;
|
||
|
M->type = NL_MATRIX_CRS;
|
||
|
M->destroy_func = (NLDestroyMatrixFunc)nlCRSMatrixDestroy;
|
||
|
M->mult_func = (NLMultMatrixVectorFunc)nlCRSMatrixMult;
|
||
|
M->nslices = 0;
|
||
|
M->val = NL_NEW_ARRAY(double, nnz);
|
||
|
M->rowptr = NL_NEW_ARRAY(NLuint, n+1);
|
||
|
M->colind = NL_NEW_ARRAY(NLuint, nnz);
|
||
|
M->sliceptr = NULL;
|
||
|
M->symmetric_storage = NL_TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
/* SparseMatrix data structure */
|
||
|
|
||
|
|
||
|
static void nlSparseMatrixDestroyRowColumns(NLSparseMatrix* M) {
|
||
|
NLuint i;
|
||
|
if(M->storage & NL_MATRIX_STORE_ROWS) {
|
||
|
for(i=0; i<M->m; i++) {
|
||
|
nlRowColumnDestroy(&(M->row[i]));
|
||
|
}
|
||
|
NL_DELETE_ARRAY(M->row);
|
||
|
}
|
||
|
M->storage = (NLenum)((int)(M->storage) & ~NL_MATRIX_STORE_ROWS);
|
||
|
|
||
|
if(M->storage & NL_MATRIX_STORE_COLUMNS) {
|
||
|
for(i=0; i<M->n; i++) {
|
||
|
nlRowColumnDestroy(&(M->column[i]));
|
||
|
}
|
||
|
NL_DELETE_ARRAY(M->column);
|
||
|
}
|
||
|
M->storage = (NLenum)((int)(M->storage) & ~NL_MATRIX_STORE_COLUMNS);
|
||
|
}
|
||
|
|
||
|
void nlSparseMatrixDestroy(NLSparseMatrix* M) {
|
||
|
nl_assert(M->type == NL_MATRIX_SPARSE_DYNAMIC);
|
||
|
nlSparseMatrixDestroyRowColumns(M);
|
||
|
NL_DELETE_ARRAY(M->diag);
|
||
|
#ifdef NL_PARANOID
|
||
|
NL_CLEAR(NLSparseMatrix,M);
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
void nlSparseMatrixAdd(NLSparseMatrix* M, NLuint i, NLuint j, NLdouble value) {
|
||
|
nl_parano_range_assert(i, 0, M->m - 1);
|
||
|
nl_parano_range_assert(j, 0, M->n - 1);
|
||
|
if((M->storage & NL_MATRIX_STORE_SYMMETRIC) && (j > i)) {
|
||
|
return;
|
||
|
}
|
||
|
if(i == j) {
|
||
|
M->diag[i] += value;
|
||
|
}
|
||
|
if(M->storage & NL_MATRIX_STORE_ROWS) {
|
||
|
nlRowColumnAdd(&(M->row[i]), j, value);
|
||
|
}
|
||
|
if(M->storage & NL_MATRIX_STORE_COLUMNS) {
|
||
|
nlRowColumnAdd(&(M->column[j]), i, value);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void nlSparseMatrixAddSparseMatrix(
|
||
|
NLSparseMatrix* M, double mul, const NLSparseMatrix* N
|
||
|
) {
|
||
|
NLuint i,j,ii,jj;
|
||
|
nl_assert(M->m == N->m);
|
||
|
nl_assert(M->n == N->n);
|
||
|
if(N->storage & NL_MATRIX_STORE_SYMMETRIC) {
|
||
|
nl_assert(M->storage & NL_MATRIX_STORE_SYMMETRIC);
|
||
|
}
|
||
|
if(N->storage & NL_MATRIX_STORE_ROWS) {
|
||
|
for(i=0; i<N->m; ++i) {
|
||
|
for(jj=0; jj<N->row[i].size; ++jj) {
|
||
|
nlSparseMatrixAdd(
|
||
|
M,
|
||
|
i, N->row[i].coeff[jj].index,
|
||
|
mul*N->row[i].coeff[jj].value
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
nl_assert(N->storage & NL_MATRIX_STORE_COLUMNS);
|
||
|
for(j=0; j<N->n; ++j) {
|
||
|
for(ii=0; ii<N->column[j].size; ++ii) {
|
||
|
nlSparseMatrixAdd(
|
||
|
M,
|
||
|
N->column[j].coeff[ii].index, j,
|
||
|
mul*N->column[j].coeff[ii].value
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void nlSparseMatrixAddCRSMatrix(
|
||
|
NLSparseMatrix* M, double mul, const NLCRSMatrix* N
|
||
|
) {
|
||
|
NLuint i,jj;
|
||
|
nl_assert(M->m == N->m);
|
||
|
nl_assert(M->n == N->n);
|
||
|
for(i=0; i<M->m; ++i) {
|
||
|
for(jj=N->rowptr[i]; jj<N->rowptr[i+1]; ++jj) {
|
||
|
nlSparseMatrixAdd(
|
||
|
M,
|
||
|
i,
|
||
|
N->colind[jj],
|
||
|
mul*N->val[jj]
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void nlSparseMatrixAddMatrix(
|
||
|
NLSparseMatrix* M, double mul, const NLMatrix N
|
||
|
) {
|
||
|
nl_assert(M->m == N->m);
|
||
|
nl_assert(M->n == N->n);
|
||
|
if(N->type == NL_MATRIX_SPARSE_DYNAMIC) {
|
||
|
nlSparseMatrixAddSparseMatrix(M, mul, (const NLSparseMatrix*)N);
|
||
|
} else if(N->type == NL_MATRIX_CRS) {
|
||
|
nlSparseMatrixAddCRSMatrix(M, mul, (const NLCRSMatrix*)N);
|
||
|
} else {
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
void nlSparseMatrixZero( NLSparseMatrix* M) {
|
||
|
NLuint i;
|
||
|
if(M->storage & NL_MATRIX_STORE_ROWS) {
|
||
|
for(i=0; i<M->m; i++) {
|
||
|
nlRowColumnZero(&(M->row[i]));
|
||
|
}
|
||
|
}
|
||
|
if(M->storage & NL_MATRIX_STORE_COLUMNS) {
|
||
|
for(i=0; i<M->n; i++) {
|
||
|
nlRowColumnZero(&(M->column[i]));
|
||
|
}
|
||
|
}
|
||
|
NL_CLEAR_ARRAY(NLdouble, M->diag, M->diag_size);
|
||
|
}
|
||
|
|
||
|
void nlSparseMatrixClear( NLSparseMatrix* M) {
|
||
|
NLuint i;
|
||
|
if(M->storage & NL_MATRIX_STORE_ROWS) {
|
||
|
for(i=0; i<M->m; i++) {
|
||
|
nlRowColumnClear(&(M->row[i]));
|
||
|
}
|
||
|
}
|
||
|
if(M->storage & NL_MATRIX_STORE_COLUMNS) {
|
||
|
for(i=0; i<M->n; i++) {
|
||
|
nlRowColumnClear(&(M->column[i]));
|
||
|
}
|
||
|
}
|
||
|
NL_CLEAR_ARRAY(NLdouble, M->diag, M->diag_size);
|
||
|
}
|
||
|
|
||
|
/* Returns the number of non-zero coefficients */
|
||
|
NLuint nlSparseMatrixNNZ( NLSparseMatrix* M) {
|
||
|
NLuint nnz = 0;
|
||
|
NLuint i;
|
||
|
if(M->storage & NL_MATRIX_STORE_ROWS) {
|
||
|
for(i = 0; i<M->m; i++) {
|
||
|
nnz += M->row[i].size;
|
||
|
}
|
||
|
} else if (M->storage & NL_MATRIX_STORE_COLUMNS) {
|
||
|
for(i = 0; i<M->n; i++) {
|
||
|
nnz += M->column[i].size;
|
||
|
}
|
||
|
} else {
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
return nnz;
|
||
|
}
|
||
|
|
||
|
void nlSparseMatrixSort( NLSparseMatrix* M) {
|
||
|
NLuint i;
|
||
|
if(M->storage & NL_MATRIX_STORE_ROWS) {
|
||
|
for(i = 0; i<M->m; i++) {
|
||
|
nlRowColumnSort(&(M->row[i]));
|
||
|
}
|
||
|
}
|
||
|
if (M->storage & NL_MATRIX_STORE_COLUMNS) {
|
||
|
for(i = 0; i<M->n; i++) {
|
||
|
nlRowColumnSort(&(M->column[i]));
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void nlSparseMatrixMAddRow(
|
||
|
NLSparseMatrix* M, NLuint i1, double s, NLuint i2
|
||
|
) {
|
||
|
NLuint jj;
|
||
|
NLRowColumn* Ri2 = &(M->row[i2]);
|
||
|
NLCoeff* c = NULL;
|
||
|
|
||
|
nl_debug_assert(i1 < M->m);
|
||
|
nl_debug_assert(i2 < M->m);
|
||
|
|
||
|
for(jj=0; jj<Ri2->size; ++jj) {
|
||
|
c = &(Ri2->coeff[jj]);
|
||
|
nlSparseMatrixAdd(M, i1, c->index, s*c->value);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void nlSparseMatrixScaleRow(
|
||
|
NLSparseMatrix* M, NLuint i, double s
|
||
|
) {
|
||
|
NLuint jj;
|
||
|
NLRowColumn* Ri = &(M->row[i]);
|
||
|
NLCoeff* c = NULL;
|
||
|
|
||
|
nl_assert(M->storage & NL_MATRIX_STORE_ROWS);
|
||
|
nl_assert(!(M->storage & NL_MATRIX_STORE_COLUMNS));
|
||
|
nl_debug_assert(i < M->m);
|
||
|
|
||
|
for(jj=0; jj<Ri->size; ++jj) {
|
||
|
c = &(Ri->coeff[jj]);
|
||
|
c->value *= s;
|
||
|
}
|
||
|
if(i < M->diag_size) {
|
||
|
M->diag[i] *= s;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void nlSparseMatrixZeroRow(
|
||
|
NLSparseMatrix* M, NLuint i
|
||
|
) {
|
||
|
NLRowColumn* Ri = &(M->row[i]);
|
||
|
|
||
|
nl_debug_assert(i < M->m);
|
||
|
|
||
|
Ri->size = 0;
|
||
|
if(i < M->diag_size) {
|
||
|
M->diag[i] = 0.0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
/* SparseMatrix x Vector routines, internal helper routines */
|
||
|
|
||
|
static void nlSparseMatrix_mult_rows_symmetric(
|
||
|
NLSparseMatrix* A,
|
||
|
const NLdouble* x,
|
||
|
NLdouble* y
|
||
|
) {
|
||
|
NLuint m = A->m;
|
||
|
NLuint i,ij;
|
||
|
NLCoeff* c = NULL;
|
||
|
for(i=0; i<m; i++) {
|
||
|
NLRowColumn* Ri = &(A->row[i]);
|
||
|
y[i] = 0;
|
||
|
for(ij=0; ij<Ri->size; ++ij) {
|
||
|
c = &(Ri->coeff[ij]);
|
||
|
y[i] += c->value * x[c->index];
|
||
|
if(i != c->index) {
|
||
|
y[c->index] += c->value * x[i];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void nlSparseMatrix_mult_rows(
|
||
|
NLSparseMatrix* A,
|
||
|
const NLdouble* x,
|
||
|
NLdouble* y
|
||
|
) {
|
||
|
/*
|
||
|
* Note: OpenMP does not like unsigned ints
|
||
|
* (causes some floating point exceptions),
|
||
|
* therefore I use here signed ints for all
|
||
|
* indices.
|
||
|
*/
|
||
|
|
||
|
int m = (int)(A->m);
|
||
|
int i,ij;
|
||
|
NLCoeff* c = NULL;
|
||
|
NLRowColumn* Ri = NULL;
|
||
|
|
||
|
#if defined(_OPENMP)
|
||
|
#pragma omp parallel for private(i,ij,c,Ri)
|
||
|
#endif
|
||
|
|
||
|
for(i=0; i<m; i++) {
|
||
|
Ri = &(A->row[i]);
|
||
|
y[i] = 0;
|
||
|
for(ij=0; ij<(int)(Ri->size); ij++) {
|
||
|
c = &(Ri->coeff[ij]);
|
||
|
y[i] += c->value * x[c->index];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void nlSparseMatrix_mult_cols_symmetric(
|
||
|
NLSparseMatrix* A,
|
||
|
const NLdouble* x,
|
||
|
NLdouble* y
|
||
|
) {
|
||
|
NLuint n = A->n;
|
||
|
NLuint j,ii;
|
||
|
NLCoeff* c = NULL;
|
||
|
for(j=0; j<n; j++) {
|
||
|
NLRowColumn* Cj = &(A->column[j]);
|
||
|
y[j] = 0;
|
||
|
for(ii=0; ii<Cj->size; ii++) {
|
||
|
c = &(Cj->coeff[ii]);
|
||
|
y[c->index] += c->value * x[j];
|
||
|
if(j != c->index) {
|
||
|
y[j] += c->value * x[c->index];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void nlSparseMatrix_mult_cols(
|
||
|
NLSparseMatrix* A,
|
||
|
const NLdouble* x,
|
||
|
NLdouble* y
|
||
|
) {
|
||
|
NLuint n = A->n;
|
||
|
NLuint j,ii;
|
||
|
NLCoeff* c = NULL;
|
||
|
NL_CLEAR_ARRAY(NLdouble, y, A->m);
|
||
|
for(j=0; j<n; j++) {
|
||
|
NLRowColumn* Cj = &(A->column[j]);
|
||
|
for(ii=0; ii<Cj->size; ii++) {
|
||
|
c = &(Cj->coeff[ii]);
|
||
|
y[c->index] += c->value * x[j];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void nlSparseMatrixMult(
|
||
|
NLSparseMatrix* A, const NLdouble* x, NLdouble* y
|
||
|
) {
|
||
|
nl_assert(A->type == NL_MATRIX_SPARSE_DYNAMIC);
|
||
|
if(A->storage & NL_MATRIX_STORE_ROWS) {
|
||
|
if(A->storage & NL_MATRIX_STORE_SYMMETRIC) {
|
||
|
nlSparseMatrix_mult_rows_symmetric(A, x, y);
|
||
|
} else {
|
||
|
nlSparseMatrix_mult_rows(A, x, y);
|
||
|
}
|
||
|
} else {
|
||
|
if(A->storage & NL_MATRIX_STORE_SYMMETRIC) {
|
||
|
nlSparseMatrix_mult_cols_symmetric(A, x, y);
|
||
|
} else {
|
||
|
nlSparseMatrix_mult_cols(A, x, y);
|
||
|
}
|
||
|
}
|
||
|
nlHostBlas()->flops += (NLulong)(2*nlSparseMatrixNNZ(A));
|
||
|
}
|
||
|
|
||
|
NLMatrix nlSparseMatrixNew(
|
||
|
NLuint m, NLuint n, NLenum storage
|
||
|
) {
|
||
|
NLSparseMatrix* result = NL_NEW(NLSparseMatrix);
|
||
|
nlSparseMatrixConstruct(result, m, n, storage);
|
||
|
return (NLMatrix)result;
|
||
|
}
|
||
|
|
||
|
void nlSparseMatrixConstruct(
|
||
|
NLSparseMatrix* M, NLuint m, NLuint n, NLenum storage
|
||
|
) {
|
||
|
NLuint i;
|
||
|
M->m = m;
|
||
|
M->n = n;
|
||
|
M->type = NL_MATRIX_SPARSE_DYNAMIC;
|
||
|
M->destroy_func = (NLDestroyMatrixFunc)nlSparseMatrixDestroy;
|
||
|
M->mult_func = (NLMultMatrixVectorFunc)nlSparseMatrixMult;
|
||
|
M->storage = storage;
|
||
|
if(storage & NL_MATRIX_STORE_ROWS) {
|
||
|
M->row = NL_NEW_ARRAY(NLRowColumn, m);
|
||
|
M->row_capacity = m;
|
||
|
for(i=0; i<n; i++) {
|
||
|
nlRowColumnConstruct(&(M->row[i]));
|
||
|
}
|
||
|
} else {
|
||
|
M->row = NULL;
|
||
|
M->row_capacity = 0;
|
||
|
}
|
||
|
|
||
|
if(storage & NL_MATRIX_STORE_COLUMNS) {
|
||
|
M->column = NL_NEW_ARRAY(NLRowColumn, n);
|
||
|
M->column_capacity = n;
|
||
|
for(i=0; i<n; i++) {
|
||
|
nlRowColumnConstruct(&(M->column[i]));
|
||
|
}
|
||
|
} else {
|
||
|
M->column = NULL;
|
||
|
M->column_capacity = 0;
|
||
|
}
|
||
|
|
||
|
M->diag_size = MIN(m,n);
|
||
|
M->diag_capacity = M->diag_size;
|
||
|
M->diag = NL_NEW_ARRAY(NLdouble, M->diag_size);
|
||
|
}
|
||
|
|
||
|
static void adjust_diag(NLSparseMatrix* M) {
|
||
|
NLuint new_diag_size = MIN(M->m, M->n);
|
||
|
NLuint i;
|
||
|
if(new_diag_size > M->diag_size) {
|
||
|
if(new_diag_size > M->diag_capacity) {
|
||
|
M->diag_capacity *= 2;
|
||
|
if(M->diag_capacity == 0) {
|
||
|
M->diag_capacity = 16;
|
||
|
}
|
||
|
M->diag = NL_RENEW_ARRAY(double, M->diag, M->diag_capacity);
|
||
|
for(i=M->diag_size; i<new_diag_size; ++i) {
|
||
|
M->diag[i] = 0.0;
|
||
|
}
|
||
|
}
|
||
|
M->diag_size= new_diag_size;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void nlSparseMatrixAddRow( NLSparseMatrix* M) {
|
||
|
++M->m;
|
||
|
if(M->storage & NL_MATRIX_STORE_ROWS) {
|
||
|
if(M->m > M->row_capacity) {
|
||
|
M->row_capacity *= 2;
|
||
|
if(M->row_capacity == 0) {
|
||
|
M->row_capacity = 16;
|
||
|
}
|
||
|
M->row = NL_RENEW_ARRAY(
|
||
|
NLRowColumn, M->row, M->row_capacity
|
||
|
);
|
||
|
}
|
||
|
nlRowColumnConstruct(&(M->row[M->m-1]));
|
||
|
}
|
||
|
adjust_diag(M);
|
||
|
}
|
||
|
|
||
|
void nlSparseMatrixAddColumn( NLSparseMatrix* M) {
|
||
|
++M->n;
|
||
|
if(M->storage & NL_MATRIX_STORE_COLUMNS) {
|
||
|
if(M->n > M->column_capacity) {
|
||
|
M->column_capacity *= 2;
|
||
|
if(M->column_capacity == 0) {
|
||
|
M->column_capacity = 16;
|
||
|
}
|
||
|
M->column = NL_RENEW_ARRAY(
|
||
|
NLRowColumn, M->column, M->column_capacity
|
||
|
);
|
||
|
}
|
||
|
nlRowColumnConstruct(&(M->column[M->n-1]));
|
||
|
}
|
||
|
adjust_diag(M);
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
NLMatrix nlCRSMatrixNewFromSparseMatrix(NLSparseMatrix* M) {
|
||
|
NLuint nnz = nlSparseMatrixNNZ(M);
|
||
|
NLuint nslices = 8; /* TODO: get number of cores */
|
||
|
NLuint slice, cur_bound, cur_NNZ, cur_row;
|
||
|
NLuint i,ij,k;
|
||
|
NLuint slice_size = nnz / nslices;
|
||
|
NLCRSMatrix* CRS = NL_NEW(NLCRSMatrix);
|
||
|
|
||
|
nl_assert(M->storage & NL_MATRIX_STORE_ROWS);
|
||
|
|
||
|
if(M->storage & NL_MATRIX_STORE_SYMMETRIC) {
|
||
|
nl_assert(M->m == M->n);
|
||
|
nlCRSMatrixConstructSymmetric(CRS, M->n, nnz);
|
||
|
} else {
|
||
|
nlCRSMatrixConstruct(CRS, M->m, M->n, nnz, nslices);
|
||
|
}
|
||
|
|
||
|
nlSparseMatrixSort(M);
|
||
|
/* Convert matrix to CRS format */
|
||
|
k=0;
|
||
|
for(i=0; i<M->m; ++i) {
|
||
|
NLRowColumn* Ri = &(M->row[i]);
|
||
|
CRS->rowptr[i] = k;
|
||
|
for(ij=0; ij<Ri->size; ij++) {
|
||
|
NLCoeff* c = &(Ri->coeff[ij]);
|
||
|
CRS->val[k] = c->value;
|
||
|
CRS->colind[k] = c->index;
|
||
|
++k;
|
||
|
}
|
||
|
}
|
||
|
CRS->rowptr[M->m] = k;
|
||
|
|
||
|
/* Create "slices" to be used by parallel sparse matrix vector product */
|
||
|
if(CRS->sliceptr != NULL) {
|
||
|
cur_bound = slice_size;
|
||
|
cur_NNZ = 0;
|
||
|
cur_row = 0;
|
||
|
CRS->sliceptr[0]=0;
|
||
|
for(slice=1; slice<nslices; ++slice) {
|
||
|
while(cur_NNZ < cur_bound && cur_row < M->m) {
|
||
|
++cur_row;
|
||
|
cur_NNZ += CRS->rowptr[cur_row+1] - CRS->rowptr[cur_row];
|
||
|
}
|
||
|
CRS->sliceptr[slice] = cur_row;
|
||
|
cur_bound += slice_size;
|
||
|
}
|
||
|
CRS->sliceptr[nslices]=M->m;
|
||
|
}
|
||
|
return (NLMatrix)CRS;
|
||
|
}
|
||
|
|
||
|
NLMatrix nlCRSMatrixNewFromSparseMatrixSymmetric(NLSparseMatrix* M) {
|
||
|
NLuint nnz;
|
||
|
NLuint i,j,jj,k;
|
||
|
NLCRSMatrix* CRS = NL_NEW(NLCRSMatrix);
|
||
|
|
||
|
nl_assert(M->storage & NL_MATRIX_STORE_ROWS);
|
||
|
nl_assert(M->m == M->n);
|
||
|
|
||
|
nlSparseMatrixSort(M);
|
||
|
|
||
|
if(M->storage & NL_MATRIX_STORE_SYMMETRIC) {
|
||
|
nnz = nlSparseMatrixNNZ(M);
|
||
|
} else {
|
||
|
nnz = 0;
|
||
|
for(i=0; i<M->n; ++i) {
|
||
|
NLRowColumn* Ri = &M->row[i];
|
||
|
for(jj=0; jj<Ri->size; ++jj) {
|
||
|
j = Ri->coeff[jj].index;
|
||
|
if(j <= i) {
|
||
|
++nnz;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
nlCRSMatrixConstructSymmetric(CRS, M->n, nnz);
|
||
|
|
||
|
k=0;
|
||
|
for(i=0; i<M->m; ++i) {
|
||
|
NLRowColumn* Ri = &(M->row[i]);
|
||
|
CRS->rowptr[i] = k;
|
||
|
for(jj=0; jj<Ri->size; ++jj) {
|
||
|
j = Ri->coeff[jj].index;
|
||
|
if((M->storage & NL_MATRIX_STORE_SYMMETRIC)) {
|
||
|
nl_debug_assert(j <= i);
|
||
|
}
|
||
|
if(j <= i) {
|
||
|
CRS->val[k] = Ri->coeff[jj].value;
|
||
|
CRS->colind[k] = j;
|
||
|
++k;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
CRS->rowptr[M->m] = k;
|
||
|
|
||
|
return (NLMatrix)CRS;
|
||
|
}
|
||
|
|
||
|
|
||
|
void nlMatrixCompress(NLMatrix* M) {
|
||
|
NLMatrix CRS = NULL;
|
||
|
if((*M)->type != NL_MATRIX_SPARSE_DYNAMIC) {
|
||
|
return;
|
||
|
}
|
||
|
CRS = nlCRSMatrixNewFromSparseMatrix((NLSparseMatrix*)*M);
|
||
|
nlDeleteMatrix(*M);
|
||
|
*M = CRS;
|
||
|
}
|
||
|
|
||
|
NLuint nlMatrixNNZ(NLMatrix M) {
|
||
|
if(M->type == NL_MATRIX_SPARSE_DYNAMIC) {
|
||
|
return nlSparseMatrixNNZ((NLSparseMatrix*)M);
|
||
|
} else if(M->type == NL_MATRIX_CRS) {
|
||
|
return nlCRSMatrixNNZ((NLCRSMatrix*)M);
|
||
|
}
|
||
|
return M->m * M->n;
|
||
|
}
|
||
|
|
||
|
NLMatrix nlMatrixFactorize(NLMatrix M, NLenum solver) {
|
||
|
NLMatrix result = NULL;
|
||
|
switch(solver) {
|
||
|
case NL_SUPERLU_EXT:
|
||
|
case NL_PERM_SUPERLU_EXT:
|
||
|
case NL_SYMMETRIC_SUPERLU_EXT:
|
||
|
result = nlMatrixFactorize_SUPERLU(M,solver);
|
||
|
break;
|
||
|
case NL_CHOLMOD_EXT:
|
||
|
result = nlMatrixFactorize_CHOLMOD(M,solver);
|
||
|
break;
|
||
|
default:
|
||
|
nlError("nlMatrixFactorize","unknown solver");
|
||
|
}
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
typedef struct {
|
||
|
NLuint m;
|
||
|
|
||
|
NLuint n;
|
||
|
|
||
|
NLenum type;
|
||
|
|
||
|
NLDestroyMatrixFunc destroy_func;
|
||
|
|
||
|
NLMultMatrixVectorFunc mult_func;
|
||
|
|
||
|
NLMatrixFunc matrix_func;
|
||
|
} NLFunctionMatrix;
|
||
|
|
||
|
static void nlFunctionMatrixDestroy(NLFunctionMatrix* M) {
|
||
|
(void)M; /* to avoid 'unused parameter' warning */
|
||
|
/*
|
||
|
* Nothing special to do,
|
||
|
* there is no dynamic allocated mem.
|
||
|
*/
|
||
|
}
|
||
|
|
||
|
static void nlFunctionMatrixMult(
|
||
|
NLFunctionMatrix* M, const NLdouble* x, NLdouble* y
|
||
|
) {
|
||
|
M->matrix_func(x,y);
|
||
|
}
|
||
|
|
||
|
NLMatrix nlMatrixNewFromFunction(NLuint m, NLuint n, NLMatrixFunc func) {
|
||
|
NLFunctionMatrix* result = NL_NEW(NLFunctionMatrix);
|
||
|
result->m = m;
|
||
|
result->n = n;
|
||
|
result->type = NL_MATRIX_FUNCTION;
|
||
|
result->destroy_func = (NLDestroyMatrixFunc)nlFunctionMatrixDestroy;
|
||
|
result->mult_func = (NLMultMatrixVectorFunc)nlFunctionMatrixMult;
|
||
|
result->matrix_func = func;
|
||
|
return (NLMatrix)result;
|
||
|
}
|
||
|
|
||
|
NLMatrixFunc nlMatrixGetFunction(NLMatrix M) {
|
||
|
if(M == NULL) {
|
||
|
return NULL;
|
||
|
}
|
||
|
if(M->type != NL_MATRIX_FUNCTION) {
|
||
|
return NULL;
|
||
|
}
|
||
|
return ((NLFunctionMatrix*)M)->matrix_func;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
typedef struct {
|
||
|
NLuint m;
|
||
|
|
||
|
NLuint n;
|
||
|
|
||
|
NLenum type;
|
||
|
|
||
|
NLDestroyMatrixFunc destroy_func;
|
||
|
|
||
|
NLMultMatrixVectorFunc mult_func;
|
||
|
|
||
|
NLMatrixFunc matrix_func;
|
||
|
|
||
|
NLMatrix M;
|
||
|
|
||
|
NLboolean owns_M;
|
||
|
|
||
|
NLMatrix N;
|
||
|
|
||
|
NLboolean owns_N;
|
||
|
|
||
|
NLdouble* work;
|
||
|
} NLMatrixProduct;
|
||
|
|
||
|
|
||
|
static void nlMatrixProductDestroy(NLMatrixProduct* P) {
|
||
|
NL_DELETE_ARRAY(P->work);
|
||
|
if(P->owns_M) {
|
||
|
nlDeleteMatrix(P->M); P->M = NULL;
|
||
|
}
|
||
|
if(P->owns_N) {
|
||
|
nlDeleteMatrix(P->N); P->N = NULL;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void nlMatrixProductMult(
|
||
|
NLMatrixProduct* P, const NLdouble* x, NLdouble* y
|
||
|
) {
|
||
|
nlMultMatrixVector(P->N, x, P->work);
|
||
|
nlMultMatrixVector(P->M, P->work, y);
|
||
|
}
|
||
|
|
||
|
NLMatrix nlMatrixNewFromProduct(
|
||
|
NLMatrix M, NLboolean owns_M, NLMatrix N, NLboolean owns_N
|
||
|
) {
|
||
|
NLMatrixProduct* result = NL_NEW(NLMatrixProduct);
|
||
|
nl_assert(M->n == N->m);
|
||
|
result->m = M->m;
|
||
|
result->n = N->n;
|
||
|
result->type = NL_MATRIX_OTHER;
|
||
|
result->work = NL_NEW_ARRAY(NLdouble,N->m);
|
||
|
result->destroy_func = (NLDestroyMatrixFunc)nlMatrixProductDestroy;
|
||
|
result->mult_func = (NLMultMatrixVectorFunc)nlMatrixProductMult;
|
||
|
result->M = M;
|
||
|
result->owns_M = owns_M;
|
||
|
result->N = N;
|
||
|
result->owns_N = owns_N;
|
||
|
return (NLMatrix)result;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
/******* extracted from nl_context.c *******/
|
||
|
|
||
|
|
||
|
NLContextStruct* nlCurrentContext = NULL;
|
||
|
|
||
|
NLContext nlNewContext() {
|
||
|
NLContextStruct* result = NL_NEW(NLContextStruct);
|
||
|
result->state = NL_STATE_INITIAL;
|
||
|
result->solver = NL_SOLVER_DEFAULT;
|
||
|
result->max_iterations = 100;
|
||
|
result->threshold = 1e-6;
|
||
|
result->omega = 1.5;
|
||
|
result->row_scaling = 1.0;
|
||
|
result->inner_iterations = 5;
|
||
|
result->solver_func = nlDefaultSolver;
|
||
|
result->progress_func = NULL;
|
||
|
result->verbose = NL_FALSE;
|
||
|
result->nb_systems = 1;
|
||
|
result->matrix_mode = NL_STIFFNESS_MATRIX;
|
||
|
nlMakeCurrent(result);
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
void nlDeleteContext(NLContext context_in) {
|
||
|
NLContextStruct* context = (NLContextStruct*)(context_in);
|
||
|
if(nlCurrentContext == context) {
|
||
|
nlCurrentContext = NULL;
|
||
|
}
|
||
|
|
||
|
nlDeleteMatrix(context->M);
|
||
|
context->M = NULL;
|
||
|
|
||
|
nlDeleteMatrix(context->P);
|
||
|
context->P = NULL;
|
||
|
|
||
|
nlDeleteMatrix(context->B);
|
||
|
context->B = NULL;
|
||
|
|
||
|
nlRowColumnDestroy(&context->af);
|
||
|
nlRowColumnDestroy(&context->al);
|
||
|
|
||
|
NL_DELETE_ARRAY(context->variable_value);
|
||
|
NL_DELETE_ARRAY(context->variable_buffer);
|
||
|
NL_DELETE_ARRAY(context->variable_is_locked);
|
||
|
NL_DELETE_ARRAY(context->variable_index);
|
||
|
|
||
|
NL_DELETE_ARRAY(context->x);
|
||
|
NL_DELETE_ARRAY(context->b);
|
||
|
NL_DELETE_ARRAY(context->right_hand_side);
|
||
|
|
||
|
NL_DELETE_ARRAY(context->eigen_value);
|
||
|
|
||
|
#ifdef NL_PARANOID
|
||
|
NL_CLEAR(NLContextStruct, context);
|
||
|
#endif
|
||
|
NL_DELETE(context);
|
||
|
}
|
||
|
|
||
|
void nlMakeCurrent(NLContext context) {
|
||
|
nlCurrentContext = (NLContextStruct*)(context);
|
||
|
}
|
||
|
|
||
|
NLContext nlGetCurrent() {
|
||
|
return nlCurrentContext;
|
||
|
}
|
||
|
|
||
|
|
||
|
/* Finite state automaton */
|
||
|
|
||
|
void nlCheckState(NLenum state) {
|
||
|
nl_assert(nlCurrentContext->state == state);
|
||
|
}
|
||
|
|
||
|
void nlTransition(NLenum from_state, NLenum to_state) {
|
||
|
nlCheckState(from_state);
|
||
|
nlCurrentContext->state = to_state;
|
||
|
}
|
||
|
|
||
|
|
||
|
/* Preconditioner setup and default solver */
|
||
|
|
||
|
static void nlSetupPreconditioner() {
|
||
|
/* Check compatibility between solver and preconditioner */
|
||
|
if(
|
||
|
nlCurrentContext->solver == NL_BICGSTAB &&
|
||
|
nlCurrentContext->preconditioner == NL_PRECOND_SSOR
|
||
|
) {
|
||
|
nlWarning(
|
||
|
"nlSolve",
|
||
|
"cannot use SSOR preconditioner with non-symmetric matrix, "
|
||
|
"switching to Jacobi"
|
||
|
);
|
||
|
nlCurrentContext->preconditioner = NL_PRECOND_JACOBI;
|
||
|
}
|
||
|
if(
|
||
|
nlCurrentContext->solver == NL_GMRES &&
|
||
|
nlCurrentContext->preconditioner != NL_PRECOND_NONE
|
||
|
) {
|
||
|
nlWarning("nlSolve", "Preconditioner not implemented yet for GMRES");
|
||
|
nlCurrentContext->preconditioner = NL_PRECOND_NONE;
|
||
|
}
|
||
|
if(
|
||
|
nlCurrentContext->solver == NL_SUPERLU_EXT &&
|
||
|
nlCurrentContext->preconditioner != NL_PRECOND_NONE
|
||
|
) {
|
||
|
nlWarning("nlSolve", "Preconditioner not implemented yet for SUPERLU");
|
||
|
nlCurrentContext->preconditioner = NL_PRECOND_NONE;
|
||
|
}
|
||
|
if(
|
||
|
nlCurrentContext->solver == NL_CHOLMOD_EXT &&
|
||
|
nlCurrentContext->preconditioner != NL_PRECOND_NONE
|
||
|
) {
|
||
|
nlWarning("nlSolve", "Preconditioner not implemented yet for CHOLMOD");
|
||
|
nlCurrentContext->preconditioner = NL_PRECOND_NONE;
|
||
|
}
|
||
|
if(
|
||
|
nlCurrentContext->solver == NL_PERM_SUPERLU_EXT &&
|
||
|
nlCurrentContext->preconditioner != NL_PRECOND_NONE
|
||
|
) {
|
||
|
nlWarning(
|
||
|
"nlSolve", "Preconditioner not implemented yet for PERMSUPERLU"
|
||
|
);
|
||
|
nlCurrentContext->preconditioner = NL_PRECOND_NONE;
|
||
|
}
|
||
|
if(
|
||
|
nlCurrentContext->solver == NL_SYMMETRIC_SUPERLU_EXT &&
|
||
|
nlCurrentContext->preconditioner != NL_PRECOND_NONE
|
||
|
) {
|
||
|
nlWarning(
|
||
|
"nlSolve", "Preconditioner not implemented yet for PERMSUPERLU"
|
||
|
);
|
||
|
nlCurrentContext->preconditioner = NL_PRECOND_NONE;
|
||
|
}
|
||
|
|
||
|
nlDeleteMatrix(nlCurrentContext->P);
|
||
|
nlCurrentContext->P = NULL;
|
||
|
|
||
|
switch(nlCurrentContext->preconditioner) {
|
||
|
case NL_PRECOND_NONE:
|
||
|
break;
|
||
|
case NL_PRECOND_JACOBI:
|
||
|
nlCurrentContext->P = nlNewJacobiPreconditioner(nlCurrentContext->M);
|
||
|
break;
|
||
|
case NL_PRECOND_SSOR:
|
||
|
nlCurrentContext->P = nlNewSSORPreconditioner(
|
||
|
nlCurrentContext->M,nlCurrentContext->omega
|
||
|
);
|
||
|
break;
|
||
|
case NL_PRECOND_USER:
|
||
|
break;
|
||
|
default:
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
|
||
|
if(nlCurrentContext->preconditioner != NL_PRECOND_SSOR) {
|
||
|
if(getenv("NL_LOW_MEM") == NULL) {
|
||
|
nlMatrixCompress(&nlCurrentContext->M);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static NLboolean nlSolveDirect() {
|
||
|
NLdouble* b = nlCurrentContext->b;
|
||
|
NLdouble* x = nlCurrentContext->x;
|
||
|
NLuint n = nlCurrentContext->n;
|
||
|
NLuint k;
|
||
|
|
||
|
NLMatrix F = nlMatrixFactorize(
|
||
|
nlCurrentContext->M, nlCurrentContext->solver
|
||
|
);
|
||
|
if(F == NULL) {
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
for(k=0; k<nlCurrentContext->nb_systems; ++k) {
|
||
|
nlMultMatrixVector(F, b, x);
|
||
|
b += n;
|
||
|
x += n;
|
||
|
}
|
||
|
nlDeleteMatrix(F);
|
||
|
return NL_TRUE;
|
||
|
}
|
||
|
|
||
|
static NLboolean nlSolveIterative() {
|
||
|
NLboolean use_CUDA = NL_FALSE;
|
||
|
NLdouble* b = nlCurrentContext->b;
|
||
|
NLdouble* x = nlCurrentContext->x;
|
||
|
NLuint n = nlCurrentContext->n;
|
||
|
NLuint k;
|
||
|
NLBlas_t blas = nlHostBlas();
|
||
|
NLMatrix M = nlCurrentContext->M;
|
||
|
NLMatrix P = nlCurrentContext->P;
|
||
|
|
||
|
/*
|
||
|
* For CUDA: it is implemented for
|
||
|
* all iterative solvers except GMRES
|
||
|
* Jacobi preconditioner
|
||
|
*/
|
||
|
if(nlExtensionIsInitialized_CUDA() &&
|
||
|
(nlCurrentContext->solver != NL_GMRES) &&
|
||
|
(nlCurrentContext->preconditioner == NL_PRECOND_NONE ||
|
||
|
nlCurrentContext->preconditioner == NL_PRECOND_JACOBI)
|
||
|
) {
|
||
|
if(nlCurrentContext->verbose) {
|
||
|
nl_printf("Using CUDA\n");
|
||
|
}
|
||
|
use_CUDA = NL_TRUE;
|
||
|
blas = nlCUDABlas();
|
||
|
if(nlCurrentContext->preconditioner == NL_PRECOND_JACOBI) {
|
||
|
P = nlCUDAJacobiPreconditionerNewFromCRSMatrix(M);
|
||
|
}
|
||
|
M = nlCUDAMatrixNewFromCRSMatrix(M);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* We do not count CUDA transfers and CUDA matrix construction
|
||
|
* when estimating GFlops
|
||
|
*/
|
||
|
nlCurrentContext->start_time = nlCurrentTime();
|
||
|
nlBlasResetStats(blas);
|
||
|
|
||
|
for(k=0; k<nlCurrentContext->nb_systems; ++k) {
|
||
|
nlSolveSystemIterative(
|
||
|
blas,
|
||
|
M,
|
||
|
P,
|
||
|
b,
|
||
|
x,
|
||
|
nlCurrentContext->solver,
|
||
|
nlCurrentContext->threshold,
|
||
|
nlCurrentContext->max_iterations,
|
||
|
nlCurrentContext->inner_iterations
|
||
|
);
|
||
|
b += n;
|
||
|
x += n;
|
||
|
}
|
||
|
|
||
|
nlCurrentContext->flops += blas->flops;
|
||
|
|
||
|
if(use_CUDA) {
|
||
|
nlDeleteMatrix(M);
|
||
|
nlDeleteMatrix(P);
|
||
|
}
|
||
|
|
||
|
return NL_TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
NLboolean nlDefaultSolver() {
|
||
|
NLboolean result = NL_TRUE;
|
||
|
nlSetupPreconditioner();
|
||
|
switch(nlCurrentContext->solver) {
|
||
|
case NL_CG:
|
||
|
case NL_BICGSTAB:
|
||
|
case NL_GMRES: {
|
||
|
result = nlSolveIterative();
|
||
|
} break;
|
||
|
|
||
|
case NL_SUPERLU_EXT:
|
||
|
case NL_PERM_SUPERLU_EXT:
|
||
|
case NL_SYMMETRIC_SUPERLU_EXT:
|
||
|
case NL_CHOLMOD_EXT: {
|
||
|
result = nlSolveDirect();
|
||
|
} break;
|
||
|
default:
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
/******* extracted from nl_blas.c *******/
|
||
|
|
||
|
|
||
|
/*
|
||
|
Many warnings about const double* converted to
|
||
|
double* when calling BLAS functions that do not
|
||
|
have the const qualifier in their prototypes.
|
||
|
*/
|
||
|
#ifdef __clang__
|
||
|
#pragma GCC diagnostic ignored "-Wcast-qual"
|
||
|
#endif
|
||
|
|
||
|
#ifndef NL_FORTRAN_WRAP
|
||
|
#define NL_FORTRAN_WRAP(x) x##_
|
||
|
#endif
|
||
|
|
||
|
#ifdef NL_USE_ATLAS
|
||
|
int NL_FORTRAN_WRAP(xerbla)(char *srname, int *info) {
|
||
|
nl_printf(stderr, "** On entry to %6s, parameter number %2d had an illegal value\n",
|
||
|
srname, *info
|
||
|
);
|
||
|
return 0;
|
||
|
}
|
||
|
#ifndef NL_USE_BLAS
|
||
|
#define NL_USE_BLAS
|
||
|
#endif
|
||
|
#endif
|
||
|
|
||
|
#ifdef NL_USE_SUPERLU
|
||
|
#ifndef NL_USE_BLAS
|
||
|
#define NL_USE_BLAS
|
||
|
/*
|
||
|
* The BLAS included in SuperLU does not have DTPSV,
|
||
|
* we use the DTPSV embedded in OpenNL.
|
||
|
*/
|
||
|
#define NEEDS_DTPSV
|
||
|
#endif
|
||
|
#endif
|
||
|
|
||
|
#ifndef NL_USE_BLAS
|
||
|
#define NEEDS_DTPSV
|
||
|
#endif
|
||
|
|
||
|
|
||
|
|
||
|
/* BLAS routines */
|
||
|
/* copy-pasted from CBLAS (i.e. generated from f2c) */
|
||
|
|
||
|
/*
|
||
|
* lsame
|
||
|
* xerbla
|
||
|
* daxpy
|
||
|
* ddot
|
||
|
* dscal
|
||
|
* dnrm2
|
||
|
* dcopy
|
||
|
* dgemv
|
||
|
* dtpsv
|
||
|
*/
|
||
|
|
||
|
|
||
|
|
||
|
typedef NLint integer ;
|
||
|
typedef NLdouble doublereal ;
|
||
|
typedef NLboolean logical ;
|
||
|
typedef NLint ftnlen ;
|
||
|
|
||
|
|
||
|
#ifndef max
|
||
|
#define max(x,y) ((x) > (y) ? (x) : (y))
|
||
|
#endif
|
||
|
|
||
|
#ifndef NL_USE_BLAS
|
||
|
|
||
|
static int NL_FORTRAN_WRAP(lsame)(const char *ca, const char *cb)
|
||
|
{
|
||
|
/* -- LAPACK auxiliary routine (version 2.0) --
|
||
|
Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
|
||
|
Courant Institute, Argonne National Lab, and Rice University
|
||
|
September 30, 1994
|
||
|
|
||
|
Purpose
|
||
|
=======
|
||
|
|
||
|
LSAME returns .TRUE. if CA is the same letter as CB regardless of case.
|
||
|
|
||
|
Arguments
|
||
|
=========
|
||
|
|
||
|
CA (input) CHARACTER*1
|
||
|
CB (input) CHARACTER*1
|
||
|
CA and CB specify the single characters to be compared.
|
||
|
|
||
|
=====================================================================
|
||
|
*/
|
||
|
|
||
|
/* System generated locals */
|
||
|
int ret_val;
|
||
|
|
||
|
/* Local variables */
|
||
|
int inta, intb, zcode;
|
||
|
|
||
|
ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
|
||
|
if (ret_val) {
|
||
|
return ret_val;
|
||
|
}
|
||
|
|
||
|
/* Now test for equivalence if both characters are alphabetic. */
|
||
|
|
||
|
zcode = 'Z';
|
||
|
|
||
|
/* Use 'Z' rather than 'A' so that ASCII can be detected on Prime
|
||
|
machines, on which ICHAR returns a value with bit 8 set.
|
||
|
ICHAR('A') on Prime machines returns 193 which is the same as
|
||
|
ICHAR('A') on an EBCDIC machine. */
|
||
|
|
||
|
inta = *(unsigned char *)ca;
|
||
|
intb = *(unsigned char *)cb;
|
||
|
|
||
|
if (zcode == 90 || zcode == 122) {
|
||
|
/* ASCII is assumed - ZCODE is the ASCII code of either lower or
|
||
|
upper case 'Z'. */
|
||
|
if (inta >= 97 && inta <= 122) inta += -32;
|
||
|
if (intb >= 97 && intb <= 122) intb += -32;
|
||
|
|
||
|
} else if (zcode == 233 || zcode == 169) {
|
||
|
/* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or
|
||
|
upper case 'Z'. */
|
||
|
if ((inta >= 129 && inta <= 137) ||
|
||
|
(inta >= 145 && inta <= 153) ||
|
||
|
(inta >= 162 && inta <= 169)
|
||
|
)
|
||
|
inta += 64;
|
||
|
if (
|
||
|
(intb >= 129 && intb <= 137) ||
|
||
|
(intb >= 145 && intb <= 153) ||
|
||
|
(intb >= 162 && intb <= 169)
|
||
|
)
|
||
|
intb += 64;
|
||
|
} else if (zcode == 218 || zcode == 250) {
|
||
|
/* ASCII is assumed, on Prime machines - ZCODE is the ASCII code
|
||
|
plus 128 of either lower or upper case 'Z'. */
|
||
|
if (inta >= 225 && inta <= 250) inta += -32;
|
||
|
if (intb >= 225 && intb <= 250) intb += -32;
|
||
|
}
|
||
|
ret_val = inta == intb;
|
||
|
return ret_val;
|
||
|
|
||
|
} /* lsame_ */
|
||
|
|
||
|
/* Subroutine */ static int NL_FORTRAN_WRAP(xerbla)(const char *srname, int *info)
|
||
|
{
|
||
|
/* -- LAPACK auxiliary routine (version 2.0) --
|
||
|
Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
|
||
|
Courant Institute, Argonne National Lab, and Rice University
|
||
|
September 30, 1994
|
||
|
|
||
|
|
||
|
Purpose
|
||
|
=======
|
||
|
|
||
|
XERBLA is an error handler for the LAPACK routines.
|
||
|
It is called by an LAPACK routine if an input parameter has an
|
||
|
invalid value. A message is printed and execution stops.
|
||
|
|
||
|
Installers may consider modifying the STOP statement in order to
|
||
|
call system-specific exception-handling facilities.
|
||
|
|
||
|
Arguments
|
||
|
=========
|
||
|
|
||
|
SRNAME (input) CHARACTER*6
|
||
|
The name of the routine which called XERBLA.
|
||
|
|
||
|
INFO (input) INT
|
||
|
The position of the invalid parameter in the parameter list
|
||
|
|
||
|
of the calling routine.
|
||
|
|
||
|
=====================================================================
|
||
|
*/
|
||
|
|
||
|
nl_fprintf(stderr, "** On entry to %6s, parameter number %2d had an illegal value\n",
|
||
|
srname, *info);
|
||
|
|
||
|
/* End of XERBLA */
|
||
|
|
||
|
return 0;
|
||
|
} /* xerbla_ */
|
||
|
|
||
|
|
||
|
/* Subroutine */ static int NL_FORTRAN_WRAP(daxpy)(integer *n, doublereal *da, doublereal *dx,
|
||
|
integer *incx, doublereal *dy, integer *incy)
|
||
|
{
|
||
|
|
||
|
|
||
|
/* System generated locals */
|
||
|
integer i__1;
|
||
|
|
||
|
/* Local variables */
|
||
|
static integer i, m, ix, iy, mp1;
|
||
|
|
||
|
|
||
|
/* constant times a vector plus a vector.
|
||
|
uses unrolled loops for increments equal to one.
|
||
|
jack dongarra, linpack, 3/11/78.
|
||
|
modified 12/3/93, array(1) declarations changed to array(*)
|
||
|
|
||
|
|
||
|
|
||
|
Parameter adjustments
|
||
|
Function Body */
|
||
|
#define DY(I) dy[(I)-1]
|
||
|
#define DX(I) dx[(I)-1]
|
||
|
|
||
|
|
||
|
if (*n <= 0) {
|
||
|
return 0;
|
||
|
}
|
||
|
if (*da == 0.) {
|
||
|
return 0;
|
||
|
}
|
||
|
if (*incx == 1 && *incy == 1) {
|
||
|
goto L20;
|
||
|
}
|
||
|
|
||
|
/* code for unequal increments or equal increments
|
||
|
not equal to 1 */
|
||
|
|
||
|
ix = 1;
|
||
|
iy = 1;
|
||
|
if (*incx < 0) {
|
||
|
ix = (-(*n) + 1) * *incx + 1;
|
||
|
}
|
||
|
if (*incy < 0) {
|
||
|
iy = (-(*n) + 1) * *incy + 1;
|
||
|
}
|
||
|
i__1 = *n;
|
||
|
for (i = 1; i <= *n; ++i) {
|
||
|
DY(iy) += *da * DX(ix);
|
||
|
ix += *incx;
|
||
|
iy += *incy;
|
||
|
/* L10: */
|
||
|
}
|
||
|
return 0;
|
||
|
|
||
|
/* code for both increments equal to 1
|
||
|
|
||
|
|
||
|
clean-up loop */
|
||
|
|
||
|
L20:
|
||
|
m = *n % 4;
|
||
|
if (m == 0) {
|
||
|
goto L40;
|
||
|
}
|
||
|
i__1 = m;
|
||
|
for (i = 1; i <= m; ++i) {
|
||
|
DY(i) += *da * DX(i);
|
||
|
/* L30: */
|
||
|
}
|
||
|
if (*n < 4) {
|
||
|
return 0;
|
||
|
}
|
||
|
L40:
|
||
|
mp1 = m + 1;
|
||
|
i__1 = *n;
|
||
|
for (i = mp1; i <= *n; i += 4) {
|
||
|
DY(i) += *da * DX(i);
|
||
|
DY(i + 1) += *da * DX(i + 1);
|
||
|
DY(i + 2) += *da * DX(i + 2);
|
||
|
DY(i + 3) += *da * DX(i + 3);
|
||
|
/* L50: */
|
||
|
}
|
||
|
nl_arg_used(i__1);
|
||
|
return 0;
|
||
|
} /* daxpy_ */
|
||
|
#undef DY
|
||
|
#undef DX
|
||
|
|
||
|
|
||
|
static doublereal NL_FORTRAN_WRAP(ddot)(integer *n, doublereal *dx, integer *incx, doublereal *dy,
|
||
|
integer *incy)
|
||
|
{
|
||
|
|
||
|
/* System generated locals */
|
||
|
integer i__1;
|
||
|
doublereal ret_val;
|
||
|
|
||
|
/* Local variables */
|
||
|
static integer i, m;
|
||
|
static doublereal dtemp;
|
||
|
static integer ix, iy, mp1;
|
||
|
|
||
|
|
||
|
/* forms the dot product of two vectors.
|
||
|
uses unrolled loops for increments equal to one.
|
||
|
jack dongarra, linpack, 3/11/78.
|
||
|
modified 12/3/93, array(1) declarations changed to array(*)
|
||
|
|
||
|
|
||
|
|
||
|
Parameter adjustments
|
||
|
Function Body */
|
||
|
#define DY(I) dy[(I)-1]
|
||
|
#define DX(I) dx[(I)-1]
|
||
|
|
||
|
ret_val = 0.;
|
||
|
dtemp = 0.;
|
||
|
if (*n <= 0) {
|
||
|
return ret_val;
|
||
|
}
|
||
|
if (*incx == 1 && *incy == 1) {
|
||
|
goto L20;
|
||
|
}
|
||
|
|
||
|
/* code for unequal increments or equal increments
|
||
|
not equal to 1 */
|
||
|
|
||
|
ix = 1;
|
||
|
iy = 1;
|
||
|
if (*incx < 0) {
|
||
|
ix = (-(*n) + 1) * *incx + 1;
|
||
|
}
|
||
|
if (*incy < 0) {
|
||
|
iy = (-(*n) + 1) * *incy + 1;
|
||
|
}
|
||
|
i__1 = *n;
|
||
|
for (i = 1; i <= *n; ++i) {
|
||
|
dtemp += DX(ix) * DY(iy);
|
||
|
ix += *incx;
|
||
|
iy += *incy;
|
||
|
/* L10: */
|
||
|
}
|
||
|
ret_val = dtemp;
|
||
|
return ret_val;
|
||
|
|
||
|
/* code for both increments equal to 1
|
||
|
|
||
|
|
||
|
clean-up loop */
|
||
|
|
||
|
L20:
|
||
|
m = *n % 5;
|
||
|
if (m == 0) {
|
||
|
goto L40;
|
||
|
}
|
||
|
i__1 = m;
|
||
|
for (i = 1; i <= m; ++i) {
|
||
|
dtemp += DX(i) * DY(i);
|
||
|
/* L30: */
|
||
|
}
|
||
|
if (*n < 5) {
|
||
|
goto L60;
|
||
|
}
|
||
|
L40:
|
||
|
mp1 = m + 1;
|
||
|
i__1 = *n;
|
||
|
for (i = mp1; i <= *n; i += 5) {
|
||
|
dtemp = dtemp + DX(i) * DY(i) + DX(i + 1) * DY(i + 1) + DX(i + 2) *
|
||
|
DY(i + 2) + DX(i + 3) * DY(i + 3) + DX(i + 4) * DY(i + 4);
|
||
|
/* L50: */
|
||
|
}
|
||
|
L60:
|
||
|
ret_val = dtemp;
|
||
|
nl_arg_used(i__1);
|
||
|
return ret_val;
|
||
|
} /* ddot_ */
|
||
|
#undef DY
|
||
|
#undef DX
|
||
|
|
||
|
/* Subroutine */ static int NL_FORTRAN_WRAP(dscal)(integer *n, doublereal *da, doublereal *dx,
|
||
|
integer *incx)
|
||
|
{
|
||
|
|
||
|
|
||
|
/* System generated locals */
|
||
|
integer i__1, i__2;
|
||
|
|
||
|
/* Local variables */
|
||
|
static integer i, m, nincx, mp1;
|
||
|
|
||
|
|
||
|
/* scales a vector by a constant.
|
||
|
uses unrolled loops for increment equal to one.
|
||
|
jack dongarra, linpack, 3/11/78.
|
||
|
modified 3/93 to return if incx .le. 0.
|
||
|
modified 12/3/93, array(1) declarations changed to array(*)
|
||
|
|
||
|
|
||
|
|
||
|
Parameter adjustments
|
||
|
Function Body */
|
||
|
#ifdef DX
|
||
|
#undef DX
|
||
|
#endif
|
||
|
#define DX(I) dx[(I)-1]
|
||
|
|
||
|
|
||
|
if (*n <= 0 || *incx <= 0) {
|
||
|
return 0;
|
||
|
}
|
||
|
if (*incx == 1) {
|
||
|
goto L20;
|
||
|
}
|
||
|
|
||
|
/* code for increment not equal to 1 */
|
||
|
|
||
|
nincx = *n * *incx;
|
||
|
i__1 = nincx;
|
||
|
i__2 = *incx;
|
||
|
for (i = 1; *incx < 0 ? i >= nincx : i <= nincx; i += *incx) {
|
||
|
DX(i) = *da * DX(i);
|
||
|
/* L10: */
|
||
|
}
|
||
|
return 0;
|
||
|
|
||
|
/* code for increment equal to 1
|
||
|
|
||
|
|
||
|
clean-up loop */
|
||
|
|
||
|
L20:
|
||
|
m = *n % 5;
|
||
|
if (m == 0) {
|
||
|
goto L40;
|
||
|
}
|
||
|
i__2 = m;
|
||
|
for (i = 1; i <= m; ++i) {
|
||
|
DX(i) = *da * DX(i);
|
||
|
/* L30: */
|
||
|
}
|
||
|
if (*n < 5) {
|
||
|
return 0;
|
||
|
}
|
||
|
L40:
|
||
|
mp1 = m + 1;
|
||
|
i__2 = *n;
|
||
|
for (i = mp1; i <= *n; i += 5) {
|
||
|
DX(i) = *da * DX(i);
|
||
|
DX(i + 1) = *da * DX(i + 1);
|
||
|
DX(i + 2) = *da * DX(i + 2);
|
||
|
DX(i + 3) = *da * DX(i + 3);
|
||
|
DX(i + 4) = *da * DX(i + 4);
|
||
|
/* L50: */
|
||
|
}
|
||
|
nl_arg_used(i__1);
|
||
|
nl_arg_used(i__2);
|
||
|
return 0;
|
||
|
} /* dscal_ */
|
||
|
#undef DX
|
||
|
|
||
|
static doublereal NL_FORTRAN_WRAP(dnrm2)(integer *n, doublereal *x, integer *incx)
|
||
|
{
|
||
|
|
||
|
|
||
|
/* System generated locals */
|
||
|
integer i__1, i__2;
|
||
|
doublereal ret_val, d__1;
|
||
|
|
||
|
/* Builtin functions */
|
||
|
/* BL: already declared in the included <math.h>,
|
||
|
we do not need it here. */
|
||
|
/*double sqrt(doublereal); */
|
||
|
|
||
|
/* Local variables */
|
||
|
static doublereal norm, scale, absxi;
|
||
|
static integer ix;
|
||
|
static doublereal ssq;
|
||
|
|
||
|
|
||
|
/* DNRM2 returns the euclidean norm of a vector via the function
|
||
|
name, so that
|
||
|
|
||
|
DNRM2 := sqrt( x'*x )
|
||
|
|
||
|
|
||
|
|
||
|
-- This version written on 25-October-1982.
|
||
|
Modified on 14-October-1993 to inline the call to DLASSQ.
|
||
|
Sven Hammarling, Nag Ltd.
|
||
|
|
||
|
|
||
|
|
||
|
Parameter adjustments
|
||
|
Function Body */
|
||
|
#ifdef X
|
||
|
#undef X
|
||
|
#endif
|
||
|
#define X(I) x[(I)-1]
|
||
|
|
||
|
|
||
|
if (*n < 1 || *incx < 1) {
|
||
|
norm = 0.;
|
||
|
} else if (*n == 1) {
|
||
|
norm = fabs(X(1));
|
||
|
} else {
|
||
|
scale = 0.;
|
||
|
ssq = 1.;
|
||
|
/* The following loop is equivalent to this call to the LAPACK
|
||
|
|
||
|
auxiliary routine:
|
||
|
CALL DLASSQ( N, X, INCX, SCALE, SSQ ) */
|
||
|
|
||
|
i__1 = (*n - 1) * *incx + 1;
|
||
|
i__2 = *incx;
|
||
|
for (ix = 1; *incx < 0 ? ix >= (*n-1)**incx+1 : ix <= (*n-1)**incx+1; ix += *incx) {
|
||
|
if (X(ix) != 0.) {
|
||
|
absxi = (d__1 = X(ix), fabs(d__1));
|
||
|
if (scale < absxi) {
|
||
|
/* Computing 2nd power */
|
||
|
d__1 = scale / absxi;
|
||
|
ssq = ssq * (d__1 * d__1) + 1.;
|
||
|
scale = absxi;
|
||
|
} else {
|
||
|
/* Computing 2nd power */
|
||
|
d__1 = absxi / scale;
|
||
|
ssq += d__1 * d__1;
|
||
|
}
|
||
|
}
|
||
|
/* L10: */
|
||
|
}
|
||
|
norm = scale * sqrt(ssq);
|
||
|
}
|
||
|
|
||
|
ret_val = norm;
|
||
|
|
||
|
nl_arg_used(i__1);
|
||
|
nl_arg_used(i__2);
|
||
|
|
||
|
return ret_val;
|
||
|
|
||
|
/* End of DNRM2. */
|
||
|
|
||
|
} /* dnrm2_ */
|
||
|
#undef X
|
||
|
|
||
|
/* Subroutine */ static int NL_FORTRAN_WRAP(dcopy)(integer *n, doublereal *dx, integer *incx,
|
||
|
doublereal *dy, integer *incy)
|
||
|
{
|
||
|
|
||
|
/* System generated locals */
|
||
|
integer i__1;
|
||
|
|
||
|
/* Local variables */
|
||
|
static integer i, m, ix, iy, mp1;
|
||
|
|
||
|
|
||
|
/* copies a vector, x, to a vector, y.
|
||
|
uses unrolled loops for increments equal to one.
|
||
|
jack dongarra, linpack, 3/11/78.
|
||
|
modified 12/3/93, array(1) declarations changed to array(*)
|
||
|
|
||
|
|
||
|
|
||
|
Parameter adjustments
|
||
|
Function Body */
|
||
|
#define DY(I) dy[(I)-1]
|
||
|
#define DX(I) dx[(I)-1]
|
||
|
|
||
|
|
||
|
if (*n <= 0) {
|
||
|
return 0;
|
||
|
}
|
||
|
if (*incx == 1 && *incy == 1) {
|
||
|
goto L20;
|
||
|
}
|
||
|
|
||
|
/* code for unequal increments or equal increments
|
||
|
not equal to 1 */
|
||
|
|
||
|
ix = 1;
|
||
|
iy = 1;
|
||
|
if (*incx < 0) {
|
||
|
ix = (-(*n) + 1) * *incx + 1;
|
||
|
}
|
||
|
if (*incy < 0) {
|
||
|
iy = (-(*n) + 1) * *incy + 1;
|
||
|
}
|
||
|
i__1 = *n;
|
||
|
for (i = 1; i <= *n; ++i) {
|
||
|
DY(iy) = DX(ix);
|
||
|
ix += *incx;
|
||
|
iy += *incy;
|
||
|
/* L10: */
|
||
|
}
|
||
|
return 0;
|
||
|
|
||
|
/* code for both increments equal to 1
|
||
|
|
||
|
|
||
|
clean-up loop */
|
||
|
|
||
|
L20:
|
||
|
m = *n % 7;
|
||
|
if (m == 0) {
|
||
|
goto L40;
|
||
|
}
|
||
|
i__1 = m;
|
||
|
for (i = 1; i <= m; ++i) {
|
||
|
DY(i) = DX(i);
|
||
|
/* L30: */
|
||
|
}
|
||
|
if (*n < 7) {
|
||
|
return 0;
|
||
|
}
|
||
|
L40:
|
||
|
mp1 = m + 1;
|
||
|
i__1 = *n;
|
||
|
for (i = mp1; i <= *n; i += 7) {
|
||
|
DY(i) = DX(i);
|
||
|
DY(i + 1) = DX(i + 1);
|
||
|
DY(i + 2) = DX(i + 2);
|
||
|
DY(i + 3) = DX(i + 3);
|
||
|
DY(i + 4) = DX(i + 4);
|
||
|
DY(i + 5) = DX(i + 5);
|
||
|
DY(i + 6) = DX(i + 6);
|
||
|
/* L50: */
|
||
|
}
|
||
|
nl_arg_used(i__1);
|
||
|
return 0;
|
||
|
} /* dcopy_ */
|
||
|
|
||
|
#undef DX
|
||
|
#undef DY
|
||
|
|
||
|
/* Subroutine */ static int NL_FORTRAN_WRAP(dgemv)(const char *trans, integer *m, integer *n, doublereal *
|
||
|
alpha, doublereal *a, integer *lda, doublereal *x, integer *incx,
|
||
|
doublereal *beta, doublereal *y, integer *incy)
|
||
|
{
|
||
|
|
||
|
|
||
|
/* System generated locals */
|
||
|
/* integer a_dim1, a_offset ; */
|
||
|
integer i__1, i__2;
|
||
|
|
||
|
/* Local variables */
|
||
|
static integer info;
|
||
|
static doublereal temp;
|
||
|
static integer lenx, leny, i, j;
|
||
|
/* extern logical lsame_(char *, char *); */
|
||
|
static integer ix, iy, jx, jy, kx, ky;
|
||
|
/* extern int xerbla_(char *, integer *); */
|
||
|
|
||
|
|
||
|
/* Purpose
|
||
|
=======
|
||
|
|
||
|
DGEMV performs one of the matrix-vector operations
|
||
|
|
||
|
y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y,
|
||
|
|
||
|
where alpha and beta are scalars, x and y are vectors and A is an
|
||
|
m by n matrix.
|
||
|
|
||
|
Parameters
|
||
|
==========
|
||
|
|
||
|
TRANS - CHARACTER*1.
|
||
|
On entry, TRANS specifies the operation to be performed as
|
||
|
follows:
|
||
|
|
||
|
TRANS = 'N' or 'n' y := alpha*A*x + beta*y.
|
||
|
|
||
|
TRANS = 'T' or 't' y := alpha*A'*x + beta*y.
|
||
|
|
||
|
TRANS = 'C' or 'c' y := alpha*A'*x + beta*y.
|
||
|
|
||
|
Unchanged on exit.
|
||
|
|
||
|
M - INTEGER.
|
||
|
On entry, M specifies the number of rows of the matrix A.
|
||
|
M must be at least zero.
|
||
|
Unchanged on exit.
|
||
|
|
||
|
N - INTEGER.
|
||
|
On entry, N specifies the number of columns of the matrix A.
|
||
|
|
||
|
N must be at least zero.
|
||
|
Unchanged on exit.
|
||
|
|
||
|
ALPHA - DOUBLE PRECISION.
|
||
|
On entry, ALPHA specifies the scalar alpha.
|
||
|
Unchanged on exit.
|
||
|
|
||
|
A - DOUBLE PRECISION array of DIMENSION ( LDA, n ).
|
||
|
Before entry, the leading m by n part of the array A must
|
||
|
contain the matrix of coefficients.
|
||
|
Unchanged on exit.
|
||
|
|
||
|
LDA - INTEGER.
|
||
|
On entry, LDA specifies the first dimension of A as declared
|
||
|
|
||
|
in the calling (sub) program. LDA must be at least
|
||
|
max( 1, m ).
|
||
|
Unchanged on exit.
|
||
|
|
||
|
X - DOUBLE PRECISION array of DIMENSION at least
|
||
|
( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
|
||
|
and at least
|
||
|
( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
|
||
|
Before entry, the incremented array X must contain the
|
||
|
vector x.
|
||
|
Unchanged on exit.
|
||
|
|
||
|
INCX - INTEGER.
|
||
|
On entry, INCX specifies the increment for the elements of
|
||
|
X. INCX must not be zero.
|
||
|
Unchanged on exit.
|
||
|
|
||
|
BETA - DOUBLE PRECISION.
|
||
|
On entry, BETA specifies the scalar beta. When BETA is
|
||
|
supplied as zero then Y need not be set on input.
|
||
|
Unchanged on exit.
|
||
|
|
||
|
Y - DOUBLE PRECISION array of DIMENSION at least
|
||
|
( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
|
||
|
and at least
|
||
|
( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
|
||
|
Before entry with BETA non-zero, the incremented array Y
|
||
|
must contain the vector y. On exit, Y is overwritten by the
|
||
|
|
||
|
updated vector y.
|
||
|
|
||
|
INCY - INTEGER.
|
||
|
On entry, INCY specifies the increment for the elements of
|
||
|
Y. INCY must not be zero.
|
||
|
Unchanged on exit.
|
||
|
|
||
|
|
||
|
Level 2 Blas routine.
|
||
|
|
||
|
-- Written on 22-October-1986.
|
||
|
Jack Dongarra, Argonne National Lab.
|
||
|
Jeremy Du Croz, Nag Central Office.
|
||
|
Sven Hammarling, Nag Central Office.
|
||
|
Richard Hanson, Sandia National Labs.
|
||
|
|
||
|
|
||
|
|
||
|
Test the input parameters.
|
||
|
|
||
|
|
||
|
Parameter adjustments
|
||
|
Function Body */
|
||
|
#define X(I) x[(I)-1]
|
||
|
#define Y(I) y[(I)-1]
|
||
|
|
||
|
#define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
|
||
|
|
||
|
info = 0;
|
||
|
if (! NL_FORTRAN_WRAP(lsame)(trans, "N") && ! NL_FORTRAN_WRAP(lsame)(trans, "T") && !
|
||
|
NL_FORTRAN_WRAP(lsame)(trans, "C")) {
|
||
|
info = 1;
|
||
|
} else if (*m < 0) {
|
||
|
info = 2;
|
||
|
} else if (*n < 0) {
|
||
|
info = 3;
|
||
|
} else if (*lda < max(1,*m)) {
|
||
|
info = 6;
|
||
|
} else if (*incx == 0) {
|
||
|
info = 8;
|
||
|
} else if (*incy == 0) {
|
||
|
info = 11;
|
||
|
}
|
||
|
if (info != 0) {
|
||
|
NL_FORTRAN_WRAP(xerbla)("DGEMV ", &info);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/* Quick return if possible. */
|
||
|
|
||
|
if (*m == 0 || *n == 0 || (*alpha == 0. && *beta == 1.)) {
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/* Set LENX and LENY, the lengths of the vectors x and y, and set
|
||
|
|
||
|
up the start points in X and Y. */
|
||
|
|
||
|
if (NL_FORTRAN_WRAP(lsame)(trans, "N")) {
|
||
|
lenx = *n;
|
||
|
leny = *m;
|
||
|
} else {
|
||
|
lenx = *m;
|
||
|
leny = *n;
|
||
|
}
|
||
|
if (*incx > 0) {
|
||
|
kx = 1;
|
||
|
} else {
|
||
|
kx = 1 - (lenx - 1) * *incx;
|
||
|
}
|
||
|
if (*incy > 0) {
|
||
|
ky = 1;
|
||
|
} else {
|
||
|
ky = 1 - (leny - 1) * *incy;
|
||
|
}
|
||
|
|
||
|
/* Start the operations. In this version the elements of A are
|
||
|
accessed sequentially with one pass through A.
|
||
|
|
||
|
First form y := beta*y. */
|
||
|
|
||
|
if (*beta != 1.) {
|
||
|
if (*incy == 1) {
|
||
|
if (*beta == 0.) {
|
||
|
i__1 = leny;
|
||
|
for (i = 1; i <= leny; ++i) {
|
||
|
Y(i) = 0.;
|
||
|
/* L10: */
|
||
|
}
|
||
|
} else {
|
||
|
i__1 = leny;
|
||
|
for (i = 1; i <= leny; ++i) {
|
||
|
Y(i) = *beta * Y(i);
|
||
|
/* L20: */
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
iy = ky;
|
||
|
if (*beta == 0.) {
|
||
|
i__1 = leny;
|
||
|
for (i = 1; i <= leny; ++i) {
|
||
|
Y(iy) = 0.;
|
||
|
iy += *incy;
|
||
|
/* L30: */
|
||
|
}
|
||
|
} else {
|
||
|
i__1 = leny;
|
||
|
for (i = 1; i <= leny; ++i) {
|
||
|
Y(iy) = *beta * Y(iy);
|
||
|
iy += *incy;
|
||
|
/* L40: */
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if (*alpha == 0.) {
|
||
|
return 0;
|
||
|
}
|
||
|
if (NL_FORTRAN_WRAP(lsame)(trans, "N")) {
|
||
|
|
||
|
/* Form y := alpha*A*x + y. */
|
||
|
|
||
|
jx = kx;
|
||
|
if (*incy == 1) {
|
||
|
i__1 = *n;
|
||
|
for (j = 1; j <= *n; ++j) {
|
||
|
if (X(jx) != 0.) {
|
||
|
temp = *alpha * X(jx);
|
||
|
i__2 = *m;
|
||
|
for (i = 1; i <= *m; ++i) {
|
||
|
Y(i) += temp * A(i,j);
|
||
|
/* L50: */
|
||
|
}
|
||
|
}
|
||
|
jx += *incx;
|
||
|
/* L60: */
|
||
|
}
|
||
|
} else {
|
||
|
i__1 = *n;
|
||
|
for (j = 1; j <= *n; ++j) {
|
||
|
if (X(jx) != 0.) {
|
||
|
temp = *alpha * X(jx);
|
||
|
iy = ky;
|
||
|
i__2 = *m;
|
||
|
for (i = 1; i <= *m; ++i) {
|
||
|
Y(iy) += temp * A(i,j);
|
||
|
iy += *incy;
|
||
|
/* L70: */
|
||
|
}
|
||
|
}
|
||
|
jx += *incx;
|
||
|
/* L80: */
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
|
||
|
/* Form y := alpha*A'*x + y. */
|
||
|
|
||
|
jy = ky;
|
||
|
if (*incx == 1) {
|
||
|
i__1 = *n;
|
||
|
for (j = 1; j <= *n; ++j) {
|
||
|
temp = 0.;
|
||
|
i__2 = *m;
|
||
|
for (i = 1; i <= *m; ++i) {
|
||
|
temp += A(i,j) * X(i);
|
||
|
/* L90: */
|
||
|
}
|
||
|
Y(jy) += *alpha * temp;
|
||
|
jy += *incy;
|
||
|
/* L100: */
|
||
|
}
|
||
|
} else {
|
||
|
i__1 = *n;
|
||
|
for (j = 1; j <= *n; ++j) {
|
||
|
temp = 0.;
|
||
|
ix = kx;
|
||
|
i__2 = *m;
|
||
|
for (i = 1; i <= *m; ++i) {
|
||
|
temp += A(i,j) * X(ix);
|
||
|
ix += *incx;
|
||
|
/* L110: */
|
||
|
}
|
||
|
Y(jy) += *alpha * temp;
|
||
|
jy += *incy;
|
||
|
/* L120: */
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
nl_arg_used(i__1);
|
||
|
nl_arg_used(i__2);
|
||
|
return 0;
|
||
|
|
||
|
/* End of DGEMV . */
|
||
|
|
||
|
} /* dgemv_ */
|
||
|
|
||
|
#undef X
|
||
|
#undef Y
|
||
|
#undef A
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
#else
|
||
|
|
||
|
extern void NL_FORTRAN_WRAP(daxpy)(
|
||
|
int *n, double *alpha, double *x,
|
||
|
int *incx, double *y, int *incy
|
||
|
) ;
|
||
|
|
||
|
|
||
|
extern double NL_FORTRAN_WRAP(dnrm2)( int *n, double *x, int *incx ) ;
|
||
|
|
||
|
extern int NL_FORTRAN_WRAP(dcopy)(int* n, double* dx, int* incx, double* dy, int* incy) ;
|
||
|
|
||
|
extern void NL_FORTRAN_WRAP(dscal)(int* n, double* alpha, double *x, int* incx) ;
|
||
|
|
||
|
#ifndef NEEDS_DTPSV
|
||
|
extern void NL_FORTRAN_WRAP(dtpsv)(
|
||
|
char *uplo, char *trans, char *diag,
|
||
|
int *n, double *AP, double *x, int *incx
|
||
|
) ;
|
||
|
#endif
|
||
|
|
||
|
extern void NL_FORTRAN_WRAP(dgemv)(
|
||
|
char *trans, int *m, int *n,
|
||
|
double *alpha, double *A, int *ldA,
|
||
|
double *x, int *incx,
|
||
|
double *beta, double *y, int *incy
|
||
|
) ;
|
||
|
|
||
|
#endif
|
||
|
|
||
|
#ifdef NEEDS_DTPSV
|
||
|
|
||
|
/* DECK DTPSV */
|
||
|
/* Subroutine */
|
||
|
static int NL_FORTRAN_WRAP(dtpsv)(
|
||
|
const char* uplo,
|
||
|
const char* trans,
|
||
|
const char* diag,
|
||
|
integer* n,
|
||
|
doublereal* ap,
|
||
|
doublereal* x,
|
||
|
integer* incx
|
||
|
) {
|
||
|
/* System generated locals */
|
||
|
integer i__1, i__2;
|
||
|
|
||
|
/* Local variables */
|
||
|
static integer info;
|
||
|
static doublereal temp;
|
||
|
static integer i__, j, k;
|
||
|
/* extern logical lsame_(); */
|
||
|
static integer kk, ix, jx, kx;
|
||
|
/* extern int xerbla_(); */
|
||
|
static logical nounit;
|
||
|
|
||
|
/* ***BEGIN PROLOGUE DTPSV */
|
||
|
/* ***PURPOSE Solve one of the systems of equations. */
|
||
|
/* ***LIBRARY SLATEC (BLAS) */
|
||
|
/* ***CATEGORY D1B4 */
|
||
|
/* ***TYPE DOUBLE PRECISION (STPSV-S, DTPSV-D, CTPSV-C) */
|
||
|
/* ***KEYWORDS LEVEL 2 BLAS, LINEAR ALGEBRA */
|
||
|
/* ***AUTHOR Dongarra, J. J., (ANL) */
|
||
|
/* Du Croz, J., (NAG) */
|
||
|
/* Hammarling, S., (NAG) */
|
||
|
/* Hanson, R. J., (SNLA) */
|
||
|
/* ***DESCRIPTION */
|
||
|
|
||
|
/* DTPSV solves one of the systems of equations */
|
||
|
|
||
|
/* A*x = b, or A'*x = b, */
|
||
|
|
||
|
/* where b and x are n element vectors and A is an n by n unit, or */
|
||
|
/* non-unit, upper or lower triangular matrix, supplied in packed form. */
|
||
|
|
||
|
/* No test for singularity or near-singularity is included in this */
|
||
|
/* routine. Such tests must be performed before calling this routine. */
|
||
|
|
||
|
/* Parameters */
|
||
|
/* ========== */
|
||
|
|
||
|
/* UPLO - CHARACTER*1. */
|
||
|
/* On entry, UPLO specifies whether the matrix is an upper or */
|
||
|
/* lower triangular matrix as follows: */
|
||
|
|
||
|
/* UPLO = 'U' or 'u' A is an upper triangular matrix. */
|
||
|
|
||
|
/* UPLO = 'L' or 'l' A is a lower triangular matrix. */
|
||
|
|
||
|
/* Unchanged on exit. */
|
||
|
|
||
|
/* TRANS - CHARACTER*1. */
|
||
|
/* On entry, TRANS specifies the equations to be solved as */
|
||
|
/* follows: */
|
||
|
|
||
|
/* TRANS = 'N' or 'n' A*x = b. */
|
||
|
|
||
|
/* TRANS = 'T' or 't' A'*x = b. */
|
||
|
|
||
|
/* TRANS = 'C' or 'c' A'*x = b. */
|
||
|
|
||
|
/* Unchanged on exit. */
|
||
|
|
||
|
/* DIAG - CHARACTER*1. */
|
||
|
/* On entry, DIAG specifies whether or not A is unit */
|
||
|
/* triangular as follows: */
|
||
|
|
||
|
/* DIAG = 'U' or 'u' A is assumed to be unit triangular. */
|
||
|
|
||
|
/* DIAG = 'N' or 'n' A is not assumed to be unit */
|
||
|
/* triangular. */
|
||
|
|
||
|
/* Unchanged on exit. */
|
||
|
|
||
|
/* N - INTEGER. */
|
||
|
/* On entry, N specifies the order of the matrix A. */
|
||
|
/* N must be at least zero. */
|
||
|
/* Unchanged on exit. */
|
||
|
|
||
|
/* AP - DOUBLE PRECISION array of DIMENSION at least */
|
||
|
/* ( ( n*( n + 1))/2). */
|
||
|
/* Before entry with UPLO = 'U' or 'u', the array AP must */
|
||
|
/* contain the upper triangular matrix packed sequentially, */
|
||
|
/* column by column, so that AP( 1 ) contains a( 1, 1 ), */
|
||
|
/* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) */
|
||
|
/* respectively, and so on. */
|
||
|
/* Before entry with UPLO = 'L' or 'l', the array AP must */
|
||
|
/* contain the lower triangular matrix packed sequentially, */
|
||
|
/* column by column, so that AP( 1 ) contains a( 1, 1 ), */
|
||
|
/* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) */
|
||
|
/* respectively, and so on. */
|
||
|
/* Note that when DIAG = 'U' or 'u', the diagonal elements of */
|
||
|
/* A are not referenced, but are assumed to be unity. */
|
||
|
/* Unchanged on exit. */
|
||
|
|
||
|
/* X - DOUBLE PRECISION array of dimension at least */
|
||
|
/* ( 1 + ( n - 1 )*abs( INCX ) ). */
|
||
|
/* Before entry, the incremented array X must contain the n */
|
||
|
/* element right-hand side vector b. On exit, X is overwritten */
|
||
|
/* with the solution vector x. */
|
||
|
|
||
|
/* INCX - INTEGER. */
|
||
|
/* On entry, INCX specifies the increment for the elements of */
|
||
|
/* X. INCX must not be zero. */
|
||
|
/* Unchanged on exit. */
|
||
|
|
||
|
/* ***REFERENCES Dongarra, J. J., Du Croz, J., Hammarling, S., and */
|
||
|
/* Hanson, R. J. An extended set of Fortran basic linear */
|
||
|
/* algebra subprograms. ACM TOMS, Vol. 14, No. 1, */
|
||
|
/* pp. 1-17, March 1988. */
|
||
|
/* ***ROUTINES CALLED LSAME, XERBLA */
|
||
|
/* ***REVISION HISTORY (YYMMDD) */
|
||
|
/* 861022 DATE WRITTEN */
|
||
|
/* 910605 Modified to meet SLATEC prologue standards. Only comment */
|
||
|
/* lines were modified. (BKS) */
|
||
|
/* ***END PROLOGUE DTPSV */
|
||
|
/* .. Scalar Arguments .. */
|
||
|
/* .. Array Arguments .. */
|
||
|
/* .. Parameters .. */
|
||
|
/* .. Local Scalars .. */
|
||
|
/* .. External Functions .. */
|
||
|
/* .. External Subroutines .. */
|
||
|
/* ***FIRST EXECUTABLE STATEMENT DTPSV */
|
||
|
|
||
|
/* Test the input parameters. */
|
||
|
|
||
|
/* Parameter adjustments */
|
||
|
--x;
|
||
|
--ap;
|
||
|
|
||
|
/* Function Body */
|
||
|
info = 0;
|
||
|
if (!NL_FORTRAN_WRAP(lsame)(uplo, "U") &&
|
||
|
!NL_FORTRAN_WRAP(lsame)(uplo, "L")
|
||
|
) {
|
||
|
info = 1;
|
||
|
} else if (
|
||
|
!NL_FORTRAN_WRAP(lsame)(trans, "N") &&
|
||
|
!NL_FORTRAN_WRAP(lsame)(trans, "T") &&
|
||
|
!NL_FORTRAN_WRAP(lsame)(trans, "C")
|
||
|
) {
|
||
|
info = 2;
|
||
|
} else if (
|
||
|
!NL_FORTRAN_WRAP(lsame)(diag, "U") &&
|
||
|
!NL_FORTRAN_WRAP(lsame)(diag, "N")
|
||
|
) {
|
||
|
info = 3;
|
||
|
} else if (*n < 0) {
|
||
|
info = 4;
|
||
|
} else if (*incx == 0) {
|
||
|
info = 7;
|
||
|
}
|
||
|
if (info != 0) {
|
||
|
NL_FORTRAN_WRAP(xerbla)("DTPSV ", &info);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
/* Quick return if possible. */
|
||
|
|
||
|
if (*n == 0) {
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
nounit = (logical)(NL_FORTRAN_WRAP(lsame)(diag, "N"));
|
||
|
|
||
|
/* Set up the start point in X if the increment is not unity. This */
|
||
|
/* will be ( N - 1 )*INCX too small for descending loops. */
|
||
|
|
||
|
if (*incx <= 0) {
|
||
|
kx = 1 - (*n - 1) * *incx;
|
||
|
} else if (*incx != 1) {
|
||
|
kx = 1;
|
||
|
}
|
||
|
|
||
|
/* Start the operations. In this version the elements of AP are */
|
||
|
/* accessed sequentially with one pass through AP. */
|
||
|
|
||
|
if (NL_FORTRAN_WRAP(lsame)(trans, "N")) {
|
||
|
|
||
|
/* Form x := inv( A )*x. */
|
||
|
|
||
|
if (NL_FORTRAN_WRAP(lsame)(uplo, "U")) {
|
||
|
kk = *n * (*n + 1) / 2;
|
||
|
if (*incx == 1) {
|
||
|
for (j = *n; j >= 1; --j) {
|
||
|
if (x[j] != 0.) {
|
||
|
if (nounit) {
|
||
|
x[j] /= ap[kk];
|
||
|
}
|
||
|
temp = x[j];
|
||
|
k = kk - 1;
|
||
|
for (i__ = j - 1; i__ >= 1; --i__) {
|
||
|
x[i__] -= temp * ap[k];
|
||
|
--k;
|
||
|
/* L10: */
|
||
|
}
|
||
|
}
|
||
|
kk -= j;
|
||
|
/* L20: */
|
||
|
}
|
||
|
} else {
|
||
|
jx = kx + (*n - 1) * *incx;
|
||
|
for (j = *n; j >= 1; --j) {
|
||
|
if (x[jx] != 0.) {
|
||
|
if (nounit) {
|
||
|
x[jx] /= ap[kk];
|
||
|
}
|
||
|
temp = x[jx];
|
||
|
ix = jx;
|
||
|
i__1 = kk - j + 1;
|
||
|
for (k = kk - 1; k >= i__1; --k) {
|
||
|
ix -= *incx;
|
||
|
x[ix] -= temp * ap[k];
|
||
|
/* L30: */
|
||
|
}
|
||
|
}
|
||
|
jx -= *incx;
|
||
|
kk -= j;
|
||
|
/* L40: */
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
kk = 1;
|
||
|
if (*incx == 1) {
|
||
|
i__1 = *n;
|
||
|
for (j = 1; j <= i__1; ++j) {
|
||
|
if (x[j] != 0.) {
|
||
|
if (nounit) {
|
||
|
x[j] /= ap[kk];
|
||
|
}
|
||
|
temp = x[j];
|
||
|
k = kk + 1;
|
||
|
i__2 = *n;
|
||
|
for (i__ = j + 1; i__ <= i__2; ++i__) {
|
||
|
x[i__] -= temp * ap[k];
|
||
|
++k;
|
||
|
/* L50: */
|
||
|
}
|
||
|
}
|
||
|
kk += *n - j + 1;
|
||
|
/* L60: */
|
||
|
}
|
||
|
} else {
|
||
|
jx = kx;
|
||
|
i__1 = *n;
|
||
|
for (j = 1; j <= i__1; ++j) {
|
||
|
if (x[jx] != 0.) {
|
||
|
if (nounit) {
|
||
|
x[jx] /= ap[kk];
|
||
|
}
|
||
|
temp = x[jx];
|
||
|
ix = jx;
|
||
|
i__2 = kk + *n - j;
|
||
|
for (k = kk + 1; k <= i__2; ++k) {
|
||
|
ix += *incx;
|
||
|
x[ix] -= temp * ap[k];
|
||
|
/* L70: */
|
||
|
}
|
||
|
}
|
||
|
jx += *incx;
|
||
|
kk += *n - j + 1;
|
||
|
/* L80: */
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
|
||
|
/* Form x := inv( A' )*x. */
|
||
|
|
||
|
if (NL_FORTRAN_WRAP(lsame)(uplo, "U")) {
|
||
|
kk = 1;
|
||
|
if (*incx == 1) {
|
||
|
i__1 = *n;
|
||
|
for (j = 1; j <= i__1; ++j) {
|
||
|
temp = x[j];
|
||
|
k = kk;
|
||
|
i__2 = j - 1;
|
||
|
for (i__ = 1; i__ <= i__2; ++i__) {
|
||
|
temp -= ap[k] * x[i__];
|
||
|
++k;
|
||
|
/* L90: */
|
||
|
}
|
||
|
if (nounit) {
|
||
|
temp /= ap[kk + j - 1];
|
||
|
}
|
||
|
x[j] = temp;
|
||
|
kk += j;
|
||
|
/* L100: */
|
||
|
}
|
||
|
} else {
|
||
|
jx = kx;
|
||
|
i__1 = *n;
|
||
|
for (j = 1; j <= i__1; ++j) {
|
||
|
temp = x[jx];
|
||
|
ix = kx;
|
||
|
i__2 = kk + j - 2;
|
||
|
for (k = kk; k <= i__2; ++k) {
|
||
|
temp -= ap[k] * x[ix];
|
||
|
ix += *incx;
|
||
|
/* L110: */
|
||
|
}
|
||
|
if (nounit) {
|
||
|
temp /= ap[kk + j - 1];
|
||
|
}
|
||
|
x[jx] = temp;
|
||
|
jx += *incx;
|
||
|
kk += j;
|
||
|
/* L120: */
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
kk = *n * (*n + 1) / 2;
|
||
|
if (*incx == 1) {
|
||
|
for (j = *n; j >= 1; --j) {
|
||
|
temp = x[j];
|
||
|
k = kk;
|
||
|
i__1 = j + 1;
|
||
|
for (i__ = *n; i__ >= i__1; --i__) {
|
||
|
temp -= ap[k] * x[i__];
|
||
|
--k;
|
||
|
/* L130: */
|
||
|
}
|
||
|
if (nounit) {
|
||
|
temp /= ap[kk - *n + j];
|
||
|
}
|
||
|
x[j] = temp;
|
||
|
kk -= *n - j + 1;
|
||
|
/* L140: */
|
||
|
}
|
||
|
} else {
|
||
|
kx += (*n - 1) * *incx;
|
||
|
jx = kx;
|
||
|
for (j = *n; j >= 1; --j) {
|
||
|
temp = x[jx];
|
||
|
ix = kx;
|
||
|
i__1 = kk - (*n - (j + 1));
|
||
|
for (k = kk; k >= i__1; --k) {
|
||
|
temp -= ap[k] * x[ix];
|
||
|
ix -= *incx;
|
||
|
/* L150: */
|
||
|
}
|
||
|
if (nounit) {
|
||
|
temp /= ap[kk - *n + j];
|
||
|
}
|
||
|
x[jx] = temp;
|
||
|
jx -= *incx;
|
||
|
kk -= *n - j + 1;
|
||
|
/* L160: */
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return 0;
|
||
|
|
||
|
/* End of DTPSV . */
|
||
|
|
||
|
} /* dtpsv_ */
|
||
|
|
||
|
#endif
|
||
|
|
||
|
|
||
|
/* End of BLAS routines */
|
||
|
|
||
|
|
||
|
|
||
|
/* Abstract BLAS interface */
|
||
|
|
||
|
|
||
|
void nlBlasResetStats(NLBlas_t blas) {
|
||
|
blas->start_time = nlCurrentTime();
|
||
|
blas->flops = 0;
|
||
|
blas->used_ram[0] = 0;
|
||
|
blas->used_ram[1] = 0;
|
||
|
blas->max_used_ram[0] = 0;
|
||
|
blas->max_used_ram[1] = 0;
|
||
|
blas->sq_rnorm = 0.0;
|
||
|
blas->sq_bnorm = 0.0;
|
||
|
}
|
||
|
|
||
|
double nlBlasGFlops(NLBlas_t blas) {
|
||
|
double now = nlCurrentTime();
|
||
|
double elapsed_time = now - blas->start_time;
|
||
|
return (NLdouble)(blas->flops) / (elapsed_time * 1e9);
|
||
|
}
|
||
|
|
||
|
NLulong nlBlasUsedRam(NLBlas_t blas, NLmemoryType type) {
|
||
|
return blas->used_ram[type];
|
||
|
}
|
||
|
|
||
|
NLulong nlBlasMaxUsedRam(NLBlas_t blas, NLmemoryType type) {
|
||
|
return blas->max_used_ram[type];
|
||
|
}
|
||
|
|
||
|
NLboolean nlBlasHasUnifiedMemory(NLBlas_t blas) {
|
||
|
return blas->has_unified_memory;
|
||
|
}
|
||
|
|
||
|
static void* host_blas_malloc(
|
||
|
NLBlas_t blas, NLmemoryType type, size_t size
|
||
|
) {
|
||
|
nl_arg_used(type);
|
||
|
blas->used_ram[type] += (NLulong)size;
|
||
|
blas->max_used_ram[type] = MAX(
|
||
|
blas->max_used_ram[type],blas->used_ram[type]
|
||
|
);
|
||
|
return malloc(size);
|
||
|
}
|
||
|
|
||
|
static void host_blas_free(
|
||
|
NLBlas_t blas, NLmemoryType type, size_t size, void* ptr
|
||
|
) {
|
||
|
nl_arg_used(type);
|
||
|
blas->used_ram[type] -= (NLulong)size;
|
||
|
free(ptr);
|
||
|
}
|
||
|
|
||
|
static void host_blas_memcpy(
|
||
|
NLBlas_t blas,
|
||
|
void* to, NLmemoryType to_type,
|
||
|
void* from, NLmemoryType from_type,
|
||
|
size_t size
|
||
|
) {
|
||
|
nl_arg_used(blas);
|
||
|
nl_arg_used(to_type);
|
||
|
nl_arg_used(from_type);
|
||
|
memcpy(to,from,size);
|
||
|
}
|
||
|
|
||
|
static void host_blas_dcopy(
|
||
|
NLBlas_t blas, int n, const double *x, int incx, double *y, int incy
|
||
|
) {
|
||
|
nl_arg_used(blas);
|
||
|
NL_FORTRAN_WRAP(dcopy)(&n,(double*)x,&incx,y,&incy);
|
||
|
}
|
||
|
|
||
|
static double host_blas_ddot(
|
||
|
NLBlas_t blas, int n, const double *x, int incx, const double *y, int incy
|
||
|
) {
|
||
|
blas->flops += (NLulong)(2*n);
|
||
|
return NL_FORTRAN_WRAP(ddot)(&n,(double*)x,&incx,(double*)y,&incy);
|
||
|
}
|
||
|
|
||
|
static double host_blas_dnrm2(
|
||
|
NLBlas_t blas, int n, const double *x, int incx
|
||
|
) {
|
||
|
blas->flops += (NLulong)(2*n);
|
||
|
return NL_FORTRAN_WRAP(dnrm2)(&n,(double*)x,&incx);
|
||
|
}
|
||
|
|
||
|
static void host_blas_daxpy(
|
||
|
NLBlas_t blas, int n, double a, const double *x, int incx, double *y, int incy
|
||
|
) {
|
||
|
blas->flops += (NLulong)(2*n);
|
||
|
NL_FORTRAN_WRAP(daxpy)(&n,&a,(double*)x,&incx,y,&incy);
|
||
|
}
|
||
|
|
||
|
static void host_blas_dscal(
|
||
|
NLBlas_t blas, int n, double a, double *x, int incx
|
||
|
) {
|
||
|
blas->flops += (NLulong)n;
|
||
|
NL_FORTRAN_WRAP(dscal)(&n,&a,x,&incx);
|
||
|
}
|
||
|
|
||
|
static void host_blas_dgemv(
|
||
|
NLBlas_t blas, MatrixTranspose trans, int m, int n, double alpha,
|
||
|
const double *A, int ldA, const double *x, int incx,
|
||
|
double beta, double *y, int incy
|
||
|
) {
|
||
|
static const char *T[3] = { "N", "T", 0 };
|
||
|
nl_arg_used(blas);
|
||
|
NL_FORTRAN_WRAP(dgemv)(
|
||
|
T[(int)trans],&m,&n,&alpha,(double*)A,&ldA,
|
||
|
(double*)x,&incx,&beta,y,&incy
|
||
|
);
|
||
|
/* TODO: update flops */
|
||
|
}
|
||
|
|
||
|
static void host_blas_dtpsv(
|
||
|
NLBlas_t blas, MatrixTriangle uplo, MatrixTranspose trans,
|
||
|
MatrixUnitTriangular diag, int n, const double *AP,
|
||
|
double *x, int incx
|
||
|
) {
|
||
|
static const char *UL[2] = { "U", "L" };
|
||
|
static const char *T[3] = { "N", "T", 0 };
|
||
|
static const char *D[2] = { "U", "N" };
|
||
|
nl_arg_used(blas);
|
||
|
NL_FORTRAN_WRAP(dtpsv)(
|
||
|
UL[(int)uplo],T[(int)trans],D[(int)diag],&n,(double*)AP,x,&incx
|
||
|
);
|
||
|
/* TODO: update flops */
|
||
|
}
|
||
|
|
||
|
NLBlas_t nlHostBlas() {
|
||
|
static NLboolean initialized = NL_FALSE;
|
||
|
static struct NLBlas blas;
|
||
|
if(!initialized) {
|
||
|
memset(&blas, 0, sizeof(blas));
|
||
|
blas.has_unified_memory = NL_TRUE;
|
||
|
blas.Malloc = host_blas_malloc;
|
||
|
blas.Free = host_blas_free;
|
||
|
blas.Memcpy = host_blas_memcpy;
|
||
|
blas.Dcopy = host_blas_dcopy;
|
||
|
blas.Ddot = host_blas_ddot;
|
||
|
blas.Dnrm2 = host_blas_dnrm2;
|
||
|
blas.Daxpy = host_blas_daxpy;
|
||
|
blas.Dscal = host_blas_dscal;
|
||
|
blas.Dgemv = host_blas_dgemv;
|
||
|
blas.Dtpsv = host_blas_dtpsv;
|
||
|
nlBlasResetStats(&blas);
|
||
|
initialized = NL_TRUE;
|
||
|
}
|
||
|
return &blas;
|
||
|
}
|
||
|
|
||
|
|
||
|
/******* extracted from nl_iterative_solvers.c *******/
|
||
|
|
||
|
|
||
|
|
||
|
/* Solvers */
|
||
|
|
||
|
/*
|
||
|
* The implementation of the solvers is inspired by
|
||
|
* the lsolver library, by Christian Badura, available from:
|
||
|
* http://www.mathematik.uni-freiburg.de
|
||
|
* /IAM/Research/projectskr/lin_solver/
|
||
|
*
|
||
|
* About the Conjugate Gradient, details can be found in:
|
||
|
* Ashby, Manteuffel, Saylor
|
||
|
* A taxononmy for conjugate gradient methods
|
||
|
* SIAM J Numer Anal 27, 1542-1568 (1990)
|
||
|
*
|
||
|
* This version is completely abstract, the same code can be used for
|
||
|
* CPU/GPU, dense matrix / sparse matrix etc...
|
||
|
* Abstraction is realized through:
|
||
|
* - Abstract blas interface (NLBlas_t), that can implement BLAS
|
||
|
* operations on the CPU or on the GPU.
|
||
|
* - Abstract matrix interface (NLMatrix), that can implement different
|
||
|
* versions of matrix x vector product (CPU/GPU, sparse/dense ...)
|
||
|
*/
|
||
|
|
||
|
|
||
|
|
||
|
static NLuint nlSolveSystem_CG(
|
||
|
NLBlas_t blas,
|
||
|
NLMatrix M, NLdouble* b, NLdouble* x,
|
||
|
double eps, NLuint max_iter
|
||
|
) {
|
||
|
NLint N = (NLint)M->m;
|
||
|
|
||
|
NLdouble *g = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLdouble *r = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLdouble *p = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLuint its=0;
|
||
|
NLdouble t, tau, sig, rho, gam;
|
||
|
NLdouble b_square=blas->Ddot(blas,N,b,1,b,1);
|
||
|
NLdouble err=eps*eps*b_square;
|
||
|
NLdouble curr_err;
|
||
|
|
||
|
nlMultMatrixVector(M,x,g);
|
||
|
blas->Daxpy(blas,N,-1.,b,1,g,1);
|
||
|
blas->Dscal(blas,N,-1.,g,1);
|
||
|
blas->Dcopy(blas,N,g,1,r,1);
|
||
|
curr_err = blas->Ddot(blas,N,g,1,g,1);
|
||
|
while ( curr_err >err && its < max_iter) {
|
||
|
if(nlCurrentContext != NULL) {
|
||
|
if(nlCurrentContext->progress_func != NULL) {
|
||
|
nlCurrentContext->progress_func(its, max_iter, curr_err, err);
|
||
|
}
|
||
|
if(nlCurrentContext->verbose && !(its % 100)) {
|
||
|
nl_printf ( "%d : %.10e -- %.10e\n", its, curr_err, err );
|
||
|
}
|
||
|
}
|
||
|
nlMultMatrixVector(M,r,p);
|
||
|
rho=blas->Ddot(blas,N,p,1,p,1);
|
||
|
sig=blas->Ddot(blas,N,r,1,p,1);
|
||
|
tau=blas->Ddot(blas,N,g,1,r,1);
|
||
|
t=tau/sig;
|
||
|
blas->Daxpy(blas,N,t,r,1,x,1);
|
||
|
blas->Daxpy(blas,N,-t,p,1,g,1);
|
||
|
gam=(t*t*rho-tau)/tau;
|
||
|
blas->Dscal(blas,N,gam,r,1);
|
||
|
blas->Daxpy(blas,N,1.,g,1,r,1);
|
||
|
++its;
|
||
|
curr_err = blas->Ddot(blas,N,g,1,g,1);
|
||
|
}
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, g);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, r);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, p);
|
||
|
blas->sq_bnorm = b_square;
|
||
|
blas->sq_rnorm = curr_err;
|
||
|
return its;
|
||
|
}
|
||
|
|
||
|
static NLuint nlSolveSystem_PRE_CG(
|
||
|
NLBlas_t blas,
|
||
|
NLMatrix M, NLMatrix P, NLdouble* b, NLdouble* x,
|
||
|
double eps, NLuint max_iter
|
||
|
) {
|
||
|
NLint N = (NLint)M->n;
|
||
|
NLdouble* r = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLdouble* d = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLdouble* h = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLdouble *Ad = h;
|
||
|
NLuint its=0;
|
||
|
NLdouble rh, alpha, beta;
|
||
|
NLdouble b_square = blas->Ddot(blas,N,b,1,b,1);
|
||
|
NLdouble err=eps*eps*b_square;
|
||
|
NLdouble curr_err;
|
||
|
|
||
|
nlMultMatrixVector(M,x,r);
|
||
|
blas->Daxpy(blas,N,-1.,b,1,r,1);
|
||
|
nlMultMatrixVector(P,r,d);
|
||
|
blas->Dcopy(blas,N,d,1,h,1);
|
||
|
rh=blas->Ddot(blas,N,r,1,h,1);
|
||
|
curr_err = blas->Ddot(blas,N,r,1,r,1);
|
||
|
|
||
|
while ( curr_err >err && its < max_iter) {
|
||
|
if(nlCurrentContext != NULL) {
|
||
|
if(nlCurrentContext->progress_func != NULL) {
|
||
|
nlCurrentContext->progress_func(its, max_iter, curr_err, err);
|
||
|
}
|
||
|
if( nlCurrentContext->verbose && !(its % 100)) {
|
||
|
nl_printf ( "%d : %.10e -- %.10e\n", its, curr_err, err );
|
||
|
}
|
||
|
}
|
||
|
nlMultMatrixVector(M,d,Ad);
|
||
|
alpha=rh/blas->Ddot(blas,N,d,1,Ad,1);
|
||
|
blas->Daxpy(blas,N,-alpha,d,1,x,1);
|
||
|
blas->Daxpy(blas,N,-alpha,Ad,1,r,1);
|
||
|
nlMultMatrixVector(P,r,h);
|
||
|
beta=1./rh;
|
||
|
rh=blas->Ddot(blas,N,r,1,h,1);
|
||
|
beta*=rh;
|
||
|
blas->Dscal(blas,N,beta,d,1);
|
||
|
blas->Daxpy(blas,N,1.,h,1,d,1);
|
||
|
++its;
|
||
|
curr_err = blas->Ddot(blas,N,r,1,r,1);
|
||
|
}
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, r);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, d);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, h);
|
||
|
blas->sq_bnorm = b_square;
|
||
|
blas->sq_rnorm = curr_err;
|
||
|
return its;
|
||
|
}
|
||
|
|
||
|
static NLuint nlSolveSystem_BICGSTAB(
|
||
|
NLBlas_t blas,
|
||
|
NLMatrix M, NLdouble* b, NLdouble* x,
|
||
|
double eps, NLuint max_iter
|
||
|
) {
|
||
|
NLint N = (NLint)M->n;
|
||
|
NLdouble *rT = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLdouble *d = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLdouble *h = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLdouble *u = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLdouble *Ad = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLdouble *t = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLdouble *s = h;
|
||
|
NLdouble rTh, rTAd, rTr, alpha, beta, omega, st, tt;
|
||
|
NLuint its=0;
|
||
|
NLdouble b_square = blas->Ddot(blas,N,b,1,b,1);
|
||
|
NLdouble err=eps*eps*b_square;
|
||
|
NLdouble *r = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
nlMultMatrixVector(M,x,r);
|
||
|
blas->Daxpy(blas,N,-1.,b,1,r,1);
|
||
|
blas->Dcopy(blas,N,r,1,d,1);
|
||
|
blas->Dcopy(blas,N,d,1,h,1);
|
||
|
blas->Dcopy(blas,N,h,1,rT,1);
|
||
|
nl_assert( blas->Ddot(blas,N,rT,1,rT,1)>1e-40 );
|
||
|
rTh=blas->Ddot(blas,N,rT,1,h,1);
|
||
|
rTr=blas->Ddot(blas,N,r,1,r,1);
|
||
|
|
||
|
while ( rTr>err && its < max_iter) {
|
||
|
if(nlCurrentContext != NULL) {
|
||
|
if(nlCurrentContext->progress_func != NULL) {
|
||
|
nlCurrentContext->progress_func(its, max_iter, rTr, err);
|
||
|
}
|
||
|
if( (nlCurrentContext->verbose) && !(its % 100)) {
|
||
|
nl_printf ( "%d : %.10e -- %.10e\n", its, rTr, err );
|
||
|
}
|
||
|
}
|
||
|
nlMultMatrixVector(M,d,Ad);
|
||
|
rTAd=blas->Ddot(blas,N,rT,1,Ad,1);
|
||
|
nl_assert( fabs(rTAd)>1e-40 );
|
||
|
alpha=rTh/rTAd;
|
||
|
blas->Daxpy(blas,N,-alpha,Ad,1,r,1);
|
||
|
blas->Dcopy(blas,N,h,1,s,1);
|
||
|
blas->Daxpy(blas,N,-alpha,Ad,1,s,1);
|
||
|
nlMultMatrixVector(M,s,t);
|
||
|
blas->Daxpy(blas,N,1.,t,1,u,1);
|
||
|
blas->Dscal(blas,N,alpha,u,1);
|
||
|
st=blas->Ddot(blas,N,s,1,t,1);
|
||
|
tt=blas->Ddot(blas,N,t,1,t,1);
|
||
|
if ( fabs(st)<1e-40 || fabs(tt)<1e-40 ) {
|
||
|
omega = 0.;
|
||
|
} else {
|
||
|
omega = st/tt;
|
||
|
}
|
||
|
blas->Daxpy(blas,N,-omega,t,1,r,1);
|
||
|
blas->Daxpy(blas,N,-alpha,d,1,x,1);
|
||
|
blas->Daxpy(blas,N,-omega,s,1,x,1);
|
||
|
blas->Dcopy(blas,N,s,1,h,1);
|
||
|
blas->Daxpy(blas,N,-omega,t,1,h,1);
|
||
|
beta=(alpha/omega)/rTh;
|
||
|
rTh=blas->Ddot(blas,N,rT,1,h,1);
|
||
|
beta*=rTh;
|
||
|
blas->Dscal(blas,N,beta,d,1);
|
||
|
blas->Daxpy(blas,N,1.,h,1,d,1);
|
||
|
blas->Daxpy(blas,N,-beta*omega,Ad,1,d,1);
|
||
|
rTr=blas->Ddot(blas,N,r,1,r,1);
|
||
|
++its;
|
||
|
}
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, r);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, rT);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, d);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, h);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, u);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, Ad);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, t);
|
||
|
blas->sq_bnorm = b_square;
|
||
|
blas->sq_rnorm = rTr;
|
||
|
return its;
|
||
|
}
|
||
|
|
||
|
static NLuint nlSolveSystem_PRE_BICGSTAB(
|
||
|
NLBlas_t blas,
|
||
|
NLMatrix M, NLMatrix P, NLdouble* b, NLdouble* x,
|
||
|
double eps, NLuint max_iter
|
||
|
) {
|
||
|
NLint N = (NLint)M->n;
|
||
|
NLdouble *rT = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLdouble *d = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLdouble *h = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLdouble *u = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLdouble *Sd = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLdouble *t = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLdouble *aux = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
NLdouble *s = h;
|
||
|
NLdouble rTh, rTSd, rTr, alpha, beta, omega, st, tt;
|
||
|
NLuint its=0;
|
||
|
NLdouble b_square = blas->Ddot(blas,N,b,1,b,1);
|
||
|
NLdouble err = eps*eps*b_square;
|
||
|
NLdouble *r = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, N);
|
||
|
|
||
|
nlMultMatrixVector(M,x,r);
|
||
|
blas->Daxpy(blas,N,-1.,b,1,r,1);
|
||
|
nlMultMatrixVector(P,r,d);
|
||
|
blas->Dcopy(blas,N,d,1,h,1);
|
||
|
blas->Dcopy(blas,N,h,1,rT,1);
|
||
|
nl_assert( blas->Ddot(blas,N,rT,1,rT,1)>1e-40 );
|
||
|
rTh=blas->Ddot(blas,N,rT,1,h,1);
|
||
|
rTr=blas->Ddot(blas,N,r,1,r,1);
|
||
|
|
||
|
while ( rTr>err && its < max_iter) {
|
||
|
if(nlCurrentContext != NULL) {
|
||
|
if(nlCurrentContext->progress_func != NULL) {
|
||
|
nlCurrentContext->progress_func(its, max_iter, rTr, err);
|
||
|
}
|
||
|
if( (nlCurrentContext->verbose) && !(its % 100)) {
|
||
|
nl_printf ( "%d : %.10e -- %.10e\n", its, rTr, err );
|
||
|
}
|
||
|
}
|
||
|
nlMultMatrixVector(M,d,aux);
|
||
|
nlMultMatrixVector(P,aux,Sd);
|
||
|
rTSd=blas->Ddot(blas,N,rT,1,Sd,1);
|
||
|
nl_assert( fabs(rTSd)>1e-40 );
|
||
|
alpha=rTh/rTSd;
|
||
|
blas->Daxpy(blas,N,-alpha,aux,1,r,1);
|
||
|
blas->Dcopy(blas,N,h,1,s,1);
|
||
|
blas->Daxpy(blas,N,-alpha,Sd,1,s,1);
|
||
|
nlMultMatrixVector(M,s,aux);
|
||
|
nlMultMatrixVector(P,aux,t);
|
||
|
blas->Daxpy(blas,N,1.,t,1,u,1);
|
||
|
blas->Dscal(blas,N,alpha,u,1);
|
||
|
st=blas->Ddot(blas,N,s,1,t,1);
|
||
|
tt=blas->Ddot(blas,N,t,1,t,1);
|
||
|
if ( fabs(st)<1e-40 || fabs(tt)<1e-40 ) {
|
||
|
omega = 0.;
|
||
|
} else {
|
||
|
omega = st/tt;
|
||
|
}
|
||
|
blas->Daxpy(blas,N,-omega,aux,1,r,1);
|
||
|
blas->Daxpy(blas,N,-alpha,d,1,x,1);
|
||
|
blas->Daxpy(blas,N,-omega,s,1,x,1);
|
||
|
blas->Dcopy(blas,N,s,1,h,1);
|
||
|
blas->Daxpy(blas,N,-omega,t,1,h,1);
|
||
|
beta=(alpha/omega)/rTh;
|
||
|
rTh=blas->Ddot(blas,N,rT,1,h,1);
|
||
|
beta*=rTh;
|
||
|
blas->Dscal(blas,N,beta,d,1);
|
||
|
blas->Daxpy(blas,N,1.,h,1,d,1);
|
||
|
blas->Daxpy(blas,N,-beta*omega,Sd,1,d,1);
|
||
|
rTr=blas->Ddot(blas,N,r,1,r,1);
|
||
|
++its;
|
||
|
}
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, r);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, rT);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, d);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, h);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, u);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, Sd);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, t);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, N, aux);
|
||
|
blas->sq_bnorm = b_square;
|
||
|
blas->sq_rnorm = rTr;
|
||
|
return its;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Note: this one cannot be executed on device (GPU)
|
||
|
* because it directly manipulates the vectors.
|
||
|
*/
|
||
|
static NLuint nlSolveSystem_GMRES(
|
||
|
NLBlas_t blas,
|
||
|
NLMatrix M, NLdouble* b, NLdouble* x,
|
||
|
double eps, NLuint max_iter, NLuint inner_iter
|
||
|
) {
|
||
|
NLint n = (NLint)M->n;
|
||
|
NLint m = (NLint)inner_iter;
|
||
|
typedef NLdouble *NLdoubleP;
|
||
|
NLdouble *V = NL_NEW_ARRAY(NLdouble, n*(m+1) );
|
||
|
NLdouble *U = NL_NEW_ARRAY(NLdouble, m*(m+1)/2 );
|
||
|
NLdouble *r = NL_NEW_ARRAY(NLdouble, n );
|
||
|
NLdouble *y = NL_NEW_ARRAY(NLdouble, m+1 );
|
||
|
NLdouble *c = NL_NEW_ARRAY(NLdouble, m );
|
||
|
NLdouble *s = NL_NEW_ARRAY(NLdouble, m );
|
||
|
NLdouble **v = NL_NEW_ARRAY(NLdoubleP, m+1 );
|
||
|
NLint i, j, io, uij, u0j;
|
||
|
NLint its = -1;
|
||
|
NLdouble beta, h, rd, dd, nrm2b;
|
||
|
|
||
|
/*
|
||
|
* The way it is written, this routine will not
|
||
|
* work on the GPU since it directly modifies the
|
||
|
* vectors.
|
||
|
*/
|
||
|
nl_assert(nlBlasHasUnifiedMemory(blas));
|
||
|
|
||
|
for ( i=0; i<=m; ++i ){
|
||
|
v[i]=V+i*n;
|
||
|
}
|
||
|
|
||
|
nrm2b=blas->Dnrm2(blas,n,b,1);
|
||
|
io=0;
|
||
|
|
||
|
do { /* outer loop */
|
||
|
++io;
|
||
|
nlMultMatrixVector(M,x,r);
|
||
|
blas->Daxpy(blas,n,-1.,b,1,r,1);
|
||
|
beta=blas->Dnrm2(blas,n,r,1);
|
||
|
blas->Dcopy(blas,n,r,1,v[0],1);
|
||
|
blas->Dscal(blas,n,1./beta,v[0],1);
|
||
|
|
||
|
y[0]=beta;
|
||
|
j=0;
|
||
|
uij=0;
|
||
|
do { /* inner loop: j=0,...,m-1 */
|
||
|
u0j=uij;
|
||
|
nlMultMatrixVector(M,v[j],v[j+1]);
|
||
|
blas->Dgemv(
|
||
|
blas,Transpose,n,j+1,1.,V,n,v[j+1],1,0.,U+u0j,1
|
||
|
);
|
||
|
blas->Dgemv(
|
||
|
blas,NoTranspose,n,j+1,-1.,V,n,U+u0j,1,1.,v[j+1],1
|
||
|
);
|
||
|
h=blas->Dnrm2(blas,n,v[j+1],1);
|
||
|
blas->Dscal(blas,n,1./h,v[j+1],1);
|
||
|
for (i=0; i<j; ++i ) { /* rotiere neue Spalte */
|
||
|
double tmp = c[i]*U[uij]-s[i]*U[uij+1];
|
||
|
U[uij+1] = s[i]*U[uij]+c[i]*U[uij+1];
|
||
|
U[uij] = tmp;
|
||
|
++uij;
|
||
|
}
|
||
|
{ /* berechne neue Rotation */
|
||
|
rd = U[uij];
|
||
|
dd = sqrt(rd*rd+h*h);
|
||
|
c[j] = rd/dd;
|
||
|
s[j] = -h/dd;
|
||
|
U[uij] = dd;
|
||
|
++uij;
|
||
|
}
|
||
|
{ /* rotiere rechte Seite y (vorher: y[j+1]=0) */
|
||
|
y[j+1] = s[j]*y[j];
|
||
|
y[j] = c[j]*y[j];
|
||
|
}
|
||
|
++j;
|
||
|
} while (
|
||
|
j<m && fabs(y[j])>=eps*nrm2b
|
||
|
);
|
||
|
{ /* minimiere bzgl Y */
|
||
|
blas->Dtpsv(
|
||
|
blas,
|
||
|
UpperTriangle,
|
||
|
NoTranspose,
|
||
|
NotUnitTriangular,
|
||
|
j,U,y,1
|
||
|
);
|
||
|
/* correct X */
|
||
|
blas->Dgemv(blas,NoTranspose,n,j,-1.,V,n,y,1,1.,x,1);
|
||
|
}
|
||
|
} while ( fabs(y[j])>=eps*nrm2b && (m*(io-1)+j) < (NLint)max_iter);
|
||
|
|
||
|
/* Count the inner iterations */
|
||
|
its = m*(io-1)+j;
|
||
|
blas->sq_bnorm = nrm2b*nrm2b;
|
||
|
blas->sq_rnorm = y[j]*y[j];
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, V);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, U);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, r);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, y);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, c);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, s);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, n, v);
|
||
|
return (NLuint)its;
|
||
|
}
|
||
|
|
||
|
|
||
|
/* Main driver routine */
|
||
|
|
||
|
NLuint nlSolveSystemIterative(
|
||
|
NLBlas_t blas,
|
||
|
NLMatrix M, NLMatrix P, NLdouble* b_in, NLdouble* x_in,
|
||
|
NLenum solver,
|
||
|
double eps, NLuint max_iter, NLuint inner_iter
|
||
|
) {
|
||
|
NLuint N = M->n;
|
||
|
NLuint result=0;
|
||
|
NLdouble rnorm=0.0;
|
||
|
NLdouble bnorm=0.0;
|
||
|
double* b = b_in;
|
||
|
double* x = x_in;
|
||
|
nl_assert(M->m == M->n);
|
||
|
|
||
|
if(!nlBlasHasUnifiedMemory(blas)) {
|
||
|
b = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, (int)M->n);
|
||
|
blas->Memcpy(
|
||
|
blas,
|
||
|
b, NL_DEVICE_MEMORY,
|
||
|
b_in, NL_HOST_MEMORY, (size_t)N*sizeof(double)
|
||
|
);
|
||
|
x = NL_NEW_VECTOR(blas, NL_DEVICE_MEMORY, (int)M->n);
|
||
|
blas->Memcpy(
|
||
|
blas,
|
||
|
x, NL_DEVICE_MEMORY,
|
||
|
x_in, NL_HOST_MEMORY, (size_t)N*sizeof(double)
|
||
|
);
|
||
|
}
|
||
|
|
||
|
switch(solver) {
|
||
|
case NL_CG:
|
||
|
if(P == NULL) {
|
||
|
result = nlSolveSystem_CG(blas,M,b,x,eps,max_iter);
|
||
|
} else {
|
||
|
result = nlSolveSystem_PRE_CG(blas,M,P,b,x,eps,max_iter);
|
||
|
}
|
||
|
break;
|
||
|
case NL_BICGSTAB:
|
||
|
if(P == NULL) {
|
||
|
result = nlSolveSystem_BICGSTAB(blas,M,b,x,eps,max_iter);
|
||
|
} else {
|
||
|
result = nlSolveSystem_PRE_BICGSTAB(blas,M,P,b,x,eps,max_iter);
|
||
|
}
|
||
|
break;
|
||
|
case NL_GMRES:
|
||
|
result = nlSolveSystem_GMRES(blas,M,b,x,eps,max_iter,inner_iter);
|
||
|
break;
|
||
|
default:
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
|
||
|
|
||
|
/* Get residual norm and rhs norm from BLAS context */
|
||
|
if(nlCurrentContext != NULL) {
|
||
|
bnorm = sqrt(blas->sq_bnorm);
|
||
|
rnorm = sqrt(blas->sq_rnorm);
|
||
|
if(bnorm == 0.0) {
|
||
|
nlCurrentContext->error = rnorm;
|
||
|
if(nlCurrentContext->verbose) {
|
||
|
nl_printf("in OpenNL : ||Ax-b|| = %e\n",nlCurrentContext->error);
|
||
|
}
|
||
|
} else {
|
||
|
nlCurrentContext->error = rnorm/bnorm;
|
||
|
if(nlCurrentContext->verbose) {
|
||
|
nl_printf("in OpenNL : ||Ax-b||/||b|| = %e\n",
|
||
|
nlCurrentContext->error
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
nlCurrentContext->used_iterations = result;
|
||
|
|
||
|
if(!nlBlasHasUnifiedMemory(blas)) {
|
||
|
blas->Memcpy(
|
||
|
blas,
|
||
|
x_in, NL_HOST_MEMORY, x, NL_DEVICE_MEMORY, (size_t)N*sizeof(double)
|
||
|
);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, (int)M->n, x);
|
||
|
NL_DELETE_VECTOR(blas, NL_DEVICE_MEMORY, (int)M->n, b);
|
||
|
}
|
||
|
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
/******* extracted from nl_preconditioners.c *******/
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
typedef struct {
|
||
|
NLuint m;
|
||
|
|
||
|
NLuint n;
|
||
|
|
||
|
NLenum type;
|
||
|
|
||
|
NLDestroyMatrixFunc destroy_func;
|
||
|
|
||
|
NLMultMatrixVectorFunc mult_func;
|
||
|
|
||
|
NLdouble* diag_inv;
|
||
|
|
||
|
} NLJacobiPreconditioner;
|
||
|
|
||
|
|
||
|
static void nlJacobiPreconditionerDestroy(NLJacobiPreconditioner* M) {
|
||
|
NL_DELETE_ARRAY(M->diag_inv);
|
||
|
}
|
||
|
|
||
|
static void nlJacobiPreconditionerMult(
|
||
|
NLJacobiPreconditioner* M, const double* x, double* y
|
||
|
) {
|
||
|
NLuint i;
|
||
|
for(i=0; i<M->n; ++i) {
|
||
|
y[i] = x[i] * M->diag_inv[i];
|
||
|
}
|
||
|
nlHostBlas()->flops += (NLulong)(M->n);
|
||
|
}
|
||
|
|
||
|
NLMatrix nlNewJacobiPreconditioner(NLMatrix M_in) {
|
||
|
NLSparseMatrix* M = NULL;
|
||
|
NLJacobiPreconditioner* result = NULL;
|
||
|
NLuint i;
|
||
|
nl_assert(M_in->type == NL_MATRIX_SPARSE_DYNAMIC);
|
||
|
nl_assert(M_in->m == M_in->n);
|
||
|
M = (NLSparseMatrix*)M_in;
|
||
|
result = NL_NEW(NLJacobiPreconditioner);
|
||
|
result->m = M->m;
|
||
|
result->n = M->n;
|
||
|
result->type = NL_MATRIX_OTHER;
|
||
|
result->destroy_func = (NLDestroyMatrixFunc)nlJacobiPreconditionerDestroy;
|
||
|
result->mult_func = (NLMultMatrixVectorFunc)nlJacobiPreconditionerMult;
|
||
|
result->diag_inv = NL_NEW_ARRAY(double, M->n);
|
||
|
for(i=0; i<M->n; ++i) {
|
||
|
result->diag_inv[i] = (M->diag[i] == 0.0) ? 1.0 : 1.0/M->diag[i];
|
||
|
}
|
||
|
return (NLMatrix)result;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
typedef struct {
|
||
|
NLuint m;
|
||
|
|
||
|
NLuint n;
|
||
|
|
||
|
NLenum type;
|
||
|
|
||
|
NLDestroyMatrixFunc destroy_func;
|
||
|
|
||
|
NLMultMatrixVectorFunc mult_func;
|
||
|
|
||
|
NLSparseMatrix* M;
|
||
|
|
||
|
double omega;
|
||
|
|
||
|
NLdouble* work;
|
||
|
|
||
|
} NLSSORPreconditioner;
|
||
|
|
||
|
|
||
|
static void nlSSORPreconditionerDestroy(NLSSORPreconditioner* M) {
|
||
|
NL_DELETE_ARRAY(M->work);
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
static void nlSparseMatrixMultLowerInverse(
|
||
|
NLSparseMatrix* A, const NLdouble* x, NLdouble* y, double omega
|
||
|
) {
|
||
|
NLuint n = A->n;
|
||
|
NLdouble* diag = A->diag;
|
||
|
NLuint i;
|
||
|
NLuint ij;
|
||
|
NLCoeff* c = NULL;
|
||
|
NLdouble S;
|
||
|
|
||
|
nl_assert(A->storage & NL_MATRIX_STORE_SYMMETRIC);
|
||
|
nl_assert(A->storage & NL_MATRIX_STORE_ROWS);
|
||
|
|
||
|
for(i=0; i<n; i++) {
|
||
|
NLRowColumn* Ri = &(A->row[i]);
|
||
|
S = 0;
|
||
|
for(ij=0; ij < Ri->size; ij++) {
|
||
|
c = &(Ri->coeff[ij]);
|
||
|
nl_parano_assert(c->index <= i);
|
||
|
if(c->index != i) {
|
||
|
S += c->value * y[c->index];
|
||
|
}
|
||
|
}
|
||
|
nlHostBlas()->flops += (NLulong)(2*Ri->size);
|
||
|
y[i] = (x[i] - S) * omega / diag[i];
|
||
|
}
|
||
|
nlHostBlas()->flops += (NLulong)(n*3);
|
||
|
}
|
||
|
static void nlSparseMatrixMultUpperInverse(
|
||
|
NLSparseMatrix* A, const NLdouble* x, NLdouble* y, NLdouble omega
|
||
|
) {
|
||
|
NLuint n = A->n;
|
||
|
NLdouble* diag = A->diag;
|
||
|
NLint i;
|
||
|
NLuint ij;
|
||
|
NLCoeff* c = NULL;
|
||
|
NLdouble S;
|
||
|
|
||
|
nl_assert(A->storage & NL_MATRIX_STORE_SYMMETRIC);
|
||
|
nl_assert(A->storage & NL_MATRIX_STORE_COLUMNS);
|
||
|
|
||
|
for(i=(NLint)(n-1); i>=0; i--) {
|
||
|
NLRowColumn* Ci = &(A->column[i]);
|
||
|
S = 0;
|
||
|
for(ij=0; ij < Ci->size; ij++) {
|
||
|
c = &(Ci->coeff[ij]);
|
||
|
nl_parano_assert(c->index >= i);
|
||
|
if((NLint)(c->index) != i) {
|
||
|
S += c->value * y[c->index];
|
||
|
}
|
||
|
}
|
||
|
nlHostBlas()->flops += (NLulong)(2*Ci->size);
|
||
|
y[i] = (x[i] - S) * omega / diag[i];
|
||
|
}
|
||
|
nlHostBlas()->flops += (NLulong)(n*3);
|
||
|
}
|
||
|
|
||
|
|
||
|
static void nlSSORPreconditionerMult(
|
||
|
NLSSORPreconditioner* P, const double* x, double* y
|
||
|
) {
|
||
|
NLdouble* diag = P->M->diag;
|
||
|
NLuint i;
|
||
|
nlSparseMatrixMultLowerInverse(
|
||
|
P->M, x, P->work, P->omega
|
||
|
);
|
||
|
for(i=0; i<P->n; i++) {
|
||
|
P->work[i] *= (diag[i] / P->omega);
|
||
|
}
|
||
|
nlHostBlas()->flops += (NLulong)(P->n);
|
||
|
nlSparseMatrixMultUpperInverse(
|
||
|
P->M, P->work, y, P->omega
|
||
|
);
|
||
|
nlHostBlas()->Dscal(nlHostBlas(),(NLint)P->n, 2.0 - P->omega, y, 1);
|
||
|
}
|
||
|
|
||
|
NLMatrix nlNewSSORPreconditioner(NLMatrix M_in, double omega) {
|
||
|
NLSparseMatrix* M = NULL;
|
||
|
NLSSORPreconditioner* result = NULL;
|
||
|
nl_assert(M_in->type == NL_MATRIX_SPARSE_DYNAMIC);
|
||
|
nl_assert(M_in->m == M_in->n);
|
||
|
M = (NLSparseMatrix*)M_in;
|
||
|
result = NL_NEW(NLSSORPreconditioner);
|
||
|
result->m = M->m;
|
||
|
result->n = M->n;
|
||
|
result->type = NL_MATRIX_OTHER;
|
||
|
result->destroy_func = (NLDestroyMatrixFunc)nlSSORPreconditionerDestroy;
|
||
|
result->mult_func = (NLMultMatrixVectorFunc)nlSSORPreconditionerMult;
|
||
|
result->M = M;
|
||
|
result->work = NL_NEW_ARRAY(NLdouble, result->n);
|
||
|
result->omega = omega;
|
||
|
return (NLMatrix)result;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
/******* extracted from nl_superlu.c *******/
|
||
|
|
||
|
|
||
|
|
||
|
#ifdef NL_OS_UNIX
|
||
|
# ifdef NL_OS_APPLE
|
||
|
# define SUPERLU_LIB_NAME "libsuperlu_5.dylib"
|
||
|
# else
|
||
|
# define SUPERLU_LIB_NAME "libsuperlu.so"
|
||
|
# endif
|
||
|
#else
|
||
|
# define SUPERLU_LIB_NAME "libsuperlu.xxx"
|
||
|
#endif
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
typedef enum {
|
||
|
SLU_NC, /* column-wise, no supernode */
|
||
|
SLU_NCP, /* column-wise, column-permuted, no supernode
|
||
|
(The consecutive columns of nonzeros, after permutation,
|
||
|
may not be stored contiguously.) */
|
||
|
SLU_NR, /* row-wize, no supernode */
|
||
|
SLU_SC, /* column-wise, supernode */
|
||
|
SLU_SCP, /* supernode, column-wise, permuted */
|
||
|
SLU_SR, /* row-wise, supernode */
|
||
|
SLU_DN, /* Fortran style column-wise storage for dense matrix */
|
||
|
SLU_NR_loc /* distributed compressed row format */
|
||
|
} Stype_t;
|
||
|
|
||
|
typedef enum {
|
||
|
SLU_S, /* single */
|
||
|
SLU_D, /* double */
|
||
|
SLU_C, /* single complex */
|
||
|
SLU_Z /* double complex */
|
||
|
} Dtype_t;
|
||
|
|
||
|
|
||
|
typedef enum {
|
||
|
SLU_GE, /* general */
|
||
|
SLU_TRLU, /* lower triangular, unit diagonal */
|
||
|
SLU_TRUU, /* upper triangular, unit diagonal */
|
||
|
SLU_TRL, /* lower triangular */
|
||
|
SLU_TRU, /* upper triangular */
|
||
|
SLU_SYL, /* symmetric, store lower half */
|
||
|
SLU_SYU, /* symmetric, store upper half */
|
||
|
SLU_HEL, /* Hermitian, store lower half */
|
||
|
SLU_HEU /* Hermitian, store upper half */
|
||
|
} Mtype_t;
|
||
|
|
||
|
typedef int int_t;
|
||
|
|
||
|
typedef struct {
|
||
|
int_t nnz; /* number of nonzeros in the matrix */
|
||
|
void *nzval; /* pointer to array of nonzero values, packed by raw */
|
||
|
int_t *colind; /* pointer to array of columns indices of the nonzeros */
|
||
|
int_t *rowptr; /* pointer to array of beginning of rows in nzval[]
|
||
|
and colind[] */
|
||
|
/* Note:
|
||
|
Zero-based indexing is used;
|
||
|
rowptr[] has nrow+1 entries, the last one pointing
|
||
|
beyond the last row, so that rowptr[nrow] = nnz. */
|
||
|
} NRformat;
|
||
|
|
||
|
typedef struct {
|
||
|
Stype_t Stype; /* Storage type: interprets the storage structure
|
||
|
pointed to by *Store. */
|
||
|
Dtype_t Dtype; /* Data type. */
|
||
|
Mtype_t Mtype; /* Matrix type: describes the mathematical property of
|
||
|
the matrix. */
|
||
|
int_t nrow; /* number of rows */
|
||
|
int_t ncol; /* number of columns */
|
||
|
void *Store; /* pointer to the actual storage of the matrix */
|
||
|
} SuperMatrix;
|
||
|
|
||
|
/* Stype == SLU_DN */
|
||
|
typedef struct {
|
||
|
int_t lda; /* leading dimension */
|
||
|
void *nzval; /* array of size lda*ncol to represent a dense matrix */
|
||
|
} DNformat;
|
||
|
|
||
|
|
||
|
typedef enum {NO, YES} yes_no_t;
|
||
|
typedef enum {DOFACT, SamePattern, SamePattern_SameRowPerm, FACTORED} fact_t;
|
||
|
typedef enum {NOROWPERM, LargeDiag, MY_PERMR} rowperm_t;
|
||
|
typedef enum {NATURAL, MMD_ATA, MMD_AT_PLUS_A, COLAMD,
|
||
|
METIS_AT_PLUS_A, PARMETIS, ZOLTAN, MY_PERMC} colperm_t;
|
||
|
typedef enum {NOTRANS, TRANS, CONJ} trans_t;
|
||
|
typedef enum {NOEQUIL, ROW, COL, BOTH} DiagScale_t;
|
||
|
typedef enum {NOREFINE, SLU_SINGLE=1, SLU_DOUBLE, SLU_EXTRA} IterRefine_t;
|
||
|
typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL} MemType;
|
||
|
typedef enum {HEAD, TAIL} stack_end_t;
|
||
|
typedef enum {SYSTEM, USER} LU_space_t;
|
||
|
typedef enum {ONE_NORM, TWO_NORM, INF_NORM} norm_t;
|
||
|
typedef enum {SILU, SMILU_1, SMILU_2, SMILU_3} milu_t;
|
||
|
|
||
|
typedef struct {
|
||
|
fact_t Fact;
|
||
|
yes_no_t Equil;
|
||
|
colperm_t ColPerm;
|
||
|
trans_t Trans;
|
||
|
IterRefine_t IterRefine;
|
||
|
double DiagPivotThresh;
|
||
|
yes_no_t SymmetricMode;
|
||
|
yes_no_t PivotGrowth;
|
||
|
yes_no_t ConditionNumber;
|
||
|
rowperm_t RowPerm;
|
||
|
int ILU_DropRule;
|
||
|
double ILU_DropTol; /* threshold for dropping */
|
||
|
double ILU_FillFactor; /* gamma in the secondary dropping */
|
||
|
norm_t ILU_Norm; /* infinity-norm, 1-norm, or 2-norm */
|
||
|
double ILU_FillTol; /* threshold for zero pivot perturbation */
|
||
|
milu_t ILU_MILU;
|
||
|
double ILU_MILU_Dim; /* Dimension of PDE (if available) */
|
||
|
yes_no_t ParSymbFact;
|
||
|
yes_no_t ReplaceTinyPivot; /* used in SuperLU_DIST */
|
||
|
yes_no_t SolveInitialized;
|
||
|
yes_no_t RefineInitialized;
|
||
|
yes_no_t PrintStat;
|
||
|
int nnzL, nnzU; /* used to store nnzs for now */
|
||
|
int num_lookaheads; /* num of levels in look-ahead */
|
||
|
yes_no_t lookahead_etree; /* use etree computed from the
|
||
|
serial symbolic factorization */
|
||
|
yes_no_t SymPattern; /* symmetric factorization */
|
||
|
} superlu_options_t;
|
||
|
|
||
|
typedef void* superlu_options_ptr;
|
||
|
|
||
|
typedef float flops_t;
|
||
|
typedef unsigned char Logical;
|
||
|
|
||
|
typedef struct {
|
||
|
int *panel_histo; /* histogram of panel size distribution */
|
||
|
double *utime; /* running time at various phases */
|
||
|
flops_t *ops; /* operation count at various phases */
|
||
|
int TinyPivots; /* number of tiny pivots */
|
||
|
int RefineSteps; /* number of iterative refinement steps */
|
||
|
int expansions; /* number of memory expansions (SuperLU4) */
|
||
|
} SuperLUStat_t;
|
||
|
|
||
|
/*! \brief Headers for 4 types of dynamatically managed memory */
|
||
|
typedef struct e_node {
|
||
|
int size; /* length of the memory that has been used */
|
||
|
void *mem; /* pointer to the new malloc'd store */
|
||
|
} ExpHeader;
|
||
|
|
||
|
typedef struct {
|
||
|
int size;
|
||
|
int used;
|
||
|
int top1; /* grow upward, relative to &array[0] */
|
||
|
int top2; /* grow downward */
|
||
|
void *array;
|
||
|
} LU_stack_t;
|
||
|
|
||
|
typedef struct {
|
||
|
int *xsup; /* supernode and column mapping */
|
||
|
int *supno;
|
||
|
int *lsub; /* compressed L subscripts */
|
||
|
int *xlsub;
|
||
|
void *lusup; /* L supernodes */
|
||
|
int *xlusup;
|
||
|
void *ucol; /* U columns */
|
||
|
int *usub;
|
||
|
int *xusub;
|
||
|
int nzlmax; /* current max size of lsub */
|
||
|
int nzumax; /* " " " ucol */
|
||
|
int nzlumax; /* " " " lusup */
|
||
|
int n; /* number of columns in the matrix */
|
||
|
LU_space_t MemModel; /* 0 - system malloc'd; 1 - user provided */
|
||
|
int num_expansions;
|
||
|
ExpHeader *expanders; /* Array of pointers to 4 types of memory */
|
||
|
LU_stack_t stack; /* use user supplied memory */
|
||
|
} GlobalLU_t;
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
typedef void (*FUNPTR_set_default_options)(superlu_options_ptr options);
|
||
|
typedef void (*FUNPTR_ilu_set_default_options)(superlu_options_ptr options);
|
||
|
typedef void (*FUNPTR_StatInit)(SuperLUStat_t *);
|
||
|
typedef void (*FUNPTR_StatFree)(SuperLUStat_t *);
|
||
|
|
||
|
typedef void (*FUNPTR_dCreate_CompCol_Matrix)(
|
||
|
SuperMatrix *, int, int, int, const double *,
|
||
|
const int *, const int *, Stype_t, Dtype_t, Mtype_t);
|
||
|
|
||
|
typedef void (*FUNPTR_dCreate_Dense_Matrix)(
|
||
|
SuperMatrix *, int, int, const double *, int,
|
||
|
Stype_t, Dtype_t, Mtype_t);
|
||
|
|
||
|
typedef void (*FUNPTR_Destroy_SuperNode_Matrix)(SuperMatrix *);
|
||
|
typedef void (*FUNPTR_Destroy_CompCol_Matrix)(SuperMatrix *);
|
||
|
typedef void (*FUNPTR_Destroy_CompCol_Permuted)(SuperMatrix *);
|
||
|
typedef void (*FUNPTR_Destroy_SuperMatrix_Store)(SuperMatrix *);
|
||
|
|
||
|
typedef void (*FUNPTR_dgssv)(
|
||
|
superlu_options_ptr, SuperMatrix *, int *, int *, SuperMatrix *,
|
||
|
SuperMatrix *, SuperMatrix *, SuperLUStat_t *, int *
|
||
|
);
|
||
|
|
||
|
typedef void (*FUNPTR_dgstrs)(
|
||
|
trans_t, SuperMatrix *, SuperMatrix *, int *, int *,
|
||
|
SuperMatrix *, SuperLUStat_t*, int *
|
||
|
);
|
||
|
|
||
|
typedef void (*FUNPTR_get_perm_c)(int, SuperMatrix *, int *);
|
||
|
typedef void (*FUNPTR_sp_preorder)(
|
||
|
superlu_options_t *, SuperMatrix*, int*, int*, SuperMatrix*
|
||
|
);
|
||
|
typedef int (*FUNPTR_sp_ienv)(int);
|
||
|
typedef int (*FUNPTR_input_error)(const char *, int *);
|
||
|
|
||
|
typedef void (*FUNPTR_dgstrf) (superlu_options_t *options, SuperMatrix *A,
|
||
|
int relax, int panel_size, int *etree, void *work, int lwork,
|
||
|
int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U,
|
||
|
GlobalLU_t *Glu, /* persistent to facilitate multiple factorizations */
|
||
|
SuperLUStat_t *stat, int *info
|
||
|
);
|
||
|
|
||
|
|
||
|
typedef struct {
|
||
|
FUNPTR_set_default_options set_default_options;
|
||
|
FUNPTR_ilu_set_default_options ilu_set_default_options;
|
||
|
FUNPTR_StatInit StatInit;
|
||
|
FUNPTR_StatFree StatFree;
|
||
|
FUNPTR_dCreate_CompCol_Matrix dCreate_CompCol_Matrix;
|
||
|
FUNPTR_dCreate_Dense_Matrix dCreate_Dense_Matrix;
|
||
|
FUNPTR_Destroy_SuperNode_Matrix Destroy_SuperNode_Matrix;
|
||
|
FUNPTR_Destroy_CompCol_Matrix Destroy_CompCol_Matrix;
|
||
|
FUNPTR_Destroy_CompCol_Permuted Destroy_CompCol_Permuted;
|
||
|
FUNPTR_Destroy_SuperMatrix_Store Destroy_SuperMatrix_Store;
|
||
|
FUNPTR_dgssv dgssv;
|
||
|
FUNPTR_dgstrs dgstrs;
|
||
|
FUNPTR_get_perm_c get_perm_c;
|
||
|
FUNPTR_sp_preorder sp_preorder;
|
||
|
FUNPTR_sp_ienv sp_ienv;
|
||
|
FUNPTR_dgstrf dgstrf;
|
||
|
FUNPTR_input_error input_error;
|
||
|
|
||
|
NLdll DLL_handle;
|
||
|
} SuperLUContext;
|
||
|
|
||
|
static SuperLUContext* SuperLU() {
|
||
|
static SuperLUContext context;
|
||
|
static NLboolean init = NL_FALSE;
|
||
|
if(!init) {
|
||
|
init = NL_TRUE;
|
||
|
memset(&context, 0, sizeof(context));
|
||
|
}
|
||
|
return &context;
|
||
|
}
|
||
|
|
||
|
NLboolean nlExtensionIsInitialized_SUPERLU() {
|
||
|
return
|
||
|
SuperLU()->DLL_handle != NULL &&
|
||
|
SuperLU()->set_default_options != NULL &&
|
||
|
SuperLU()->ilu_set_default_options != NULL &&
|
||
|
SuperLU()->StatInit != NULL &&
|
||
|
SuperLU()->StatFree != NULL &&
|
||
|
SuperLU()->dCreate_CompCol_Matrix != NULL &&
|
||
|
SuperLU()->dCreate_Dense_Matrix != NULL &&
|
||
|
SuperLU()->Destroy_SuperNode_Matrix != NULL &&
|
||
|
SuperLU()->Destroy_CompCol_Matrix != NULL &&
|
||
|
SuperLU()->Destroy_CompCol_Permuted != NULL &&
|
||
|
SuperLU()->Destroy_SuperMatrix_Store != NULL &&
|
||
|
SuperLU()->dgssv != NULL &&
|
||
|
SuperLU()->dgstrs != NULL &&
|
||
|
SuperLU()->get_perm_c != NULL &&
|
||
|
SuperLU()->sp_preorder != NULL &&
|
||
|
SuperLU()->sp_ienv != NULL &&
|
||
|
SuperLU()->dgstrf != NULL &&
|
||
|
SuperLU()->input_error != NULL;
|
||
|
}
|
||
|
|
||
|
static void nlTerminateExtension_SUPERLU(void) {
|
||
|
if(SuperLU()->DLL_handle != NULL) {
|
||
|
nlCloseDLL(SuperLU()->DLL_handle);
|
||
|
SuperLU()->DLL_handle = NULL;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
#define find_superlu_func(name) \
|
||
|
if( \
|
||
|
( \
|
||
|
SuperLU()->name = \
|
||
|
(FUNPTR_##name)nlFindFunction(SuperLU()->DLL_handle,#name) \
|
||
|
) == NULL \
|
||
|
) { \
|
||
|
nlError("nlInitExtension_SUPERLU","function not found"); \
|
||
|
nlError("nlInitExtension_SUPERLU",#name); \
|
||
|
return NL_FALSE; \
|
||
|
}
|
||
|
|
||
|
|
||
|
NLboolean nlInitExtension_SUPERLU(void) {
|
||
|
NLenum flags = NL_LINK_NOW | NL_LINK_USE_FALLBACK;
|
||
|
if(nlCurrentContext == NULL || !nlCurrentContext->verbose) {
|
||
|
flags |= NL_LINK_QUIET;
|
||
|
}
|
||
|
|
||
|
if(SuperLU()->DLL_handle != NULL) {
|
||
|
return nlExtensionIsInitialized_SUPERLU();
|
||
|
}
|
||
|
|
||
|
SuperLU()->DLL_handle = nlOpenDLL(SUPERLU_LIB_NAME, flags);
|
||
|
if(SuperLU()->DLL_handle == NULL) {
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
|
||
|
find_superlu_func(set_default_options);
|
||
|
find_superlu_func(ilu_set_default_options);
|
||
|
find_superlu_func(StatInit);
|
||
|
find_superlu_func(StatFree);
|
||
|
find_superlu_func(dCreate_CompCol_Matrix);
|
||
|
find_superlu_func(dCreate_Dense_Matrix);
|
||
|
find_superlu_func(Destroy_SuperNode_Matrix);
|
||
|
find_superlu_func(Destroy_CompCol_Matrix);
|
||
|
find_superlu_func(Destroy_CompCol_Permuted);
|
||
|
find_superlu_func(Destroy_SuperMatrix_Store);
|
||
|
find_superlu_func(dgssv);
|
||
|
find_superlu_func(dgstrs);
|
||
|
find_superlu_func(get_perm_c);
|
||
|
find_superlu_func(sp_preorder);
|
||
|
find_superlu_func(sp_ienv);
|
||
|
find_superlu_func(dgstrf);
|
||
|
find_superlu_func(input_error);
|
||
|
|
||
|
atexit(nlTerminateExtension_SUPERLU);
|
||
|
return NL_TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
typedef struct {
|
||
|
NLuint m;
|
||
|
|
||
|
NLuint n;
|
||
|
|
||
|
NLenum type;
|
||
|
|
||
|
NLDestroyMatrixFunc destroy_func;
|
||
|
|
||
|
NLMultMatrixVectorFunc mult_func;
|
||
|
|
||
|
SuperMatrix L;
|
||
|
|
||
|
SuperMatrix U;
|
||
|
|
||
|
int* perm_r;
|
||
|
|
||
|
int* perm_c;
|
||
|
|
||
|
trans_t trans;
|
||
|
|
||
|
} NLSuperLUFactorizedMatrix;
|
||
|
|
||
|
|
||
|
static void nlSuperLUFactorizedMatrixDestroy(NLSuperLUFactorizedMatrix* M) {
|
||
|
SuperLU()->Destroy_SuperNode_Matrix(&M->L);
|
||
|
SuperLU()->Destroy_CompCol_Matrix(&M->U);
|
||
|
NL_DELETE_ARRAY(M->perm_r);
|
||
|
NL_DELETE_ARRAY(M->perm_c);
|
||
|
}
|
||
|
|
||
|
static void nlSuperLUFactorizedMatrixMult(
|
||
|
NLSuperLUFactorizedMatrix* M, const double* x, double* y
|
||
|
) {
|
||
|
SuperMatrix B;
|
||
|
SuperLUStat_t stat;
|
||
|
int info = 0;
|
||
|
NLuint i;
|
||
|
|
||
|
/* Create vector */
|
||
|
SuperLU()->dCreate_Dense_Matrix(
|
||
|
&B, (int)(M->n), 1, y, (int)(M->n),
|
||
|
SLU_DN, /* Fortran-type column-wise storage */
|
||
|
SLU_D, /* doubles */
|
||
|
SLU_GE /* general */
|
||
|
);
|
||
|
|
||
|
/* copy rhs onto y (superLU matrix-vector product expects it here */
|
||
|
for(i = 0; i < M->n; i++){
|
||
|
y[i] = x[i];
|
||
|
}
|
||
|
|
||
|
/* Call SuperLU triangular solve */
|
||
|
SuperLU()->StatInit(&stat) ;
|
||
|
|
||
|
SuperLU()->dgstrs(
|
||
|
M->trans, &M->L, &M->U, M->perm_c, M->perm_r, &B, &stat, &info
|
||
|
);
|
||
|
|
||
|
SuperLU()->StatFree(&stat) ;
|
||
|
|
||
|
/* Only the "store" structure needs to be
|
||
|
* deallocated (the array has been allocated
|
||
|
* by client code).
|
||
|
*/
|
||
|
SuperLU()->Destroy_SuperMatrix_Store(&B) ;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Copied from SUPERLU/dgssv.c, removed call to linear solve.
|
||
|
*/
|
||
|
static void dgssv_factorize_only(
|
||
|
superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r,
|
||
|
SuperMatrix *L, SuperMatrix *U,
|
||
|
SuperLUStat_t *stat, int *info, trans_t *trans
|
||
|
) {
|
||
|
SuperMatrix *AA = NULL;
|
||
|
/* A in SLU_NC format used by the factorization routine.*/
|
||
|
SuperMatrix AC; /* Matrix postmultiplied by Pc */
|
||
|
int lwork = 0, *etree, i;
|
||
|
GlobalLU_t Glu; /* Not needed on return. */
|
||
|
|
||
|
/* Set default values for some parameters */
|
||
|
int panel_size; /* panel size */
|
||
|
int relax; /* no of columns in a relaxed snodes */
|
||
|
int permc_spec;
|
||
|
|
||
|
nl_assert(A->Stype == SLU_NR || A->Stype == SLU_NC);
|
||
|
|
||
|
*trans = NOTRANS;
|
||
|
|
||
|
if ( options->Fact != DOFACT ) *info = -1;
|
||
|
else if ( A->nrow != A->ncol || A->nrow < 0 ||
|
||
|
(A->Stype != SLU_NC && A->Stype != SLU_NR) ||
|
||
|
A->Dtype != SLU_D || A->Mtype != SLU_GE )
|
||
|
*info = -2;
|
||
|
if ( *info != 0 ) {
|
||
|
i = -(*info);
|
||
|
SuperLU()->input_error("SUPERLU/OpenNL dgssv_factorize_only", &i);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
/* Convert A to SLU_NC format when necessary. */
|
||
|
if ( A->Stype == SLU_NR ) {
|
||
|
NRformat *Astore = (NRformat*)A->Store;
|
||
|
AA = NL_NEW(SuperMatrix);
|
||
|
SuperLU()->dCreate_CompCol_Matrix(
|
||
|
AA, A->ncol, A->nrow, Astore->nnz,
|
||
|
(double*)Astore->nzval, Astore->colind, Astore->rowptr,
|
||
|
SLU_NC, A->Dtype, A->Mtype
|
||
|
);
|
||
|
*trans = TRANS;
|
||
|
} else {
|
||
|
if ( A->Stype == SLU_NC ) AA = A;
|
||
|
}
|
||
|
|
||
|
nl_assert(AA != NULL);
|
||
|
|
||
|
/*
|
||
|
* Get column permutation vector perm_c[], according to permc_spec:
|
||
|
* permc_spec = NATURAL: natural ordering
|
||
|
* permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A
|
||
|
* permc_spec = MMD_ATA: minimum degree on structure of A'*A
|
||
|
* permc_spec = COLAMD: approximate minimum degree column ordering
|
||
|
* permc_spec = MY_PERMC: the ordering already supplied in perm_c[]
|
||
|
*/
|
||
|
permc_spec = options->ColPerm;
|
||
|
if ( permc_spec != MY_PERMC && options->Fact == DOFACT )
|
||
|
SuperLU()->get_perm_c(permc_spec, AA, perm_c);
|
||
|
|
||
|
etree = NL_NEW_ARRAY(int,A->ncol);
|
||
|
SuperLU()->sp_preorder(options, AA, perm_c, etree, &AC);
|
||
|
panel_size = SuperLU()->sp_ienv(1);
|
||
|
relax = SuperLU()->sp_ienv(2);
|
||
|
SuperLU()->dgstrf(options, &AC, relax, panel_size, etree,
|
||
|
NULL, lwork, perm_c, perm_r, L, U, &Glu, stat, info);
|
||
|
|
||
|
NL_DELETE_ARRAY(etree);
|
||
|
SuperLU()->Destroy_CompCol_Permuted(&AC);
|
||
|
if ( A->Stype == SLU_NR ) {
|
||
|
SuperLU()->Destroy_SuperMatrix_Store(AA);
|
||
|
NL_DELETE(AA);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
NLMatrix nlMatrixFactorize_SUPERLU(
|
||
|
NLMatrix M, NLenum solver
|
||
|
) {
|
||
|
NLSuperLUFactorizedMatrix* LU = NULL;
|
||
|
NLCRSMatrix* CRS = NULL;
|
||
|
SuperMatrix superM;
|
||
|
NLuint n = M->n;
|
||
|
superlu_options_t options;
|
||
|
SuperLUStat_t stat;
|
||
|
NLint info = 0; /* status code */
|
||
|
|
||
|
nl_assert(M->m == M->n);
|
||
|
|
||
|
if(M->type == NL_MATRIX_CRS) {
|
||
|
CRS = (NLCRSMatrix*)M;
|
||
|
} else if(M->type == NL_MATRIX_SPARSE_DYNAMIC) {
|
||
|
CRS = (NLCRSMatrix*)nlCRSMatrixNewFromSparseMatrix((NLSparseMatrix*)M);
|
||
|
}
|
||
|
|
||
|
nl_assert(!(CRS->symmetric_storage));
|
||
|
|
||
|
LU = NL_NEW(NLSuperLUFactorizedMatrix);
|
||
|
LU->m = M->m;
|
||
|
LU->n = M->n;
|
||
|
LU->type = NL_MATRIX_OTHER;
|
||
|
LU->destroy_func = (NLDestroyMatrixFunc)(nlSuperLUFactorizedMatrixDestroy);
|
||
|
LU->mult_func = (NLMultMatrixVectorFunc)(nlSuperLUFactorizedMatrixMult);
|
||
|
LU->perm_c = NL_NEW_ARRAY(int, n);
|
||
|
LU->perm_r = NL_NEW_ARRAY(int, n);
|
||
|
|
||
|
SuperLU()->dCreate_CompCol_Matrix(
|
||
|
&superM, (int)n, (int)n, (int)nlCRSMatrixNNZ(CRS),
|
||
|
CRS->val, (int*)CRS->colind, (int*)CRS->rowptr,
|
||
|
SLU_NR, /* Row_wise, no supernode */
|
||
|
SLU_D, /* doubles */
|
||
|
CRS->symmetric_storage ? SLU_SYL : SLU_GE
|
||
|
);
|
||
|
|
||
|
SuperLU()->set_default_options(&options);
|
||
|
switch(solver) {
|
||
|
case NL_SUPERLU_EXT: {
|
||
|
options.ColPerm = NATURAL;
|
||
|
} break;
|
||
|
case NL_PERM_SUPERLU_EXT: {
|
||
|
options.ColPerm = COLAMD;
|
||
|
} break;
|
||
|
case NL_SYMMETRIC_SUPERLU_EXT: {
|
||
|
options.ColPerm = MMD_AT_PLUS_A;
|
||
|
options.SymmetricMode = YES;
|
||
|
} break;
|
||
|
default:
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
|
||
|
SuperLU()->StatInit(&stat);
|
||
|
|
||
|
dgssv_factorize_only(
|
||
|
&options, &superM, LU->perm_c, LU->perm_r,
|
||
|
&LU->L, &LU->U, &stat, &info, &LU->trans
|
||
|
);
|
||
|
|
||
|
SuperLU()->StatFree(&stat);
|
||
|
|
||
|
/*
|
||
|
* Only the "store" structure needs to be deallocated
|
||
|
* (the arrays have been allocated by us, they are in CRS).
|
||
|
*/
|
||
|
SuperLU()->Destroy_SuperMatrix_Store(&superM);
|
||
|
|
||
|
if((NLMatrix)CRS != M) {
|
||
|
nlDeleteMatrix((NLMatrix)CRS);
|
||
|
}
|
||
|
|
||
|
if(info != 0) {
|
||
|
NL_DELETE(LU);
|
||
|
LU = NULL;
|
||
|
}
|
||
|
return (NLMatrix)LU;
|
||
|
}
|
||
|
|
||
|
/******* extracted from nl_cholmod.c *******/
|
||
|
|
||
|
|
||
|
|
||
|
#ifdef NL_OS_UNIX
|
||
|
# ifdef NL_OS_APPLE
|
||
|
# define CHOLMOD_LIB_NAME "libcholmod.dylib"
|
||
|
# else
|
||
|
# define CHOLMOD_LIB_NAME "libcholmod.so"
|
||
|
# endif
|
||
|
#else
|
||
|
# define CHOLMOD_LIB_NAME "libcholmod.xxx"
|
||
|
#endif
|
||
|
|
||
|
|
||
|
/* Excerpt from cholmod_core.h */
|
||
|
|
||
|
|
||
|
/* A dense matrix in column-oriented form. It has no itype since it contains
|
||
|
* no integers. Entry in row i and column j is located in x [i+j*d].
|
||
|
*/
|
||
|
typedef struct cholmod_dense_struct {
|
||
|
size_t nrow ; /* the matrix is nrow-by-ncol */
|
||
|
size_t ncol ;
|
||
|
size_t nzmax ; /* maximum number of entries in the matrix */
|
||
|
size_t d ; /* leading dimension (d >= nrow must hold) */
|
||
|
void *x ; /* size nzmax or 2*nzmax, if present */
|
||
|
void *z ; /* size nzmax, if present */
|
||
|
int xtype ; /* pattern, real, complex, or zomplex */
|
||
|
int dtype ; /* x and z double or float */
|
||
|
} cholmod_dense ;
|
||
|
|
||
|
/* A sparse matrix stored in compressed-column form. */
|
||
|
|
||
|
typedef struct cholmod_sparse_struct
|
||
|
{
|
||
|
size_t nrow ; /* the matrix is nrow-by-ncol */
|
||
|
size_t ncol ;
|
||
|
size_t nzmax ; /* maximum number of entries in the matrix */
|
||
|
|
||
|
/* pointers to int or SuiteSparse_long: */
|
||
|
void *p ; /* p [0..ncol], the column pointers */
|
||
|
void *i ; /* i [0..nzmax-1], the row indices */
|
||
|
|
||
|
/* for unpacked matrices only: */
|
||
|
void *nz ; /* nz [0..ncol-1], the # of nonzeros in each col. In
|
||
|
* packed form, the nonzero pattern of column j is in
|
||
|
* A->i [A->p [j] ... A->p [j+1]-1]. In unpacked form, column j is in
|
||
|
* A->i [A->p [j] ... A->p [j]+A->nz[j]-1] instead. In both cases, the
|
||
|
* numerical values (if present) are in the corresponding locations in
|
||
|
* the array x (or z if A->xtype is CHOLMOD_ZOMPLEX). */
|
||
|
|
||
|
/* pointers to double or float: */
|
||
|
void *x ; /* size nzmax or 2*nzmax, if present */
|
||
|
void *z ; /* size nzmax, if present */
|
||
|
|
||
|
int stype ; /* Describes what parts of the matrix are considered:
|
||
|
*
|
||
|
* 0: matrix is "unsymmetric": use both upper and lower triangular parts
|
||
|
* (the matrix may actually be symmetric in pattern and value, but
|
||
|
* both parts are explicitly stored and used). May be square or
|
||
|
* rectangular.
|
||
|
* >0: matrix is square and symmetric, use upper triangular part.
|
||
|
* Entries in the lower triangular part are ignored.
|
||
|
* <0: matrix is square and symmetric, use lower triangular part.
|
||
|
* Entries in the upper triangular part are ignored.
|
||
|
*
|
||
|
* Note that stype>0 and stype<0 are different for cholmod_sparse and
|
||
|
* cholmod_triplet. See the cholmod_triplet data structure for more
|
||
|
* details.
|
||
|
*/
|
||
|
|
||
|
int itype ; /* CHOLMOD_INT: p, i, and nz are int.
|
||
|
* CHOLMOD_INTLONG: p is SuiteSparse_long,
|
||
|
* i and nz are int.
|
||
|
* CHOLMOD_LONG: p, i, and nz are SuiteSparse_long */
|
||
|
|
||
|
int xtype ; /* pattern, real, complex, or zomplex */
|
||
|
int dtype ; /* x and z are double or float */
|
||
|
int sorted ; /* TRUE if columns are sorted, FALSE otherwise */
|
||
|
int packed ; /* TRUE if packed (nz ignored), FALSE if unpacked
|
||
|
* (nz is required) */
|
||
|
|
||
|
} cholmod_sparse ;
|
||
|
|
||
|
|
||
|
|
||
|
typedef void* cholmod_common_ptr;
|
||
|
typedef cholmod_dense* cholmod_dense_ptr;
|
||
|
typedef cholmod_sparse* cholmod_sparse_ptr;
|
||
|
typedef void* cholmod_factor_ptr;
|
||
|
|
||
|
|
||
|
typedef enum cholmod_xtype_enum {
|
||
|
CHOLMOD_PATTERN =0,
|
||
|
CHOLMOD_REAL =1,
|
||
|
CHOLMOD_COMPLEX =2,
|
||
|
CHOLMOD_ZOMPLEX =3
|
||
|
} cholmod_xtype;
|
||
|
|
||
|
|
||
|
typedef enum cholmod_solve_type_enum {
|
||
|
CHOLMOD_A =0,
|
||
|
CHOLMOD_LDLt =1,
|
||
|
CHOLMOD_LD =2,
|
||
|
CHOLMOD_DLt =3,
|
||
|
CHOLMOD_L =4,
|
||
|
CHOLMOD_Lt =5,
|
||
|
CHOLMOD_D =6,
|
||
|
CHOLMOD_P =7,
|
||
|
CHOLMOD_Pt =8
|
||
|
} cholmod_solve_type;
|
||
|
|
||
|
typedef int cholmod_stype;
|
||
|
|
||
|
typedef void (*FUNPTR_cholmod_start)(cholmod_common_ptr);
|
||
|
|
||
|
typedef cholmod_sparse_ptr (*FUNPTR_cholmod_allocate_sparse)(
|
||
|
size_t m, size_t n, size_t nnz, int sorted,
|
||
|
int packed, int stype, int xtype, cholmod_common_ptr
|
||
|
);
|
||
|
|
||
|
typedef cholmod_dense_ptr (*FUNPTR_cholmod_allocate_dense)(
|
||
|
size_t m, size_t n, size_t d, int xtype, cholmod_common_ptr
|
||
|
);
|
||
|
|
||
|
typedef cholmod_factor_ptr (*FUNPTR_cholmod_analyze)(
|
||
|
cholmod_sparse_ptr A, cholmod_common_ptr
|
||
|
);
|
||
|
|
||
|
typedef int (*FUNPTR_cholmod_factorize)(
|
||
|
cholmod_sparse_ptr A, cholmod_factor_ptr L, cholmod_common_ptr
|
||
|
);
|
||
|
|
||
|
typedef cholmod_dense_ptr (*FUNPTR_cholmod_solve)(
|
||
|
int solve_type, cholmod_factor_ptr, cholmod_dense_ptr, cholmod_common_ptr
|
||
|
);
|
||
|
|
||
|
typedef void (*FUNPTR_cholmod_free_factor)(
|
||
|
cholmod_factor_ptr*, cholmod_common_ptr
|
||
|
);
|
||
|
|
||
|
typedef void (*FUNPTR_cholmod_free_dense)(
|
||
|
cholmod_dense_ptr*, cholmod_common_ptr
|
||
|
);
|
||
|
|
||
|
typedef void (*FUNPTR_cholmod_free_sparse)(
|
||
|
cholmod_sparse_ptr*, cholmod_common_ptr
|
||
|
);
|
||
|
|
||
|
typedef void (*FUNPTR_cholmod_finish)(cholmod_common_ptr);
|
||
|
|
||
|
typedef struct {
|
||
|
char cholmod_common[16384];
|
||
|
|
||
|
FUNPTR_cholmod_start cholmod_start;
|
||
|
FUNPTR_cholmod_allocate_sparse cholmod_allocate_sparse;
|
||
|
FUNPTR_cholmod_allocate_dense cholmod_allocate_dense;
|
||
|
FUNPTR_cholmod_analyze cholmod_analyze;
|
||
|
FUNPTR_cholmod_factorize cholmod_factorize;
|
||
|
FUNPTR_cholmod_solve cholmod_solve;
|
||
|
FUNPTR_cholmod_free_factor cholmod_free_factor;
|
||
|
FUNPTR_cholmod_free_sparse cholmod_free_sparse;
|
||
|
FUNPTR_cholmod_free_dense cholmod_free_dense;
|
||
|
FUNPTR_cholmod_finish cholmod_finish;
|
||
|
|
||
|
NLdll DLL_handle;
|
||
|
} CHOLMODContext;
|
||
|
|
||
|
static CHOLMODContext* CHOLMOD() {
|
||
|
static CHOLMODContext context;
|
||
|
static NLboolean init = NL_FALSE;
|
||
|
if(!init) {
|
||
|
init = NL_TRUE;
|
||
|
memset(&context, 0, sizeof(context));
|
||
|
}
|
||
|
return &context;
|
||
|
}
|
||
|
|
||
|
NLboolean nlExtensionIsInitialized_CHOLMOD() {
|
||
|
return
|
||
|
CHOLMOD()->DLL_handle != NULL &&
|
||
|
CHOLMOD()->cholmod_start != NULL &&
|
||
|
CHOLMOD()->cholmod_allocate_sparse != NULL &&
|
||
|
CHOLMOD()->cholmod_allocate_dense != NULL &&
|
||
|
CHOLMOD()->cholmod_analyze != NULL &&
|
||
|
CHOLMOD()->cholmod_factorize != NULL &&
|
||
|
CHOLMOD()->cholmod_solve != NULL &&
|
||
|
CHOLMOD()->cholmod_free_factor != NULL &&
|
||
|
CHOLMOD()->cholmod_free_sparse != NULL &&
|
||
|
CHOLMOD()->cholmod_free_dense != NULL &&
|
||
|
CHOLMOD()->cholmod_finish != NULL ;
|
||
|
}
|
||
|
|
||
|
#define find_cholmod_func(name) \
|
||
|
if( \
|
||
|
( \
|
||
|
CHOLMOD()->name = \
|
||
|
(FUNPTR_##name)nlFindFunction(CHOLMOD()->DLL_handle,#name) \
|
||
|
) == NULL \
|
||
|
) { \
|
||
|
nlError("nlInitExtension_CHOLMOD","function not found"); \
|
||
|
return NL_FALSE; \
|
||
|
}
|
||
|
|
||
|
|
||
|
static void nlTerminateExtension_CHOLMOD(void) {
|
||
|
if(CHOLMOD()->DLL_handle != NULL) {
|
||
|
CHOLMOD()->cholmod_finish(&CHOLMOD()->cholmod_common);
|
||
|
nlCloseDLL(CHOLMOD()->DLL_handle);
|
||
|
CHOLMOD()->DLL_handle = NULL;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
NLboolean nlInitExtension_CHOLMOD(void) {
|
||
|
NLenum flags = NL_LINK_NOW | NL_LINK_USE_FALLBACK;
|
||
|
if(nlCurrentContext == NULL || !nlCurrentContext->verbose) {
|
||
|
flags |= NL_LINK_QUIET;
|
||
|
}
|
||
|
|
||
|
if(CHOLMOD()->DLL_handle != NULL) {
|
||
|
return nlExtensionIsInitialized_CHOLMOD();
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* MKL has a built-in CHOLMOD that conflicts with
|
||
|
* the CHOLMOD used by OpenNL (to be fixed). For now
|
||
|
* we simply output a warning message and deactivate the
|
||
|
* CHOLMOD extension if the MKL extension was initialized
|
||
|
* before.
|
||
|
*/
|
||
|
if(NLMultMatrixVector_MKL != NULL) {
|
||
|
nl_fprintf(
|
||
|
stderr,
|
||
|
"CHOLMOD extension incompatible with MKL (deactivating)"
|
||
|
);
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
|
||
|
|
||
|
CHOLMOD()->DLL_handle = nlOpenDLL(CHOLMOD_LIB_NAME,flags);
|
||
|
if(CHOLMOD()->DLL_handle == NULL) {
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
|
||
|
find_cholmod_func(cholmod_start);
|
||
|
find_cholmod_func(cholmod_allocate_sparse);
|
||
|
find_cholmod_func(cholmod_allocate_dense);
|
||
|
find_cholmod_func(cholmod_analyze);
|
||
|
find_cholmod_func(cholmod_factorize);
|
||
|
find_cholmod_func(cholmod_solve);
|
||
|
find_cholmod_func(cholmod_free_factor);
|
||
|
find_cholmod_func(cholmod_free_sparse);
|
||
|
find_cholmod_func(cholmod_free_dense);
|
||
|
find_cholmod_func(cholmod_finish);
|
||
|
|
||
|
CHOLMOD()->cholmod_start(&CHOLMOD()->cholmod_common);
|
||
|
|
||
|
atexit(nlTerminateExtension_CHOLMOD);
|
||
|
return NL_TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
typedef struct {
|
||
|
NLuint m;
|
||
|
|
||
|
NLuint n;
|
||
|
|
||
|
NLenum type;
|
||
|
|
||
|
NLDestroyMatrixFunc destroy_func;
|
||
|
|
||
|
NLMultMatrixVectorFunc mult_func;
|
||
|
|
||
|
cholmod_factor_ptr L;
|
||
|
|
||
|
} NLCholmodFactorizedMatrix;
|
||
|
|
||
|
static void nlCholmodFactorizedMatrixDestroy(NLCholmodFactorizedMatrix* M) {
|
||
|
CHOLMOD()->cholmod_free_factor(&M->L, &CHOLMOD()->cholmod_common);
|
||
|
}
|
||
|
|
||
|
static void nlCholmodFactorizedMatrixMult(
|
||
|
NLCholmodFactorizedMatrix* M, const double* x, double* y
|
||
|
) {
|
||
|
/*
|
||
|
* TODO: see whether CHOLDMOD can use user-allocated vectors
|
||
|
* (and avoid copy)
|
||
|
*/
|
||
|
cholmod_dense_ptr X=CHOLMOD()->cholmod_allocate_dense(
|
||
|
M->n, 1, M->n, CHOLMOD_REAL, &CHOLMOD()->cholmod_common
|
||
|
);
|
||
|
cholmod_dense_ptr Y=NULL;
|
||
|
|
||
|
memcpy(X->x, x, M->n*sizeof(double));
|
||
|
Y = CHOLMOD()->cholmod_solve(
|
||
|
CHOLMOD_A, M->L, X, &CHOLMOD()->cholmod_common
|
||
|
);
|
||
|
memcpy(y, Y->x, M->n*sizeof(double));
|
||
|
|
||
|
CHOLMOD()->cholmod_free_dense(&X, &CHOLMOD()->cholmod_common);
|
||
|
CHOLMOD()->cholmod_free_dense(&Y, &CHOLMOD()->cholmod_common);
|
||
|
}
|
||
|
|
||
|
NLMatrix nlMatrixFactorize_CHOLMOD(
|
||
|
NLMatrix M, NLenum solver
|
||
|
) {
|
||
|
NLCholmodFactorizedMatrix* LLt = NULL;
|
||
|
NLCRSMatrix* CRS = NULL;
|
||
|
cholmod_sparse_ptr cM= NULL;
|
||
|
NLuint nnz, cur, i, j, jj;
|
||
|
int* rowptr = NULL;
|
||
|
int* colind = NULL;
|
||
|
double* val = NULL;
|
||
|
NLuint n = M->n;
|
||
|
|
||
|
nl_assert(solver == NL_CHOLMOD_EXT);
|
||
|
nl_assert(M->m == M->n);
|
||
|
|
||
|
if(M->type == NL_MATRIX_CRS) {
|
||
|
CRS = (NLCRSMatrix*)M;
|
||
|
} else if(M->type == NL_MATRIX_SPARSE_DYNAMIC) {
|
||
|
/*
|
||
|
* Note: since we convert once again into symmetric storage,
|
||
|
* we could also directly read the NLSparseMatrix there instead
|
||
|
* of copying once more...
|
||
|
*/
|
||
|
CRS = (NLCRSMatrix*)nlCRSMatrixNewFromSparseMatrix((NLSparseMatrix*)M);
|
||
|
}
|
||
|
|
||
|
LLt = NL_NEW(NLCholmodFactorizedMatrix);
|
||
|
LLt->m = M->m;
|
||
|
LLt->n = M->n;
|
||
|
LLt->type = NL_MATRIX_OTHER;
|
||
|
LLt->destroy_func = (NLDestroyMatrixFunc)(nlCholmodFactorizedMatrixDestroy);
|
||
|
LLt->mult_func = (NLMultMatrixVectorFunc)(nlCholmodFactorizedMatrixMult);
|
||
|
|
||
|
/*
|
||
|
* Compute required nnz, if matrix is not already with symmetric storage,
|
||
|
* ignore entries in the upper triangular part.
|
||
|
*/
|
||
|
|
||
|
nnz=0;
|
||
|
for(i=0; i<n; ++i) {
|
||
|
for(jj=CRS->rowptr[i]; jj<CRS->rowptr[i+1]; ++jj) {
|
||
|
j=CRS->colind[jj];
|
||
|
if(j <= i) {
|
||
|
++nnz;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Copy CRS matrix into CHOLDMOD matrix (and ignore upper trianglar part)
|
||
|
*/
|
||
|
|
||
|
cM = CHOLMOD()->cholmod_allocate_sparse(
|
||
|
n, n, nnz, /* Dimensions and number of non-zeros */
|
||
|
NL_FALSE, /* Sorted = false */
|
||
|
NL_TRUE, /* Packed = true */
|
||
|
1, /* stype (-1 = lower triangular, 1 = upper triangular) */
|
||
|
CHOLMOD_REAL, /* Entries are real numbers */
|
||
|
&CHOLMOD()->cholmod_common
|
||
|
);
|
||
|
|
||
|
rowptr = (int*)cM->p;
|
||
|
colind = (int*)cM->i;
|
||
|
val = (double*)cM->x;
|
||
|
cur = 0;
|
||
|
for(i=0; i<n; ++i) {
|
||
|
rowptr[i] = (int)cur;
|
||
|
for(jj=CRS->rowptr[i]; jj<CRS->rowptr[i+1]; ++jj) {
|
||
|
j = CRS->colind[jj];
|
||
|
if(j <= i) {
|
||
|
val[cur] = CRS->val[jj];
|
||
|
colind[cur] = (int)j;
|
||
|
++cur;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
rowptr[n] = (int)cur;
|
||
|
nl_assert(cur==nnz);
|
||
|
|
||
|
LLt->L = CHOLMOD()->cholmod_analyze(cM, &CHOLMOD()->cholmod_common);
|
||
|
if(!CHOLMOD()->cholmod_factorize(cM, LLt->L, &CHOLMOD()->cholmod_common)) {
|
||
|
CHOLMOD()->cholmod_free_factor(&LLt->L, &CHOLMOD()->cholmod_common);
|
||
|
NL_DELETE(LLt);
|
||
|
}
|
||
|
|
||
|
CHOLMOD()->cholmod_free_sparse(&cM, &CHOLMOD()->cholmod_common);
|
||
|
|
||
|
if((NLMatrix)CRS != M) {
|
||
|
nlDeleteMatrix((NLMatrix)CRS);
|
||
|
}
|
||
|
|
||
|
return (NLMatrix)(LLt);
|
||
|
}
|
||
|
|
||
|
|
||
|
/******* extracted from nl_arpack.c *******/
|
||
|
|
||
|
|
||
|
|
||
|
#ifdef NL_OS_UNIX
|
||
|
# ifdef NL_OS_APPLE
|
||
|
# define ARPACK_LIB_NAME "libarpack.dylib"
|
||
|
# else
|
||
|
# define ARPACK_LIB_NAME "libarpack.so"
|
||
|
# endif
|
||
|
#else
|
||
|
# define ARPACK_LIB_NAME "libarpack.dll"
|
||
|
#endif
|
||
|
|
||
|
|
||
|
typedef int ARint;
|
||
|
typedef int ARlogical;
|
||
|
|
||
|
|
||
|
/* double precision symmetric routines */
|
||
|
|
||
|
typedef void (*FUNPTR_dsaupd)(
|
||
|
ARint *ido, char *bmat, ARint *n, char *which,
|
||
|
ARint *nev, double *tol, double *resid,
|
||
|
ARint *ncv, double *V, ARint *ldv,
|
||
|
ARint *iparam, ARint *ipntr, double *workd,
|
||
|
double *workl, ARint *lworkl, ARint *info
|
||
|
);
|
||
|
|
||
|
typedef void (*FUNPTR_dseupd)(
|
||
|
ARlogical *rvec, char *HowMny, ARlogical *select,
|
||
|
double *d, double *Z, ARint *ldz,
|
||
|
double *sigma, char *bmat, ARint *n,
|
||
|
char *which, ARint *nev, double *tol,
|
||
|
double *resid, ARint *ncv, double *V,
|
||
|
ARint *ldv, ARint *iparam, ARint *ipntr,
|
||
|
double *workd, double *workl,
|
||
|
ARint *lworkl, ARint *info
|
||
|
);
|
||
|
|
||
|
/* double precision nonsymmetric routines */
|
||
|
|
||
|
typedef void (*FUNPTR_dnaupd)(
|
||
|
ARint *ido, char *bmat, ARint *n, char *which,
|
||
|
ARint *nev, double *tol, double *resid,
|
||
|
ARint *ncv, double *V, ARint *ldv,
|
||
|
ARint *iparam, ARint *ipntr, double *workd,
|
||
|
double *workl, ARint *lworkl, ARint *info
|
||
|
);
|
||
|
|
||
|
typedef void (*FUNPTR_dneupd)(
|
||
|
ARlogical *rvec, char *HowMny, ARlogical *select,
|
||
|
double *dr, double *di, double *Z,
|
||
|
ARint *ldz, double *sigmar,
|
||
|
double *sigmai, double *workev,
|
||
|
char *bmat, ARint *n, char *which,
|
||
|
ARint *nev, double *tol, double *resid,
|
||
|
ARint *ncv, double *V, ARint *ldv,
|
||
|
ARint *iparam, ARint *ipntr,
|
||
|
double *workd, double *workl,
|
||
|
ARint *lworkl, ARint *info
|
||
|
);
|
||
|
|
||
|
|
||
|
|
||
|
typedef struct {
|
||
|
FUNPTR_dsaupd dsaupd;
|
||
|
FUNPTR_dseupd dseupd;
|
||
|
FUNPTR_dnaupd dnaupd;
|
||
|
FUNPTR_dneupd dneupd;
|
||
|
NLdll DLL_handle;
|
||
|
} ARPACKContext;
|
||
|
|
||
|
|
||
|
static ARPACKContext* ARPACK() {
|
||
|
static ARPACKContext context;
|
||
|
static NLboolean init = NL_FALSE;
|
||
|
if(!init) {
|
||
|
init = NL_TRUE;
|
||
|
memset(&context, 0, sizeof(context));
|
||
|
}
|
||
|
return &context;
|
||
|
}
|
||
|
|
||
|
NLboolean nlExtensionIsInitialized_ARPACK() {
|
||
|
return
|
||
|
ARPACK()->DLL_handle != NULL &&
|
||
|
ARPACK()->dsaupd != NULL &&
|
||
|
ARPACK()->dseupd != NULL &&
|
||
|
ARPACK()->dnaupd != NULL &&
|
||
|
ARPACK()->dneupd != NULL;
|
||
|
}
|
||
|
|
||
|
static void nlTerminateExtension_ARPACK(void) {
|
||
|
if(ARPACK()->DLL_handle != NULL) {
|
||
|
nlCloseDLL(ARPACK()->DLL_handle);
|
||
|
ARPACK()->DLL_handle = NULL;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
static char* u(const char* str) {
|
||
|
static char buff[1000];
|
||
|
sprintf(buff, "%s_", str);
|
||
|
return buff;
|
||
|
}
|
||
|
|
||
|
#define find_arpack_func(name) \
|
||
|
if( \
|
||
|
( \
|
||
|
ARPACK()->name = \
|
||
|
(FUNPTR_##name)nlFindFunction(ARPACK()->DLL_handle,u(#name)) \
|
||
|
) == NULL \
|
||
|
) { \
|
||
|
nlError("nlInitExtension_ARPACK","function not found"); \
|
||
|
nlError("nlInitExtension_ARPACK",u(#name)); \
|
||
|
return NL_FALSE; \
|
||
|
}
|
||
|
|
||
|
NLboolean nlInitExtension_ARPACK(void) {
|
||
|
NLenum flags = NL_LINK_NOW | NL_LINK_USE_FALLBACK;
|
||
|
if(nlCurrentContext == NULL || !nlCurrentContext->verbose) {
|
||
|
flags |= NL_LINK_QUIET;
|
||
|
}
|
||
|
|
||
|
if(ARPACK()->DLL_handle != NULL) {
|
||
|
return nlExtensionIsInitialized_ARPACK();
|
||
|
}
|
||
|
|
||
|
ARPACK()->DLL_handle = nlOpenDLL(ARPACK_LIB_NAME, flags);
|
||
|
if(ARPACK()->DLL_handle == NULL) {
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
|
||
|
find_arpack_func(dsaupd);
|
||
|
find_arpack_func(dseupd);
|
||
|
find_arpack_func(dnaupd);
|
||
|
find_arpack_func(dneupd);
|
||
|
|
||
|
atexit(nlTerminateExtension_ARPACK);
|
||
|
return NL_TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
static NLMatrix create_OP(NLboolean symmetric) {
|
||
|
NLuint n = nlCurrentContext->M->n;
|
||
|
NLuint i;
|
||
|
NLMatrix result = NULL;
|
||
|
|
||
|
|
||
|
if(nlCurrentContext->eigen_shift != 0.0) {
|
||
|
/*
|
||
|
* A = M
|
||
|
*/
|
||
|
NLSparseMatrix* A = NL_NEW(NLSparseMatrix);
|
||
|
nlSparseMatrixConstruct(A, n, n, NL_MATRIX_STORE_ROWS);
|
||
|
nlSparseMatrixAddMatrix(A, 1.0, nlCurrentContext->M);
|
||
|
if(nlCurrentContext->B == NULL) {
|
||
|
/*
|
||
|
* A = A - shift * Id
|
||
|
*/
|
||
|
for(i=0; i<n; ++i) {
|
||
|
nlSparseMatrixAdd(A, i, i, -nlCurrentContext->eigen_shift);
|
||
|
}
|
||
|
} else {
|
||
|
/*
|
||
|
* A = A - shift * B
|
||
|
*/
|
||
|
nlSparseMatrixAddMatrix(
|
||
|
A, -nlCurrentContext->eigen_shift, nlCurrentContext->B
|
||
|
);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* OP = A^{-1}
|
||
|
*/
|
||
|
if(nlCurrentContext->verbose) {
|
||
|
nl_printf("Factorizing matrix...\n");
|
||
|
}
|
||
|
result = nlMatrixFactorize(
|
||
|
(NLMatrix)A,
|
||
|
symmetric ? NL_SYMMETRIC_SUPERLU_EXT : NL_PERM_SUPERLU_EXT
|
||
|
);
|
||
|
if(nlCurrentContext->verbose) {
|
||
|
nl_printf("Matrix factorized\n");
|
||
|
}
|
||
|
nlDeleteMatrix((NLMatrix)A);
|
||
|
} else {
|
||
|
/*
|
||
|
* OP = M^{-1}
|
||
|
*/
|
||
|
if(nlCurrentContext->verbose) {
|
||
|
nl_printf("Factorizing matrix...\n");
|
||
|
}
|
||
|
result = nlMatrixFactorize(
|
||
|
nlCurrentContext->M,
|
||
|
symmetric ? NL_SYMMETRIC_SUPERLU_EXT : NL_PERM_SUPERLU_EXT
|
||
|
);
|
||
|
if(nlCurrentContext->verbose) {
|
||
|
nl_printf("Matrix factorized\n");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(nlCurrentContext->B != NULL) {
|
||
|
/*
|
||
|
* OP = OP * B
|
||
|
*/
|
||
|
result = nlMatrixNewFromProduct(
|
||
|
result, NL_TRUE, /* mem. ownership transferred */
|
||
|
nlCurrentContext->B, NL_FALSE /* mem. ownership kept by context */
|
||
|
);
|
||
|
}
|
||
|
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
static int eigencompare(const void* pi, const void* pj) {
|
||
|
NLuint i = *(const NLuint*)pi;
|
||
|
NLuint j = *(const NLuint*)pj;
|
||
|
double vali = fabs(nlCurrentContext->temp_eigen_value[i]);
|
||
|
double valj = fabs(nlCurrentContext->temp_eigen_value[j]);
|
||
|
if(vali == valj) {
|
||
|
return 0;
|
||
|
}
|
||
|
return vali < valj ? -1 : 1;
|
||
|
}
|
||
|
|
||
|
void nlEigenSolve_ARPACK(void) {
|
||
|
NLboolean symmetric =
|
||
|
nlCurrentContext->symmetric && (nlCurrentContext->B == NULL);
|
||
|
int n = (int)nlCurrentContext->M->n; /* Dimension of the matrix */
|
||
|
int nev = /* Number of eigenvectors requested */
|
||
|
(int)nlCurrentContext->nb_systems;
|
||
|
NLMatrix OP = create_OP(symmetric);
|
||
|
int ncv = (int)(nev * 2.5); /* Length of Arnoldi factorization */
|
||
|
/* Rule of thumb in ARPACK documentation: ncv > 2 * nev */
|
||
|
int* iparam = NULL;
|
||
|
int* ipntr = NULL;
|
||
|
NLdouble* resid = NULL;
|
||
|
NLdouble* workev = NULL;
|
||
|
NLdouble* workd = NULL;
|
||
|
NLdouble* workl = NULL;
|
||
|
NLdouble* v = NULL;
|
||
|
NLdouble* d = NULL;
|
||
|
ARlogical* select = NULL;
|
||
|
ARlogical rvec = 1;
|
||
|
double sigmar = 0.0;
|
||
|
double sigmai = 0.0;
|
||
|
int ierr;
|
||
|
int i,k,kk;
|
||
|
int ldv = (int)n;
|
||
|
char* bmat = (char*)"I"; /*Standard problem */
|
||
|
char* which = (char*)"LM"; /*Largest eigenvalues, but we invert->smallest */
|
||
|
char* howmny = (char*)"A"; /*which eigens should be computed: all */
|
||
|
double tol = nlCurrentContext->threshold;
|
||
|
int ido = 0; /* reverse communication variable (which operation ?) */
|
||
|
int info = 1; /* start with initial value of resid */
|
||
|
int lworkl; /* size of work array */
|
||
|
NLboolean converged = NL_FALSE;
|
||
|
NLdouble value;
|
||
|
int index;
|
||
|
int* sorted; /* indirection array for sorting eigenpairs */
|
||
|
|
||
|
if(ncv > n) {
|
||
|
ncv = n;
|
||
|
}
|
||
|
|
||
|
if(nev > n) {
|
||
|
nev = n;
|
||
|
}
|
||
|
|
||
|
if(nev + 2 > ncv) {
|
||
|
nev = ncv - 2;
|
||
|
}
|
||
|
|
||
|
|
||
|
if(symmetric) {
|
||
|
lworkl = ncv * (ncv + 8) ;
|
||
|
} else {
|
||
|
lworkl = 3*ncv*ncv + 6*ncv ;
|
||
|
}
|
||
|
iparam = NL_NEW_ARRAY(int, 11);
|
||
|
ipntr = NL_NEW_ARRAY(int, 14);
|
||
|
|
||
|
iparam[1-1] = 1; /* ARPACK chooses the shifts */
|
||
|
iparam[3-1] = (int)nlCurrentContext->max_iterations;
|
||
|
iparam[7-1] = 1; /* Normal mode (we do not use
|
||
|
shift-invert (3) since we do our own shift-invert */
|
||
|
|
||
|
workev = NL_NEW_ARRAY(NLdouble, 3*ncv);
|
||
|
workd = NL_NEW_ARRAY(NLdouble, 3*n);
|
||
|
|
||
|
resid = NL_NEW_ARRAY(NLdouble, n);
|
||
|
for(i=0; i<n; ++i) {
|
||
|
resid[i] = 1.0; /* (double)i / (double)n; */
|
||
|
}
|
||
|
v = NL_NEW_ARRAY(NLdouble, ldv*ncv);
|
||
|
if(symmetric) {
|
||
|
d = NL_NEW_ARRAY(NLdouble, 2*ncv);
|
||
|
} else {
|
||
|
d = NL_NEW_ARRAY(NLdouble, 3*ncv);
|
||
|
}
|
||
|
workl = NL_NEW_ARRAY(NLdouble, lworkl);
|
||
|
|
||
|
|
||
|
|
||
|
if(nlCurrentContext->verbose) {
|
||
|
if(symmetric) {
|
||
|
nl_printf("calling dsaupd()\n");
|
||
|
} else {
|
||
|
nl_printf("calling dnaupd()\n");
|
||
|
}
|
||
|
}
|
||
|
while(!converged) {
|
||
|
/*
|
||
|
if(nlCurrentContext->verbose) {
|
||
|
fprintf(stderr, ".");
|
||
|
fflush(stderr);
|
||
|
}
|
||
|
*/
|
||
|
if(symmetric) {
|
||
|
ARPACK()->dsaupd(
|
||
|
&ido, bmat, &n, which, &nev, &tol, resid, &ncv,
|
||
|
v, &ldv, iparam, ipntr, workd, workl, &lworkl, &info
|
||
|
);
|
||
|
} else {
|
||
|
ARPACK()->dnaupd(
|
||
|
&ido, bmat, &n, which, &nev, &tol, resid, &ncv,
|
||
|
v, &ldv, iparam, ipntr, workd, workl, &lworkl, &info
|
||
|
);
|
||
|
}
|
||
|
if(ido == 1) {
|
||
|
nlMultMatrixVector(
|
||
|
OP,
|
||
|
workd+ipntr[1-1]-1, /*The "-1"'s are for FORTRAN-to-C conversion */
|
||
|
workd+ipntr[2-1]-1 /*to keep the same indices as in ARPACK doc */
|
||
|
);
|
||
|
} else {
|
||
|
converged = NL_TRUE;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
if(info < 0) {
|
||
|
if(symmetric) {
|
||
|
nl_fprintf(stderr, "\nError with dsaupd(): %d\n", info);
|
||
|
} else {
|
||
|
nl_fprintf(stderr, "\nError with dnaupd(): %d\n", info);
|
||
|
}
|
||
|
} else {
|
||
|
if(nlCurrentContext->verbose) {
|
||
|
fprintf(stderr, "\nconverged\n");
|
||
|
}
|
||
|
|
||
|
select = NL_NEW_ARRAY(ARlogical, ncv);
|
||
|
for(i=0; i<ncv; ++i) {
|
||
|
select[i] = 1;
|
||
|
}
|
||
|
|
||
|
if(nlCurrentContext->verbose) {
|
||
|
if(symmetric) {
|
||
|
nl_printf("calling dseupd()\n");
|
||
|
} else {
|
||
|
nl_printf("calling dneupd()\n");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if(symmetric) {
|
||
|
ARPACK()->dseupd(
|
||
|
&rvec, howmny, select, d, v,
|
||
|
&ldv, &sigmar, bmat, &n, which, &nev,
|
||
|
&tol, resid, &ncv, v, &ldv,
|
||
|
iparam, ipntr, workd,
|
||
|
workl, &lworkl, &ierr
|
||
|
);
|
||
|
} else {
|
||
|
ARPACK()->dneupd(
|
||
|
&rvec, howmny, select, d, d+ncv,
|
||
|
v, &ldv,
|
||
|
&sigmar, &sigmai, workev, bmat, &n,
|
||
|
which, &nev, &tol,
|
||
|
resid, &ncv, v, &ldv, iparam,
|
||
|
ipntr, workd, workl, &lworkl, &ierr
|
||
|
) ;
|
||
|
}
|
||
|
|
||
|
|
||
|
if(nlCurrentContext->verbose) {
|
||
|
if(ierr != 0) {
|
||
|
if(symmetric) {
|
||
|
nl_fprintf(stderr, "Error with dseupd(): %d\n", ierr);
|
||
|
} else {
|
||
|
nl_fprintf(stderr, "Error with dneupd(): %d\n", ierr);
|
||
|
}
|
||
|
} else {
|
||
|
if(symmetric) {
|
||
|
nl_printf("dseupd() OK, nconv= %d\n", iparam[3-1]);
|
||
|
} else {
|
||
|
nl_printf("dneupd() OK, nconv= %d\n", iparam[3-1]);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
NL_DELETE_ARRAY(select);
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
for(i=0; i<nev; ++i) {
|
||
|
d[i] = (fabs(d[i]) < 1e-30) ? 1e30 : 1.0 / d[i] ;
|
||
|
d[i] += nlCurrentContext->eigen_shift ;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
/* Make it visible to the eigen_compare function */
|
||
|
nlCurrentContext->temp_eigen_value = d;
|
||
|
sorted = NL_NEW_ARRAY(int, nev);
|
||
|
for(i=0; i<nev; ++i) {
|
||
|
sorted[i] = i;
|
||
|
}
|
||
|
qsort(sorted, (size_t)nev, sizeof(NLuint), eigencompare);
|
||
|
nlCurrentContext->temp_eigen_value = NULL;
|
||
|
|
||
|
|
||
|
|
||
|
for(k=0; k<nev; ++k) {
|
||
|
kk = sorted[k];
|
||
|
nlCurrentContext->eigen_value[k] = d[kk];
|
||
|
for(i=0; i<(int)nlCurrentContext->nb_variables; ++i) {
|
||
|
if(!nlCurrentContext->variable_is_locked[i]) {
|
||
|
index = (int)nlCurrentContext->variable_index[i];
|
||
|
nl_assert(index < n);
|
||
|
value = v[kk*n+index];
|
||
|
NL_BUFFER_ITEM(
|
||
|
nlCurrentContext->variable_buffer[k],(NLuint)i
|
||
|
) = value;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
NL_DELETE_ARRAY(sorted);
|
||
|
NL_DELETE_ARRAY(workl);
|
||
|
NL_DELETE_ARRAY(d);
|
||
|
NL_DELETE_ARRAY(v);
|
||
|
NL_DELETE_ARRAY(resid);
|
||
|
NL_DELETE_ARRAY(workd);
|
||
|
NL_DELETE_ARRAY(workev);
|
||
|
nlDeleteMatrix(OP);
|
||
|
NL_DELETE_ARRAY(iparam);
|
||
|
NL_DELETE_ARRAY(ipntr);
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
/******* extracted from nl_mkl.c *******/
|
||
|
|
||
|
|
||
|
|
||
|
typedef unsigned int MKL_INT;
|
||
|
|
||
|
typedef void (*FUNPTR_mkl_cspblas_dcsrgemv)(
|
||
|
const char *transa, const MKL_INT *m, const double *a,
|
||
|
const MKL_INT *ia, const MKL_INT *ja, const double *x, double *y
|
||
|
);
|
||
|
|
||
|
typedef void (*FUNPTR_mkl_cspblas_dcsrsymv)(
|
||
|
const char *transa, const MKL_INT *m, const double *a,
|
||
|
const MKL_INT *ia, const MKL_INT *ja, const double *x, double *y
|
||
|
);
|
||
|
|
||
|
typedef struct {
|
||
|
NLdll DLL_mkl_intel_lp64;
|
||
|
NLdll DLL_mkl_intel_thread;
|
||
|
NLdll DLL_mkl_core;
|
||
|
NLdll DLL_iomp5;
|
||
|
|
||
|
FUNPTR_mkl_cspblas_dcsrgemv mkl_cspblas_dcsrgemv;
|
||
|
FUNPTR_mkl_cspblas_dcsrsymv mkl_cspblas_dcsrsymv;
|
||
|
} MKLContext;
|
||
|
|
||
|
static MKLContext* MKL() {
|
||
|
static MKLContext context;
|
||
|
static NLboolean init = NL_FALSE;
|
||
|
if(!init) {
|
||
|
init = NL_TRUE;
|
||
|
memset(&context, 0, sizeof(context));
|
||
|
}
|
||
|
return &context;
|
||
|
}
|
||
|
|
||
|
NLboolean nlExtensionIsInitialized_MKL() {
|
||
|
if(
|
||
|
MKL()->DLL_iomp5 == NULL ||
|
||
|
MKL()->DLL_mkl_core == NULL ||
|
||
|
MKL()->DLL_mkl_intel_thread == NULL ||
|
||
|
MKL()->DLL_mkl_intel_lp64 == NULL ||
|
||
|
MKL()->mkl_cspblas_dcsrgemv == NULL ||
|
||
|
MKL()->mkl_cspblas_dcsrsymv == NULL
|
||
|
) {
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
return NL_TRUE;
|
||
|
}
|
||
|
|
||
|
#define find_mkl_func(name) \
|
||
|
if( \
|
||
|
( \
|
||
|
MKL()->name = \
|
||
|
(FUNPTR_##name)nlFindFunction( \
|
||
|
MKL()->DLL_mkl_intel_lp64,#name \
|
||
|
) \
|
||
|
) == NULL \
|
||
|
) { \
|
||
|
nlError("nlInitExtension_MKL","function not found"); \
|
||
|
return NL_FALSE; \
|
||
|
}
|
||
|
|
||
|
static void nlTerminateExtension_MKL(void) {
|
||
|
if(!nlExtensionIsInitialized_MKL()) {
|
||
|
return;
|
||
|
}
|
||
|
nlCloseDLL(MKL()->DLL_mkl_intel_lp64);
|
||
|
nlCloseDLL(MKL()->DLL_mkl_intel_thread);
|
||
|
nlCloseDLL(MKL()->DLL_mkl_core);
|
||
|
nlCloseDLL(MKL()->DLL_iomp5);
|
||
|
}
|
||
|
|
||
|
NLMultMatrixVectorFunc NLMultMatrixVector_MKL = NULL;
|
||
|
|
||
|
static void NLMultMatrixVector_MKL_impl(NLMatrix M_in, const double* x, double* y) {
|
||
|
NLCRSMatrix* M = (NLCRSMatrix*)(M_in);
|
||
|
nl_debug_assert(M_in->type == NL_MATRIX_CRS);
|
||
|
if(M->symmetric_storage) {
|
||
|
MKL()->mkl_cspblas_dcsrsymv(
|
||
|
"N", /* No transpose */
|
||
|
&M->m,
|
||
|
M->val,
|
||
|
M->rowptr,
|
||
|
M->colind,
|
||
|
x,
|
||
|
y
|
||
|
);
|
||
|
} else {
|
||
|
MKL()->mkl_cspblas_dcsrgemv(
|
||
|
"N", /* No transpose */
|
||
|
&M->m,
|
||
|
M->val,
|
||
|
M->rowptr,
|
||
|
M->colind,
|
||
|
x,
|
||
|
y
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
#define INTEL_PREFIX "/opt/intel/"
|
||
|
#define LIB_DIR "lib/intel64/"
|
||
|
#define MKL_PREFIX INTEL_PREFIX "mkl/" LIB_DIR
|
||
|
|
||
|
NLboolean nlInitExtension_MKL(void) {
|
||
|
NLenum flags = NL_LINK_LAZY | NL_LINK_GLOBAL;
|
||
|
if(nlCurrentContext == NULL || !nlCurrentContext->verbose) {
|
||
|
flags |= NL_LINK_QUIET;
|
||
|
}
|
||
|
|
||
|
if(MKL()->DLL_mkl_intel_lp64 != NULL) {
|
||
|
return nlExtensionIsInitialized_MKL();
|
||
|
}
|
||
|
|
||
|
MKL()->DLL_iomp5 = nlOpenDLL(
|
||
|
INTEL_PREFIX LIB_DIR "libiomp5.so",
|
||
|
flags
|
||
|
);
|
||
|
MKL()->DLL_mkl_core = nlOpenDLL(
|
||
|
MKL_PREFIX "libmkl_core.so",
|
||
|
flags
|
||
|
);
|
||
|
MKL()->DLL_mkl_intel_thread = nlOpenDLL(
|
||
|
MKL_PREFIX "libmkl_intel_thread.so",
|
||
|
flags
|
||
|
);
|
||
|
MKL()->DLL_mkl_intel_lp64 = nlOpenDLL(
|
||
|
MKL_PREFIX "libmkl_intel_lp64.so",
|
||
|
flags
|
||
|
);
|
||
|
|
||
|
if(
|
||
|
MKL()->DLL_iomp5 == NULL ||
|
||
|
MKL()->DLL_mkl_core == NULL ||
|
||
|
MKL()->DLL_mkl_intel_thread == NULL ||
|
||
|
MKL()->DLL_mkl_intel_lp64 == NULL
|
||
|
) {
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
|
||
|
find_mkl_func(mkl_cspblas_dcsrgemv);
|
||
|
find_mkl_func(mkl_cspblas_dcsrsymv);
|
||
|
|
||
|
if(nlExtensionIsInitialized_MKL()) {
|
||
|
NLMultMatrixVector_MKL = NLMultMatrixVector_MKL_impl;
|
||
|
}
|
||
|
|
||
|
atexit(nlTerminateExtension_MKL);
|
||
|
return NL_TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
/******* extracted from nl_cuda.c *******/
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
/* CUDA structures and functions */
|
||
|
/* Repeated here so that one can compile OpenNL without */
|
||
|
/* requiring CUDA to be installed in the system. */
|
||
|
|
||
|
|
||
|
struct cudaDeviceProp {
|
||
|
char name[256];
|
||
|
size_t totalGlobalMem;
|
||
|
size_t sharedMemPerBlock;
|
||
|
int regsPerBlock;
|
||
|
int warpSize;
|
||
|
size_t memPitch;
|
||
|
int maxThreadsPerBlock;
|
||
|
int maxThreadsDim[3];
|
||
|
int maxGridSize[3];
|
||
|
int clockRate;
|
||
|
size_t totalConstMem;
|
||
|
int major;
|
||
|
int minor;
|
||
|
size_t textureAlignment;
|
||
|
size_t texturePitchAlignment;
|
||
|
int deviceOverlap;
|
||
|
int multiProcessorCount;
|
||
|
int kernelExecTimeoutEnabled;
|
||
|
int integrated;
|
||
|
int canMapHostMemory;
|
||
|
int computeMode;
|
||
|
int maxTexture1D;
|
||
|
int maxTexture1DMipmap;
|
||
|
int maxTexture1DLinear;
|
||
|
int maxTexture2D[2];
|
||
|
int maxTexture2DMipmap[2];
|
||
|
int maxTexture2DLinear[3];
|
||
|
int maxTexture2DGather[2];
|
||
|
int maxTexture3D[3];
|
||
|
int maxTexture3DAlt[3];
|
||
|
int maxTextureCubemap;
|
||
|
int maxTexture1DLayered[2];
|
||
|
int maxTexture2DLayered[3];
|
||
|
int maxTextureCubemapLayered[2];
|
||
|
int maxSurface1D;
|
||
|
int maxSurface2D[2];
|
||
|
int maxSurface3D[3];
|
||
|
int maxSurface1DLayered[2];
|
||
|
int maxSurface2DLayered[3];
|
||
|
int maxSurfaceCubemap;
|
||
|
int maxSurfaceCubemapLayered[2];
|
||
|
size_t surfaceAlignment;
|
||
|
int concurrentKernels;
|
||
|
int ECCEnabled;
|
||
|
int pciBusID;
|
||
|
int pciDeviceID;
|
||
|
int pciDomainID;
|
||
|
int tccDriver;
|
||
|
int asyncEngineCount;
|
||
|
int unifiedAddressing;
|
||
|
int memoryClockRate;
|
||
|
int memoryBusWidth;
|
||
|
int l2CacheSize;
|
||
|
int maxThreadsPerMultiProcessor;
|
||
|
int streamPrioritiesSupported;
|
||
|
int globalL1CacheSupported;
|
||
|
int localL1CacheSupported;
|
||
|
size_t sharedMemPerMultiprocessor;
|
||
|
int regsPerMultiprocessor;
|
||
|
int managedMemSupported;
|
||
|
int isMultiGpuBoard;
|
||
|
int multiGpuBoardGroupID;
|
||
|
int singleToDoublePrecisionPerfRatio;
|
||
|
int pageableMemoryAccess;
|
||
|
int concurrentManagedAccess;
|
||
|
char padding[1024]; /* More room for future evolutions */
|
||
|
};
|
||
|
|
||
|
enum cudaComputeMode {
|
||
|
cudaComputeModeDefault = 0,
|
||
|
cudaComputeModeExclusive = 1,
|
||
|
cudaComputeModeProhibited = 2,
|
||
|
cudaComputeModeExclusiveProcess = 3
|
||
|
};
|
||
|
|
||
|
enum cudaMemcpyKind {
|
||
|
cudaMemcpyHostToHost = 0,
|
||
|
cudaMemcpyHostToDevice = 1,
|
||
|
cudaMemcpyDeviceToHost = 2,
|
||
|
cudaMemcpyDeviceToDevice = 3,
|
||
|
cudaMemcpyDefault = 4
|
||
|
};
|
||
|
|
||
|
typedef int cudaError_t;
|
||
|
|
||
|
typedef cudaError_t (*FUNPTR_cudaGetDeviceCount)(int* device_count);
|
||
|
typedef cudaError_t (*FUNPTR_cudaGetDeviceProperties)(
|
||
|
struct cudaDeviceProp *props, int device
|
||
|
);
|
||
|
typedef cudaError_t (*FUNPTR_cudaDeviceReset)(void);
|
||
|
typedef cudaError_t (*FUNPTR_cudaMalloc)(void **devPtr, size_t size);
|
||
|
typedef cudaError_t (*FUNPTR_cudaFree)(void* devPtr);
|
||
|
typedef cudaError_t (*FUNPTR_cudaMemcpy)(
|
||
|
void *dst, const void *src, size_t count, enum cudaMemcpyKind kind
|
||
|
);
|
||
|
|
||
|
#define find_cuda_func(name) \
|
||
|
if( \
|
||
|
( \
|
||
|
CUDA()->name = \
|
||
|
(FUNPTR_##name)nlFindFunction( \
|
||
|
CUDA()->DLL_cudart,#name \
|
||
|
) \
|
||
|
) == NULL \
|
||
|
) { \
|
||
|
nlError("nlInitExtension_CUDA: function not found", #name); \
|
||
|
return NL_FALSE; \
|
||
|
}
|
||
|
|
||
|
|
||
|
/* CUBLAS structures and functions */
|
||
|
|
||
|
|
||
|
struct cublasContext;
|
||
|
typedef struct cublasContext *cublasHandle_t;
|
||
|
typedef int cublasStatus_t;
|
||
|
|
||
|
typedef enum {
|
||
|
CUBLAS_SIDE_LEFT =0,
|
||
|
CUBLAS_SIDE_RIGHT=1
|
||
|
} cublasSideMode_t;
|
||
|
|
||
|
typedef enum {
|
||
|
CUBLAS_FILL_MODE_LOWER=0,
|
||
|
CUBLAS_FILL_MODE_UPPER=1
|
||
|
} cublasFillMode_t;
|
||
|
|
||
|
typedef enum {
|
||
|
CUBLAS_OP_N=0,
|
||
|
CUBLAS_OP_T=1,
|
||
|
CUBLAS_OP_C=2
|
||
|
} cublasOperation_t;
|
||
|
|
||
|
typedef enum {
|
||
|
CUBLAS_DIAG_NON_UNIT=0,
|
||
|
CUBLAS_DIAG_UNIT=1
|
||
|
} cublasDiagType_t;
|
||
|
|
||
|
typedef cublasStatus_t (*FUNPTR_cublasCreate)(cublasHandle_t* handle);
|
||
|
typedef cublasStatus_t (*FUNPTR_cublasDestroy)(cublasHandle_t handle);
|
||
|
|
||
|
typedef cublasStatus_t (*FUNPTR_cublasGetVersion)(
|
||
|
cublasHandle_t handle, int* version
|
||
|
);
|
||
|
|
||
|
typedef cublasStatus_t (*FUNPTR_cublasDdot)(
|
||
|
cublasHandle_t handle, int n,
|
||
|
const double *x, int incx,
|
||
|
const double *y, int incy,
|
||
|
double *result
|
||
|
);
|
||
|
|
||
|
typedef cublasStatus_t (*FUNPTR_cublasDcopy)(
|
||
|
cublasHandle_t handle, int n,
|
||
|
const double *x, int incx,
|
||
|
const double *y, int incy
|
||
|
);
|
||
|
|
||
|
typedef cublasStatus_t (*FUNPTR_cublasDaxpy)(
|
||
|
cublasHandle_t handle, int n,
|
||
|
const double* alpha,
|
||
|
const double *x, int incx,
|
||
|
const double *y, int incy
|
||
|
);
|
||
|
|
||
|
typedef cublasStatus_t (*FUNPTR_cublasDscal)(
|
||
|
cublasHandle_t handle, int n,
|
||
|
const double* alpha,
|
||
|
const double *x, int incx
|
||
|
);
|
||
|
|
||
|
typedef cublasStatus_t (*FUNPTR_cublasDnrm2)(
|
||
|
cublasHandle_t handle, int n,
|
||
|
const double *x, int incx,
|
||
|
double* result
|
||
|
);
|
||
|
|
||
|
typedef cublasStatus_t (*FUNPTR_cublasDdgmm)(
|
||
|
cublasHandle_t handle, cublasSideMode_t mode,
|
||
|
int m, int n,
|
||
|
const double* A, int lda,
|
||
|
const double* x, int incx,
|
||
|
double* C, int ldc
|
||
|
);
|
||
|
|
||
|
typedef cublasStatus_t (*FUNPTR_cublasDgemv)(
|
||
|
cublasHandle_t handle,
|
||
|
cublasOperation_t trans,
|
||
|
int m,
|
||
|
int n,
|
||
|
const double *alpha,
|
||
|
const double *A,
|
||
|
int lda,
|
||
|
const double *x,
|
||
|
int incx,
|
||
|
const double *beta,
|
||
|
double *y,
|
||
|
int incy
|
||
|
);
|
||
|
|
||
|
typedef cublasStatus_t (*FUNPTR_cublasDtpsv)(
|
||
|
cublasHandle_t handle, cublasFillMode_t uplo,
|
||
|
cublasOperation_t trans, cublasDiagType_t diag,
|
||
|
int n, const double *AP,
|
||
|
double* x, int incx
|
||
|
);
|
||
|
|
||
|
|
||
|
#define find_cublas_func(name) \
|
||
|
if( \
|
||
|
( \
|
||
|
CUDA()->name = \
|
||
|
(FUNPTR_##name)nlFindFunction( \
|
||
|
CUDA()->DLL_cublas,#name "_v2" \
|
||
|
) \
|
||
|
) == NULL \
|
||
|
) { \
|
||
|
nlError("nlInitExtension_CUDA: function not found", #name); \
|
||
|
return NL_FALSE; \
|
||
|
}
|
||
|
|
||
|
#define find_cublas_func_v1(name) \
|
||
|
if( \
|
||
|
( \
|
||
|
CUDA()->name = \
|
||
|
(FUNPTR_##name)nlFindFunction( \
|
||
|
CUDA()->DLL_cublas,#name \
|
||
|
) \
|
||
|
) == NULL \
|
||
|
) { \
|
||
|
nlError("nlInitExtension_CUDA: function not found", #name); \
|
||
|
return NL_FALSE; \
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
/* CUSPARSE structures and functions */
|
||
|
|
||
|
|
||
|
struct cusparseContext;
|
||
|
typedef struct cusparseContext *cusparseHandle_t;
|
||
|
typedef int cusparseStatus_t;
|
||
|
struct cusparseMatDescr;
|
||
|
typedef struct cusparseMatDescr *cusparseMatDescr_t;
|
||
|
|
||
|
typedef enum {
|
||
|
CUSPARSE_MATRIX_TYPE_GENERAL = 0,
|
||
|
CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1,
|
||
|
CUSPARSE_MATRIX_TYPE_HERMITIAN = 2,
|
||
|
CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3
|
||
|
} cusparseMatrixType_t;
|
||
|
|
||
|
typedef enum {
|
||
|
CUSPARSE_INDEX_BASE_ZERO = 0,
|
||
|
CUSPARSE_INDEX_BASE_ONE = 1
|
||
|
} cusparseIndexBase_t;
|
||
|
|
||
|
typedef enum {
|
||
|
CUSPARSE_OPERATION_NON_TRANSPOSE = 0,
|
||
|
CUSPARSE_OPERATION_TRANSPOSE = 1,
|
||
|
CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2
|
||
|
} cusparseOperation_t;
|
||
|
|
||
|
struct cusparseHybMat;
|
||
|
typedef struct cusparseHybMat *cusparseHybMat_t;
|
||
|
|
||
|
typedef enum {
|
||
|
CUSPARSE_HYB_PARTITION_AUTO = 0,
|
||
|
CUSPARSE_HYB_PARTITION_USER = 1,
|
||
|
CUSPARSE_HYB_PARTITION_MAX = 2
|
||
|
} cusparseHybPartition_t;
|
||
|
|
||
|
typedef cusparseStatus_t (*FUNPTR_cusparseCreate)(cusparseHandle_t* handle);
|
||
|
typedef cusparseStatus_t (*FUNPTR_cusparseDestroy)(cusparseHandle_t handle);
|
||
|
typedef cusparseStatus_t (*FUNPTR_cusparseGetVersion)(
|
||
|
cusparseHandle_t handle, int* version
|
||
|
);
|
||
|
typedef cusparseStatus_t (*FUNPTR_cusparseCreateMatDescr)(
|
||
|
cusparseMatDescr_t* descr
|
||
|
);
|
||
|
typedef cusparseStatus_t (*FUNPTR_cusparseDestroyMatDescr)(
|
||
|
cusparseMatDescr_t descr
|
||
|
);
|
||
|
typedef cusparseStatus_t (*FUNPTR_cusparseSetMatType)(
|
||
|
cusparseMatDescr_t descr, cusparseMatrixType_t mtype
|
||
|
);
|
||
|
typedef cusparseStatus_t (*FUNPTR_cusparseSetMatIndexBase)(
|
||
|
cusparseMatDescr_t descr, cusparseIndexBase_t ibase
|
||
|
);
|
||
|
typedef cusparseStatus_t (*FUNPTR_cusparseDcsrmv)(
|
||
|
cusparseHandle_t handle, cusparseOperation_t transA,
|
||
|
int m, int n, int nnz,
|
||
|
const double *alpha, const cusparseMatDescr_t descrA,
|
||
|
const double *csrSortedValA, const int *csrSortedRowPtrA,
|
||
|
const int *csrSortedColIndA, const double *x,
|
||
|
const double *beta, double *y
|
||
|
);
|
||
|
typedef cusparseStatus_t (*FUNPTR_cusparseCreateHybMat)(
|
||
|
cusparseHybMat_t *hybA
|
||
|
);
|
||
|
typedef cusparseStatus_t (*FUNPTR_cusparseDestroyHybMat)(
|
||
|
cusparseHybMat_t hybA
|
||
|
);
|
||
|
typedef cusparseStatus_t (*FUNPTR_cusparseDcsr2hyb)(
|
||
|
cusparseHandle_t handle,
|
||
|
int m,
|
||
|
int n,
|
||
|
const cusparseMatDescr_t descrA,
|
||
|
const double *csrSortedValA,
|
||
|
const int *csrSortedRowPtrA,
|
||
|
const int *csrSortedColIndA,
|
||
|
cusparseHybMat_t hybA,
|
||
|
int userEllWidth,
|
||
|
cusparseHybPartition_t partitionType
|
||
|
);
|
||
|
typedef cusparseStatus_t (*FUNPTR_cusparseDhybmv)(
|
||
|
cusparseHandle_t handle,
|
||
|
cusparseOperation_t transA,
|
||
|
const double *alpha,
|
||
|
const cusparseMatDescr_t descrA,
|
||
|
const cusparseHybMat_t hybA,
|
||
|
const double *x,
|
||
|
const double *beta,
|
||
|
double *y
|
||
|
);
|
||
|
|
||
|
#define find_cusparse_func(name) \
|
||
|
if( \
|
||
|
( \
|
||
|
CUDA()->name = \
|
||
|
(FUNPTR_##name)nlFindFunction( \
|
||
|
CUDA()->DLL_cusparse,#name \
|
||
|
) \
|
||
|
) == NULL \
|
||
|
) { \
|
||
|
nlError("nlInitExtension_CUDA : function not found", #name); \
|
||
|
return NL_FALSE; \
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
typedef struct {
|
||
|
NLdll DLL_cudart;
|
||
|
FUNPTR_cudaGetDeviceCount cudaGetDeviceCount;
|
||
|
FUNPTR_cudaGetDeviceProperties cudaGetDeviceProperties;
|
||
|
FUNPTR_cudaDeviceReset cudaDeviceReset;
|
||
|
FUNPTR_cudaMalloc cudaMalloc;
|
||
|
FUNPTR_cudaFree cudaFree;
|
||
|
FUNPTR_cudaMemcpy cudaMemcpy;
|
||
|
|
||
|
NLdll DLL_cublas;
|
||
|
cublasHandle_t HNDL_cublas;
|
||
|
FUNPTR_cublasCreate cublasCreate;
|
||
|
FUNPTR_cublasDestroy cublasDestroy;
|
||
|
FUNPTR_cublasGetVersion cublasGetVersion;
|
||
|
FUNPTR_cublasDdot cublasDdot;
|
||
|
FUNPTR_cublasDcopy cublasDcopy;
|
||
|
FUNPTR_cublasDaxpy cublasDaxpy;
|
||
|
FUNPTR_cublasDscal cublasDscal;
|
||
|
FUNPTR_cublasDnrm2 cublasDnrm2;
|
||
|
FUNPTR_cublasDdgmm cublasDdgmm;
|
||
|
FUNPTR_cublasDgemv cublasDgemv;
|
||
|
FUNPTR_cublasDtpsv cublasDtpsv;
|
||
|
|
||
|
NLdll DLL_cusparse;
|
||
|
cusparseHandle_t HNDL_cusparse;
|
||
|
FUNPTR_cusparseCreate cusparseCreate;
|
||
|
FUNPTR_cusparseDestroy cusparseDestroy;
|
||
|
FUNPTR_cusparseGetVersion cusparseGetVersion;
|
||
|
FUNPTR_cusparseCreateMatDescr cusparseCreateMatDescr;
|
||
|
FUNPTR_cusparseDestroyMatDescr cusparseDestroyMatDescr;
|
||
|
FUNPTR_cusparseSetMatType cusparseSetMatType;
|
||
|
FUNPTR_cusparseSetMatIndexBase cusparseSetMatIndexBase;
|
||
|
FUNPTR_cusparseDcsrmv cusparseDcsrmv;
|
||
|
FUNPTR_cusparseCreateHybMat cusparseCreateHybMat;
|
||
|
FUNPTR_cusparseDestroyHybMat cusparseDestroyHybMat;
|
||
|
FUNPTR_cusparseDcsr2hyb cusparseDcsr2hyb;
|
||
|
FUNPTR_cusparseDhybmv cusparseDhybmv;
|
||
|
|
||
|
int devID;
|
||
|
} CUDAContext;
|
||
|
|
||
|
static CUDAContext* CUDA() {
|
||
|
static CUDAContext context;
|
||
|
static NLboolean init = NL_FALSE;
|
||
|
if(!init) {
|
||
|
init = NL_TRUE;
|
||
|
memset(&context, 0, sizeof(context));
|
||
|
}
|
||
|
return &context;
|
||
|
}
|
||
|
|
||
|
NLboolean nlExtensionIsInitialized_CUDA() {
|
||
|
if(
|
||
|
CUDA()->DLL_cudart == NULL ||
|
||
|
CUDA()->cudaGetDeviceCount == NULL ||
|
||
|
CUDA()->cudaGetDeviceProperties == NULL ||
|
||
|
CUDA()->cudaDeviceReset == NULL ||
|
||
|
CUDA()->cudaMalloc == NULL ||
|
||
|
CUDA()->cudaFree == NULL ||
|
||
|
CUDA()->cudaMemcpy == NULL ||
|
||
|
|
||
|
CUDA()->DLL_cublas == NULL ||
|
||
|
CUDA()->HNDL_cublas == NULL ||
|
||
|
CUDA()->cublasCreate == NULL ||
|
||
|
CUDA()->cublasDestroy == NULL ||
|
||
|
CUDA()->cublasGetVersion == NULL ||
|
||
|
CUDA()->cublasDdot == NULL ||
|
||
|
CUDA()->cublasDcopy == NULL ||
|
||
|
CUDA()->cublasDaxpy == NULL ||
|
||
|
CUDA()->cublasDscal == NULL ||
|
||
|
CUDA()->cublasDnrm2 == NULL ||
|
||
|
CUDA()->cublasDdgmm == NULL ||
|
||
|
|
||
|
CUDA()->DLL_cusparse == NULL ||
|
||
|
CUDA()->HNDL_cusparse == NULL ||
|
||
|
CUDA()->cusparseCreate == NULL ||
|
||
|
CUDA()->cusparseDestroy == NULL ||
|
||
|
CUDA()->cusparseGetVersion == NULL ||
|
||
|
CUDA()->cusparseCreateMatDescr == NULL ||
|
||
|
CUDA()->cusparseDestroyMatDescr == NULL ||
|
||
|
CUDA()->cusparseSetMatType == NULL ||
|
||
|
CUDA()->cusparseSetMatIndexBase == NULL ||
|
||
|
CUDA()->cusparseDcsrmv == NULL ||
|
||
|
CUDA()->cusparseCreateHybMat == NULL ||
|
||
|
CUDA()->cusparseDestroyHybMat == NULL ||
|
||
|
CUDA()->cusparseDcsr2hyb == NULL ||
|
||
|
CUDA()->cusparseDhybmv == NULL
|
||
|
) {
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
return NL_TRUE;
|
||
|
}
|
||
|
|
||
|
static void nlTerminateExtension_CUDA(void) {
|
||
|
if(!nlExtensionIsInitialized_CUDA()) {
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
CUDA()->cusparseDestroy(CUDA()->HNDL_cusparse);
|
||
|
nlCloseDLL(CUDA()->DLL_cusparse);
|
||
|
|
||
|
CUDA()->cublasDestroy(CUDA()->HNDL_cublas);
|
||
|
nlCloseDLL(CUDA()->DLL_cublas);
|
||
|
|
||
|
CUDA()->cudaDeviceReset();
|
||
|
nlCloseDLL(CUDA()->DLL_cudart);
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
static int ConvertSMVer2Cores(int major, int minor) {
|
||
|
/* Defines for GPU Architecture types (using the SM version
|
||
|
to determine the # of cores per SM */
|
||
|
typedef struct {
|
||
|
int SM; /* 0xMm (hexadecimal notation),
|
||
|
M = SM Major version,
|
||
|
and m = SM minor version */
|
||
|
int Cores;
|
||
|
} sSMtoCores;
|
||
|
|
||
|
sSMtoCores nGpuArchCoresPerSM[] = {
|
||
|
{ 0x10, 8 }, /* Tesla Generation (SM 1.0) G80 class */
|
||
|
{ 0x11, 8 }, /* Tesla Generation (SM 1.1) G8x class */
|
||
|
{ 0x12, 8 }, /* Tesla Generation (SM 1.2) G9x class */
|
||
|
{ 0x13, 8 }, /* Tesla Generation (SM 1.3) GT200 class */
|
||
|
{ 0x20, 32 }, /* Fermi Generation (SM 2.0) GF100 class */
|
||
|
{ 0x21, 48 }, /* Fermi Generation (SM 2.1) GF10x class */
|
||
|
{ 0x30, 192}, /* Kepler Generation (SM 3.0) GK10x class */
|
||
|
{ 0x35, 192}, /* Kepler Generation (SM 3.5) GK11x class */
|
||
|
{ 0x50, 128}, /* Maxwell Generation (SM 5.0) GM10x class
|
||
|
(yes, #cores smaller than with 3.x) */
|
||
|
{ 0x52, 128}, /* Maxwell Generation (SM 5.2) GM20x class */
|
||
|
{ 0x60, 64 }, /* Pascal Generation (SM 6.0) GP100,GP102
|
||
|
(yes, 64, but GP100 has superfast double precision) */
|
||
|
{ 0x61, 128}, /* Pascal Generation (SM 6.1) GP104 class
|
||
|
(but FP64 runs as 1/32 FP32 speed) */
|
||
|
{ -1, -1 }
|
||
|
};
|
||
|
int index = 0;
|
||
|
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||
|
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
|
||
|
return nGpuArchCoresPerSM[index].Cores;
|
||
|
}
|
||
|
index++;
|
||
|
}
|
||
|
/* If we don't find the values, we default use the
|
||
|
previous one to run properly */
|
||
|
nl_printf(
|
||
|
"MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n",
|
||
|
major, minor, nGpuArchCoresPerSM[8].Cores
|
||
|
);
|
||
|
return nGpuArchCoresPerSM[8].Cores;
|
||
|
}
|
||
|
|
||
|
static int getBestDeviceID() {
|
||
|
int current_device = 0, sm_per_multiproc = 0;
|
||
|
int max_compute_perf = 0, max_perf_device = 0;
|
||
|
int device_count = 0, best_SM_arch = 0;
|
||
|
int compute_perf = 0;
|
||
|
struct cudaDeviceProp deviceProp;
|
||
|
CUDA()->cudaGetDeviceCount(&device_count);
|
||
|
/* Find the best major SM Architecture GPU device */
|
||
|
while (current_device < device_count) {
|
||
|
CUDA()->cudaGetDeviceProperties(&deviceProp, current_device);
|
||
|
/* If this GPU is not running on Compute Mode prohibited,
|
||
|
then we can add it to the list */
|
||
|
if (deviceProp.computeMode != cudaComputeModeProhibited) {
|
||
|
if (deviceProp.major > 0 && deviceProp.major < 9999) {
|
||
|
best_SM_arch = MAX(best_SM_arch, deviceProp.major);
|
||
|
}
|
||
|
}
|
||
|
current_device++;
|
||
|
}
|
||
|
/* Find the best CUDA capable GPU device */
|
||
|
current_device = 0;
|
||
|
while (current_device < device_count) {
|
||
|
CUDA()->cudaGetDeviceProperties(&deviceProp, current_device);
|
||
|
/* If this GPU is not running on Compute Mode prohibited,
|
||
|
then we can add it to the list */
|
||
|
if (deviceProp.computeMode != cudaComputeModeProhibited) {
|
||
|
if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
|
||
|
sm_per_multiproc = 1;
|
||
|
} else {
|
||
|
sm_per_multiproc = ConvertSMVer2Cores(
|
||
|
deviceProp.major, deviceProp.minor
|
||
|
);
|
||
|
}
|
||
|
compute_perf =
|
||
|
deviceProp.multiProcessorCount *
|
||
|
sm_per_multiproc * deviceProp.clockRate;
|
||
|
if (compute_perf > max_compute_perf) {
|
||
|
/* If we find GPU with SM major > 2, search only these */
|
||
|
if (best_SM_arch > 2) {
|
||
|
/* If our device==dest_SM_arch, choose this, or else pass */
|
||
|
if (deviceProp.major == best_SM_arch) {
|
||
|
max_compute_perf = compute_perf;
|
||
|
max_perf_device = current_device;
|
||
|
}
|
||
|
} else {
|
||
|
max_compute_perf = compute_perf;
|
||
|
max_perf_device = current_device;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
++current_device;
|
||
|
}
|
||
|
|
||
|
|
||
|
return max_perf_device;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
#ifdef NL_OS_UNIX
|
||
|
# define LIBPREFIX "lib"
|
||
|
# ifdef NL_OS_APPLE
|
||
|
# define LIBEXTENSION ".dylib"
|
||
|
# else
|
||
|
# define LIBEXTENSION ".so"
|
||
|
# endif
|
||
|
#else
|
||
|
# define LIBPREFIX
|
||
|
# define LIBEXTENSION ".dll"
|
||
|
#endif
|
||
|
|
||
|
|
||
|
NLboolean nlInitExtension_CUDA(void) {
|
||
|
struct cudaDeviceProp deviceProp;
|
||
|
int cublas_version;
|
||
|
int cusparse_version;
|
||
|
NLenum flags = NL_LINK_LAZY | NL_LINK_GLOBAL;
|
||
|
if(nlCurrentContext == NULL || !nlCurrentContext->verbose) {
|
||
|
flags |= NL_LINK_QUIET;
|
||
|
}
|
||
|
|
||
|
if(nlExtensionIsInitialized_CUDA()) {
|
||
|
return NL_TRUE;
|
||
|
}
|
||
|
|
||
|
CUDA()->DLL_cudart = nlOpenDLL(
|
||
|
LIBPREFIX "cudart" LIBEXTENSION, flags
|
||
|
);
|
||
|
|
||
|
find_cuda_func(cudaGetDeviceCount);
|
||
|
find_cuda_func(cudaGetDeviceProperties);
|
||
|
find_cuda_func(cudaDeviceReset);
|
||
|
find_cuda_func(cudaMalloc);
|
||
|
find_cuda_func(cudaFree);
|
||
|
find_cuda_func(cudaMemcpy);
|
||
|
|
||
|
CUDA()->devID = getBestDeviceID();
|
||
|
|
||
|
if(CUDA()->cudaGetDeviceProperties(&deviceProp, CUDA()->devID)) {
|
||
|
nl_fprintf(stderr,"OpenNL CUDA: could not find a CUDA device\n");
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
|
||
|
nl_printf("OpenNL CUDA: Device ID = %d\n", CUDA()->devID);
|
||
|
nl_printf("OpenNL CUDA: Device name=%s\n", deviceProp.name);
|
||
|
nl_printf(
|
||
|
"OpenNL CUDA: Device has %d Multi-Processors, "
|
||
|
"%d cores per Multi-Processor, SM %d.%d compute capabilities\n",
|
||
|
deviceProp.multiProcessorCount,
|
||
|
ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
|
||
|
deviceProp.major, deviceProp.minor
|
||
|
);
|
||
|
|
||
|
nl_printf(
|
||
|
"OpenNL CUDA: %d kB shared mem. per block, %d per MP\n",
|
||
|
(int)(deviceProp.sharedMemPerBlock / 1024),
|
||
|
(int)(deviceProp.sharedMemPerMultiprocessor / 1024)
|
||
|
);
|
||
|
|
||
|
nl_printf(
|
||
|
"OpenNL CUDA: %d regs. per block, %d per MP\n",
|
||
|
deviceProp.regsPerBlock,
|
||
|
deviceProp.regsPerMultiprocessor
|
||
|
);
|
||
|
|
||
|
nl_printf(
|
||
|
"OpenNL CUDA: warpsize=%d\n",
|
||
|
deviceProp.warpSize
|
||
|
);
|
||
|
|
||
|
if ((deviceProp.major * 0x10 + deviceProp.minor) < 0x11) {
|
||
|
nl_fprintf(stderr, "OpenNL CUDA requires a minimum CUDA compute 1.1 capability\n");
|
||
|
CUDA()->cudaDeviceReset();
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
|
||
|
CUDA()->DLL_cublas = nlOpenDLL(
|
||
|
LIBPREFIX "cublas" LIBEXTENSION, flags
|
||
|
);
|
||
|
|
||
|
find_cublas_func(cublasCreate);
|
||
|
find_cublas_func(cublasDestroy);
|
||
|
find_cublas_func(cublasGetVersion);
|
||
|
find_cublas_func(cublasDdot);
|
||
|
find_cublas_func(cublasDaxpy);
|
||
|
find_cublas_func(cublasDcopy);
|
||
|
find_cublas_func(cublasDscal);
|
||
|
find_cublas_func(cublasDnrm2);
|
||
|
find_cublas_func(cublasDgemv);
|
||
|
find_cublas_func(cublasDtpsv);
|
||
|
find_cublas_func_v1(cublasDdgmm);
|
||
|
|
||
|
|
||
|
if(CUDA()->cublasCreate(&CUDA()->HNDL_cublas)) {
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
|
||
|
if(CUDA()->cublasGetVersion(CUDA()->HNDL_cublas, &cublas_version)) {
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
nl_printf("OpenNL CUDA: cublas version = %d\n", cublas_version);
|
||
|
|
||
|
CUDA()->DLL_cusparse = nlOpenDLL(
|
||
|
LIBPREFIX "cusparse" LIBEXTENSION, flags
|
||
|
);
|
||
|
find_cusparse_func(cusparseCreate);
|
||
|
find_cusparse_func(cusparseDestroy);
|
||
|
find_cusparse_func(cusparseGetVersion);
|
||
|
find_cusparse_func(cusparseCreateMatDescr);
|
||
|
find_cusparse_func(cusparseDestroyMatDescr);
|
||
|
find_cusparse_func(cusparseSetMatType);
|
||
|
find_cusparse_func(cusparseSetMatIndexBase);
|
||
|
find_cusparse_func(cusparseDcsrmv);
|
||
|
find_cusparse_func(cusparseCreateHybMat);
|
||
|
find_cusparse_func(cusparseDestroyHybMat);
|
||
|
find_cusparse_func(cusparseDcsr2hyb);
|
||
|
find_cusparse_func(cusparseDhybmv);
|
||
|
|
||
|
if(CUDA()->cusparseCreate(&CUDA()->HNDL_cusparse)) {
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
if(CUDA()->cusparseGetVersion(CUDA()->HNDL_cusparse, &cusparse_version)) {
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
nl_printf("OpenNL CUDA: cusparse version = %d\n", cusparse_version);
|
||
|
|
||
|
if(!nlExtensionIsInitialized_CUDA()) {
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
|
||
|
atexit(nlTerminateExtension_CUDA);
|
||
|
return NL_TRUE;
|
||
|
|
||
|
}
|
||
|
|
||
|
static void nlCUDACheckImpl(int status, int line) {
|
||
|
if(status != 0) {
|
||
|
nl_fprintf(stderr,"nl_cuda.c:%d fatal error %d\n",line, status);
|
||
|
CUDA()->cudaDeviceReset();
|
||
|
exit(-1);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#define nlCUDACheck(status) nlCUDACheckImpl(status, __LINE__)
|
||
|
|
||
|
|
||
|
|
||
|
typedef struct {
|
||
|
NLuint m;
|
||
|
NLuint n;
|
||
|
NLenum type;
|
||
|
NLDestroyMatrixFunc destroy_func;
|
||
|
NLMultMatrixVectorFunc mult_func;
|
||
|
cusparseMatDescr_t descr;
|
||
|
NLuint nnz;
|
||
|
int* colind;
|
||
|
int* rowptr;
|
||
|
double* val;
|
||
|
cusparseHybMat_t hyb;
|
||
|
} NLCUDASparseMatrix;
|
||
|
|
||
|
|
||
|
static void nlCRSMatrixCUDADestroyCRS(NLCUDASparseMatrix* Mcuda) {
|
||
|
if(Mcuda->colind != NULL) {
|
||
|
nlCUDACheck(CUDA()->cudaFree(Mcuda->colind));
|
||
|
Mcuda->colind = NULL;
|
||
|
}
|
||
|
if(Mcuda->rowptr != NULL) {
|
||
|
nlCUDACheck(CUDA()->cudaFree(Mcuda->rowptr));
|
||
|
Mcuda->rowptr = NULL;
|
||
|
}
|
||
|
if(Mcuda->val != NULL) {
|
||
|
nlCUDACheck(CUDA()->cudaFree(Mcuda->val));
|
||
|
Mcuda->val = NULL;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void nlCRSMatrixCUDADestroy(NLCUDASparseMatrix* Mcuda) {
|
||
|
if(Mcuda->hyb != NULL) {
|
||
|
nlCUDACheck(CUDA()->cusparseDestroyHybMat(Mcuda->hyb));
|
||
|
}
|
||
|
nlCRSMatrixCUDADestroyCRS(Mcuda);
|
||
|
nlCUDACheck(CUDA()->cusparseDestroyMatDescr(Mcuda->descr));
|
||
|
memset(Mcuda, 0, sizeof(*Mcuda));
|
||
|
}
|
||
|
|
||
|
static void nlCRSMatrixCUDAMult(
|
||
|
NLCUDASparseMatrix* Mcuda, const double* x, double* y
|
||
|
) {
|
||
|
const double one = 1;
|
||
|
const double zero = 0;
|
||
|
if(Mcuda->hyb != NULL) {
|
||
|
nlCUDACheck(
|
||
|
CUDA()->cusparseDhybmv(
|
||
|
CUDA()->HNDL_cusparse,
|
||
|
CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||
|
&one,
|
||
|
Mcuda->descr,
|
||
|
Mcuda->hyb,
|
||
|
x,
|
||
|
&zero,
|
||
|
y
|
||
|
)
|
||
|
);
|
||
|
} else {
|
||
|
nlCUDACheck(
|
||
|
CUDA()->cusparseDcsrmv(
|
||
|
CUDA()->HNDL_cusparse,
|
||
|
CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||
|
(int)Mcuda->m,
|
||
|
(int)Mcuda->n,
|
||
|
(int)Mcuda->nnz,
|
||
|
&one,
|
||
|
Mcuda->descr,
|
||
|
Mcuda->val,
|
||
|
Mcuda->rowptr,
|
||
|
Mcuda->colind,
|
||
|
x,
|
||
|
&zero,
|
||
|
y
|
||
|
)
|
||
|
);
|
||
|
}
|
||
|
nlCUDABlas()->flops += (NLulong)(2*Mcuda->nnz);
|
||
|
}
|
||
|
|
||
|
NLMatrix nlCUDAMatrixNewFromCRSMatrix(NLMatrix M_in) {
|
||
|
NLCUDASparseMatrix* Mcuda = NL_NEW(NLCUDASparseMatrix);
|
||
|
NLCRSMatrix* M = (NLCRSMatrix*)(M_in);
|
||
|
size_t colind_sz, rowptr_sz, val_sz;
|
||
|
nl_assert(M_in->type == NL_MATRIX_CRS);
|
||
|
nlCUDACheck(CUDA()->cusparseCreateMatDescr(&Mcuda->descr));
|
||
|
if(M->symmetric_storage) {
|
||
|
nlCUDACheck(CUDA()->cusparseSetMatType(
|
||
|
Mcuda->descr, CUSPARSE_MATRIX_TYPE_SYMMETRIC)
|
||
|
);
|
||
|
} else {
|
||
|
nlCUDACheck(CUDA()->cusparseSetMatType(
|
||
|
Mcuda->descr, CUSPARSE_MATRIX_TYPE_GENERAL)
|
||
|
);
|
||
|
}
|
||
|
nlCUDACheck(CUDA()->cusparseSetMatIndexBase(
|
||
|
Mcuda->descr, CUSPARSE_INDEX_BASE_ZERO)
|
||
|
);
|
||
|
Mcuda->m = M->m;
|
||
|
Mcuda->n = M->n;
|
||
|
Mcuda->nnz = nlCRSMatrixNNZ(M);
|
||
|
|
||
|
colind_sz = (size_t)Mcuda->nnz*sizeof(int);
|
||
|
rowptr_sz = (size_t)(Mcuda->m+1)*sizeof(int);
|
||
|
val_sz = (size_t)Mcuda->nnz*sizeof(double);
|
||
|
|
||
|
nlCUDACheck(CUDA()->cudaMalloc((void**)&Mcuda->colind,colind_sz));
|
||
|
nlCUDACheck(CUDA()->cudaMalloc((void**)&Mcuda->rowptr,rowptr_sz));
|
||
|
nlCUDACheck(CUDA()->cudaMalloc((void**)&Mcuda->val,val_sz));
|
||
|
nlCUDACheck(CUDA()->cudaMemcpy(
|
||
|
Mcuda->colind, M->colind, colind_sz, cudaMemcpyHostToDevice)
|
||
|
);
|
||
|
nlCUDACheck(CUDA()->cudaMemcpy(
|
||
|
Mcuda->rowptr, M->rowptr, rowptr_sz, cudaMemcpyHostToDevice)
|
||
|
);
|
||
|
nlCUDACheck(CUDA()->cudaMemcpy(
|
||
|
Mcuda->val, M->val, val_sz, cudaMemcpyHostToDevice)
|
||
|
);
|
||
|
Mcuda->hyb=NULL;
|
||
|
if(!M->symmetric_storage) {
|
||
|
nlCUDACheck(CUDA()->cusparseCreateHybMat(&Mcuda->hyb));
|
||
|
nlCUDACheck(CUDA()->cusparseDcsr2hyb(
|
||
|
CUDA()->HNDL_cusparse,
|
||
|
(int)M->m,
|
||
|
(int)M->n,
|
||
|
Mcuda->descr,
|
||
|
Mcuda->val,
|
||
|
Mcuda->rowptr,
|
||
|
Mcuda->colind,
|
||
|
Mcuda->hyb,
|
||
|
0,
|
||
|
CUSPARSE_HYB_PARTITION_AUTO
|
||
|
));
|
||
|
/* We no longer need the CRS part */
|
||
|
nlCRSMatrixCUDADestroyCRS(Mcuda);
|
||
|
}
|
||
|
Mcuda->type=NL_MATRIX_OTHER;
|
||
|
Mcuda->destroy_func=(NLDestroyMatrixFunc)nlCRSMatrixCUDADestroy;
|
||
|
Mcuda->mult_func=(NLMultMatrixVectorFunc)nlCRSMatrixCUDAMult;
|
||
|
return (NLMatrix)Mcuda;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
typedef struct {
|
||
|
NLuint m;
|
||
|
NLuint n;
|
||
|
NLenum type;
|
||
|
NLDestroyMatrixFunc destroy_func;
|
||
|
NLMultMatrixVectorFunc mult_func;
|
||
|
double* val;
|
||
|
} NLDiagonalMatrixCUDA;
|
||
|
|
||
|
static void nlDiagonalMatrixCUDADestroy(NLDiagonalMatrixCUDA* Mcuda) {
|
||
|
nlCUDACheck(CUDA()->cudaFree(Mcuda->val));
|
||
|
memset(Mcuda, 0, sizeof(*Mcuda));
|
||
|
}
|
||
|
|
||
|
static void nlDiagonalMatrixCUDAMult(
|
||
|
NLDiagonalMatrixCUDA* Mcuda, const double* x, double* y
|
||
|
) {
|
||
|
int N = (int)Mcuda->n;
|
||
|
/*
|
||
|
* vector x vector component-wise product implemented
|
||
|
* using diagonal matrix x matrix function.
|
||
|
*/
|
||
|
nlCUDACheck(CUDA()->cublasDdgmm(
|
||
|
CUDA()->HNDL_cublas, CUBLAS_SIDE_LEFT,
|
||
|
N, 1,
|
||
|
x, N,
|
||
|
Mcuda->val, 1,
|
||
|
y, N
|
||
|
));
|
||
|
nlCUDABlas()->flops += (NLulong)N;
|
||
|
}
|
||
|
|
||
|
static NLMatrix nlDiagonalMatrixCUDANew(const double* diag, NLuint n) {
|
||
|
NLDiagonalMatrixCUDA* Mcuda = NL_NEW(NLDiagonalMatrixCUDA);
|
||
|
Mcuda->m = n;
|
||
|
Mcuda->n = n;
|
||
|
Mcuda->type = NL_MATRIX_OTHER;
|
||
|
nlCUDACheck(CUDA()->cudaMalloc(
|
||
|
(void**)&Mcuda->val, n*sizeof(double))
|
||
|
);
|
||
|
nlCUDACheck(CUDA()->cudaMemcpy(
|
||
|
Mcuda->val, diag, n*sizeof(double), cudaMemcpyHostToDevice)
|
||
|
);
|
||
|
Mcuda->destroy_func=(NLDestroyMatrixFunc)nlDiagonalMatrixCUDADestroy;
|
||
|
Mcuda->mult_func=(NLMultMatrixVectorFunc)nlDiagonalMatrixCUDAMult;
|
||
|
return (NLMatrix)Mcuda;
|
||
|
}
|
||
|
|
||
|
NLMatrix nlCUDAJacobiPreconditionerNewFromCRSMatrix(NLMatrix M_in) {
|
||
|
NLuint N = M_in->n;
|
||
|
NLuint i,jj;
|
||
|
double* diag = NULL;
|
||
|
NLMatrix result = NULL;
|
||
|
NLCRSMatrix* M = (NLCRSMatrix*)(M_in);
|
||
|
nl_assert(M_in->type == NL_MATRIX_CRS);
|
||
|
diag = NL_NEW_ARRAY(double,N);
|
||
|
for(i=0; i<N; ++i) {
|
||
|
for(jj=M->rowptr[i]; jj<M->rowptr[i+1]; ++jj) {
|
||
|
if(M->colind[jj] == i) {
|
||
|
diag[i] = M->val[jj];
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
for(i=0; i<N; ++i) {
|
||
|
diag[i] = ((diag[i] == 0.0) ? 1.0 : 1.0 / diag[i]);
|
||
|
}
|
||
|
result = nlDiagonalMatrixCUDANew(diag, N);
|
||
|
NL_DELETE_ARRAY(diag);
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
static void* cuda_blas_malloc(
|
||
|
NLBlas_t blas, NLmemoryType type, size_t size
|
||
|
) {
|
||
|
void* result = NULL;
|
||
|
blas->used_ram[type] += (NLulong)size;
|
||
|
blas->max_used_ram[type] = MAX(
|
||
|
blas->max_used_ram[type],blas->used_ram[type]
|
||
|
);
|
||
|
if(type == NL_HOST_MEMORY) {
|
||
|
result = malloc(size);
|
||
|
} else {
|
||
|
nlCUDACheck(CUDA()->cudaMalloc(&result,size));
|
||
|
}
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
static void cuda_blas_free(
|
||
|
NLBlas_t blas, NLmemoryType type, size_t size, void* ptr
|
||
|
) {
|
||
|
blas->used_ram[type] -= (NLulong)size;
|
||
|
if(type == NL_HOST_MEMORY) {
|
||
|
free(ptr);
|
||
|
} else {
|
||
|
nlCUDACheck(CUDA()->cudaFree(ptr));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void cuda_blas_memcpy(
|
||
|
NLBlas_t blas,
|
||
|
void* to, NLmemoryType to_type,
|
||
|
void* from, NLmemoryType from_type,
|
||
|
size_t size
|
||
|
) {
|
||
|
enum cudaMemcpyKind kind = cudaMemcpyDefault;
|
||
|
nl_arg_used(blas);
|
||
|
if(from_type == NL_HOST_MEMORY) {
|
||
|
if(to_type == NL_HOST_MEMORY) {
|
||
|
kind = cudaMemcpyHostToHost;
|
||
|
} else {
|
||
|
kind = cudaMemcpyHostToDevice;
|
||
|
}
|
||
|
} else {
|
||
|
if(to_type == NL_HOST_MEMORY) {
|
||
|
kind = cudaMemcpyDeviceToHost;
|
||
|
} else {
|
||
|
kind = cudaMemcpyDeviceToDevice;
|
||
|
}
|
||
|
}
|
||
|
nlCUDACheck(CUDA()->cudaMemcpy(to, from, size, kind));
|
||
|
}
|
||
|
|
||
|
static void cuda_blas_dcopy(
|
||
|
NLBlas_t blas, int n, const double *x, int incx, double *y, int incy
|
||
|
) {
|
||
|
nl_arg_used(blas);
|
||
|
CUDA()->cublasDcopy(CUDA()->HNDL_cublas,n,x,incx,y,incy);
|
||
|
}
|
||
|
|
||
|
static double cuda_blas_ddot(
|
||
|
NLBlas_t blas, int n, const double *x, int incx, const double *y, int incy
|
||
|
) {
|
||
|
double result = 0.0;
|
||
|
blas->flops += (NLulong)(2*n);
|
||
|
CUDA()->cublasDdot(CUDA()->HNDL_cublas,n,x,incx,y,incy,&result);
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
static double cuda_blas_dnrm2(
|
||
|
NLBlas_t blas, int n, const double *x, int incx
|
||
|
) {
|
||
|
double result = 0.0;
|
||
|
blas->flops += (NLulong)(2*n);
|
||
|
CUDA()->cublasDnrm2(CUDA()->HNDL_cublas,n,x,incx,&result);
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
static void cuda_blas_daxpy(
|
||
|
NLBlas_t blas, int n,
|
||
|
double a, const double *x, int incx, double *y, int incy
|
||
|
) {
|
||
|
blas->flops += (NLulong)(2*n);
|
||
|
CUDA()->cublasDaxpy(CUDA()->HNDL_cublas,n,&a,x,incx,y,incy);
|
||
|
}
|
||
|
|
||
|
static void cuda_blas_dscal(
|
||
|
NLBlas_t blas, int n, double a, double *x, int incx
|
||
|
) {
|
||
|
blas->flops += (NLulong)n;
|
||
|
CUDA()->cublasDscal(CUDA()->HNDL_cublas,n,&a,x,incx);
|
||
|
}
|
||
|
|
||
|
|
||
|
static void cuda_blas_dgemv(
|
||
|
NLBlas_t blas, MatrixTranspose trans, int m, int n, double alpha,
|
||
|
const double *A, int ldA, const double *x, int incx,
|
||
|
double beta, double *y, int incy
|
||
|
) {
|
||
|
nl_arg_used(blas);
|
||
|
/* TODO: update FLOPS */
|
||
|
CUDA()->cublasDgemv(
|
||
|
CUDA()->HNDL_cublas, (cublasOperation_t)trans,
|
||
|
m, n, &alpha, A, ldA, x, incx, &beta, y, incy
|
||
|
);
|
||
|
}
|
||
|
|
||
|
static void cuda_blas_dtpsv(
|
||
|
NLBlas_t blas, MatrixTriangle uplo, MatrixTranspose trans,
|
||
|
MatrixUnitTriangular diag, int n, const double *AP,
|
||
|
double *x, int incx
|
||
|
) {
|
||
|
nl_arg_used(blas);
|
||
|
/* TODO: update FLOPS */
|
||
|
CUDA()->cublasDtpsv(
|
||
|
CUDA()->HNDL_cublas,
|
||
|
(cublasFillMode_t)uplo,
|
||
|
(cublasOperation_t)trans,
|
||
|
(cublasDiagType_t)diag, n,
|
||
|
AP, x, incx
|
||
|
);
|
||
|
}
|
||
|
|
||
|
|
||
|
NLBlas_t nlCUDABlas() {
|
||
|
static NLboolean initialized = NL_FALSE;
|
||
|
static struct NLBlas blas;
|
||
|
if(!initialized) {
|
||
|
memset(&blas, 0, sizeof(blas));
|
||
|
blas.has_unified_memory = NL_FALSE;
|
||
|
blas.Malloc = cuda_blas_malloc;
|
||
|
blas.Free = cuda_blas_free;
|
||
|
blas.Memcpy = cuda_blas_memcpy;
|
||
|
blas.Dcopy = cuda_blas_dcopy;
|
||
|
blas.Ddot = cuda_blas_ddot;
|
||
|
blas.Dnrm2 = cuda_blas_dnrm2;
|
||
|
blas.Daxpy = cuda_blas_daxpy;
|
||
|
blas.Dscal = cuda_blas_dscal;
|
||
|
blas.Dgemv = cuda_blas_dgemv;
|
||
|
blas.Dtpsv = cuda_blas_dtpsv;
|
||
|
nlBlasResetStats(&blas);
|
||
|
initialized = NL_TRUE;
|
||
|
}
|
||
|
return &blas;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
/******* extracted from nl_api.c *******/
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
static NLSparseMatrix* nlGetCurrentSparseMatrix() {
|
||
|
NLSparseMatrix* result = NULL;
|
||
|
switch(nlCurrentContext->matrix_mode) {
|
||
|
case NL_STIFFNESS_MATRIX: {
|
||
|
nl_assert(nlCurrentContext->M != NULL);
|
||
|
nl_assert(nlCurrentContext->M->type == NL_MATRIX_SPARSE_DYNAMIC);
|
||
|
result = (NLSparseMatrix*)(nlCurrentContext->M);
|
||
|
} break;
|
||
|
case NL_MASS_MATRIX: {
|
||
|
nl_assert(nlCurrentContext->B != NULL);
|
||
|
nl_assert(nlCurrentContext->B->type == NL_MATRIX_SPARSE_DYNAMIC);
|
||
|
result = (NLSparseMatrix*)(nlCurrentContext->B);
|
||
|
} break;
|
||
|
default:
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
NLboolean nlInitExtension(const char* extension) {
|
||
|
if(!strcmp(extension, "SUPERLU")) {
|
||
|
return nlInitExtension_SUPERLU();
|
||
|
} else if(!strcmp(extension, "CHOLMOD")) {
|
||
|
return nlInitExtension_CHOLMOD();
|
||
|
} else if(!strcmp(extension, "ARPACK")) {
|
||
|
/*
|
||
|
* SUPERLU is needed by OpenNL's ARPACK driver
|
||
|
* (factorizes the matrix for the shift-invert spectral
|
||
|
* transform).
|
||
|
*/
|
||
|
return nlInitExtension_SUPERLU() && nlInitExtension_ARPACK();
|
||
|
} else if(!strcmp(extension, "MKL")) {
|
||
|
return nlInitExtension_MKL();
|
||
|
} else if(!strcmp(extension, "CUDA")) {
|
||
|
return nlInitExtension_CUDA();
|
||
|
}
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
|
||
|
NLboolean nlExtensionIsInitialized(const char* extension) {
|
||
|
if(!strcmp(extension, "SUPERLU")) {
|
||
|
return nlExtensionIsInitialized_SUPERLU();
|
||
|
} else if(!strcmp(extension, "CHOLMOD")) {
|
||
|
return nlExtensionIsInitialized_CHOLMOD();
|
||
|
} else if(!strcmp(extension, "ARPACK")) {
|
||
|
/*
|
||
|
* SUPERLU is needed by OpenNL's ARPACK driver
|
||
|
* (factorizes the matrix for the shift-invert spectral
|
||
|
* transform).
|
||
|
*/
|
||
|
return nlExtensionIsInitialized_SUPERLU() && nlExtensionIsInitialized_ARPACK();
|
||
|
} else if(!strcmp(extension, "MKL")) {
|
||
|
return nlExtensionIsInitialized_MKL();
|
||
|
} else if(!strcmp(extension, "CUDA")) {
|
||
|
return nlExtensionIsInitialized_CUDA();
|
||
|
}
|
||
|
return NL_FALSE;
|
||
|
}
|
||
|
|
||
|
void nlInitialize(int argc, char** argv) {
|
||
|
int i=0;
|
||
|
char* ptr=NULL;
|
||
|
char extension[255];
|
||
|
/* Find all the arguments with the form:
|
||
|
* nl:<extension>=true|false
|
||
|
* and try to activate the corresponding extensions.
|
||
|
*/
|
||
|
for(i=1; i<argc; ++i) {
|
||
|
ptr = strstr(argv[i],"=true");
|
||
|
if(!strncmp(argv[i], "nl:", 3) &&
|
||
|
(strlen(argv[i]) > 3) &&
|
||
|
(ptr != NULL)) {
|
||
|
strncpy(extension, argv[i]+3, (size_t)(ptr-argv[i]-3));
|
||
|
extension[(size_t)(ptr-argv[i]-3)] = '\0';
|
||
|
if(nlInitExtension(extension)) {
|
||
|
nl_fprintf(stdout,"OpenNL %s: initialized\n", extension);
|
||
|
} else {
|
||
|
nl_fprintf(stderr,"OpenNL %s: could not initialize\n", extension);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/* Get/Set parameters */
|
||
|
|
||
|
void nlSolverParameterd(NLenum pname, NLdouble param) {
|
||
|
nlCheckState(NL_STATE_INITIAL);
|
||
|
switch(pname) {
|
||
|
case NL_THRESHOLD: {
|
||
|
nl_assert(param >= 0);
|
||
|
nlCurrentContext->threshold = (NLdouble)param;
|
||
|
nlCurrentContext->threshold_defined = NL_TRUE;
|
||
|
} break;
|
||
|
case NL_OMEGA: {
|
||
|
nl_range_assert(param,1.0,2.0);
|
||
|
nlCurrentContext->omega = (NLdouble)param;
|
||
|
} break;
|
||
|
default: {
|
||
|
nlError("nlSolverParameterd","Invalid parameter");
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void nlSolverParameteri(NLenum pname, NLint param) {
|
||
|
nlCheckState(NL_STATE_INITIAL);
|
||
|
switch(pname) {
|
||
|
case NL_SOLVER: {
|
||
|
nlCurrentContext->solver = (NLenum)param;
|
||
|
} break;
|
||
|
case NL_NB_VARIABLES: {
|
||
|
nl_assert(param > 0);
|
||
|
nlCurrentContext->nb_variables = (NLuint)param;
|
||
|
} break;
|
||
|
case NL_NB_SYSTEMS: {
|
||
|
nl_assert(param > 0);
|
||
|
nlCurrentContext->nb_systems = (NLuint)param;
|
||
|
} break;
|
||
|
case NL_LEAST_SQUARES: {
|
||
|
nlCurrentContext->least_squares = (NLboolean)param;
|
||
|
} break;
|
||
|
case NL_MAX_ITERATIONS: {
|
||
|
nl_assert(param > 0);
|
||
|
nlCurrentContext->max_iterations = (NLuint)param;
|
||
|
nlCurrentContext->max_iterations_defined = NL_TRUE;
|
||
|
} break;
|
||
|
case NL_SYMMETRIC: {
|
||
|
nlCurrentContext->symmetric = (NLboolean)param;
|
||
|
} break;
|
||
|
case NL_INNER_ITERATIONS: {
|
||
|
nl_assert(param > 0);
|
||
|
nlCurrentContext->inner_iterations = (NLuint)param;
|
||
|
} break;
|
||
|
case NL_PRECONDITIONER: {
|
||
|
nlCurrentContext->preconditioner = (NLuint)param;
|
||
|
nlCurrentContext->preconditioner_defined = NL_TRUE;
|
||
|
} break;
|
||
|
default: {
|
||
|
nlError("nlSolverParameteri","Invalid parameter");
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void nlGetBooleanv(NLenum pname, NLboolean* params) {
|
||
|
switch(pname) {
|
||
|
case NL_LEAST_SQUARES: {
|
||
|
*params = nlCurrentContext->least_squares;
|
||
|
} break;
|
||
|
case NL_SYMMETRIC: {
|
||
|
*params = nlCurrentContext->symmetric;
|
||
|
} break;
|
||
|
default: {
|
||
|
nlError("nlGetBooleanv","Invalid parameter");
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void nlGetDoublev(NLenum pname, NLdouble* params) {
|
||
|
switch(pname) {
|
||
|
case NL_THRESHOLD: {
|
||
|
*params = nlCurrentContext->threshold;
|
||
|
} break;
|
||
|
case NL_OMEGA: {
|
||
|
*params = nlCurrentContext->omega;
|
||
|
} break;
|
||
|
case NL_ERROR: {
|
||
|
*params = nlCurrentContext->error;
|
||
|
} break;
|
||
|
case NL_ELAPSED_TIME: {
|
||
|
*params = nlCurrentContext->elapsed_time;
|
||
|
} break;
|
||
|
case NL_GFLOPS: {
|
||
|
if(nlCurrentContext->elapsed_time == 0) {
|
||
|
*params = 0.0;
|
||
|
} else {
|
||
|
*params = (NLdouble)(nlCurrentContext->flops) /
|
||
|
(nlCurrentContext->elapsed_time * 1e9);
|
||
|
}
|
||
|
} break;
|
||
|
default: {
|
||
|
nlError("nlGetDoublev","Invalid parameter");
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void nlGetIntegerv(NLenum pname, NLint* params) {
|
||
|
switch(pname) {
|
||
|
case NL_SOLVER: {
|
||
|
*params = (NLint)(nlCurrentContext->solver);
|
||
|
} break;
|
||
|
case NL_NB_VARIABLES: {
|
||
|
*params = (NLint)(nlCurrentContext->nb_variables);
|
||
|
} break;
|
||
|
case NL_NB_SYSTEMS: {
|
||
|
*params = (NLint)(nlCurrentContext->nb_systems);
|
||
|
} break;
|
||
|
case NL_LEAST_SQUARES: {
|
||
|
*params = (NLint)(nlCurrentContext->least_squares);
|
||
|
} break;
|
||
|
case NL_MAX_ITERATIONS: {
|
||
|
*params = (NLint)(nlCurrentContext->max_iterations);
|
||
|
} break;
|
||
|
case NL_SYMMETRIC: {
|
||
|
*params = (NLint)(nlCurrentContext->symmetric);
|
||
|
} break;
|
||
|
case NL_USED_ITERATIONS: {
|
||
|
*params = (NLint)(nlCurrentContext->used_iterations);
|
||
|
} break;
|
||
|
case NL_PRECONDITIONER: {
|
||
|
*params = (NLint)(nlCurrentContext->preconditioner);
|
||
|
} break;
|
||
|
case NL_NNZ: {
|
||
|
*params = (NLint)(nlMatrixNNZ(nlCurrentContext->M));
|
||
|
} break;
|
||
|
default: {
|
||
|
nlError("nlGetIntegerv","Invalid parameter");
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/* Enable / Disable */
|
||
|
|
||
|
void nlEnable(NLenum pname) {
|
||
|
switch(pname) {
|
||
|
case NL_NORMALIZE_ROWS: {
|
||
|
nl_assert(nlCurrentContext->state != NL_STATE_ROW);
|
||
|
nlCurrentContext->normalize_rows = NL_TRUE;
|
||
|
} break;
|
||
|
case NL_VERBOSE: {
|
||
|
nlCurrentContext->verbose = NL_TRUE;
|
||
|
} break;
|
||
|
case NL_VARIABLES_BUFFER: {
|
||
|
nlCurrentContext->user_variable_buffers = NL_TRUE;
|
||
|
} break;
|
||
|
default: {
|
||
|
nlError("nlEnable","Invalid parameter");
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void nlDisable(NLenum pname) {
|
||
|
switch(pname) {
|
||
|
case NL_NORMALIZE_ROWS: {
|
||
|
nl_assert(nlCurrentContext->state != NL_STATE_ROW);
|
||
|
nlCurrentContext->normalize_rows = NL_FALSE;
|
||
|
} break;
|
||
|
case NL_VERBOSE: {
|
||
|
nlCurrentContext->verbose = NL_FALSE;
|
||
|
} break;
|
||
|
case NL_VARIABLES_BUFFER: {
|
||
|
nlCurrentContext->user_variable_buffers = NL_FALSE;
|
||
|
} break;
|
||
|
default: {
|
||
|
nlError("nlDisable","Invalid parameter");
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
NLboolean nlIsEnabled(NLenum pname) {
|
||
|
NLboolean result = NL_FALSE;
|
||
|
switch(pname) {
|
||
|
case NL_NORMALIZE_ROWS: {
|
||
|
result = nlCurrentContext->normalize_rows;
|
||
|
} break;
|
||
|
case NL_VERBOSE: {
|
||
|
result = nlCurrentContext->verbose;
|
||
|
} break;
|
||
|
case NL_VARIABLES_BUFFER: {
|
||
|
result = nlCurrentContext->user_variable_buffers;
|
||
|
} break;
|
||
|
default: {
|
||
|
nlError("nlIsEnables","Invalid parameter");
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
}
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
|
||
|
/* NL functions */
|
||
|
|
||
|
void nlSetFunction(NLenum pname, NLfunc param) {
|
||
|
switch(pname) {
|
||
|
case NL_FUNC_SOLVER:
|
||
|
nlCurrentContext->solver_func = (NLSolverFunc)(param);
|
||
|
nlCurrentContext->solver = NL_SOLVER_USER;
|
||
|
break;
|
||
|
case NL_FUNC_MATRIX:
|
||
|
nlDeleteMatrix(nlCurrentContext->M);
|
||
|
nlCurrentContext->M = nlMatrixNewFromFunction(
|
||
|
nlCurrentContext->n, nlCurrentContext->n,
|
||
|
(NLMatrixFunc)param
|
||
|
);
|
||
|
break;
|
||
|
case NL_FUNC_PRECONDITIONER:
|
||
|
nlDeleteMatrix(nlCurrentContext->P);
|
||
|
nlCurrentContext->P = nlMatrixNewFromFunction(
|
||
|
nlCurrentContext->n, nlCurrentContext->n,
|
||
|
(NLMatrixFunc)param
|
||
|
);
|
||
|
nlCurrentContext->preconditioner = NL_PRECOND_USER;
|
||
|
break;
|
||
|
case NL_FUNC_PROGRESS:
|
||
|
nlCurrentContext->progress_func = (NLProgressFunc)(param);
|
||
|
break;
|
||
|
default:
|
||
|
nlError("nlSetFunction","Invalid parameter");
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void nlGetFunction(NLenum pname, NLfunc* param) {
|
||
|
switch(pname) {
|
||
|
case NL_FUNC_SOLVER:
|
||
|
*param = (NLfunc)(nlCurrentContext->solver_func);
|
||
|
break;
|
||
|
case NL_FUNC_MATRIX:
|
||
|
*param = (NLfunc)(nlMatrixGetFunction(nlCurrentContext->M));
|
||
|
break;
|
||
|
case NL_FUNC_PRECONDITIONER:
|
||
|
*param = (NLfunc)(nlMatrixGetFunction(nlCurrentContext->P));
|
||
|
break;
|
||
|
default:
|
||
|
nlError("nlGetFunction","Invalid parameter");
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/* Get/Set Lock/Unlock variables */
|
||
|
|
||
|
void nlSetVariable(NLuint index, NLdouble value) {
|
||
|
nlCheckState(NL_STATE_SYSTEM);
|
||
|
nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables - 1);
|
||
|
NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[0],index) = value;
|
||
|
}
|
||
|
|
||
|
void nlMultiSetVariable(NLuint index, NLuint system, NLdouble value) {
|
||
|
nlCheckState(NL_STATE_SYSTEM);
|
||
|
nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables-1);
|
||
|
nl_debug_range_assert(system, 0, nlCurrentContext->nb_systems-1);
|
||
|
NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[system],index) = value;
|
||
|
}
|
||
|
|
||
|
NLdouble nlGetVariable(NLuint index) {
|
||
|
nl_assert(nlCurrentContext->state != NL_STATE_INITIAL);
|
||
|
nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables - 1);
|
||
|
return NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[0],index);
|
||
|
}
|
||
|
|
||
|
NLdouble nlMultiGetVariable(NLuint index, NLuint system) {
|
||
|
nl_assert(nlCurrentContext->state != NL_STATE_INITIAL);
|
||
|
nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables-1);
|
||
|
nl_debug_range_assert(system, 0, nlCurrentContext->nb_systems-1);
|
||
|
return NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[system],index);
|
||
|
}
|
||
|
|
||
|
|
||
|
void nlLockVariable(NLuint index) {
|
||
|
nlCheckState(NL_STATE_SYSTEM);
|
||
|
nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables - 1);
|
||
|
nlCurrentContext->variable_is_locked[index] = NL_TRUE;
|
||
|
}
|
||
|
|
||
|
void nlUnlockVariable(NLuint index) {
|
||
|
nlCheckState(NL_STATE_SYSTEM);
|
||
|
nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables - 1);
|
||
|
nlCurrentContext->variable_is_locked[index] = NL_FALSE;
|
||
|
}
|
||
|
|
||
|
NLboolean nlVariableIsLocked(NLuint index) {
|
||
|
nl_assert(nlCurrentContext->state != NL_STATE_INITIAL);
|
||
|
nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables - 1);
|
||
|
return nlCurrentContext->variable_is_locked[index];
|
||
|
}
|
||
|
|
||
|
|
||
|
/* System construction */
|
||
|
|
||
|
static void nlVariablesToVector() {
|
||
|
NLuint n=nlCurrentContext->n;
|
||
|
NLuint k,i,index;
|
||
|
NLdouble value;
|
||
|
|
||
|
nl_assert(nlCurrentContext->x != NULL);
|
||
|
for(k=0; k<nlCurrentContext->nb_systems; ++k) {
|
||
|
for(i=0; i<nlCurrentContext->nb_variables; ++i) {
|
||
|
if(!nlCurrentContext->variable_is_locked[i]) {
|
||
|
index = nlCurrentContext->variable_index[i];
|
||
|
nl_assert(index < nlCurrentContext->n);
|
||
|
value = NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[k],i);
|
||
|
nlCurrentContext->x[index+k*n] = value;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void nlVectorToVariables() {
|
||
|
NLuint n=nlCurrentContext->n;
|
||
|
NLuint k,i,index;
|
||
|
NLdouble value;
|
||
|
|
||
|
nl_assert(nlCurrentContext->x != NULL);
|
||
|
for(k=0; k<nlCurrentContext->nb_systems; ++k) {
|
||
|
for(i=0; i<nlCurrentContext->nb_variables; ++i) {
|
||
|
if(!nlCurrentContext->variable_is_locked[i]) {
|
||
|
index = nlCurrentContext->variable_index[i];
|
||
|
nl_assert(index < nlCurrentContext->n);
|
||
|
value = nlCurrentContext->x[index+k*n];
|
||
|
NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[k],i) = value;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
static void nlBeginSystem() {
|
||
|
NLuint k;
|
||
|
|
||
|
nlTransition(NL_STATE_INITIAL, NL_STATE_SYSTEM);
|
||
|
nl_assert(nlCurrentContext->nb_variables > 0);
|
||
|
|
||
|
nlCurrentContext->variable_buffer = NL_NEW_ARRAY(
|
||
|
NLBufferBinding, nlCurrentContext->nb_systems
|
||
|
);
|
||
|
|
||
|
if(nlCurrentContext->user_variable_buffers) {
|
||
|
nlCurrentContext->variable_value = NULL;
|
||
|
} else {
|
||
|
nlCurrentContext->variable_value = NL_NEW_ARRAY(
|
||
|
NLdouble,
|
||
|
nlCurrentContext->nb_variables * nlCurrentContext->nb_systems
|
||
|
);
|
||
|
for(k=0; k<nlCurrentContext->nb_systems; ++k) {
|
||
|
nlCurrentContext->variable_buffer[k].base_address =
|
||
|
nlCurrentContext->variable_value +
|
||
|
k * nlCurrentContext->nb_variables;
|
||
|
nlCurrentContext->variable_buffer[k].stride = sizeof(NLdouble);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
nlCurrentContext->variable_is_locked = NL_NEW_ARRAY(
|
||
|
NLboolean, nlCurrentContext->nb_variables
|
||
|
);
|
||
|
nlCurrentContext->variable_index = NL_NEW_ARRAY(
|
||
|
NLuint, nlCurrentContext->nb_variables
|
||
|
);
|
||
|
}
|
||
|
|
||
|
static void nlEndSystem() {
|
||
|
nlTransition(NL_STATE_MATRIX_CONSTRUCTED, NL_STATE_SYSTEM_CONSTRUCTED);
|
||
|
}
|
||
|
|
||
|
static void nlInitializeM() {
|
||
|
NLuint i;
|
||
|
NLuint n = 0;
|
||
|
NLenum storage = NL_MATRIX_STORE_ROWS;
|
||
|
|
||
|
|
||
|
for(i=0; i<nlCurrentContext->nb_variables; i++) {
|
||
|
if(!nlCurrentContext->variable_is_locked[i]) {
|
||
|
nlCurrentContext->variable_index[i] = n;
|
||
|
n++;
|
||
|
} else {
|
||
|
nlCurrentContext->variable_index[i] = (NLuint)~0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
nlCurrentContext->n = n;
|
||
|
|
||
|
/*
|
||
|
* If the user trusts OpenNL and has left solver as NL_SOLVER_DEFAULT,
|
||
|
* then we setup reasonable parameters for him.
|
||
|
*/
|
||
|
if(nlCurrentContext->solver == NL_SOLVER_DEFAULT) {
|
||
|
if(nlCurrentContext->least_squares || nlCurrentContext->symmetric) {
|
||
|
nlCurrentContext->solver = NL_CG;
|
||
|
if(!nlCurrentContext->preconditioner_defined) {
|
||
|
nlCurrentContext->preconditioner = NL_PRECOND_JACOBI;
|
||
|
}
|
||
|
} else {
|
||
|
nlCurrentContext->solver = NL_BICGSTAB;
|
||
|
}
|
||
|
if(!nlCurrentContext->max_iterations_defined) {
|
||
|
nlCurrentContext->max_iterations = n*5;
|
||
|
}
|
||
|
if(!nlCurrentContext->threshold_defined) {
|
||
|
nlCurrentContext->threshold = 1e-6;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/* SSOR preconditioner requires rows and columns */
|
||
|
if(nlCurrentContext->preconditioner == NL_PRECOND_SSOR) {
|
||
|
storage = (storage | NL_MATRIX_STORE_COLUMNS);
|
||
|
}
|
||
|
|
||
|
/* a least squares problem results in a symmetric matrix */
|
||
|
if(nlCurrentContext->least_squares) {
|
||
|
nlCurrentContext->symmetric = NL_TRUE;
|
||
|
}
|
||
|
|
||
|
if(
|
||
|
nlCurrentContext->symmetric &&
|
||
|
nlCurrentContext->preconditioner == NL_PRECOND_SSOR
|
||
|
) {
|
||
|
/*
|
||
|
* For now, only used with SSOR preconditioner, because
|
||
|
* for other modes it is either unsupported (SUPERLU) or
|
||
|
* causes performance loss (non-parallel sparse SpMV)
|
||
|
*/
|
||
|
storage = (storage | NL_MATRIX_STORE_SYMMETRIC);
|
||
|
}
|
||
|
|
||
|
nlCurrentContext->M = (NLMatrix)(NL_NEW(NLSparseMatrix));
|
||
|
nlSparseMatrixConstruct(
|
||
|
(NLSparseMatrix*)(nlCurrentContext->M), n, n, storage
|
||
|
);
|
||
|
|
||
|
nlCurrentContext->x = NL_NEW_ARRAY(
|
||
|
NLdouble, n*nlCurrentContext->nb_systems
|
||
|
);
|
||
|
nlCurrentContext->b = NL_NEW_ARRAY(
|
||
|
NLdouble, n*nlCurrentContext->nb_systems
|
||
|
);
|
||
|
|
||
|
nlVariablesToVector();
|
||
|
|
||
|
nlRowColumnConstruct(&nlCurrentContext->af);
|
||
|
nlRowColumnConstruct(&nlCurrentContext->al);
|
||
|
|
||
|
nlCurrentContext->right_hand_side = NL_NEW_ARRAY(
|
||
|
double, nlCurrentContext->nb_systems
|
||
|
);
|
||
|
nlCurrentContext->current_row = 0;
|
||
|
}
|
||
|
|
||
|
static void nlEndMatrix() {
|
||
|
nlTransition(NL_STATE_MATRIX, NL_STATE_MATRIX_CONSTRUCTED);
|
||
|
|
||
|
nlRowColumnClear(&nlCurrentContext->af);
|
||
|
nlRowColumnClear(&nlCurrentContext->al);
|
||
|
|
||
|
if(!nlCurrentContext->least_squares) {
|
||
|
nl_assert(
|
||
|
nlCurrentContext->ij_coefficient_called || (
|
||
|
nlCurrentContext->current_row ==
|
||
|
nlCurrentContext->n
|
||
|
)
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void nlBeginRow() {
|
||
|
nlTransition(NL_STATE_MATRIX, NL_STATE_ROW);
|
||
|
nlRowColumnZero(&nlCurrentContext->af);
|
||
|
nlRowColumnZero(&nlCurrentContext->al);
|
||
|
}
|
||
|
|
||
|
static void nlScaleRow(NLdouble s) {
|
||
|
NLRowColumn* af = &nlCurrentContext->af;
|
||
|
NLRowColumn* al = &nlCurrentContext->al;
|
||
|
NLuint nf = af->size;
|
||
|
NLuint nl = al->size;
|
||
|
NLuint i,k;
|
||
|
for(i=0; i<nf; i++) {
|
||
|
af->coeff[i].value *= s;
|
||
|
}
|
||
|
for(i=0; i<nl; i++) {
|
||
|
al->coeff[i].value *= s;
|
||
|
}
|
||
|
for(k=0; k<nlCurrentContext->nb_systems; ++k) {
|
||
|
nlCurrentContext->right_hand_side[k] *= s;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void nlNormalizeRow(NLdouble weight) {
|
||
|
NLRowColumn* af = &nlCurrentContext->af;
|
||
|
NLRowColumn* al = &nlCurrentContext->al;
|
||
|
NLuint nf = af->size;
|
||
|
NLuint nl = al->size;
|
||
|
NLuint i;
|
||
|
NLdouble norm = 0.0;
|
||
|
for(i=0; i<nf; i++) {
|
||
|
norm += af->coeff[i].value * af->coeff[i].value;
|
||
|
}
|
||
|
for(i=0; i<nl; i++) {
|
||
|
norm += al->coeff[i].value * al->coeff[i].value;
|
||
|
}
|
||
|
norm = sqrt(norm);
|
||
|
nlScaleRow(weight / norm);
|
||
|
}
|
||
|
|
||
|
static void nlEndRow() {
|
||
|
NLRowColumn* af = &nlCurrentContext->af;
|
||
|
NLRowColumn* al = &nlCurrentContext->al;
|
||
|
NLSparseMatrix* M = nlGetCurrentSparseMatrix();
|
||
|
NLdouble* b = nlCurrentContext->b;
|
||
|
NLuint nf = af->size;
|
||
|
NLuint nl = al->size;
|
||
|
NLuint n = nlCurrentContext->n;
|
||
|
NLuint current_row = nlCurrentContext->current_row;
|
||
|
NLuint i,j,jj;
|
||
|
NLdouble S;
|
||
|
NLuint k;
|
||
|
nlTransition(NL_STATE_ROW, NL_STATE_MATRIX);
|
||
|
|
||
|
if(nlCurrentContext->normalize_rows) {
|
||
|
nlNormalizeRow(nlCurrentContext->row_scaling);
|
||
|
} else if(nlCurrentContext->row_scaling != 1.0) {
|
||
|
nlScaleRow(nlCurrentContext->row_scaling);
|
||
|
}
|
||
|
/*
|
||
|
* if least_squares : we want to solve
|
||
|
* A'A x = A'b
|
||
|
*/
|
||
|
|
||
|
if(nlCurrentContext->least_squares) {
|
||
|
for(i=0; i<nf; i++) {
|
||
|
for(j=0; j<nf; j++) {
|
||
|
nlSparseMatrixAdd(
|
||
|
M, af->coeff[i].index, af->coeff[j].index,
|
||
|
af->coeff[i].value * af->coeff[j].value
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
for(k=0; k<nlCurrentContext->nb_systems; ++k) {
|
||
|
S = -nlCurrentContext->right_hand_side[k];
|
||
|
for(jj=0; jj<nl; ++jj) {
|
||
|
j = al->coeff[jj].index;
|
||
|
S += al->coeff[jj].value *
|
||
|
NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[k],j);
|
||
|
}
|
||
|
for(jj=0; jj<nf; jj++) {
|
||
|
b[ k*n+af->coeff[jj].index ] -= af->coeff[jj].value * S;
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
for(jj=0; jj<nf; ++jj) {
|
||
|
nlSparseMatrixAdd(
|
||
|
M, current_row, af->coeff[jj].index, af->coeff[jj].value
|
||
|
);
|
||
|
}
|
||
|
for(k=0; k<nlCurrentContext->nb_systems; ++k) {
|
||
|
b[k*n+current_row] = nlCurrentContext->right_hand_side[k];
|
||
|
for(jj=0; jj<nl; ++jj) {
|
||
|
j = al->coeff[jj].index;
|
||
|
b[k*n+current_row] -= al->coeff[jj].value *
|
||
|
NL_BUFFER_ITEM(nlCurrentContext->variable_buffer[k],j);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
nlCurrentContext->current_row++;
|
||
|
for(k=0; k<nlCurrentContext->nb_systems; ++k) {
|
||
|
nlCurrentContext->right_hand_side[k] = 0.0;
|
||
|
}
|
||
|
nlCurrentContext->row_scaling = 1.0;
|
||
|
}
|
||
|
|
||
|
void nlCoefficient(NLuint index, NLdouble value) {
|
||
|
nlCheckState(NL_STATE_ROW);
|
||
|
nl_debug_range_assert(index, 0, nlCurrentContext->nb_variables - 1);
|
||
|
if(nlCurrentContext->variable_is_locked[index]) {
|
||
|
/*
|
||
|
* Note: in al, indices are NLvariable indices,
|
||
|
* within [0..nb_variables-1]
|
||
|
*/
|
||
|
nlRowColumnAppend(&(nlCurrentContext->al), index, value);
|
||
|
} else {
|
||
|
/*
|
||
|
* Note: in af, indices are system indices,
|
||
|
* within [0..n-1]
|
||
|
*/
|
||
|
nlRowColumnAppend(
|
||
|
&(nlCurrentContext->af),
|
||
|
nlCurrentContext->variable_index[index], value
|
||
|
);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void nlAddIJCoefficient(NLuint i, NLuint j, NLdouble value) {
|
||
|
NLSparseMatrix* M = nlGetCurrentSparseMatrix();
|
||
|
nlCheckState(NL_STATE_MATRIX);
|
||
|
nl_debug_range_assert(i, 0, nlCurrentContext->nb_variables - 1);
|
||
|
nl_debug_range_assert(j, 0, nlCurrentContext->nb_variables - 1);
|
||
|
#ifdef NL_DEBUG
|
||
|
for(NLuint i=0; i<nlCurrentContext->nb_variables; ++i) {
|
||
|
nl_debug_assert(!nlCurrentContext->variable_is_locked[i]);
|
||
|
}
|
||
|
#endif
|
||
|
nlSparseMatrixAdd(M, i, j, value);
|
||
|
nlCurrentContext->ij_coefficient_called = NL_TRUE;
|
||
|
}
|
||
|
|
||
|
void nlAddIRightHandSide(NLuint i, NLdouble value) {
|
||
|
nlCheckState(NL_STATE_MATRIX);
|
||
|
nl_debug_range_assert(i, 0, nlCurrentContext->nb_variables - 1);
|
||
|
#ifdef NL_DEBUG
|
||
|
for(NLuint i=0; i<nlCurrentContext->nb_variables; ++i) {
|
||
|
nl_debug_assert(!nlCurrentContext->variable_is_locked[i]);
|
||
|
}
|
||
|
#endif
|
||
|
nlCurrentContext->b[i] += value;
|
||
|
nlCurrentContext->ij_coefficient_called = NL_TRUE;
|
||
|
}
|
||
|
|
||
|
void nlMultiAddIRightHandSide(NLuint i, NLuint k, NLdouble value) {
|
||
|
NLuint n = nlCurrentContext->n;
|
||
|
nlCheckState(NL_STATE_MATRIX);
|
||
|
nl_debug_range_assert(i, 0, nlCurrentContext->nb_variables - 1);
|
||
|
nl_debug_range_assert(k, 0, nlCurrentContext->nb_systems - 1);
|
||
|
#ifdef NL_DEBUG
|
||
|
for(NLuint i=0; i<nlCurrentContext->nb_variables; ++i) {
|
||
|
nl_debug_assert(!nlCurrentContext->variable_is_locked[i]);
|
||
|
}
|
||
|
#endif
|
||
|
nlCurrentContext->b[i + k*n] += value;
|
||
|
nlCurrentContext->ij_coefficient_called = NL_TRUE;
|
||
|
}
|
||
|
|
||
|
void nlRightHandSide(NLdouble value) {
|
||
|
nlCurrentContext->right_hand_side[0] = value;
|
||
|
}
|
||
|
|
||
|
void nlMultiRightHandSide(NLuint k, NLdouble value) {
|
||
|
nl_debug_range_assert(k, 0, nlCurrentContext->nb_systems - 1);
|
||
|
nlCurrentContext->right_hand_side[k] = value;
|
||
|
}
|
||
|
|
||
|
void nlRowScaling(NLdouble value) {
|
||
|
nlCheckState(NL_STATE_MATRIX);
|
||
|
nlCurrentContext->row_scaling = value;
|
||
|
}
|
||
|
|
||
|
void nlBegin(NLenum prim) {
|
||
|
switch(prim) {
|
||
|
case NL_SYSTEM: {
|
||
|
nlBeginSystem();
|
||
|
} break;
|
||
|
case NL_MATRIX: {
|
||
|
nlTransition(NL_STATE_SYSTEM, NL_STATE_MATRIX);
|
||
|
if(
|
||
|
nlCurrentContext->matrix_mode == NL_STIFFNESS_MATRIX &&
|
||
|
nlCurrentContext->M == NULL
|
||
|
) {
|
||
|
nlInitializeM();
|
||
|
}
|
||
|
} break;
|
||
|
case NL_ROW: {
|
||
|
nlBeginRow();
|
||
|
} break;
|
||
|
default: {
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void nlEnd(NLenum prim) {
|
||
|
switch(prim) {
|
||
|
case NL_SYSTEM: {
|
||
|
nlEndSystem();
|
||
|
} break;
|
||
|
case NL_MATRIX: {
|
||
|
nlEndMatrix();
|
||
|
} break;
|
||
|
case NL_ROW: {
|
||
|
nlEndRow();
|
||
|
} break;
|
||
|
default: {
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/* nlSolve() driver routine */
|
||
|
|
||
|
NLboolean nlSolve() {
|
||
|
NLboolean result;
|
||
|
nlCheckState(NL_STATE_SYSTEM_CONSTRUCTED);
|
||
|
nlCurrentContext->start_time = nlCurrentTime();
|
||
|
nlCurrentContext->elapsed_time = 0.0;
|
||
|
nlCurrentContext->flops = 0;
|
||
|
result = nlCurrentContext->solver_func();
|
||
|
nlVectorToVariables();
|
||
|
nlCurrentContext->elapsed_time = nlCurrentTime() - nlCurrentContext->start_time;
|
||
|
nlTransition(NL_STATE_SYSTEM_CONSTRUCTED, NL_STATE_SOLVED);
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
void nlUpdateRightHandSide(NLdouble* values) {
|
||
|
/*
|
||
|
* If we are in the solved state, get back to the
|
||
|
* constructed state.
|
||
|
*/
|
||
|
nl_assert(nlCurrentContext->nb_systems == 1);
|
||
|
if(nlCurrentContext->state == NL_STATE_SOLVED) {
|
||
|
nlTransition(NL_STATE_SOLVED, NL_STATE_SYSTEM_CONSTRUCTED);
|
||
|
}
|
||
|
nlCheckState(NL_STATE_SYSTEM_CONSTRUCTED);
|
||
|
memcpy(nlCurrentContext->x, values, nlCurrentContext->n * sizeof(double));
|
||
|
}
|
||
|
|
||
|
|
||
|
/* Buffers management */
|
||
|
|
||
|
void nlBindBuffer(
|
||
|
NLenum buffer, NLuint k, void* addr, NLuint stride
|
||
|
) {
|
||
|
nlCheckState(NL_STATE_SYSTEM);
|
||
|
nl_assert(nlIsEnabled(buffer));
|
||
|
nl_assert(buffer == NL_VARIABLES_BUFFER);
|
||
|
nl_assert(k<nlCurrentContext->nb_systems);
|
||
|
if(stride == 0) {
|
||
|
stride = sizeof(NLdouble);
|
||
|
}
|
||
|
nlCurrentContext->variable_buffer[k].base_address = addr;
|
||
|
nlCurrentContext->variable_buffer[k].stride = stride;
|
||
|
}
|
||
|
|
||
|
|
||
|
/* Eigen solver */
|
||
|
|
||
|
void nlMatrixMode(NLenum matrix) {
|
||
|
NLuint n = 0;
|
||
|
NLuint i;
|
||
|
nl_assert(
|
||
|
nlCurrentContext->state == NL_STATE_SYSTEM ||
|
||
|
nlCurrentContext->state == NL_STATE_MATRIX_CONSTRUCTED
|
||
|
);
|
||
|
nlCurrentContext->state = NL_STATE_SYSTEM;
|
||
|
nlCurrentContext->matrix_mode = matrix;
|
||
|
nlCurrentContext->current_row = 0;
|
||
|
nlCurrentContext->ij_coefficient_called = NL_FALSE;
|
||
|
switch(matrix) {
|
||
|
case NL_STIFFNESS_MATRIX: {
|
||
|
/* Stiffness matrix is already constructed. */
|
||
|
} break ;
|
||
|
case NL_MASS_MATRIX: {
|
||
|
if(nlCurrentContext->B == NULL) {
|
||
|
for(i=0; i<nlCurrentContext->nb_variables; ++i) {
|
||
|
if(!nlCurrentContext->variable_is_locked[i]) {
|
||
|
++n;
|
||
|
}
|
||
|
}
|
||
|
nlCurrentContext->B = (NLMatrix)(NL_NEW(NLSparseMatrix));
|
||
|
nlSparseMatrixConstruct(
|
||
|
(NLSparseMatrix*)(nlCurrentContext->B),
|
||
|
n, n, NL_MATRIX_STORE_ROWS
|
||
|
);
|
||
|
}
|
||
|
} break ;
|
||
|
default:
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
void nlEigenSolverParameterd(
|
||
|
NLenum pname, NLdouble val
|
||
|
) {
|
||
|
switch(pname) {
|
||
|
case NL_EIGEN_SHIFT: {
|
||
|
nlCurrentContext->eigen_shift = val;
|
||
|
} break;
|
||
|
case NL_EIGEN_THRESHOLD: {
|
||
|
nlSolverParameterd(pname, val);
|
||
|
} break;
|
||
|
default:
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void nlEigenSolverParameteri(
|
||
|
NLenum pname, NLint val
|
||
|
) {
|
||
|
switch(pname) {
|
||
|
case NL_EIGEN_SOLVER: {
|
||
|
nlCurrentContext->eigen_solver = (NLenum)val;
|
||
|
} break;
|
||
|
case NL_SYMMETRIC:
|
||
|
case NL_NB_VARIABLES:
|
||
|
case NL_NB_EIGENS:
|
||
|
case NL_EIGEN_MAX_ITERATIONS: {
|
||
|
nlSolverParameteri(pname, val);
|
||
|
} break;
|
||
|
default:
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void nlEigenSolve() {
|
||
|
if(nlCurrentContext->eigen_value == NULL) {
|
||
|
nlCurrentContext->eigen_value = NL_NEW_ARRAY(
|
||
|
NLdouble,nlCurrentContext->nb_systems
|
||
|
);
|
||
|
}
|
||
|
|
||
|
nlMatrixCompress(&nlCurrentContext->M);
|
||
|
if(nlCurrentContext->B != NULL) {
|
||
|
nlMatrixCompress(&nlCurrentContext->B);
|
||
|
}
|
||
|
|
||
|
switch(nlCurrentContext->eigen_solver) {
|
||
|
case NL_ARPACK_EXT:
|
||
|
nlEigenSolve_ARPACK();
|
||
|
break;
|
||
|
default:
|
||
|
nl_assert_not_reached;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
double nlGetEigenValue(NLuint i) {
|
||
|
nl_debug_assert(i < nlCurrentContext->nb_variables);
|
||
|
return nlCurrentContext->eigen_value[i];
|
||
|
}
|