diff --git a/CMakeLists.txt b/CMakeLists.txt index c82883cd93c..cb123d3bd32 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -750,8 +750,15 @@ endif() add_custom_target(extra_test ${CMAKE_CTEST_COMMAND} -C ExtraTests) -# must link libraries after target 'opmsimulators' has been defined +# must activate avx2 flags and C11 for mixed precision. +if(HAVE_AVX2_EXTENSION) + # Trying to set the standard using `set(CMAKE_C_STANDARD 11)` had no effect, still used -std=c99 + # Hence we are requesting C17. (-std=c99 -D_ISOC11_SOURCE also seems to work) + set_property(SOURCE ${AVX2_SOURCE_FILES} PROPERTY COMPILE_FLAGS "${AVX2_FLAGS}" APPEND_STRING) + set_property(TARGET opmsimulators PROPERTY C_STANDARD 17) +endif() +# must link libraries after target 'opmsimulators' has been defined if(CUDA_FOUND) if (NOT USE_HIP) target_link_libraries(opmsimulators diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index c1254cc1f40..bca21262dde 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -262,6 +262,15 @@ list (APPEND MAIN_SOURCE_FILES opm/simulators/wells/WGState.cpp ) +if (HAVE_AVX2_EXTENSION) + set (AVX2_SOURCE_FILES + opm/simulators/linalg/mixed/bsr.c + opm/simulators/linalg/mixed/prec.c + opm/simulators/linalg/mixed/bslv.c) + list (APPEND MAIN_SOURCE_FILES + ${AVX2_SOURCE_FILES}) +endif() + if (HAVE_ECL_INPUT) list (APPEND MAIN_SOURCE_FILES opm/simulators/utils/satfunc/GasPhaseConsistencyChecks.cpp @@ -1252,6 +1261,16 @@ if (HAVE_ECL_INPUT) ) endif() +if (HAVE_AVX2_EXTENSION) + list (APPEND PUBLIC_HEADER_FILES + opm/simulators/linalg/mixed/bslv.h + opm/simulators/linalg/mixed/bsr.h + opm/simulators/linalg/mixed/prec.h + opm/simulators/linalg/mixed/vec.h + opm/simulators/linalg/mixed/wrapper.hpp + ) +endif() + if (Damaris_FOUND AND MPI_FOUND AND USE_DAMARIS_LIB) list (APPEND PUBLIC_HEADER_FILES opm/simulators/utils/DamarisKeywords.hpp diff --git a/opm-simulators-prereqs.cmake b/opm-simulators-prereqs.cmake index 58bb27ec9ef..826ae0a912e 100644 --- a/opm-simulators-prereqs.cmake +++ b/opm-simulators-prereqs.cmake @@ -6,6 +6,7 @@ set (opm-simulators_CONFIG_VAR HAVE_MPI HAVE_PETSC COMPILE_GPU_BRIDGE + HAVE_AVX2_EXTENSION HAVE_CUDA HAVE_OPENCL HAVE_OPENCL_HPP @@ -37,6 +38,9 @@ if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.30.0) set(_Boost_CONFIG_MODE CONFIG) endif() +include(CheckAVX2) +check_for_avx2() + # dependencies set (opm-simulators_DEPS # Compile with C99 support if available diff --git a/opm/models/discretization/common/fvbaselinearizer.hh b/opm/models/discretization/common/fvbaselinearizer.hh index 1e34f1a4358..c5c4e33dfe0 100644 --- a/opm/models/discretization/common/fvbaselinearizer.hh +++ b/opm/models/discretization/common/fvbaselinearizer.hh @@ -288,8 +288,55 @@ public: GlobalEqVector& residual() { return residual_; } - void setLinearizationType(LinearizationType linearizationType) - { linearizationType_ = linearizationType; } + void printVector(GlobalEqVector&, const char *name="x") + { + return; + } + + void printResidual(const char *name="r") + { + printVector(residual_); + } + + void printSparsity(const char *name="s") + { + return; + } + + void printNonzeros(const char *name="d") + { + return; + } + + void printJacobian() + { + return; + } + + void exportSystem(int idx, char *tag, const char *path="export") + { + return; + } + + void exportVector(GlobalEqVector &x, const char *tag="", const char *name="export/x") + { + printf("n = %lu\n",x.dim()); + } + + void exportSparsity(const char *path=".") + { + return; + } + + void exportNonzeros(const char *tag="", const char *path=".") + { + return; + } + + + void setLinearizationType(LinearizationType linearizationType){ + linearizationType_ = linearizationType; + }; const LinearizationType& getLinearizationType() const { return linearizationType_; } diff --git a/opm/models/discretization/common/tpfalinearizer.hh b/opm/models/discretization/common/tpfalinearizer.hh index 4da966e1071..0a727c43231 100644 --- a/opm/models/discretization/common/tpfalinearizer.hh +++ b/opm/models/discretization/common/tpfalinearizer.hh @@ -236,6 +236,8 @@ public: { simulatorPtr_ = nullptr; separateSparseSourceTerms_ = Parameters::Get(); + exportIndex_=-1; + exportCount_=-1; } /*! @@ -412,6 +414,201 @@ public: GlobalEqVector& residual() { return residual_; } + /*! + * \brief Print first 16 block elements of a vector. + */ + void printVector(GlobalEqVector &x, const char *name="x") + { + int count = 1; + printf("%s =\n[\n",name); + for (auto block = x.begin(); block != x.end(); block++) + { + for (auto i=block->begin(); i!=block->end(); i++) + { + printf(" %+.4e",*i); + } + printf("\n"); + count++; + if(count>16) break; + } + printf("]\n"); + } + + /*! + * \brief Print first 16 block elements of residual. + */ + void printResidual(const char *name="r") + { + printVector(residual_, name); + } + + /*! + * \brief Print sparsity pattern of the first 16 rows + * of the jacobian block matrix + */ + void printSparsity(const char *name="s") + { + auto& A = jacobian_->istlMatrix(); + + printf("nrows = %lu\n",A.N()); + printf("ncols = %lu\n",A.M()); + printf("nnz = %lu\n",A.nonzeroes()); + + printf("%s =\n[\n",name); + int count=1; + int offset=0; + for(auto row=A.begin(); row!=A.end(); row++) + { + printf("%4d: ",offset); + for(unsigned int i=0;igetsize();i++) + { + printf(" %4lu",row->getindexptr()[i]); + } + printf("\n"); + offset+=row->getsize(); + count++; + if(count>16) break; + } + printf("]\n"); + } + + /*! + * \brief Print block elements of the first 6 rows of the + * j acobian block matrix + */ + void printNonzeros(const char *name="d") + { + auto& A = jacobian_->istlMatrix(); + printf("%s =\n[\n",name); + int count=1; + for(auto row=A.begin();row!=A.end();row++) + { + for(unsigned int j=0;jgetsize();j++) + { + printf("|"); + auto mat = row->getptr()[j]; + for(auto vec=mat.begin();vec!=mat.end();vec++) + { + for(auto k=vec->begin();k!=vec->end();k++) + { + printf(" %+.4e",*k); + } + printf(" |"); + } + printf("\n"); + } + count++; + if(count>6) break; + printf("\n"); + } + printf("]\n"); + } + + /*! + * \brief Print sparsity pattern and nonzeros of jacobian block matrix + */ + void printJacobian() + { + printSparsity(); + printNonzeros(); + } + + /*! + * \brief Export blocks-sparse linear system. + */ + void exportSystem(int idx, char *tag, const char *path="export") + { + // export sparsity only once + if(exportIndex_==-1) exportSparsity(path); + + // increment indices and generate tag + exportCount_ = exportIndex_==idx ? ++exportCount_ : 0; + exportIndex_ = idx; + sprintf(tag,"_%03d_%02d",exportIndex_, exportCount_); + + printf("index = %d\n", exportIndex_); + printf("count = %d\n", exportCount_); + + // export matrix + exportNonzeros(tag,path); + + // export residual + char name[256]; + sprintf(name,"%s/r",path); + exportVector(residual_,tag,name); + } + + /*! + * \brief Export block vector. + */ + void exportVector(GlobalEqVector &x, const char *tag="", const char *name="export/x") + { + // assume double precision and contiguous data + const double *data = &x[0][0]; + + char filename[512]; + sprintf(filename,"%s%s.f64",name,tag); + FILE *out =fopen(filename,"w"); + fwrite(data, sizeof(double), x.dim(),out); + fclose(out); + } + + /*! + * \brief Export nonzero blocks of jacobian block-sparse matrix + */ + void exportNonzeros(const char *tag="", const char *path=".") + { + auto& A = jacobian_->istlMatrix(); + + // assume double precision and contiguous data + const double *data = &A[0][0][0][0]; + size_t dim = A[0][0].N()*A[0][0].M()*A.nonzeroes(); + + char filename[256]; + sprintf(filename,"%s/data%s.f64",path,tag); + FILE *out =fopen(filename,"w"); + fwrite(data, sizeof(double), dim,out); + fclose(out); + } + + /*! + * \brief Export sparsity pattern of jacobian block-sparse matrix + */ + void exportSparsity(const char *path=".") + { + //assemble csr graph + auto& A = jacobian_->istlMatrix(); + auto rows = std::make_unique(A.N()+1); + auto cols = std::make_unique(A.nonzeroes()); + + int irow=0; + int icol=0; + rows[0]=0; + for(auto row=A.begin(); row!=A.end(); row++) + { + for(unsigned int i=0;igetsize();i++) + { + cols[icol++]=row->getindexptr()[i]; + } + rows[irow+1]= rows[irow]+row->getsize(); + irow++; + } + + //export arrays + FILE *out; + char filename[256]; + + sprintf(filename,"%s/rows.i32",path); + out=fopen(filename,"w"); + fwrite(rows, sizeof(int), A.N()+1,out); + fclose(out); + + sprintf(filename,"%s/cols.i32",path); + out=fopen(filename,"w"); + fwrite(cols, sizeof(int), A.nonzeroes(),out); + fclose(out); + } + void setLinearizationType(LinearizationType linearizationType) { linearizationType_ = linearizationType; } @@ -1063,6 +1260,9 @@ private: bool separateSparseSourceTerms_ = false; FullDomain<> fullDomain_; + + int exportIndex_; + int exportCount_; }; } // namespace Opm diff --git a/opm/simulators/linalg/FlexibleSolver_impl.hpp b/opm/simulators/linalg/FlexibleSolver_impl.hpp index 4e1d0d1e9c2..b67fc0b12f1 100644 --- a/opm/simulators/linalg/FlexibleSolver_impl.hpp +++ b/opm/simulators/linalg/FlexibleSolver_impl.hpp @@ -32,6 +32,10 @@ #include #include +#if HAVE_AVX2_EXTENSION +#include +#endif + #include #include #include @@ -181,6 +185,22 @@ namespace Dune tol, // desired residual reduction factor maxiter, // maximum number of iterations verbosity); +#if HAVE_AVX2_EXTENSION + } else if (solver_type == "mixed-bicgstab") { + if constexpr (!Opm::is_gpu_operator_v) { + const std::string prec_type = prm.get("preconditioner.type", "error"); + bool use_mixed_dilu= (prec_type=="mixed-dilu"); + using MatrixType = decltype(linearoperator_for_solver_->getmat()); + linsolver_ = std::make_shared>( + linearoperator_for_solver_->getmat(), + tol, + maxiter, + use_mixed_dilu + ); + } else { + OPM_THROW(std::invalid_argument, "mixed-bicgstab solver not supported for GPU operators."); + } +#endif } else if (solver_type == "loopsolver") { linsolver_ = std::make_shared>(*linearoperator_for_solver_, *scalarproduct_, @@ -197,7 +217,6 @@ namespace Dune restart, maxiter, // maximum number of iterations verbosity); - } else { if constexpr (!Opm::is_gpu_operator_v) { if (solver_type == "flexgmres") { diff --git a/opm/simulators/linalg/StandardPreconditioners_serial.hpp b/opm/simulators/linalg/StandardPreconditioners_serial.hpp index b45c7bfb802..0fe680f957d 100644 --- a/opm/simulators/linalg/StandardPreconditioners_serial.hpp +++ b/opm/simulators/linalg/StandardPreconditioners_serial.hpp @@ -31,6 +31,19 @@ namespace Opm { +template +class TrivialPreconditioner : public Dune::PreconditionerWithUpdate +{ + public: + TrivialPreconditioner(){}; + virtual void update() override {}; + virtual bool hasPerfectUpdate() const override {return true;} + virtual void pre ([[maybe_unused]] X& x, [[maybe_unused]] Y& y) override {}; + virtual void post ([[maybe_unused]] X& x) override {}; + virtual void apply ([[maybe_unused]] X& x, [[maybe_unused]] const Y& y) override {}; + virtual Dune::SolverCategory::Category category() const override { return Dune::SolverCategory::sequential; }; +}; + template struct StandardPreconditioners>> @@ -71,6 +84,16 @@ struct StandardPreconditioners>(op.getmat()); }); + F::addCreator("mixed-ilu0", [](const O& op, const P& prm, const std::function&, std::size_t) { + DUNE_UNUSED_PARAMETER(prm); + DUNE_UNUSED_PARAMETER(op); + return std::make_shared>(); + }); + F::addCreator("mixed-dilu", [](const O& op, const P& prm, const std::function&, std::size_t) { + DUNE_UNUSED_PARAMETER(prm); + DUNE_UNUSED_PARAMETER(op); + return std::make_shared>(); + }); F::addCreator("jac", [](const O& op, const P& prm, const std::function&, std::size_t) { const int n = prm.get("repeats", 1); const double w = prm.get("relaxation", 1.0); diff --git a/opm/simulators/linalg/mixed/README.md b/opm/simulators/linalg/mixed/README.md new file mode 100644 index 00000000000..988ff58e913 --- /dev/null +++ b/opm/simulators/linalg/mixed/README.md @@ -0,0 +1,23 @@ +# Mixed-precision linear solvers +This folder contains mixed-precision building blocks for Krylov subspace methods +and a highly optimized mixed-precision implementation of ILU0 preconditioned bicgstab. +Hopefully, this will inspire the exploration of mixed-precision algorithms in OPM. + +The initial implementation is specialized for 3x3 block-sparse matrices due to their +importance in reservoir simulation and restricted to serial runs. Extending the work +to parallel runs is a matter of extending OPM's parallel infrastructure and should be +relatively straight-forward. + +The mixed-precision solver is selected by the command-line option `--linear-solver=mixed-ilu0`. +The command-line option `--matrix-add-well-contributions=true` must also be set as the +mixed-precision solver operates directly on block-sparse matrices, not on linear operators as +other OPM solvers do. For convenience, a wrapper similar to the one below can be used. + +``` bash +OMP_NUM_THREADS=1 mpirun -np 1 --map-by numa --bind-to core build/bin/flow \ + --matrix-add-well-contributions=true \ + --linear-solver=mixed-ilu0 \ + --linear-solver-reduction=1e-3 \ + --linear-solver-max-iter=1024 \ + $@ +``` diff --git a/opm/simulators/linalg/mixed/bslv.c b/opm/simulators/linalg/mixed/bslv.c new file mode 100644 index 00000000000..97264bed7f8 --- /dev/null +++ b/opm/simulators/linalg/mixed/bslv.c @@ -0,0 +1,243 @@ +#include "bslv.h" +#include "vec.h" + +#include +#include +#include +#include + + +bslv_memory *bslv_alloc() +{ + bslv_memory *mem = malloc(sizeof(bslv_memory)); + assert(mem); + mem->e = NULL; + mem->dtmp = NULL; + return mem; +} + +void bslv_free(bslv_memory *mem) +{ + if(mem == NULL) return; + + if(mem->e !=NULL) free(mem->e); + if(mem->dtmp !=NULL) + { + int narrays=7; + for(int i=0;idtmp[i] != NULL) free(mem->dtmp[i]); + } + free(mem->dtmp); + } + if(mem->P !=NULL) prec_free(mem->P); + + free(mem); + mem=NULL; +} + +void bslv_info(bslv_memory *mem, int count) +{ + double * restrict e = mem->e; + printf("bslv_info: iterations=%d reduction=%.2e\n",count,e[count]); +} + + +void bslv_init(bslv_memory *mem, double tol, int max_iter, bsr_matrix const *A, bool use_dilu) +{ + int n=(A->nrows * A->b); + + mem->use_dilu = use_dilu; + + mem->tol = tol; + mem->max_iter = max_iter; + mem->n = n; + + mem->e = (double*) malloc(max_iter*sizeof(double)); + assert(mem->e); + + int narrays=7; + mem->dtmp = (double**) malloc(narrays*sizeof(double*)); + assert(mem->dtmp); + + int np = 8*((n+7)/8); // padded to nearest multiple of 8 + for(int i=0;idtmp[i] = aligned_alloc(64,np*sizeof(double)); + assert(mem->dtmp[i]); + for(int k=8*(n/8);kdtmp[i][k] = 0.0; //zeroing out padded section + } + + mem->P = prec_alloc(); + prec_init(mem->P, A); // initialize structure of L,D,U components of P +} + +double __attribute__((noinline)) vec_inner2(const double *a, const double *b, int n) +{ + const double *x = __builtin_assume_aligned(a,64); + const double *y = __builtin_assume_aligned(b,64); + + int const N=8; + double agg[N]; + for(int i=0;itol; + int max_iter = mem->max_iter; + int n = mem->n; + + double * restrict e = mem->e; + const double *r0 = b; + //const double * restrict r0 = mem->dtmp[0]; //access randomly initialized one-dimensional shadow space + double * restrict p_j = mem->dtmp[1]; + double * restrict q_j = mem->dtmp[2]; + double * restrict r_j = mem->dtmp[3]; + double * restrict s_j = mem->dtmp[4]; + double * restrict t_j = mem->dtmp[5]; + double * restrict v_j = mem->dtmp[6]; + double * restrict x_j = x; + + prec_t * restrict P = mem->P; + mem->use_dilu ? prec_dilu_factorize(P,A) : prec_ilu0_factorize(P,A); // choose dilu or ilu0 + prec_downcast(P); + + vec_fill(x_j,0.0,n); + vec_copy(r_j,b,n); + vec_copy(p_j,b,n); + + vec_copy(q_j,p_j,n); + //double norm_0 = sqrt(vec_inner(r_j,r_j,n)); + double norm_0 = sqrt(vec_inner2(r_j,r_j,n)); + + //double rho_j = vec_inner(r0,r_j,n); + double rho_j = vec_inner2(r0,r_j,n); + int j; + for(j=0;jtol; + int max_iter = mem->max_iter; + int n = mem->n; + + double * restrict e = mem->e; + const double *r0 = b; + //const double * restrict r0 = mem->dtmp[0]; //access randomly initialized one-dimensional shadow space + double * restrict p_j = mem->dtmp[1]; + double * restrict q_j = mem->dtmp[2]; + double * restrict r_j = mem->dtmp[3]; + double * restrict s_j = mem->dtmp[4]; + double * restrict t_j = mem->dtmp[5]; + double * restrict v_j = mem->dtmp[6]; + double * restrict x_j = x; + + prec_t * restrict P = mem->P; + mem->use_dilu ? prec_dilu_factorize(P,A) : prec_ilu0_factorize(P,A); // choose dilu or ilu0 + prec_downcast(P); + + vec_fill(x_j,0.0,n); + vec_copy(r_j,b,n); + vec_copy(p_j,b,n); + + vec_copy(q_j,p_j,n); + //double norm_0 = sqrt(vec_inner(r_j,r_j,n)); + double norm_0 = sqrt(vec_inner2(r_j,r_j,n)); + + //double rho_j = vec_inner(r0,r_j,n); + double rho_j = vec_inner2(r0,r_j,n); + int j; + for(j=0;j + +// Solver memory struct +typedef +struct bslv_memory +{ + bool use_dilu; + + double tol; + int max_iter; + double *e; + + int n; + double **dtmp; + + prec_t *P; +} +bslv_memory; + +/** + * @brief Create empty solver memory object. + * + * @return Pointer to solver memory object. + */ +bslv_memory *bslv_alloc(); + +/** + * @brief Delete solver memroy object. + * + * @param mem Pointer to solver memory object. + */ +void bslv_free(bslv_memory *mem); + +/** + * @brief Display convergence information. + * + * @param mem Pointer to solver memory object. + * @param count Number of linear iterations (returned by linear solver) + */ +void bslv_info(bslv_memory *mem, int count); + +/** + * @brief Initialize solver memory object. + * + * @param mem Pointer to solver memory object. + * @param tol Linear solver tolerance. + * @apram max_iter Maximum number of linear iterations. + * @param A Pointer to coeffient matrix in bsr format. + * @param use_dilu: Select DILU preconditioner if true. Otherwise use ILU9. + */ +void bslv_init(bslv_memory *mem, double tol, int max_iter, bsr_matrix const *A, bool use_dilu); + +/** + * @brief Preconditioned bicgstab in mixed-precision. + * + * @note Preconditioner is either ILU0 or DILU based on value of mem->use_dilu + * + * @param mem Pointer to solver memory object. + * @param A Pointer to coeffient matrix in bsr format + * @param b Pointer to right-hand side vector + * @apram x Pointer to solution vector + * + * @return Number of linear iterations. + */ +int bslv_pbicgstab3m(bslv_memory *mem, bsr_matrix *A, const double *b, double *x); + +/** + * @brief Preconditioned bicgstab in double-precision. + * + * @note Preconditioner is either ILU0 or DILU based on value of mem->use_dilu + * + * @param mem Pointer to solver memory object. + * @param A Pointer to coeffient matrix in bsr format + * @param b Pointer to right-hand side vector + * @apram x Pointer to solution vector + * + * @return Number of linear iterations. + */ +int bslv_pbicgstab3d(bslv_memory *mem, bsr_matrix *A, const double *b, double *x); + +#ifdef __cplusplus +} +#endif diff --git a/opm/simulators/linalg/mixed/bsr.c b/opm/simulators/linalg/mixed/bsr.c new file mode 100644 index 00000000000..ead89998575 --- /dev/null +++ b/opm/simulators/linalg/mixed/bsr.c @@ -0,0 +1,206 @@ +#include "bsr.h" + +#include +#include +#include +#include + +#include + + +bsr_matrix* bsr_alloc() +{ + bsr_matrix *A=malloc(sizeof(bsr_matrix)); + assert(A); + A->nrows = 0; + A->ncols = 0; + A->nnz = 0; + A->b = 0; + + A->rowptr = NULL; + A->colidx = NULL; + A->dbl = NULL; + A->flt = NULL; + + return A; +} + +void bsr_free(bsr_matrix *A) +{ + if(A==NULL) return; + + if(A->flt != NULL) free(A->flt); + if(A->dbl != NULL) free(A->dbl); + if(A->colidx != NULL) free(A->colidx); + if(A->rowptr != NULL) free(A->rowptr); + + free(A); + A=NULL; +} + +void bsr_init(bsr_matrix *A, int nrows, int nnz, int b) +{ + A->nrows=nrows; + A->ncols=nrows; + A->nnz=nnz; + A->b=b; + + A->rowptr = malloc((nrows+1)*sizeof(int)); + A->colidx = malloc(nnz*sizeof(int)); + A->dbl = malloc(b*b*nnz*sizeof(double)); + A->flt = malloc(b*b*nnz*sizeof(float)); + + assert(A->rowptr); + assert(A->colidx); + assert(A->dbl); + assert(A->flt); +} + +void bsr_info(bsr_matrix *A) +{ + printf("nrows = %d\n",A->nrows); + printf("ncols = %d\n",A->ncols); + printf("nnz = %d\n",A->nnz); + printf("b = %d\n",A->b); + + printf("rowptr= 0x%08lX\n",(uint64_t)A->rowptr); + printf("colidx= 0x%08lX\n",(uint64_t)A->colidx); + printf("dbl = 0x%08lX\n",(uint64_t)A->dbl); + + printf("\n"); +} + +void bsr_vmspmv3(bsr_matrix *A, const double *x, double *y) +{ + int nrows = A->nrows; + int *rowptr=A->rowptr; + int *colidx=A->colidx; + const float *data=A->flt; + + const int b=3; + + __m256d mm_zeros =_mm256_setzero_pd(); + for(int i=0;inrows; + int *rowptr=A->rowptr; + int *colidx=A->colidx; + const double *data=A->dbl; + + const int b=3; + + __m256d mm_zeros =_mm256_setzero_pd(); + for(int i=0;innz; + int b = M->b; + + assert(M->flt); + for(int i=0;iflt[i]=M->dbl[i]; +} + + +void bsr_sparsity(const bsr_matrix *A, const char *name) +{ + printf("%s =\n[\n",name); + int count=1; + int offset=0; + for(int i=0; inrows; i++) + { + printf("%4d: ",offset); + for(int j=A->rowptr[i];jrowptr[i+1];j++) + { + printf(" %4d",A->colidx[j]); + offset++; + } + printf("\n"); + count++; + if(count>16) break; + } + printf("]\n"); +} + +void bsr_nonzeros(bsr_matrix *A, const char *name) +{ + printf("%s =\n[\n",name); + int count=1; + int b=A->b; + int bb=b*b; + for(int i=0; inrows; i++) + { + for(int j=A->rowptr[i];jrowptr[i+1];j++) + { + printf("|"); + for(int m=0;mdbl[j*bb + m*b + n]); + } + printf(" |"); + } + printf("\n"); + } + count++; + if(count>6) break; + printf("\n"); + } + printf("]\n"); +} + diff --git a/opm/simulators/linalg/mixed/bsr.h b/opm/simulators/linalg/mixed/bsr.h new file mode 100644 index 00000000000..f58a0221790 --- /dev/null +++ b/opm/simulators/linalg/mixed/bsr.h @@ -0,0 +1,103 @@ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +// bsr matrix struct +typedef +struct bsr_matrix +{ + int nrows; + int ncols; + int nnz; + int b; + + int *rowptr; + int *colidx; + double *dbl; + float *flt; + +} bsr_matrix; + +/** + * @brief Create empty bsr matrix. + * + * @return Pointer to bsr matrix. + */ +bsr_matrix* bsr_alloc(); + +/** + * @brief Delete bsr matrix. + * + * @param A Pointer to bsr matrix. + */ +void bsr_free(bsr_matrix *A); + +/** + * @brief Initialize bsr matrix. + * + * @param A Pointer to bsr matrix. + * @param nrows Number of rows. + * @apram nnz Number of nonzero blocks. + * @param b Block size. + */ +void bsr_init(bsr_matrix *A, int nrows, int nnz, int b); + +/** + * @brief Sparse matrix-vector multiplication in mixed precision. + * + * @note Function is specialized for 3x3 block-sparse matrices. + * @note Function uses AVX2 intrinsics. + * + * @param A Pointer to bsr matrix. + * @param x Pointer to input vector. + * @param y Pointer to output vector. + */ +void bsr_vmspmv3(bsr_matrix *A, const double *x, double *y); + +/** + * @brief Sparse matrix-vector multiplication in double precision. + * + * @note Function is specialized for 3x3 block-sparse matrices. + * @note Function uses AVX2 intrinsics. + * + * @param A Pointer to bsr matrix. + * @param x Pointer to input vector. + * @param y Pointer to output vector. + */ +void bsr_vdspmv3(bsr_matrix *A, const double *x, double *y); + +/** + * @brief Make single-precision copy of double-precision values. + * + * @param M Pointer to bsr matrix + */ +void bsr_downcast(bsr_matrix *M); + +/** + * @brief Display matrix statistics. + * + * @param A Pointer to bsr matrix + */ +void bsr_info(bsr_matrix *A); + +/** + * @brief Display spasity pattern of first few rows. + * + * @param A Pointer to bsr matrix + * @param name String with desired name of bsr_matrix + */ +void bsr_sparsity(const bsr_matrix *A, const char *name); + +/** + * @brief Display nonzero blocks of first few rows. + * + * @param A Pointer to bsr matrix + * @param name String with desired name of bsr_matrix + */ +void bsr_nonzeros(bsr_matrix *A, const char *name); + +#ifdef __cplusplus +} +#endif diff --git a/opm/simulators/linalg/mixed/prec.c b/opm/simulators/linalg/mixed/prec.c new file mode 100644 index 00000000000..f0b9ea6d010 --- /dev/null +++ b/opm/simulators/linalg/mixed/prec.c @@ -0,0 +1,627 @@ +#include "prec.h" + +#include +#include +#include + + +prec_t *prec_alloc() +{ + prec_t *P = malloc(sizeof(prec_t)); + assert(P); + P->L=bsr_alloc(); + P->D=bsr_alloc(); + P->U=bsr_alloc(); + + P->noffsets=-1; + P->offsets=NULL; + + return P; +} + +void prec_free(prec_t *P) +{ + if(P==NULL) return; + + if(P->offsets!=NULL) free(P->offsets); + bsr_free(P->U); + bsr_free(P->D); + bsr_free(P->L); + + free(P); + P=NULL; +} + +int prec_analyze(bsr_matrix *M, int (*offsets)[3]) +{ + int count=0; + for(int i=0;inrows;i++) + { + for(int z=M->rowptr[i];zrowptr[i+1];z++) + { + int j = M->colidx[z]; + int match=0; + for(int m=M->rowptr[j];mrowptr[j+1];m++) + { + int k = M->colidx[m]; + for(int n=M->rowptr[i];nrowptr[i+1];n++) + { + int jjj=M->colidx[n]; + match += (k==jjj); + if(k==jjj) + { + if(offsets) + { + offsets[count][0]=z;//ij + offsets[count][1]=n;//ik + offsets[count][2]=m;//jk + } + count++; + } + } + } + } + } + return count; +} + + +void prec_init(prec_t *P, bsr_matrix const *A) +{ + int b = A->b; + int nrows = A->nrows; + int nnz = A->nnz; + + bsr_matrix *L=P->L; + bsr_matrix *D=P->D; + bsr_matrix *U=P->U; + + bsr_init(P->L, nrows, (nnz-nrows)/2, b); + bsr_init(P->D, nrows, nrows, b); + bsr_init(P->U, nrows, (nnz-nrows)/2, b); + + // Sparsity of L,D, and U components of matrix A. + // Sparsity of L is expressed for its transpose + // which is identical to the sparsity of U due to + // the assumption of structural symmetry. + nnz=0; + for (int i=0;irowptr[i]=nnz; + D->rowptr[i]=i; + U->rowptr[i]=nnz; + for (int k=A->rowptr[i];krowptr[i+1];k++) + { + int j=A->colidx[k]; + if(j>i) + { + L->colidx[nnz]=j; + U->colidx[nnz]=j; + nnz++; + } + } + D->colidx[i]=i; + } + L->rowptr[nrows]=nnz; + D->rowptr[nrows]=nrows; + U->rowptr[nrows]=nnz; + + // replace with asserts? + L->nnz=nnz; + D->nnz=nrows; + U->nnz=nnz; + + + // offsets for off-diagonal updates + int count; + count = prec_analyze(U,NULL); + P->offsets = malloc(3*(count+1)*sizeof(int)); + assert(P->offsets); + count = prec_analyze(U,P->offsets); + P->offsets[count][0]=U->nnz; + P->noffsets=count; +} + +void mat3_matmul(double *C, const double *A, const double *B) +{ + // assume 3x3 column-major matrices + // account for possiblity of C and A referring to same memory location + double M[9]; + for(int k=0;k<9;k++) M[k]=0.0; + for(int j=0;j<3;j++) + { + double *m_j = M+3*j; // j-th column of M + for(int k=0;k<3;k++) + { + double b_kj = B[k+3*j]; // kj-th element of B + double const *a_k = A+3*k; // k-th column of A + for(int i=0;i<3;i++) m_j[i] += a_k[i]*b_kj; // |m_j> += |a_k> * b_kj; + } + } + for(int k=0;k<9;k++) C[k]=M[k]; +} + +void mat3_matfms(double *C, const double *A, const double *B) +{ + // assume 3x3 column-major matrices + // account for possiblity of C and A referring to same memory location + double M[9]; + for(int k=0;k<9;k++) M[k]=0.0; + for(int j=0;j<3;j++) + { + double *m_j = M+3*j; // j-th column of M + for(int k=0;k<3;k++) + { + double b_kj = B[k+3*j]; // kj-th element of B + double const *a_k = A+3*k; // k-th column of A + for(int i=0;i<3;i++) m_j[i] += a_k[i]*b_kj; // |m_j> += |a_k> * b_kj; + } + } + for(int k=0;k<9;k++) C[k]-=M[k]; +} + + +void mat3_inv(double *invA, const double *A) +{ + double M[9]; + M[0] = +(A[4]*A[8]-A[5]*A[7]); + M[3] = -(A[3]*A[8]-A[5]*A[6]); + M[6] = +(A[3]*A[7]-A[4]*A[6]); + M[1] = -(A[1]*A[8]-A[2]*A[7]); + M[4] = +(A[0]*A[8]-A[2]*A[6]); + M[7] = -(A[0]*A[7]-A[1]*A[6]); + M[2] = +(A[1]*A[5]-A[2]*A[4]); + M[5] = -(A[0]*A[5]-A[2]*A[3]); + M[8] = +(A[0]*A[4]-A[1]*A[3]); + + double detA = A[0]*M[0]+A[1]*M[3]+A[2]*M[6]; + for(int k=0;k<9;k++) invA[k]=M[k]/detA; +} + +inline void vec_copy9(double *y, double const *x) +{ + for(int i=0;i<9;i++) y[i]=x[i]; +} + +void mat3_rmul(double *A, double const *B) +{ + // load left hand matrix + __m256d vA[3]; + vA[0] = _mm256_loadu_pd(A+0); + vA[1] = _mm256_loadu_pd(A+3); + vA[2] = _mm256_loadu_pd(A+6); + + for(int j=0;j<3;j++) + { + // load column j of B matrix + __m256d vbj = _mm256_loadu_pd(B+3*j); + + // multiply matrix A with column j of matrix B + __m256d vAB[3]; + vAB[0] = vA[0]*_mm256_permute4x64_pd(vbj,0b00000000); //0b01010101 + vAB[1] = vA[1]*_mm256_permute4x64_pd(vbj,0b01010101); //0b01010101 + vAB[2] = vA[2]*_mm256_permute4x64_pd(vbj,0b10101010); //0b01010101 + + __m256d vz = vAB[0] + vAB[1] + vAB[2]; + + // Store result in column j of matrix A + double z[4]; + _mm256_store_pd(z,vz); + for(int k=0;k<3;k++) A[3*j+k]=z[k]; + } +} + +void mat3_lmul(double const *A, double *B) +{ + // load left hand matrix + __m256d vA[3]; + vA[0] = _mm256_loadu_pd(A+0); + vA[1] = _mm256_loadu_pd(A+3); + vA[2] = _mm256_loadu_pd(A+6); + + for(int j=0;j<3;j++) + { + // load column j of B matrix + __m256d vbj = _mm256_loadu_pd(B+3*j); + + // multiply matrix A with column j of matrix B + __m256d vAB[3]; + vAB[0] = vA[0]*_mm256_permute4x64_pd(vbj,0b00000000); //0b01010101 + vAB[1] = vA[1]*_mm256_permute4x64_pd(vbj,0b01010101); //0b01010101 + vAB[2] = vA[2]*_mm256_permute4x64_pd(vbj,0b10101010); //0b01010101 + + __m256d vz = vAB[0] + vAB[1] + vAB[2]; + + // Store result in column j of matrix B + double z[4]; + _mm256_store_pd(z,vz); + for(int k=0;k<3;k++) B[3*j+k]=z[k]; + } +} + +void mat3_vfms(double *C, double const *A, double const *B) +{ + // load left hand matrix + __m256d vA[3]; + vA[0] = _mm256_loadu_pd(A+0); + vA[1] = _mm256_loadu_pd(A+3); + vA[2] = _mm256_loadu_pd(A+6); + + for(int j=0;j<3;j++) + { + // load column j of B matrix + __m256d vbj = _mm256_loadu_pd(B+3*j); + + // multiply matrix A with column j of matrix B + __m256d vAB[3]; + vAB[0] = vA[0]*_mm256_permute4x64_pd(vbj,0b00000000); //0b01010101 + vAB[1] = vA[1]*_mm256_permute4x64_pd(vbj,0b01010101); //0b01010101 + vAB[2] = vA[2]*_mm256_permute4x64_pd(vbj,0b10101010); //0b01010101 + + __m256d vz = vAB[0] + vAB[1] + vAB[2]; + + // Store result in column j of matrix A + double z[4]; + _mm256_store_pd(z,vz); + for(int k=0;k<3;k++) C[3*j+k]-=z[k]; + } +} + + +void prec_dilu_factorize(prec_t *P, bsr_matrix *A) +{ + int nrows = A->nrows; + int b = A->b; + int bb = b*b; + + bsr_matrix *L=P->L; + bsr_matrix *D=P->D; + bsr_matrix *U=P->U; + + // Splitting values of A into L, D, and U, respectively + int kU=0; + for(int i=0;irowptr[i];krowptr[i+1];k++) + { + int j=A->colidx[k]; + if(jrowptr[j]; + vec_copy9(L->dbl + bb*kL, A->dbl + bb*k); + L->rowptr[j]++; + } + else if(j==i) // struct-copy of D + { + vec_copy9(D->dbl + bb*i, A->dbl + bb*k); + } + else if(j>i) // struct-copy of U + { + vec_copy9(U->dbl + bb*kU, A->dbl + bb*k); + kU++; + } + } + } + // reset rowptr of L + for(int i=nrows;i>0;i--) L->rowptr[i]=L->rowptr[i-1]; + L->rowptr[0]=0; + + // Factorizing + double scale[9]; //hard-coded to 3x3 blocks for now + for(int i=0;inrows;i++) + { + mat3_inv(scale,D->dbl+i*bb); + vec_copy9(D->dbl+bb*i, scale); //store inverse instead to simplify application + for(int k=L->rowptr[i];krowptr[i+1];k++) + { + //scale column i of L + mat3_rmul(L->dbl+k*bb,scale); + + //update diagonal of U + int j=L->colidx[k]; + mat3_vfms(D->dbl+j*bb,L->dbl+k*bb,U->dbl+k*bb); + + //scale row i of U + mat3_lmul(scale,U->dbl+k*bb); + + //NOT IMPLEMENTED! + for(int m=L->rowptr[j];mrowptr[j+1];m++) + { + if(L->colidx[m]==j) + { + printf("ILU OFF_DIAGONALS NOT IMPLEMENTED!\n"); + printf("(%d,%d)",m,j); + getchar(); + } + } + } + } +} + +void prec_ilu0_factorize(prec_t *P, bsr_matrix *A) +{ + int nrows = A->nrows; + int b = A->b; + int bb = b*b; + + bsr_matrix *L=P->L; + bsr_matrix *D=P->D; + bsr_matrix *U=P->U; + + // Splitting values of A into L, D, and U, respectively + int kU=0; + for(int i=0;irowptr[i];krowptr[i+1];k++) + { + int j=A->colidx[k]; + if(jrowptr[j]; + vec_copy9(L->dbl + bb*kL, A->dbl + bb*k); + L->rowptr[j]++; + } + else if(j==i) // struct-copy of D + { + vec_copy9(D->dbl + bb*i, A->dbl + bb*k); + } + else if(j>i) // struct-copy of U + { + vec_copy9(U->dbl + bb*kU, A->dbl + bb*k); + kU++; + } + } + } + // reset rowptr of L + for(int i=nrows;i>0;i--) L->rowptr[i]=L->rowptr[i-1]; + L->rowptr[0]=0; + + // Factorizing + int idx=0; + int next = P->offsets[idx][0]; + double scale[9]; //hard-coded to 3x3 blocks for now + for(int i=0;inrows;i++) + { + mat3_inv(scale,D->dbl+i*bb); + vec_copy9(D->dbl+bb*i, scale); //store inverse instead to simplify application + for(int k=L->rowptr[i];krowptr[i+1];k++) + { + //scale column i of L + mat3_rmul(L->dbl+k*bb,scale); + + //update diagonal D + int j=L->colidx[k]; + mat3_vfms(D->dbl+j*bb,L->dbl+k*bb,U->dbl+k*bb); + } + + while(nextrowptr[i+1]) + { + int ij = P->offsets[idx][0]; + int ik = P->offsets[idx][1]; + int jk = P->offsets[idx][2]; + + //update off-diagonals L and U + mat3_vfms(U->dbl+jk*bb,L->dbl+ij*bb,U->dbl+ik*bb); + mat3_vfms(L->dbl+jk*bb,L->dbl+ik*bb,U->dbl+ij*bb); + + //update marker + next=P->offsets[++idx][0]; + } + + + for(int k=L->rowptr[i];krowptr[i+1];k++) + { + //scale row i of U + mat3_lmul(scale,U->dbl+k*bb); + } + + } +} + + +inline void mat3_vecmul(const double *A, double *x) +{ + const int b=3; + double z[3]; + for(int k=0;k<3;k++) z[k]=0; + for(int c=0;cL; + bsr_matrix *D = P->D; + bsr_matrix *U = P->U; + + int b=L->b; + int bb=b*b; + + __m256d mm256_zero_pd =_mm256_setzero_pd(); + + // Lower triangular solve assuming ones on diagonal + for(int i=0;incols;i++) + { + __m256d vA[3], vx[3]; + + double *xi = x+b*i; + __m256d vxi = _mm256_loadu_pd(xi); + + vx[0] = _mm256_permute4x64_pd(vxi,0b00000000); //0b01010101 + vx[1] = _mm256_permute4x64_pd(vxi,0b01010101); //0b01010101 + vx[2] = _mm256_permute4x64_pd(vxi,0b10101010); //0b01010101 + for(int k=L->rowptr[i];krowptr[i+1];k++) + { + const float *A = L->flt+k*bb; + int j=U->colidx[k]; // should be L, but does not matter die to structural symmetry? + vA[0] = _mm256_cvtps_pd(_mm_loadu_ps(A+0))*vx[0]; + vA[1] = _mm256_cvtps_pd(_mm_loadu_ps(A+3))*vx[1]; + vA[2] = _mm256_cvtps_pd(_mm_loadu_ps(A+6))*vx[2]; + + double *xj = x+b*j; + __m256d vxj = _mm256_loadu_pd(xj); + __m256d vz = (vxj - vA[0]) - (vA[1] + vA[2]); + + //vz =_mm256_blend_pd(vxj,vz,0x7); // 4th element unchanged + //_mm256_storeu_pd(xj,vz); + double z[4]; + _mm256_store_pd(z,vz); + for(int n=0;n<3;n++) xj[n]=z[n]; + } + + // Muliply by (inverse) diagonal block + const float *A = D->flt+i*bb; + vA[0] = _mm256_cvtps_pd(_mm_loadu_ps(A+0))*vx[0]; //0b01010101 + vA[1] = _mm256_cvtps_pd(_mm_loadu_ps(A+3))*vx[1]; //0b01010101 + vA[2] = _mm256_cvtps_pd(_mm_loadu_ps(A+6))*vx[2]; //0b01010101 + __m256d vz = vA[0] + vA[1] + vA[2]; + + //vz =_mm256_blend_pd(vxi,vz,0x7); // 4th element unchanged + //_mm256_storeu_pd(xi,vz); + double z[4]; + _mm256_store_pd(z,vz); + for(int k=0;k<3;k++) xi[k]=z[k]; + } + + // Upper triangular solve assuming nonzeros stored in original order + for(int i=U->ncols;i>0;i--) + { + __m256d vA[3]; + for(int k=0;k<3;k++) vA[k]=mm256_zero_pd; + for(int k=U->rowptr[i]-1;k>U->rowptr[i-1]-1;k--) + { + const float *A = U->flt+k*bb; + int j=U->colidx[k]; + __m256d vxj = _mm256_loadu_pd(x+b*j); + vA[0] += _mm256_cvtps_pd(_mm_loadu_ps(A+0))*_mm256_permute4x64_pd(vxj,0b00000000); //0b01010101 + vA[1] += _mm256_cvtps_pd(_mm_loadu_ps(A+3))*_mm256_permute4x64_pd(vxj,0b01010101); //0b01010101 + vA[2] += _mm256_cvtps_pd(_mm_loadu_ps(A+6))*_mm256_permute4x64_pd(vxj,0b10101010); //0b01010101 + } + double *xi = x+b*(i-1); + __m256d vxi = _mm256_loadu_pd(xi); + __m256d vz = (vxi - vA[0]) - (vA[1] + vA[2]); + vz =_mm256_blend_pd(vxi,vz,0x7); // 4th element unchanged + _mm256_storeu_pd(xi,vz); + + //double z[4]; + //_mm256_store_pd(z,vz); + //for(int k=0;k<3;k++) xi[k]=z[k]; + } +} + +void prec_dapply3c(prec_t *restrict P, double *x) +{ + bsr_matrix *L = P->L; + bsr_matrix *D = P->D; + bsr_matrix *U = P->U; + + int b=L->b; + int bb=b*b; + + __m256d mm256_zero_pd =_mm256_setzero_pd(); + + // Lower triangular solve assuming ones on diagonal + for(int i=0;incols;i++) + { + __m256d vA[3], vx[3]; + + double *xi = x+b*i; + __m256d vxi = _mm256_loadu_pd(xi); + + vx[0] = _mm256_permute4x64_pd(vxi,0b00000000); //0b01010101 + vx[1] = _mm256_permute4x64_pd(vxi,0b01010101); //0b01010101 + vx[2] = _mm256_permute4x64_pd(vxi,0b10101010); //0b01010101 + for(int k=L->rowptr[i];krowptr[i+1];k++) + { + const double *A = L->dbl+k*bb; + int j=U->colidx[k]; + vA[0] = _mm256_loadu_pd(A+0)*vx[0]; + vA[1] = _mm256_loadu_pd(A+3)*vx[1]; + vA[2] = _mm256_loadu_pd(A+6)*vx[2]; + + double *xj = x+b*j; + __m256d vxj = _mm256_loadu_pd(xj); + __m256d vz = (vxj - vA[0]) - (vA[1] + vA[2]); + + //vz =_mm256_blend_pd(vxj,vz,0x7); // 4th element unchanged + //_mm256_storeu_pd(xj,vz); + double z[4]; + _mm256_store_pd(z,vz); + for(int kk=0;kk<3;kk++) xj[kk]=z[kk]; + } + + // Muliply by (inverse) diagonal block + const double *A = D->dbl+i*bb; + vA[0] = _mm256_loadu_pd(A+0)*vx[0]; //0b01010101 + vA[1] = _mm256_loadu_pd(A+3)*vx[1]; //0b01010101 + vA[2] = _mm256_loadu_pd(A+6)*vx[2]; //0b01010101 + __m256d vz = vA[0] + vA[1] + vA[2]; + + //vz =_mm256_blend_pd(vxi,vz,0x7); // 4th element unchanged + //_mm256_storeu_pd(xi,vz); + double z[4]; + _mm256_store_pd(z,vz); + for(int k=0;k<3;k++) xi[k]=z[k]; + } + + // Upper triangular solve assuming nonzeros stored in original order + for(int i=U->ncols;i>0;i--) + { + __m256d vA[3]; + for(int k=0;k<3;k++) vA[k]=mm256_zero_pd; + for(int k=U->rowptr[i]-1;k>U->rowptr[i-1]-1;k--) + { + const double *A = U->dbl+k*bb; + int j=U->colidx[k]; + __m256d vxj = _mm256_loadu_pd(x+b*j); + vA[0] += _mm256_loadu_pd(A+0)*_mm256_permute4x64_pd(vxj,0b00000000); //0b01010101 + vA[1] += _mm256_loadu_pd(A+3)*_mm256_permute4x64_pd(vxj,0b01010101); //0b01010101 + vA[2] += _mm256_loadu_pd(A+6)*_mm256_permute4x64_pd(vxj,0b10101010); //0b01010101 + } + double *xi = x+b*(i-1); + __m256d vxi = _mm256_loadu_pd(xi); + __m256d vz = (vxi - vA[0]) - (vA[1] + vA[2]); + vz =_mm256_blend_pd(vxi,vz,0x7); // 4th element unchanged + _mm256_storeu_pd(xi,vz); + + //double z[4]; + //_mm256_store_pd(z,vz); + //for(int k=0;k<3;k++) xi[k]=z[k]; + } +} + +void prec_downcast(prec_t *P) +{ + bsr_downcast(P->L); + bsr_downcast(P->D); + bsr_downcast(P->U); +} + +void prec_info(prec_t *P) +{ + bsr_info(P->L); + bsr_info(P->D); + bsr_info(P->U); +} diff --git a/opm/simulators/linalg/mixed/prec.h b/opm/simulators/linalg/mixed/prec.h new file mode 100644 index 00000000000..b442c7f7af1 --- /dev/null +++ b/opm/simulators/linalg/mixed/prec.h @@ -0,0 +1,105 @@ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#include "bsr.h" + +// preconditioner struct +typedef +struct prec_t +{ + bsr_matrix *L; + bsr_matrix *D; + bsr_matrix *U; + int noffsets; + int(*offsets)[3]; +} +prec_t; + +/** + * @brief Create empty preconditioner object. + * + * @return Pointer to preconditioner object. + */ +prec_t *prec_alloc(); + +/** + * @brief Delete preconditioner object. + * + * @param A Pointer to preconditioner object. + */ +void prec_free(prec_t *P); + +/** + * @brief Initialize preconditioner object. + * + * @param P Pointer preconditioner object. + * @param A Pointer to bsr matrix. + */ +void prec_init(prec_t *P, bsr_matrix const *A); + +/** + * @brief Identify off-diagonal ILU0 targets. + * + * @param M Pointer to bsr matrix. + * @param offsets Pointer to offsets identifying off-diagonal targets. + * + * @return number of offdiagonal targets. + */ +int prec_analyze(bsr_matrix *M, int (*offsets)[3]); + +/** + * @brief DILU factorization. + * + * @param P Pointer preconditioner object. + * @param A Pointer to bsr matrix. + */ +void prec_dilu_factorize(prec_t *P, bsr_matrix *A); + +/** + * @brief ILU0 factorization. + * + * @param P Pointer preconditioner object. + * @param A Pointer to bsr matrix. + */ +void prec_ilu0_factorize(prec_t *P, bsr_matrix *A); + +/** + * @brief Preconditioner application in mixed-precision. + * + * @note Algorithm onsists of lower and upper triangular solves + * + * @param P Pointer to preconditioner object. + * @apram x Pointer to input/output vector + */ +void prec_mapply3c(prec_t *P, double *x); + +/** + * @brief Preconditioner applicationin double-precision. + * + * @note Algorithm onsists of lower and upper triangular solves + * + * @param P Pointer to preconditioner object. + * @apram x Pointer to input/output vector + */ +void prec_dapply3c(prec_t *P, double *x); + +/** + * @brief Make single-precision copy of double-precision values. + * + * @param P Pointer to preconditioner object. + */ +void prec_downcast(prec_t *P); + +/** + * @brief Display preconditioner statistics. + * + * @param P Pointer to preconditioner object. + */ +void prec_info(prec_t *P); + +#ifdef __cplusplus +} +#endif diff --git a/opm/simulators/linalg/mixed/vec.h b/opm/simulators/linalg/mixed/vec.h new file mode 100644 index 00000000000..0c5a4bdad37 --- /dev/null +++ b/opm/simulators/linalg/mixed/vec.h @@ -0,0 +1,21 @@ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + + +void vec_copy(double *y, double const * x, int n) +{ + for(int i=0;i +#include + +#include "bsr.h" +#include "bslv.h" + +namespace Dune +{ +template +class MixedSolver : public InverseOperator +{ + public: + + MixedSolver(const M &A, double tol, int maxiter, bool use_dilu) + { + // verify that well contributions are added to the matrix + if (!Opm::Parameters::Get()) { + OPM_THROW(std::logic_error, "Well operators are currently not supported for mixed precision. " + "Use --matrix-add-well-contributions=true to add well contributions to the matrix instead.");} + + int nrows = A.N(); + int nnz = A.nonzeroes(); + int b = A[0][0].N(); + + // verify that block size is 3x3 + if (b!=3) {OPM_THROW(std::logic_error, "Block sizes other than 3x3 are not supported by mixed precision.");} + + // create jacobian matrix object and allocate various arrays + jacobian_ = bsr_alloc(); + bsr_init(jacobian_,nrows,nnz,b); + + // initialize sparsity pattern + int *rows = jacobian_->rowptr; + int *cols = jacobian_->colidx; + + int irow = 0; + int icol = 0; + rows[0] = 0; + for(auto row=A.begin(); row!=A.end(); row++) + { + for(unsigned int i=0; igetsize(); i++) + { + cols[icol++] = row->getindexptr()[i]; + } + rows[irow+1] = rows[irow]+row->getsize(); + irow++; + } + + // allocate and intialize solver memory + mem_ = bslv_alloc(); + bslv_init(mem_, tol, maxiter, jacobian_, use_dilu); + + //pointer to nonzero blocks + data_ = &A[0][0][0][0]; + } + + ~MixedSolver() + { + bsr_free(jacobian_); + bslv_free(mem_); + + } + + virtual void apply (X& x, X& b, InverseOperatorResult& res) override + { + // transpose each dense block to make them column-major + double B[9]; + for(int k=0;knnz;k++) + { + for(int i=0;i<3;i++) for(int j=0;j<3;j++) B[3*j+i] = data_[9*k + 3*i + j]; + for(int i=0;i<9;i++) jacobian_->dbl[9*k + i] = B[i]; + } + + // downcast to allow mixed precision + bsr_downcast(jacobian_); + + // solve linear system + int count = bslv_pbicgstab3m(mem_, jacobian_, &b[0][0], &x[0][0]); + //int count = bslv_pbicgstab3d(mem_, jacobian_, &b[0][0], &x[0][0]); + + // return convergence information + res.converged = (mem_->e[count] < mem_->tol); + res.reduction = mem_->e[count]; + res.iterations = count; + } + + virtual void apply (X& x, X& b, double reduction, InverseOperatorResult& res) override + { + x=0; + b=0; + res.reduction = reduction; + OPM_THROW(std::invalid_argument, "MixedSolver::apply(...) not implemented yet."); + + } + + virtual Dune::SolverCategory::Category category() const override { return Dune::SolverCategory::sequential; }; + + private: + bsr_matrix *jacobian_; + bslv_memory *mem_; + double const *data_; +}; + +} + diff --git a/opm/simulators/linalg/setupPropertyTree.cpp b/opm/simulators/linalg/setupPropertyTree.cpp index d104cc39a27..3697258dbcb 100644 --- a/opm/simulators/linalg/setupPropertyTree.cpp +++ b/opm/simulators/linalg/setupPropertyTree.cpp @@ -241,6 +241,16 @@ setupPropertyTree(FlowLinearSolverParameters p, // Note: copying the parameters return setupILU(conf, p); } + // mixed-precision ILU0 + if (conf == "mixed-ilu0") { + return setupMixedILU(conf, p); + } + + // mixed-precision ILU0 + if (conf == "mixed-dilu") { + return setupMixedDILU(conf, p); + } + if (conf == "dilu") { return setupDILU(conf, p); } @@ -394,6 +404,33 @@ setupILU([[maybe_unused]] const std::string& conf, const FlowLinearSolverParamet return prm; } +PropertyTree +setupMixedILU([[maybe_unused]] const std::string& conf, const FlowLinearSolverParameters& p) +{ + using namespace std::string_literals; + PropertyTree prm; + prm.put("tol", p.linear_solver_reduction_); + prm.put("maxiter", p.linear_solver_maxiter_); + prm.put("verbosity", p.linear_solver_verbosity_); + prm.put("solver", "mixed-bicgstab"s); + prm.put("preconditioner.type", "mixed-ilu0"s); + return prm; +} + +PropertyTree +setupMixedDILU([[maybe_unused]] const std::string& conf, const FlowLinearSolverParameters& p) +{ + using namespace std::string_literals; + PropertyTree prm; + prm.put("tol", p.linear_solver_reduction_); + prm.put("maxiter", p.linear_solver_maxiter_); + prm.put("verbosity", p.linear_solver_verbosity_); + prm.put("solver", "mixed-bicgstab"s); + prm.put("preconditioner.type", "mixed-dilu"s); + return prm; +} + + PropertyTree setupDILU([[maybe_unused]] const std::string& conf, const FlowLinearSolverParameters& p) { diff --git a/opm/simulators/linalg/setupPropertyTree.hpp b/opm/simulators/linalg/setupPropertyTree.hpp index 9c80185a8d0..2cb1cc20f22 100644 --- a/opm/simulators/linalg/setupPropertyTree.hpp +++ b/opm/simulators/linalg/setupPropertyTree.hpp @@ -37,7 +37,9 @@ PropertyTree setupCPRW(const std::string& conf, const FlowLinearSolverParameters PropertyTree setupCPR(const std::string& conf, const FlowLinearSolverParameters& p); PropertyTree setupAMG(const std::string& conf, const FlowLinearSolverParameters& p); PropertyTree setupILU(const std::string& conf, const FlowLinearSolverParameters& p); +PropertyTree setupMixedILU(const std::string& conf, const FlowLinearSolverParameters& p); PropertyTree setupDILU(const std::string& conf, const FlowLinearSolverParameters& p); +PropertyTree setupMixedDILU(const std::string& conf, const FlowLinearSolverParameters& p); PropertyTree setupUMFPack(const std::string& conf, const FlowLinearSolverParameters& p); } // namespace Opm