From e90d438842ac767038018c80f663eaf38ed74c03 Mon Sep 17 00:00:00 2001
From: christian <Christian Negre>
Date: Tue, 26 Feb 2019 23:20:54 -0700
Subject: [PATCH 1/4] Attempt to add test for othogonal TB

---
 src/bodirectprogress.F90          |  5 +++
 tests/energy.fullscf.ortho.out    | 21 +++++++++
 tests/fullscf.ortho.dat           | 28 ++++++++++++
 tests/latte.fullscf.ortho.in      | 75 +++++++++++++++++++++++++++++++
 tests/ortho.params/bondints.ortho | 41 +++++++++++++++++
 tests/ortho.params/electrons.dat  |  7 +++
 tests/ortho.params/ppots.ortho    | 12 +++++
 tests/run_test.sh                 |  3 +-
 8 files changed, 191 insertions(+), 1 deletion(-)
 create mode 100644 tests/energy.fullscf.ortho.out
 create mode 100644 tests/fullscf.ortho.dat
 create mode 100644 tests/latte.fullscf.ortho.in
 create mode 100755 tests/ortho.params/bondints.ortho
 create mode 100644 tests/ortho.params/electrons.dat
 create mode 100644 tests/ortho.params/ppots.ortho
diff --git a/src/bodirectprogress.F90 b/src/bodirectprogress.F90
index 32d7875..e0e2b9d 100644
--- a/src/bodirectprogress.F90
+++ b/src/bodirectprogress.F90
@@ -54,6 +54,11 @@ SUBROUTINE BOEVECSPRG
 
   !! Convert Hamiltonian to bml format
   !! H should be in orthogonal form, ORTHOH
+
+  IF (BASISTYPE == "ORTHO") THEN
+      ORTHOH = H 
+  ENDIF 
+
   CALL BML_ZERO_MATRIX(BML_MATRIX_DENSE, BML_ELEMENT_REAL, &
        LATTEPREC, HDIM, HDIM, ORTHOH_BML)
   CALL BML_ZERO_MATRIX(BML_MATRIX_DENSE, BML_ELEMENT_REAL, &
diff --git a/tests/energy.fullscf.ortho.out b/tests/energy.fullscf.ortho.out
new file mode 100644
index 0000000..3a72149
--- /dev/null
+++ b/tests/energy.fullscf.ortho.out
@@ -0,0 +1,21 @@
+-404.503855
+-404.525030
+-404.725674
+-405.582288
+-408.059757
+-410.620284
+-406.007885
+-402.776780
+-403.819356
+-403.608768
+-403.804797
+-408.527217
+-409.741533
+-406.748274
+-405.063504
+-404.568807
+-404.498053
+-404.513288
+-404.526646
+-404.534376
+-404.620906
diff --git a/tests/fullscf.ortho.dat b/tests/fullscf.ortho.dat
new file mode 100644
index 0000000..f07c23b
--- /dev/null
+++ b/tests/fullscf.ortho.dat
@@ -0,0 +1,28 @@
+          24
+   6.26700   0.00000   0.00000
+   0.00000   6.26700   0.00000
+   0.00000   0.00000   6.26700
+O    3.08800   3.70000   3.12400
+H    4.05800   3.70000   3.12400
+H    2.76400   3.13200   3.84100
+O    2.47000   0.39000   1.36000
+H    1.54000   0.37000   1.73000
+H    2.48000   0.00000   0.44000
+O    1.99300   0.41700   5.25000
+H    2.39300   1.32700   5.16000
+H    0.99300   0.49700   5.31000
+O    2.05300   6.09700   3.48000
+H    2.12300   5.20700   3.02000
+H    1.11300   0.17000   3.40000
+O    4.90000   5.37700   2.14000
+H    5.51000   6.17700   2.18000
+H    3.95000   5.68700   2.21000
+O    0.92000   3.82700   0.56000
+H    0.00000   3.54700   0.27000
+H    1.23000   4.59700   0.00000
+O    0.89000   2.03700   3.41000
+H    0.72000   2.86700   2.87000
+H    1.79000   1.66700   3.19000
+O    4.45000   4.61700   5.43000
+H    4.75000   3.89700   4.81000
+H    4.06000   4.21700   6.26000
diff --git a/tests/latte.fullscf.ortho.in b/tests/latte.fullscf.ortho.in
new file mode 100644
index 0000000..ab19f72
--- /dev/null
+++ b/tests/latte.fullscf.ortho.in
@@ -0,0 +1,75 @@
+LATTE INPUT FILE
+================
+#This input file resumes the content of MDcontroller and TBparam/control.in
+#The parser will only read it if it is present inside the running folder. 
+#In case this file is not present Latte will read the two files as original.
+#The order of the kewords is not important in this file. 
+
+#General controls
+CONTROL{
+  XCONTROL= 1
+  BASISTYPE= ORTHO
+  PARAMPATH= "tests/ortho.params"
+  VERBOSE= 0
+  DEBUGON= 0
+  FERMIM= 6
+  CGORLIB= 1 CGTOL= 1.0e-6
+  KBT= 0.0
+  NORECS= 1
+  ENTROPYKIND= 1
+  PPOTON= 1 VDWON= 0
+  SPINON= 0 SPINTOL= 1.0e-4
+  ELECTRO= 1 ELECMETH= 0 ELEC_ETOL= 0.001 ELEC_QTOL= 1.0e-4
+  COULACC= 1.0e-6 COULCUT= -500.0 COULR1= 500.0
+  MAXSCF= 250
+  BREAKTOL= 1.0E-12 MINSP2ITER= 22 SP2CONV= REL
+  FULLQCONV= 1 QITER= 0
+  QMIX= 0.25 SPINMIX= 0.25 MDMIX= 0.25
+  ORDERNMOL= 0
+  SPARSEON= 0 THRESHOLDON= 1 NUMTHRESH= 1.0e-6 FILLINSTOP= 100 BLKSZ= 4
+  MSPARSE= 3000
+  LCNON= 0 LCNITER= 4 CHTOL= 0.01
+  SKIN= 1.0
+  RELAX= 0 RELAXTYPE= SD MAXITER= 100 RLXFTOL= 0.00001
+  MDON= 1
+  PBCON= 1
+  RESTART= 0
+  CHARGE= 0
+  XBO= 1
+  XBODISON= 1
+  XBODISORDER= 5
+  NGPU= 2
+  KON= 0
+  COMPFORCE= 1
+  DOSFIT= 0 INTS2FIT= 1 BETA= 1000.0 NFITSTEP= 5000 QFIT= 0 MCSIGMA= 0.2
+  PPFITON=  0
+  ALLFITON= 0
+  PPSTEP= 500 BISTEP= 500 PP2FIT= 2 BINT2FIT= 6 
+  PPBETA= 1000.0 PPSIGMA= 0.01 PPNMOL= 10 PPNGEOM= 200
+  PARREP= 0
+  ER= 1.0
+}
+
+#Controls for QMD 
+MDCONTROL{
+MAXITER= 20
+UDNEIGH= 1
+DT= 0.25
+TEMPERATURE= 1.0e-30 RNDIST= GAUSSIAN SEEDINIT= UNIFORM
+DUMPFREQ= 250
+RSFREQ= 500
+WRTFREQ= 1
+TOINITTEMP5= 1
+THERMPER= 500
+THERMRUN= 50000
+NVTON= 0 NPTON= 0 AVEPER= 1000  FRICTION= 1000.0 SEED= 54
+PTARGET= 0.0 NPTTYPE= ISO
+SHOCKON= 0
+SHOCKSTART= 100000
+SHOCKDIR= 1
+UPARTICLE= 500.0 USHOCK= -4590.0 C0= 1300.0
+MDADAPT= 0
+GETHUG= 0 E0= -795.725  V0= 896.984864 P0= 0.083149
+}
+
+
diff --git a/tests/ortho.params/bondints.ortho b/tests/ortho.params/bondints.ortho
new file mode 100755
index 0000000..5a4554b
--- /dev/null
+++ b/tests/ortho.params/bondints.ortho
@@ -0,0 +1,41 @@
+Noints= 39
+Element1 Element2  Kind     H0   B1      B2        B3           B4         B5  R1     Rcut H0   B1      B2        B3           B4         B5  R1     Rcut
+N O sss  -9.360078 -1.293118 -0.379415 0.000000 0.000000 1.200000 3.500000 4.000000 0.340064 -1.703613 -0.622348 0.036738 -0.040158 1.200000 3.500000 4.000000 
+N O sps  10.309052 -0.981652 -0.828497 0.000000 0.000000 1.200000 3.500000 4.000000 -0.370946 -1.040947 -0.931097 0.252441 -0.115450 1.200000 3.500000 4.000000 
+O N sps  10.723048 -0.454312 -0.916563 0.000000 0.000000 1.200000 3.500000 4.000000 -0.420014 -1.107918 -0.905594 0.188424 -0.088365 1.200000 3.500000 4.000000 
+N O pps  9.259131 -0.734112 -1.023762 0.000000 0.000000 1.200000 3.500000 4.000000 -0.314073 0.499050 -2.914288 2.067657 -0.738439 1.200000 3.500000 4.000000 
+N O ppp  -4.532623 -1.999631 -0.286275 0.000000 0.000000 1.200000 3.500000 4.000000 0.223937 -1.991867 -0.537630 -0.081270 -0.004130 1.200000 3.500000 4.000000 
+C N sss  -7.409712 -1.940942 -0.219762 0.000000 0.000000 1.500000 3.500000 4.000000 0.263438 -1.754525 -0.584215 -0.007801 -0.021729 1.500000 3.500000 4.000000 
+C N sps  7.501761 -1.211169 -0.373905 0.000000 0.000000 1.500000 3.500000 4.000000 -0.326609 -1.197485 -0.807786 0.134891 -0.084373 1.500000 3.500000 4.000000 
+N C sps  8.697591 -1.267240 -0.178484 0.000000 0.000000 1.500000 3.500000 4.000000 -0.337943 -1.335442 -0.769693 0.119373 -0.079493 1.500000 3.500000 4.000000 
+C N pps  6.954600 -1.188456 -0.808043 0.000000 0.000000 1.500000 3.500000 4.000000 -0.350240 -0.467439 -1.849316 1.854403 -0.988471 1.500000 3.500000 4.000000 
+C N ppp  -2.921605 -2.203548 -0.409424 0.000000 0.000000 1.500000 3.500000 4.000000 0.158424 -2.114409 -0.582346 -0.051076 -0.006183 1.500000 3.500000 4.000000 
+C O sss  -13.986685 -1.931973 -0.432011 0.000000 0.000000 1.200000 3.500000 4.000000 0.375339 -1.547372 -0.642492 0.020614 -0.026699 1.200000 3.500000 4.000000 
+C O sps  10.718738 -1.389459 -0.182128 0.000000 0.000000 1.200000 3.500000 4.000000 -0.373027 -0.776043 -1.019920 0.257539 -0.102838 1.200000 3.500000 4.000000 
+O C sps  14.194791 -1.371650 -0.248285 0.000000 0.000000 1.200000 3.500000 4.000000 -0.458068 -1.035067 -0.937868 0.190562 -0.077841 1.200000 3.500000 4.000000 
+C O pps  8.622023 -0.557144 -0.938551 0.000000 0.000000 1.200000 3.500000 4.000000 -0.322293 0.795473 -3.476601 2.589965 -0.897800 1.200000 3.500000 4.000000 
+C O ppp  -5.327397 -2.190160 -0.089303 0.000000 0.000000 1.200000 3.500000 4.000000 0.244570 -1.922717 -0.573671 -0.057280 -0.004108 1.200000 3.500000 4.000000 
+N N sss  -7.165811 -2.348869 -0.541905 0.000000 0.000000 1.500000 3.500000 4.000000 0.231654 -1.879002 -0.572765 -0.004579 -0.031106 1.500000 3.500000 4.000000 
+N N sps  8.212268 -1.499123 -0.526440 0.000000 0.000000 1.500000 3.500000 4.000000 -0.305271 -1.385158 -0.751032 0.114531 -0.090839 1.500000 3.500000 4.000000 
+N N pps  7.102331 -1.252366 -0.552533 0.000000 0.000000 1.500000 3.500000 4.000000 -0.324668 -0.547805 -1.638658 1.495168 -0.827868 1.500000 3.500000 4.000000 
+N N ppp  -2.828938 -2.376886 -0.560898 0.000000 0.000000 1.500000 3.500000 4.000000 0.142909 -2.162036 -0.571942 -0.071640 -0.004682 1.500000 3.500000 4.000000 
+O O sss  -14.387756 -2.244278 -1.645605 0.000000 0.000000 1.200000 3.500000 4.000000 0.296445 -1.911896 -0.663451 0.038054 -0.046608 1.200000 3.500000 4.000000 
+O O sps  13.699127 -1.602358 -0.114474 0.000000 0.000000 1.200000 3.500000 4.000000 -0.362143 -1.285274 -0.939591 0.204641 -0.106438 1.200000 3.500000 4.000000 
+O O pps  9.235469 -1.131474 -0.924535 0.000000 0.000000 1.200000 3.500000 4.000000 -0.312044 0.121814 -2.519352 1.681266 -0.644566 1.200000 3.500000 4.000000 
+O O ppp  -4.526526 -2.487174 -0.201464 0.000000 0.000000 1.200000 3.500000 4.000000 0.193010 -2.168462 -0.580629 -0.105104 0.004891 1.200000 3.500000 4.000000 
+H O sss  -12.189103 -1.800097 -0.325933 0.000000 0.000000 1.000000 3.500000 4.000000 0.404725 -1.702546 -0.707938 0.074904 -0.039922 1.000000 3.500000 4.000000 
+H O sps  9.518733 -1.333235 -0.393710 0.000000 0.000000 1.000000 3.500000 4.000000 -0.447660 -0.952979 -1.163537 0.400616 -0.156965 1.000000 3.500000 4.000000 
+H N sss  -12.631030 -1.585597 -0.250969 0.000000 0.000000 1.000000 3.500000 4.000000 0.446693 -1.500463 -0.657448 0.065741 -0.037004 1.000000 3.500000 4.000000 
+H N sps  9.837852 -1.234850 -0.324283 0.000000 0.000000 1.000000 3.500000 4.000000 -0.501530 -0.785734 -1.123232 0.394878 -0.148501 1.000000 3.500000 4.000000 
+C C sss  -9.197237 -1.607050 -0.535057 0.000000 0.000000 1.400000 3.500000 4.000000 0.346977 -1.519820 -0.570812 -0.013518 -0.015829 1.400000 3.500000 4.000000 
+C C sps  8.562436 -0.980182 -0.646929 0.000000 0.000000 1.400000 3.500000 4.000000 -0.400467 -0.984048 -0.853949 0.157178 -0.073381 1.400000 3.500000 4.000000 
+C C pps  6.614756 -0.528591 -0.951460 0.000000 0.000000 1.400000 3.500000 4.000000 -0.382417 0.102889 -2.786680 2.646356 -1.134320 1.400000 3.500000 4.000000 
+C C ppp  -3.678302 -1.881668 -0.255951 0.000000 0.000000 1.400000 3.500000 4.000000 0.214357 -1.948923 -0.578323 -0.034356 -0.007257 1.400000 3.500000 4.000000 
+H C sss  -9.235812 -1.372683 -0.408433 0.000000 0.000000 1.100000 3.500000 4.000000 0.416003 -1.459596 -0.654874 0.009140 -0.012658 1.100000 3.500000 4.000000 
+H C sps  8.104851 -0.936099 -0.626219 0.000000 0.000000 1.100000 3.500000 4.000000 -0.495695 -0.901626 -1.007214 0.189808 -0.057087 1.100000 3.500000 4.000000 
+H H sss  -9.400000 -1.145903 -0.391777 0.000000 0.000000 0.750000 3.500000 4.000000 0.575007 -1.391261 -0.778831 0.080209 -0.017759 0.750000 3.500000 4.000000
+W W sss -2.63332044 -0.71100562 -0.27081645  0.03306840 -0.00393097 2.73 6.00 7.00  0.28895119 -0.91180491 -0.22236240  0.00062879  0.00223537 2.73 6.00 7.00
+W W sds -1.48671751 -0.42509143 -0.65004572  0.29520069 -0.06194951 2.73 6.00 7.00  0.14376383 -0.45157088 -0.59271709  0.18666281 -0.03366047 2.73 6.00 7.00
+W W dds -1.70672948 -1.10134419 -0.01972556 -0.04301193  0.00482517 2.73 6.00 7.00  0.11587117 -0.53362062 -0.52285554  0.14742450 -0.02563750 2.73 6.00 7.00
+W W ddp  1.41731714 -1.52300320 -0.03534115 -0.03114721  0.00393409 2.73 6.00 7.00 -0.11180069 -1.33376345 -0.26519523  0.05629322 -0.01154228 2.73 6.00 7.00
+W W ddd -0.32269993 -1.97500297 -0.00061059 -0.03674107  0.00594063 2.73 6.00 7.00  0.02602319 -1.93947083 -0.12231373  0.00539735 -0.00134363 2.73 6.00 7.00 
diff --git a/tests/ortho.params/electrons.dat b/tests/ortho.params/electrons.dat
new file mode 100644
index 0000000..9946312
--- /dev/null
+++ b/tests/ortho.params/electrons.dat
@@ -0,0 +1,7 @@
+Noelem=  5
+Element   basis    Numel   Es          Ep        Ed          Ef    Mass     HubbardU  Wss        Wpp      Wdd        Wff
+N sp 5.000000 -18.556500 -7.062500 0.000000 0.000000 14.006700 17.372900 0.000000 -0.693400 0.000000 0.000000 
+O sp 6.000000 -23.937700 -9.003500 0.000000 0.000000 15.999400 11.876141 0.000000 -0.7576500 0.000000 0.000000 
+H s 1.000000 -6.483500 0.000000 0.000000 0.000000 1.007900 12.054683 -2.23400 0.000000 0.000000 0.000000 
+C sp 4.000000 -13.719900 -5.254100 0.000000 0.000000 12.010000 14.240811 0.000000 -0.6181000 0.000000 0.000000
+W sd 6.0 -4.05 0.0 -2.12 0.0 183.84 7.048 0.0 0.0 0.0 0.0 
diff --git a/tests/ortho.params/ppots.ortho b/tests/ortho.params/ppots.ortho
new file mode 100644
index 0000000..6d0f806
--- /dev/null
+++ b/tests/ortho.params/ppots.ortho
@@ -0,0 +1,12 @@
+Nopps= 10
+Ele1  Ele2  A0        A1      A2      A3       A4      A5       A6      C    R1  Rcut
+C C 3.927770 24.439989 -51.156433 39.032536 -11.321277 0.000000 0.000000 0.000000 1.600000 1.700000
+N O 14.005908 19.769009 -46.607006 38.399015 -12.656658 0.000000 0.000000 0.000000 1.600000 1.700000 
+C N 98.283078 10.289077 -27.709052 22.099235 -6.796462 0.000000 0.000000 0.000000 1.600000 1.700000 
+N N 40.335850 14.958977 -36.644093 29.219613 -8.918783 0.000000 0.000000 0.000000 1.600000 1.700000 
+C O 0.916287 30.115416 -59.612502 45.114207 -13.200384 0.000000 0.000000 0.000000 1.500000 1.600000 
+O O 11.833452 19.281518 -45.763767 37.924165 -12.006535 0.000000 0.000000 0.000000 1.500000 1.600000 
+N H 0.664002 28.086622 -63.415978 53.301425 -17.343446 0.000000 0.000000 0.000000 1.300000 1.400000
+O H 0.484351 33.176296 -81.154354 74.931992 -26.796460 0.000000 0.000000 0.000000 1.200000 1.300000 
+C H 1.094168 28.606497 -71.558353 65.967464 -23.372892 0.000000 0.000000 0.000000 1.200000 1.300000 
+H H 8.194700 16.371100 -75.246500 106.703000 -59.105700 0.000000 0.000000 0.000000 0.800000 0.900000 
diff --git a/tests/run_test.sh b/tests/run_test.sh
index 6db34e8..4c88cde 100755
--- a/tests/run_test.sh
+++ b/tests/run_test.sh
@@ -16,6 +16,7 @@ performanceExpectedTimes["tableread"]=1.700
 performanceExpectedTimes["0scf"]=0.127
 performanceExpectedTimes["2scf"]=0.120
 performanceExpectedTimes["fullscf"]=0.158
+performanceExpectedTimes["fullscf.ortho"]=1.050
 performanceExpectedTimes["fullscf.etemp"]=0.160
 performanceExpectedTimes["sp2"]=0.161
 performanceExpectedTimes["sp2.sparse"]=0.579
@@ -125,7 +126,7 @@ done
 
 # Testing for MD simulations:
 
-for name in tableread 0scf 2scf fullscf fullscf.etemp sp2 sp2.sparse fullscf.nvt \
+for name in tableread 0scf 2scf fullscf fullscf.ortho fullscf.etemp sp2 sp2.sparse fullscf.nvt \
        	fullscf.npt fullscf.vdw fullscf.spin fullscf.kon fullscf.rspace ; do
 
   INLATTEFILE="latte."$name".in"

From 9c0bf2edfdb13248ee5f54a054d28fadae75f076 Mon Sep 17 00:00:00 2001
From: cnegre <christianfannegre@gmail.com>
Date: Wed, 27 Feb 2019 17:37:58 -0500
Subject: [PATCH 2/4] Added missing files

---
 MATRIX/F_mmlatte.cu     | 28 +++++++++++++++++++++
 MATRIX/M_Multiply.cu    | 18 +++++++++++++
 MATRIX/Makefile         |  2 +-
 MATRIX/Matrix.h         |  3 +++
 MATRIX/SolveMatrixCG.cu |  4 ++-
 MATRIX/genmatmult.cu    | 56 +++++++++++++++++++++++++++++++++++++++++
 Makefile                |  2 +-
 makefile.CHOICES        | 50 +++++++++++-------------------------
 8 files changed, 125 insertions(+), 38 deletions(-)
 create mode 100644 MATRIX/F_mmlatte.cu
 create mode 100644 MATRIX/genmatmult.cu

diff --git a/MATRIX/F_mmlatte.cu b/MATRIX/F_mmlatte.cu
new file mode 100644
index 0000000..7745a6a
--- /dev/null
+++ b/MATRIX/F_mmlatte.cu
@@ -0,0 +1,28 @@
+/*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+! Copyright 2010.  Los Alamos National Security, LLC. This material was    !
+! produced under U.S. Government contract DE-AC52-06NA25396 for Los Alamos !
+! National Laboratory (LANL), which is operated by Los Alamos National     !
+! Security, LLC for the U.S. Department of Energy. The U.S. Government has !
+! rights to use, reproduce, and distribute this software.  NEITHER THE     !
+! GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,     !
+! EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS         !
+! SOFTWARE.  If software is modified to produce derivative works, such     !
+! modified software should be clearly marked, so as not to confuse it      !
+! with the version available from LANL.                                    !
+!                                                                          !
+! Additionally, this program is free software; you can redistribute it     !
+! and/or modify it under the terms of the GNU General Public License as    !
+! published by the Free Software Foundation; version 2.0 of the License.   !
+! Accordingly, this program is distributed in the hope that it will be     !
+! useful, but WITHOUT ANY WARRANTY; without even the implied warranty of   !
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General !
+! Public License for more details.                                         !
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/
+
+#include "Matrix.h"
+
+extern "C" void mmlatte_(int *hdim, int *tposea, int *tposeb, void *alpha, void *beta, void *amat_pointer, void *bmat_pointer, void *cmat_pointer) {
+  genmatmult(*hdim, *tposea, *tposeb, *((double *)alpha), *((double *)beta), (double *)amat_pointer, (double *)bmat_pointer, (double *)cmat_pointer);
+  }
+
+ 
diff --git a/MATRIX/M_Multiply.cu b/MATRIX/M_Multiply.cu
index 630e5de..60fa8c7 100644
--- a/MATRIX/M_Multiply.cu
+++ b/MATRIX/M_Multiply.cu
@@ -154,6 +154,24 @@ void M_Multiply(REAL *scalar1, Matrix A, Matrix B, REAL *scalar2, Matrix C) {
 
 }
 
+void M_Multiply(int tposea, int tposeb, REAL *alpha, Matrix A, Matrix B, REAL *beta, Matrix C) {
+
+     cudaSetDevice(0);
+
+//     printf("tposea = %d  tposeb  = %d \n", tposea, tposeb);
+     if (tposea == 0 && tposeb == 0 ) {
+       cublasDgemm(handle[0], CUBLAS_OP_N, CUBLAS_OP_N, A.DM, B.DN, A.DN, alpha, 
+		   A.Device[0], A.DM, B.Device[0], B.DM, beta, C.Device[0], C.DM);
+     } else if (tposea == 1 && tposeb == 0 ) {
+       cublasDgemm(handle[0], CUBLAS_OP_T, CUBLAS_OP_N, A.DM, B.DN, A.DN, alpha,
+                   A.Device[0], A.DM, B.Device[0], B.DM, beta, C.Device[0], C.DM);
+     } else if (tposea == 0 && tposeb == 1 ) {
+       cublasDgemm(handle[0], CUBLAS_OP_N, CUBLAS_OP_T, A.DM, B.DN, A.DN, alpha,
+                   A.Device[0], A.DM, B.Device[0], B.DM, beta, C.Device[0], C.DM);
+     }
+
+}     
+
 void M_Multiply(REAL k, Matrix A, Matrix B) {
 
   int msize = A.DM * A.DN;
diff --git a/MATRIX/Makefile b/MATRIX/Makefile
index 14e4154..af097cc 100644
--- a/MATRIX/Makefile
+++ b/MATRIX/Makefile
@@ -19,7 +19,7 @@ KERNEL_SOURCES=$(wildcard Kernels/*.cu)
 KERNEL_OBJECTS=$(KERNEL_SOURCES:.cu=.$(REALSIZE).o)
 
 #MATRIX_SOURCES=$(wildcard M_*.cu) $(wildcard F_*.cu) SP2Pure.cu SP2Fermi.cu SolveMatrixCG.cu Allocate.cu TestMultiply.cu
-MATRIX_SOURCES=$(wildcard M_*.cu) $(wildcard F_*.cu) SP2Pure.cu SP2Fermi.cu SolveMatrixCG.cu Allocate.cu runmatmult.cu
+MATRIX_SOURCES=$(wildcard M_*.cu) $(wildcard F_*.cu) SP2Pure.cu SP2Fermi.cu SolveMatrixCG.cu Allocate.cu runmatmult.cu genmatmult.cu
 MATRIX_CUDA_OBJECTS=$(MATRIX_SOURCES:.cu=.cuda.$(REALSIZE).o)
 
 all: libmatrix_cuda.$(PRECISION).a
diff --git a/MATRIX/Matrix.h b/MATRIX/Matrix.h
index 4f13f16..de511cc 100644
--- a/MATRIX/Matrix.h
+++ b/MATRIX/Matrix.h
@@ -108,6 +108,7 @@ void M_MultiplyTranspose(Matrix A, Matrix B, Matrix C);
 void M_Multiply(REAL *scalar, Matrix A, Matrix B, REAL *scalar2, Matrix C);
 void M_MultiplyMgpu(REAL *scalar, Matrix A, Matrix B, REAL *scalar2, Matrix C);
 void M_Multiply(REAL scalar, Matrix A, Matrix B); // B=scalar*A
+void M_Multiply(int tposea, int tposeb, REAL *scalar1, Matrix A, Matrix B, REAL *scalar2, Matrix C);
 void M_MultiplyAdd(REAL scalar, Matrix A, REAL scalar2, Matrix B, Matrix C); // C = scalar*A + scalar2*B
 void M_MultiplySub(REAL scalar, Matrix A, REAL scalar2, Matrix B, Matrix C); // C = scalar*A - scalar2*B
 void M_MultiplyAdd(REAL scalar, Matrix A, Matrix B, Matrix C); // C = scalar*A + B
@@ -153,6 +154,8 @@ void *Allocate(const char Label[], void *Pointer, size_t Size);
 
 void runmatmult(int hdim, REAL *bo_pointer, REAL *h_pointer);
 
+void genmatmult(int hdim, int tposea, int tposeb, REAL alpha, REAL beta, REAL *amat_pointer, REAL *bmat_pointer, REAL *cmat_pointer);
+
 void sp2pure_nospin3(REAL bndfil, int  hdim, REAL *bo_pointer, REAL maxeval, REAL *h_pointer, REAL maxminusmin, int minsp2iter, int sp2convint);
 
 void sp2pure_nospin4(REAL bndfil, int  hdim, REAL *bo_pointer, REAL maxeval, REAL *h_pointer, REAL maxminusmin, int minsp2iter, int sp2convint);
diff --git a/MATRIX/SolveMatrixCG.cu b/MATRIX/SolveMatrixCG.cu
index dc2b724..b02b060 100644
--- a/MATRIX/SolveMatrixCG.cu
+++ b/MATRIX/SolveMatrixCG.cu
@@ -69,6 +69,8 @@ void solve_matrix_cg(REAL *bo_ptr, int hdim, REAL cgtol2, int fermim) {
 
       iter++;
 
+//      printf("%d \n", iter);
+
       // A * P0 - intermediate term used in CG
       M_Multiply( a, p0, tmpmat);
 
@@ -105,7 +107,7 @@ void solve_matrix_cg(REAL *bo_ptr, int hdim, REAL cgtol2, int fermim) {
       // p0 = -1.0 * r0 + p0
       M_MultiplyScalarSum( &MINUS1, r0, p0);
 
-      //printf("iter = %d error2 = %e cgtol2= %e \n", iter, error2, cgtol2);
+//      printf("iter = %d error2 = %e cgtol2= %e \n", iter, error2, cgtol2);
 
       if (error2 < cgtol2) breakloop = 1;
 
diff --git a/MATRIX/genmatmult.cu b/MATRIX/genmatmult.cu
new file mode 100644
index 0000000..f638ce7
--- /dev/null
+++ b/MATRIX/genmatmult.cu
@@ -0,0 +1,56 @@
+/*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+! Copyright 2010.  Los Alamos National Security, LLC. This material was    !
+! produced under U.S. Government contract DE-AC52-06NA25396 for Los Alamos !
+! National Laboratory (LANL), which is operated by Los Alamos National     !
+! Security, LLC for the U.S. Department of Energy. The U.S. Government has !
+! rights to use, reproduce, and distribute this software.  NEITHER THE     !
+! GOVERNMENT NOR LOS ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY,     !
+! EXPRESS OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS         !
+! SOFTWARE.  If software is modified to produce derivative works, such     !
+! modified software should be clearly marked, so as not to confuse it      !
+! with the version available from LANL.                                    !
+!                                                                          !
+! Additionally, this program is free software; you can redistribute it     !
+! and/or modify it under the terms of the GNU General Public License as    !
+! published by the Free Software Foundation; version 2.0 of the License.   !
+! Accordingly, this program is distributed in the hope that it will be     !
+! useful, but WITHOUT ANY WARRANTY; without even the implied warranty of   !
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General !
+! Public License for more details.                                         !
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/
+
+#include <math.h>
+#include <stdio.h>
+#include <sys/time.h>
+#include <stdint.h>
+
+#include "Matrix.h"
+
+extern int ndevices;
+extern int nblocks;
+
+void genmatmult(int hdim, int tposea, int tposeb, REAL alpha, REAL beta, REAL *amat_pointer, REAL *bmat_pointer, REAL *cmat_pointer) {
+  //void runmatmult(int  hdim, REAL *x0_pointer, REAL *h_pointer) {
+
+  Matrix amat, bmat, cmat;
+
+  M_InitWithLocal(amat, amat_pointer, hdim, hdim);
+  M_InitWithLocal(bmat, bmat_pointer, hdim, hdim);
+  M_InitWithLocal(cmat, cmat_pointer, hdim, hdim);
+
+  // Copy Matrices to all GPUs. We only copy C if beta > 0  
+
+  M_Push( amat );
+  M_Push( bmat );	
+
+  if (fabs(beta) > 1.0e-6) M_Push( cmat );
+  
+  M_Multiply(tposea, tposeb, &alpha, amat, bmat, &beta, cmat);
+    	
+  M_Pull(cmat);
+
+  M_DeallocateDevice(amat);
+  M_DeallocateDevice(bmat);
+  M_DeallocateDevice(cmat);
+
+}
diff --git a/Makefile b/Makefile
index ff06030..44ddf4e 100644
--- a/Makefile
+++ b/Makefile
@@ -14,7 +14,7 @@ endif
 
 lammps : 
 	(rm liblatte.a; cd src; make; cd ..)
-	(cd $(HOME)/lammps/src; touch fix_latte.cpp; make serial; cd -)
+	(cd $(HOME)/lammps/src; touch fix_latte.cpp; make mpi; cd -)
 	
 src : 
 	(rm liblatte.a; cd src; make; cd ..)
diff --git a/makefile.CHOICES b/makefile.CHOICES
index 9000389..3a1b9ea 100644
--- a/makefile.CHOICES
+++ b/makefile.CHOICES
@@ -15,15 +15,15 @@ RANLIB = /usr/bin/ranlib
 
 # Use PROGRESS and BML libraries
 PROGRESS = OFF
-PROGRESS_PATH= $(HOME)/qmd-progress/install/lib
-BML_PATH= $(HOME)/bml/install/lib
+PROGRESS_PATH= $(HOME)/qmd-progress/install/lib64
+BML_PATH= $(HOME)/bml/install/lib64
 
 # Use METIS library for graph partitioning
 METIS = OFF
 METIS_PATH= $(HOME)/metis/metis-5.1.0/install
 
 # GPU available - OFF or ON
-GPUOPT = OFF
+GPUOPT = ON
 
 # Using DBCSR library from cp2k? OFF or ON
 DBCSR_OPT = OFF
@@ -35,35 +35,17 @@ MPIOPT = OFF
 # CPU Fortran options
 #
 
-#For GNU compiler:
-#FC = mpif90
-FC = gfortran
+FC = xlf90_r
 FCL = $(FC)
-FFLAGS = -O3 -fopenmp -cpp
-#FFLAGS =  -fast -Mpreprocess -mp
-LINKFLAG = -fopenmp
+# Optimization flags:
+FFLAGS = -qessl -qstrict=all -qsmp=omp -O2 -qextname 
+# Debug flags:
+FFLAGS += -g0 -qreport
+LINKFLAG = -qsmp=omp 
 
-#For intel compiler:
-#FC = ifort
-#FCL = $(FC)
-#FFLAGS =  -O3 -fpp -qopenmp
-#LINKFLAG = -qopenmp
-#LIB = -mkl=parallel
-
-#GNU BLAS/LAPACK libraries:
-LIB = -llapack -lblas
-
-#Intel MKL BLAS/LAPACK libraries:
-#LIB = -Wl,--no-as-needed -L${MKLROOT}/lib/intel64 \
-# -lmkl_lapack95_lp64 -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core \
-# -lmkl_gnu_thread -lmkl_core -ldl -lpthread -lm
-
-#Alternative flags for MKL:
-#LIB += -mkl=parallel
-
-#Other BLAS/LAPACK vendors:
-#LIB = -framework Accelerate
-#LIB = -L/usr/projects/hpcsoft/toss2/common/acml/5.3.1/gfortran64/lib -lacml
+#BLAS/LAPACK libraries:
+LIB = -L${OLCF_ESSL_ROOT}/lib64/ -lessl -lesslsmp -lesslsmpcuda -qextname
+LIB += -lxlopt -lxlf90_r -lxlfmath -lxl -lxlsmp 
 
 # Uncomment for coverage
 #CVR = OFF
@@ -82,12 +64,10 @@ ifeq ($(GRAPH), ON)
 	FFLAGS += -I$(METIS_PATH)/include
 endif
 
-#DBCSR_LIB = -L/home/cawkwell/cp2k/lib/cawkwell/popt -lcp2k_dbcsr_lib
-#DBCSR_MOD = -I/home/cawkwell/cp2k/obj/cawkwell/popt
-
 #
 # GPU options
 #
 
-GPU_CUDA_LIB = -L/opt/cudatoolkit-5.5/lib64 -lcublas -lcudart
-GPU_ARCH = sm_20 
+GPU_CUDA_LIB = -L${OLCF_CUDA_ROOT}/lib64/ -lcublas -lcudart
+
+GPU_ARCH = sm_70 

From 585c009f132af9a8a986c304c1f6ddf5dfb7638e Mon Sep 17 00:00:00 2001
From: cnegre <christianfannegre@gmail.com>
Date: Wed, 27 Feb 2019 17:59:36 -0500
Subject: [PATCH 3/4] Added Makefile and makefile.CHOICES back

---
 Makefile         |  2 +-
 makefile.CHOICES | 50 +++++++++++++++++++++++++++++++++---------------
 2 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/Makefile b/Makefile
index 44ddf4e..ff06030 100644
--- a/Makefile
+++ b/Makefile
@@ -14,7 +14,7 @@ endif
 
 lammps : 
 	(rm liblatte.a; cd src; make; cd ..)
-	(cd $(HOME)/lammps/src; touch fix_latte.cpp; make mpi; cd -)
+	(cd $(HOME)/lammps/src; touch fix_latte.cpp; make serial; cd -)
 	
 src : 
 	(rm liblatte.a; cd src; make; cd ..)
diff --git a/makefile.CHOICES b/makefile.CHOICES
index 3a1b9ea..9000389 100644
--- a/makefile.CHOICES
+++ b/makefile.CHOICES
@@ -15,15 +15,15 @@ RANLIB = /usr/bin/ranlib
 
 # Use PROGRESS and BML libraries
 PROGRESS = OFF
-PROGRESS_PATH= $(HOME)/qmd-progress/install/lib64
-BML_PATH= $(HOME)/bml/install/lib64
+PROGRESS_PATH= $(HOME)/qmd-progress/install/lib
+BML_PATH= $(HOME)/bml/install/lib
 
 # Use METIS library for graph partitioning
 METIS = OFF
 METIS_PATH= $(HOME)/metis/metis-5.1.0/install
 
 # GPU available - OFF or ON
-GPUOPT = ON
+GPUOPT = OFF
 
 # Using DBCSR library from cp2k? OFF or ON
 DBCSR_OPT = OFF
@@ -35,17 +35,35 @@ MPIOPT = OFF
 # CPU Fortran options
 #
 
-FC = xlf90_r
+#For GNU compiler:
+#FC = mpif90
+FC = gfortran
 FCL = $(FC)
-# Optimization flags:
-FFLAGS = -qessl -qstrict=all -qsmp=omp -O2 -qextname 
-# Debug flags:
-FFLAGS += -g0 -qreport
-LINKFLAG = -qsmp=omp 
+FFLAGS = -O3 -fopenmp -cpp
+#FFLAGS =  -fast -Mpreprocess -mp
+LINKFLAG = -fopenmp
 
-#BLAS/LAPACK libraries:
-LIB = -L${OLCF_ESSL_ROOT}/lib64/ -lessl -lesslsmp -lesslsmpcuda -qextname
-LIB += -lxlopt -lxlf90_r -lxlfmath -lxl -lxlsmp 
+#For intel compiler:
+#FC = ifort
+#FCL = $(FC)
+#FFLAGS =  -O3 -fpp -qopenmp
+#LINKFLAG = -qopenmp
+#LIB = -mkl=parallel
+
+#GNU BLAS/LAPACK libraries:
+LIB = -llapack -lblas
+
+#Intel MKL BLAS/LAPACK libraries:
+#LIB = -Wl,--no-as-needed -L${MKLROOT}/lib/intel64 \
+# -lmkl_lapack95_lp64 -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core \
+# -lmkl_gnu_thread -lmkl_core -ldl -lpthread -lm
+
+#Alternative flags for MKL:
+#LIB += -mkl=parallel
+
+#Other BLAS/LAPACK vendors:
+#LIB = -framework Accelerate
+#LIB = -L/usr/projects/hpcsoft/toss2/common/acml/5.3.1/gfortran64/lib -lacml
 
 # Uncomment for coverage
 #CVR = OFF
@@ -64,10 +82,12 @@ ifeq ($(GRAPH), ON)
 	FFLAGS += -I$(METIS_PATH)/include
 endif
 
+#DBCSR_LIB = -L/home/cawkwell/cp2k/lib/cawkwell/popt -lcp2k_dbcsr_lib
+#DBCSR_MOD = -I/home/cawkwell/cp2k/obj/cawkwell/popt
+
 #
 # GPU options
 #
 
-GPU_CUDA_LIB = -L${OLCF_CUDA_ROOT}/lib64/ -lcublas -lcudart
-
-GPU_ARCH = sm_70 
+GPU_CUDA_LIB = -L/opt/cudatoolkit-5.5/lib64 -lcublas -lcudart
+GPU_ARCH = sm_20 

From ee9c677bbf4fc472dd80289614400c36f7571e8c Mon Sep 17 00:00:00 2001
From: cnegre <christianfannegre@gmail.com>
Date: Mon, 4 Mar 2019 13:56:41 -0500
Subject: [PATCH 4/4] Added missing lines from getforce left after merging with
 MJC_GPU

---
 src/getforce.F90 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/getforce.F90 b/src/getforce.F90
index d39950f..3311523 100644
--- a/src/getforce.F90
+++ b/src/getforce.F90
@@ -24,20 +24,20 @@ SUBROUTINE GETFORCE
   USE CONSTANTS_MOD
   USE SETUPARRAY
   USE MYPRECISION
+  USE TIMER_MOD
 
   IMPLICIT NONE
+  REAL(LATTEPREC) :: MLSI
   IF (EXISTERROR) RETURN
 
   FTOT = ZERO
 
   IF (KON .EQ. 0) THEN
-
      IF (SPONLY .EQ. 0) THEN
         CALL GRADHSP
      ELSE
         CALL GRADH
      ENDIF
-
      FTOT = TWO * F
 
      IF (BASISTYPE .EQ. "NONORTHO") THEN