diff --git a/qcd/part_cpu/LICENCE b/qcd/part_cpu/LICENCE
new file mode 100644
index 0000000000000000000000000000000000000000..3eff5c81f79472018c6e4406ee1510fbada7c20e
--- /dev/null
+++ b/qcd/part_cpu/LICENCE
@@ -0,0 +1,42 @@
+Copyright and Disclaimer
+Copyright (C) 2008, Forschungszentrum Juelich GmbH, Federal Republic of Germany. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials provided
+ with the distribution.
+
+ * Any publications that result from the use of this software shall
+ reasonably refer to the Research Centre's development.
+
+ * All advertising materials mentioning features or use of this
+ software must display the following acknowledgement:
+
+ This product includes software developed by Forschungszentrum
+ Juelich GmbH, Federal Republic of Germany.
+
+ * Forschungszentrum Juelich GmbH is not obligated to provide the
+ user with any support, consulting, training or assistance of any
+ kind with regard to the use, operation and performance of this
+ software or to provide the user with any updates, revisions or
+ new versions.
+
+THIS SOFTWARE IS PROVIDED BY FORSCHUNGSZENTRUM JUELICH GMBH "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL FORSCHUNGSZENTRUM JUELICH
+GMBH BE LIABLE FOR ANY SPECIAL, DIRECT OR CONSEQUENTIAL DAMAGES OR ANY
+DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ARISING OUT OF OR IN CONNECTION WITH THE ACCESS, USE OR PERFORMANCE OF
+THIS SOFTWARE.
diff --git a/qcd/part_cpu/applications/QCD/QUICK_GUIDE_UEABS_QCD_BENCHMARKSUITE b/qcd/part_cpu/applications/QCD/QUICK_GUIDE_UEABS_QCD_BENCHMARKSUITE
new file mode 100644
index 0000000000000000000000000000000000000000..138b332d8f10d1b0b554247e1f155edc79a8f609
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/QUICK_GUIDE_UEABS_QCD_BENCHMARKSUITE
@@ -0,0 +1,108 @@
+#################
+################# UEABS - QCD - BENCHMARKSUITE -- QUICK-USERGUIDE
+#################
+
+This is a very short summary of the general step, which has
+to be performed, to run the UEABS QCD Benchmarksuite on a new
+machine. More information can be found in the documentation of
+the UEABS-QCD BENCHMARKSUITE which is located in in the folder
+./PABS/doc/*
+or under the web-link
+
+http://www.prace-ri.eu/UEABS/QCD/QCD_Build_README.txt
+http://www.prace-ri.eu/UEABS/QCD/QCD_Run_README.txt
+
+The suite works with Jube, which will handle the compilation,
+the submission and the analysis of the Benchmarksuite. On a new
+machine several xml-files has to be added or created.
+This guide will give a short and very quick overview about
+the different steps.
+
+The FIRST STEP on a new machine is to add information about the
+system to the platform-folder located in:
+./PABS/platform
+Here, the new platform has to be added to the xml-file "platform.xml"
+similar to the already xml-templates:
+
+..
+
+
+
+
+The SECOND STEP is to provide a dummy-submit script which has to
+added to a new subdirectory given by:
+
+./PABS/platform/"NEW-PLATFORM"
+
+In the THIRD STEP: Go to the home-directory of the UEABS-QCD-Benchmarksuite
+located in:
+./PABS/applications/QCD/
+Note that the source-files of the kernels are located in "./PABS/applications/QCD/src".
+Here, similar to STEP ONE the xml-files:
+
+compile.xml, execute.xml and analyse.xml
+
+has to be edit, i.e. new xml-templates with the new platform-information
+has to be added.
+
+In the FOURTH STEP the runs will be setup by creating runs-scripts similar to
+"prace-functional-NEW-PLATORM.xml" for a functional test
+and
+"prace-scaling-NEW-PLATORM.xml" for a scaling run.
+Here, several limits of the different codes has to be taken into account, see for
+this the section "Limitation" at the end of this quick-userguide.
+
+In the FIFTH STEP the benchmark can be compiled and ran by using the command:
+
+perl ../../bench/jube prace-functional-"NEW-PLATFORM".xml
+
+in the directory:
+"./PABS/applications/QCD/".
+This will generate a folder "tmp" with subfolder in "./PABS/applications/QCD/"
+where the source-file will be compiled and executed. If the compilation or the submission
+fails, more information can be found in the subdirectories of "tmp". In any cases
+after the generation of the folder "tmp", compilation and submition can be done,
+in principle, without Jube.
+
+In the LAST STEP, the scaling results can be analyzed, by using
+perl ../../bench/jube analyse.xml
+
+LIMITATION:
+
+The different kernels consists of lattice QCD production codes and have several limitations
+in parallelization and lattice volume. Kernel A,C,D and E using a four dimensional
+lattice while in case of kernel B a three dimensional lattice is used. All kernels
+can be parallelized in all direction. The different lattice sizes and parallelization
+has to be declared in the scripts: 'prace-functional-"NEW-PLATFORM".xml' or
+'prace-scaling-NEW-PLATORM.xml'. The limitation for the different kernel are given by:
+
+"pt * px * py * pz = task"
+
+and additional for the Kernel A, D and E
+
+" nt / pt modulo 2 = 0 " and " nt => 4 "
+
+and the same condition for the other pairs
+"{nx,px}, {ny,py}, {nz,pz}". Moreover
+the lattice extends nt, nx, ny and nx has to be even and larger
+than 4.
+
+#######
+####### Please see for further information the Readme-files
+####### which are provided under
+#######
+####### http://www.prace-ri.eu/UEABS/QCD/QCD_Build_README.txt
+####### http://www.prace-ri.eu/UEABS/QCD/QCD_Run_README.txt
+####### or in
+####### ./PABS/doc/*
+#######
+####### Jacob Finkenrath, 2017
+#######
\ No newline at end of file
diff --git a/qcd/part_cpu/applications/QCD/README b/qcd/part_cpu/applications/QCD/README
new file mode 100644
index 0000000000000000000000000000000000000000..e5ca95368bcab8545334c73216e2b6619d114301
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/README
@@ -0,0 +1,1007 @@
+The lattice quatum chromodynamics (LQCD) benchmark is a compilation
+of up to five (three in the moment) LQCD kernels. The kernels are:
+
+label : kernel_A
+short label : KA
+kernel origin : Berlin Quantum ChromoDynamics program (BQCD),
+ DEISA benchmark suite
+kernel contact person : Hinnerk Stueben
+kernel code status : 2008/08/25
+problem size parameter : KA_N{X,Y,Z,T}, 4D lattice
+problem run time parameter : KA_MAXITER, iteration steps
+other needed parameter : KA_P{X,Y,Z,T}, distribution of processes in 4D
+ KA_LIBCOMM, see readme section
+ KA_LIBCLOVER, see readme section
+ KA_LIBD, see readme section
+notes :
+
+
+label : kernel_B
+short label : KB
+kernel origin : University of Oulu, Finland
+ DEISA benchmark suite
+kernel contact person : Kari Rummukainen
+kernel code status : 2008/08/22
+problem size parameter : KB_NX, x component of the 3D grid
+ KB_NY, y component of the 3D grid
+ KB_NZ, z component of the 3D grid
+problem run time parameter : KB_MAXITER, iteration steps
+other needed parameter :
+notes : number of processes needs to be a power of 2
+
+
+label : kernel_C
+short label : KC
+kernel origin : privat communication
+
+kernel contact person : Jacob Finkenrath
+kernel code status : 2016/08/24
+problem size parameter : KC_L{X,Y,Z,T}, local size of the 4D grid in {x,y,z,t}-direction
+problem run time parameter :
+other needed parameter : KC_P{X,Y,Z,T}, number of processes in {x,y,z,t}-direction
+notes :
+
+
+label : kernel_D
+short label : KD
+kernel origin : privat communication
+
+kernel contact person : Jacob Finkenrath
+kernel code status : 2016/08/24
+problem size parameter : K_L{X,Y,Z,T}, size of the 4D grid in {x,y,z,t}-direction
+problem run time parameter :
+other needed parameter : K_N{X,Y,Z,T}, number of processes in {x,y,z,t}-direction
+notes :
+
+
+label : kernel_E
+short label : KE
+kernel origin : privat communication
+
+kernel contact person : Stefan Krieg
+kernel code status : 2008/11/10
+problem size parameter : K_L{X,Y,Z,T}, size of the 4D grid in {x,y,z,t}-direction
+problem run time parameter :
+other needed parameter : K_N{X,Y,Z,T}, number of processes in {x,y,z,t}-direction
+notes :
+
+
+######################################################################
+kernel_A README
+
+-----------
+BQCD readme
+-----------
+
+
+
+Note: all base information taken from the
+BQCD document; updated with JuBE and new ported platforms
+
+Subdirectories in src:
+~~~~~~~~~~~~~~~~~~~~
+
+clover routines for the clover improvement
+
+comm communication routines
+
+d multiplication of a vector with "D-slash"
+
+modules (some) Fortran90 modules
+
+platform Makefiles and service routines for various platforms
+
+
+General remarks
+~~~~~~~~~~~~~~
+
+BQCD has been ported to various platforms (see platform/Makefile-*.var):
+
+# Makefile-altix.var - settings on SGI-Altix 3700 and SGI-Altix 4700
+# Makefile-bgl.var - settings on BlueGene/L
+# Makefile-cray.var - settings on Cray T3E and Cray XT4
+# Makefile-hitachi-omp.var - settings on Hitachi SR8000
+# Makefile-hitachi.var - settings on Hitachi SR8000 (pure MPI version)
+# Makefile-hp.var - settings for HP-UX Fortran Compiler
+# Makefile-ibm.var - settings on IBM
+# Makefile-intel.var - settings for Intel Fortran Compiler
+# Makefile-nec.var - settings on NEC SX-8
+# Makefile-sun.var - settings on Sun
+
+The corresponding files
+
+ platform/service-*.F90
+
+contain interfaces to service routines / system calls.
+
+Not all of these files have been used recently. There are kept as a
+starting point.
+
+A "Makefile.var" and a "service.F90" have to be provided in the source
+directory that work correctly with your system.
+The contents of these files is explained in:
+
+ platform/Makefile-EXPLAINED.var
+ platform/service-EXPLAINED.var
+
+"gmake prep-" will create symbolic links accordingly:
+
+berni1> gmake prep-ibm
+gmake prep PLATFORM=ibm
+rm -f Makefile.var service.F90
+ln -s platform/Makefile-ibm.var Makefile.var
+ln -s platform/service-ibm.F90 service.F90
+
+
+
+Resource requirements
+~~~~~~~~~~~~~~~~~~~~
+
+The resource requirements are approximately:
+
+benchmark lattice total memory size of output execution time
+---------------------------------------------------------------------------
+MPP 48*48*48*96 497 GByte 4 GByte 268.2 s at 758.52 GFlop/s
+SMP 24*24*24*48 37 GByte 0.25 GByte 44.4 s at 608.96 GFlop/s
+
+
+Standart porting
+~~~~~~~~~~~~~~~
+
+*** make
+
+The Makefiles use the makro $(MAKE) and the "include" statement. Some
+of Makefile-*.var are quite standard, some require GNU-make.
+
+"make fast" can be used for a parallel "make".
+
+"make fast" builds the binary "bqcd."
+
+Without "make fast" one has to enter:
+
+ make Modules
+ make libs
+ make bqcd
+
+JuBE porting
+~~~~~~~~~~~
+For Altix:
+ Change the following lines in the execution file bensh:
+ the first line: #!/usr/local/bin/perl -w
+ to
+ #!/usr/bin/perl -w
+ the line 1235: $cmd="cp -rp $srcdir/$file $dir/src/";
+to
+ $cmd="cp -rp $srcdir/* $dir/src/";
+
+
+
+
+
+
+*** ANSI C preprocessor
+
+The C preprocessor is needed for building the source. The C
+preprocessor must be able to handle the string concatenation macro "##".
+
+Recent versions of the GNU C Proprocesse do not work because they
+refuse to process the Fortran90 separator "%".
+
+
+*** Service routines and "stderr"
+
+Service routines are needed for aborting, measuring CPU-time, to get
+arguments from the command line, etc. The corresponding routines have
+to be inserted in the file service.F90.
+
+It is assumed that Fortran unit 0 is pre-connected to stderr. If this
+is not the case on your machine you should re-#define STDERR in "defs.h".
+
+For the time measurements it is important to use a time function with
+high resolution in the function "sekunden".
+
+
+
+*** Message passing / Communication library
+
+Originally the communication was programmed with the shmem library on
+a Cray T3E.
+
+Now MPI is mainly used. There is also a single CPU version (that
+needs no communication library) and a combination of shmem for the
+most time consuming part and MPI.
+
+See $(LIBCOMM) in platform/Makefile-EXPLAINED.var and "Hints for
+optimisation" below.
+
+
+*** OpenMP
+
+In addition to setting your compiler's OpenMP option you have to add
+"-D_OPENMP" in "Makefile.var":
+
+ MYFLAGS = ... -D_OPENMP
+
+
+
+Verification
+~~~~~~~~~~~
+
+*** Random numbers
+
+Correctness of random numbers can be checked by:
+
+ make the_ranf_test
+
+The test is done by comparison with reference output. On most
+platforms there is no difference. However, on Intel "diff"
+usually reports differences in the last digit of the floating point
+representation of the random numbers; the integer representations
+match exactly, eg:
+
+< 1 4711 0.5499394951912783
+---
+> 1 4711 0.5499394951912784
+
+
+*** Argments from the command line
+
+Try option -V:
+
+berni1> ./bqcd -V
+ This is bqcd benchmark2
+ input format: 4
+ conf info format: 3
+ MAX_TEMPER: 50
+ real kind: 8
+ version of D: 2
+ D3: buffer vol: 0
+ communication: single_pe + OpenMP
+
+
+
+*** BQCD
+
+To check that the BQCD works correctly execute the following sequence
+of commands:
+
+berni1> cd work
+berni1> ../bqcd input.TEST > out.TEST
+berni1> grep ' %[fim][atc]' out.TEST > out.tmp
+berni1> grep ' %[fim][atc]' out.TEST.reference | diff - out.tmp
+18c18
+< %fa -1 1 0.4319366404 1.0173348431 43 407 38
+---
+> %fa -1 1 0.4319366404 1.0173348433 43 407 38
+
+The test can be run for any domain decomposition and any number of
+threads. In any case result should agree. Floating point numbers
+might differ in the last digit as shown above.
+(In total 20 lines containing floating point numbers are compared.)
+
+
+*** Check sums
+
+BQCD writes restart files in the working directory. The extension of
+the file containing information on the run is ".info". It contains
+check sums of the binary data files (the example was run after the
+test run):
+
+berni1> tail -6 bqcd.000.1.info
+ >BeginCheckSum
+ bqcd.000.1.00.u 286125633 24576
+ bqcd.000.1.01.u 804770858 24576
+ bqcd.000.1.02.u 657813015 24576
+ bqcd.000.1.03.u 3802083338 24576
+ >EndCheckSum
+
+These check sums should be identical to check sums calculated by the
+"cksum" command:
+
+berni1> cksum bqcd.000.1.*.u | awk '{print $3, $1, $2}'
+bqcd.000.1.00.u 286125633 24576
+bqcd.000.1.01.u 804770858 24576
+bqcd.000.1.02.u 657813015 24576
+bqcd.000.1.03.u 3802083338 24576
+
+
+
+Structure of the input
+~~~~~~~~~~~~~~~~~~~~~
+
+run 204 names of restart files will contain "run"
+ can be set to 0
+
+lattice 24 24 24 48 lattice size, can e.g. be modified for
+ weak scaling analysis
+
+processes 1 2 2 4 number of MPI-proceses per direction
+ (1 1 1 1 in the pure OpenMP case)
+
+boundary_conditions_fermions 1 1 1 -1 do not change
+
+beta 5 do not change
+kappa 0.13 do not change
+csw 2.3327 do not change
+
+hmc_test 0 do not change
+hmc_model C do not change
+hmc_rho 0.1 do not change
+hmc_trajectory_length 0.2 do not change
+hmc_steps 10 can be lowered -> shorter execution time
+hmc_accept_first 1 do not change
+hmc_m_scale 3 do not change
+
+start_configuration cold do not change
+start_random default do not change
+
+mc_steps 1 do not change
+mc_total_steps 100 do not change
+
+solver_rest 1e-99 do not change
+solver_maxiter 100 can be lowered -> shorter execution time
+solver_ignore_no_convergence 2 do not change (CG will not converge,
+ the numbers of iterations per call
+ will be exactly "solver_maxiter")
+solver_mre_vectors 7 do not change
+
+
+
+
+
+Hints on optimisation
+~~~~~~~~~~~~~~~~~~~~
+
+Before starting any optimisation one should find the fastest variant
+in the existing code. There are two libraries to look at: $(LIBD) and
+$(LIBCOMM).
+
+
+
+*** LIBCOMM ("communication", directory: comm)
+
+There are the following variants:
+
+lib_single_pe.a: Single CPU version (PE: "processing element").
+
+lib_mpi.a: MPI version.
+
+lib_shmempi.a: shmem for nearest neighbour communication, MPI for the rest.
+
+
+*** Caveat
+
+Not all combinations of LIBD and LIBCOMM have been implemented.
+
+The following combinations should work (lib_mpi.a always works):
+
+LIBD LIBCOMM
+--------------------------------------------------
+libd.a lib_single_pe.a lib_mpi.a
+libd2.a lib_single_pe.a lib_mpi.a lib_schmempi.a
+libd3.a lib_mpi.a
+libd21.a lib_single_pe.a lib_mpi.a lib_schmempi.a
+
+
+
+Rules for time measurements
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In "Makefile.var" "-DTIMING" must always be set:
+
+ MYFLAGS = -DTIMING ...
+
+All time measurements (TIMING_START() ... TIMING_STOP()) must be kept.
+There is one exception: If you restructure routines d() and d_dag()
+it might occur that the current regions of time measurements (which
+are per direction) do not make sense. (For example, this would occur
+when combining loops from more than one direction.)
+
+In that case, please report in addition the best measurement obtained
+with the existing code.
+
+
+
+######################################################################
+kernel_B README
+
+This is the README file for the SU3_AHiggs application benchmark,
+distributed with the DEISA Benchmark Suite:
+http://www.deisa.eu/science/benchmarking/
+
+Last modified by the DEISA Benchmark Team on 2008-08-22.
+
+
+
+-----------------
+SU3_AHiggs readme
+-----------------
+
+
+
+Contents
+--------
+
+1 General description
+2 Code structure
+3 Parallelisation
+4 Building
+5 Execution
+6 Verification
+7 Input data
+8 Output data
+
+
+1 General description
+=====================
+
+SU3_AHiggs is a lattice quantum chromodynamics (QCD) code intended for
+computing the conditions of the Early Universe. Instead of the "full QCD", the
+code applies an effective field theory, which is valid at high
+temperatures. In the effective theory, the lattice is 3D. For this reason,
+SU3_AHiggs stresses different parts of the architecture than the conventional
+QCD applications using 4D lattices.
+
+SU3_AHiggs has roots in the MILC code, but it is heavily rewritten by
+Prof. Kari Rummukainen (University of Oulu, Finland). The code is written
+solely in C and it uses MPI communications. No external libraries are needed
+to run the program.
+
+The directory SU3/src contains several closely related QCD programs:
+
+ * SU3_4D
+ * SU3_AHiggs
+ * SU3_Gauge
+
+In the DEISA benchmarks, only the code SU3_AHiggs is used. If you find errors
+in any of the files in the SU3 package, please contact benchmarking@deisa.eu.
+
+
+2 Code structure
+================
+
+In SU3_AHiggs, the spacetime is discretised and replaced with a 3D cubic
+lattice. Every lattice vertex contains a 3 x 3 traceless Hermitian
+matrix. From each vertex, in turn, there are six edges to nearest-neighbour
+vertices. Edges are 3 x 3 unitary matrices.
+
+The aim of the SU3_AHiggs computation is to generate lattice configurations
+from the microcanonical distribution, which is the statistical equilibrium
+state of the system. The program uses heat-bath and over-relaxation algorithms
+to update lattice vertices and links. The computation starts from a random
+initial configuration.
+
+The main function of SU3_AHiggs is in the file su3h_n/control.c. After the
+initial setup, main calls the function runthis, which in turn calls other
+functions in the SU3 package. If the dataset is sufficiently large, most of
+the computing time is spent on lattice updates (functions updategauge and
+updatehiggs in files su3h_n/updategauge.c and su3h_n/updatehiggs.c). If the
+dataset is too small, in turn, the computation becomes communication
+bound. MPI routines are not called directly but with customised communication
+functions defined in generic/comdefs.h and generic/com_mpi.c.
+
+
+3 Parallelisation
+=================
+
+SU3_AHiggs uses a 3D domain decomposition method for parallelisation. Each MPI
+task communicates with six neighbouring tasks only. The communication routines
+are defined in the files generic/comdefs.h and generic/com_mpi.c. The most
+important routines are:
+
+ * start_get()
+
+ This function starts asynchronous sends and receives required to gather
+ neighbouring lattice vertices and links. The call graph looks like this:
+
+ start_get() --> start_gather() --> MPI_Irecv(), MPI_Isend()
+
+ * wait_get()
+
+ This function waits for receives to finish, ensuring that the data has
+ actually arrived. The call graph looks like this:
+
+ wait_get() --> wait_gather() --> MPI_Wait()
+
+With a 32^3 lattice, the program performs well up to 256 processes. With a
+256^3 lattice, the speedup is almost linear with the number of processes. The
+highest processor number tested so far is 2048. The lattice size and the
+number of iterations are controlled by four user-adjustable parameters.
+
+
+4 Building
+==========
+
+To build SU3_AHiggs with the JUBE tool on a new architecture (NEWARCH), do the
+following steps:
+
+ 1) Create a new top-level XML file for the new architecture
+ (bench-NEWARCH.xml). In this task, you can use the already existing
+ files as a starting point: bench-Cray-XT4-Louhi.xml,
+ bench-IBM-SP4-Jump.xml, and bench-SGI-Altix-HLRB2.xml. Normally you have
+ to change the values of $nodes and $taskspernode only.
+
+ 2) Edit compile.xml: Create a new section , where
+ NEWARCH is the same as in the file
+ DEISA_BENCH/platform/platform.xml. Substitute values in the new compile
+ section with those proper for the new architecture. Normally you need to
+ change #CFLAGS# and #LDFLAGS#. Possibly you want to change #CC# and
+ #MPI_CC# also.
+
+ 3) Run the compile step within the benchmark "test": Edit bench-NEWARCH.xml
+ and make sure that you have:
+
+
+
+ ...
+
+
+ Then run: perl ../../bench/jube -debug bench-NEWARCH.xml
+
+If the compile step fails, go to the directory where JUBE has run the compile
+step:
+
+ tmp/SU3_NEWARCH_test_i000001/.../src
+
+Then try to run the command make manually. Analyze the error and try to fix it
+modifying the file Makefile.defs. After the problem is solved, edit the file
+compile.xml accordingly. If you cannot solve the problem just by editing
+compile.xml, please contact benchmarking@deisa.eu.
+
+
+5 Execution
+===========
+
+To run SU3_AHiggs with the JUBE tool, do the following steps:
+
+ 1) Before running the benchmarks you need an execute script template, such
+ as:
+
+ DEISA_BENCH/platform/Cray-XT4-Louhi/cray_qsub.job.in
+
+ 2) Edit execute.xml: Create a new section , and
+ match the values in the new section with the execute script template.
+
+ 3) Run a benchmark: Select a benchmark by setting active="1" in the file
+ bench-NEWARCH.xml. Then run: perl ../../bench/jube -submit
+ bench-NEWARCH.xml
+
+To run SU3_AHiggs manually (without JUBE), do the following steps:
+
+ 1) Copy the SU3_AHiggs executable to a directory that is accessible from
+ compute nodes. The name of the SU3_AHiggs executable is:
+
+ src/su3h_n/su3_ahiggs
+
+ 2) Copy the input files beta, parameter, and status to the same
+ directory. In the directory input, there are several sets of input files
+ available:
+
+ input/lat_256/* (256^3 lattice, 100 iterations)
+ input/lat_32/* (32^3 lattice, 10000 iterations)
+ input/test/* (32^3 lattice, 100 iterations)
+
+ 3) Start the program with a MPI launcher available in your system, for
+ example:
+
+ aprun -n 8 ./su3_ahiggs
+
+ The test benchmark takes approximately 10 seconds with 8 processor
+ cores. Other benchmarks run longer: approximately 1 minute with 1024
+ cores.
+
+Important: The number of tasks in su3_ahiggs must be a power of 2. Otherwise
+the program cannot layout the lattice, and the execution stops.
+
+
+6 Verification
+==============
+
+JUBE verifies benchmark results automatically as part of the result analysis
+step. In SU3_AHiggs, the verification cannot be done directly by comparing
+benchmark results with some reference results. The reason to this is that the
+results are very sensitive to compiler optimizations and the number of MPI
+tasks as well. This can make results to appear very different if compared with
+the reference results. Everything can still be all right, as long as the
+results are statistically the same.
+
+Therefore SU3_AHiggs uses a statistical comparison test to verify benchmark
+results (Student's t-test). Significance level is chosen to be 1e-4 (correct
+results are rejected once every 10000 times). First 50 iterations are not
+included in the comparison. The reference results are found at:
+
+ reference/lat_256/higgs.out (256^3 lattice, 100 iterations)
+ reference/lat_32/higgs.out (32^3 lattice, 10000 iterations)
+ reference/test/higgs.out (32^3 lattice, 100 iterations)
+
+These files contain the Higgs field at each iteration for a given lattice
+size.
+
+To verify benchmark results manually (without JUBE), do the following steps:
+
+ 1) Copy the executable src/aa/aa to the directory SU3/run.
+
+ 2) Run the following command in the directory SU3:
+
+ perl run/check_results_su3.pl output.xml stdout.log stderr.log \
+ $RUNDIR reference/lat_256
+
+ The environment variable $RUNDIR should point to the directory where
+ SU3_AHiggs has been executed.
+
+ 3) If the benchmark results are correct, the file output.xml includes the
+ following lines:
+
+
+
+
+ If not, the same lines look like this:
+
+
+
+
+
+7 Input data
+============
+
+Input data for SU3_AHiggs consist of three short ASCII files containing
+simulation parameters related to temperature, lattice size, iterations, etc.
+
+For example, the files related to the test benchmark look like this:
+
+input/test/beta:
+
+ betag 12
+ x 0.06
+ y 0.69025056
+
+input/test/parameters:
+
+ nx 32
+ ny 32
+ nz 32
+ micro steps 4
+ n_measurement 1
+ n_correlation 10000
+ w_correlation 100000
+ n_save -1000
+ blocking levels 1
+ level 0 1
+ level 1 1
+
+input/test/status:
+
+ restart 0
+ n_iteration 100
+ n_thermal 0
+ seed 479817384
+ run status
+ iteration
+ time: gauge
+ time: higgs
+ time: rest
+
+It is easy to create new datasets by changing the lattice size (variables nx,
+ny, and nz), number of iterations (n_iteration), and seed number for the
+random number generator (seed). The duration of a simulation is roughly
+proportional to:
+
+ nx * ny * nz * n_iteration
+
+SU3_AHiggs has currently three datasets:
+
+ test 32^3 lattice, 100 iterations
+ small 32^3 lattice, 10000 iterations (artificial dataset)
+ large 256^3 lattice, 100 iterations (real research dataset)
+
+The test dataset is designed to help porting to new architectures. The small
+dataset, in turn, is designed for benchmarking purposes. With it, benchmark
+timings depend strongly on the interconnect speed.
+
+
+8 Output data
+=============
+
+During the benchmarks, SU3_AHiggs writes out its result to the following files:
+
+ correl
+ measure
+ status
+
+Note that the file named status is both input and output file; SU3_AHiggs
+modifies it during the computation. The file measure is a binary file that
+contains simulation results at each iteration. Its contents can be read with
+the tool named aa available in the directory src/aa.
+
+The benchmark timings are written to the standard output. JUBE reads them
+automatically as part of the analysis step. To get benchmark timings manually,
+grep for "total time in seconds" in the standard output.
+
+
+
+######################################################################
+kernel_C README
+
+
+This document is short guide to get started and run the speed tests. For
+more detailed information see the README.extended.
+
+
+PROGRAMS
+
+The benchmark programs are provided in source form and must be
+compiled by the user on the machine that is to be tested.
+
+In addition the openQCD-1.4 package is needed. A tar-file of the
+source code can be obtained from
+
+http://luscher.web.cern.ch/luscher/openQCD/
+
+and should be extracted in the same directory level as this package.
+
+PROGRAM FEATURES
+
+All programs parallelize in 0,1,2,3 or 4 dimensions, depending on what is
+specified at compilation time. They are highly optimized for machines with
+current Intel or AMD processors, but will run correctly on any system that
+complies with the ISO C89 (formerly ANSI C) and the MPI 1.2 standards.
+
+For the purpose of testing and code development, the programs can also
+be run on a desktop or laptop computer. All what is needed for this is
+a compliant C compiler and a local MPI installation such as Open MPI.
+
+
+DOCUMENTATION
+
+The simulation program has a modular form, with strict prototyping and a
+minimal use of external variables. Each program file contains a small number
+of externally accessible functions whose functionality is described at the top
+of the file.
+
+The data layout is explained in various README files and detailed instructions
+are given on how to run the main programs. A set of further documentation
+files are included in the doc directory, where the normalization conventions,
+the chosen algorithms and other important program elements are described.
+
+
+COMPILATION
+
+The compilation of the programs requires an ISO C89 compliant compiler and a
+compatible MPI installation that complies with the MPI standard 1.2 (or later).
+
+In the main and devel directories, a GNU-style Makefile is included which
+compiles and links the programs (just type "make" to compile everything; "make
+clean" removes the files generated by "make"). The compiler options can be set
+by editing the CFLAGS line in the Makefiles.
+
+The Makefiles assume that the following environment variables are set:
+
+ GCC GNU C compiler command [Example: /usr/bin/gcc].
+
+ MPI_HOME MPI home directory [Example: /usr/lib64/mpi/gcc/openmpi].
+ The mpicc command used is the one in $MPI_HOME/mpicc and
+ the MPI libraries are expected in $MPI_HOME/lib.
+
+ MPI_INCLUDE Directory where the mpi.h file is to be found.
+
+All programs are then compiled using the $MPI_HOME/bin/mpicc command. The
+compiler options that can be set in the CFLAGS line depend on which C compiler
+the mpicc command invokes (the GCC compiler command is only used to resolve
+the dependencies on the include files).
+
+
+SSE/AVX ACCELERATION
+
+Current Intel and AMD processors are able to perform arithmetic operations on
+short vectors of floating-point numbers in just one or two machine cycles,
+using SSE and/or AVX instructions. The arithmetic performed by these
+instructions fully complies with the IEEE 754 standard.
+
+Many programs in the module directories include SSE and AVX inline-assembly
+code. On 64bit systems, and if the GNU or Intel C compiler is used, the code
+can be activated by setting the compiler flags -Dx64 and -DAVX, respectively.
+In addition, SSE prefetch instructions will be used if one of the following
+options is specified:
+
+ -DP4 Assume that prefetch instructions fetch 128 bytes at a time
+ (Pentium 4 and related Xeons).
+
+ -DPM Assume that prefetch instructions fetch 64 bytes at a time
+ (Athlon, Opteron, Pentium M, Core, Core 2 and related Xeons).
+
+ -DP3 Assume that prefetch instructions fetch 32 bytes at a time
+ (Pentium III).
+
+These options have an effect only if -Dx64 or -DAVX is set. The option
+-DAVX implies -Dx64.
+
+On recent x86-64 machines with AMD Opteron or Intel Xeon processors, for
+example, the recommended compiler flags are
+
+ -std=c89 -O -mno-avx -DAVX -DPM
+
+For older machines that do not support the AVX instruction set, the
+recommended flags are
+
+ -std=c89 -O -mno-avx -Dx64 -DPM
+
+More aggressive optimization levels such as -O2 and -O3 tend to have little
+effect on the execution speed of the programs, but the risk of generating
+wrong code is higher.
+
+AVX instructions and the option -mno-avx may not be known to old versions
+of the compilers, in which case one is limited to SSE accelerations with
+option string -std=c89 -O -Dx64 -DPM.
+
+
+DEBUGGING FLAGS
+
+For troubleshooting and parameter tuning, it may helpful to switch on some
+debugging flags at compilation time. The simulation program then prints a
+detailed report to the log file on the progress made in specified subprogram.
+
+The available flags are:
+
+-DCGNE_DBG CGNE solver.
+
+-DFGCR_DBG GCR solver.
+
+-FGCR4VD_DBG GCR solver for the little Dirac equation.
+
+-DMSCG_DBG MSCG solver.
+
+-DDFL_MODES_DBG Deflation subspace generation.
+
+-DMDINT_DBG Integration of the molecular-dynamics equations.
+
+-DRWRAT_DBG Computation of the rational function reweighting
+ factor.
+
+
+RUNNING A SIMULATION
+
+The simulation programs reside in the directory "main". For each program,
+there is a README file in this directory which describes the program
+functionality and its parameters.
+
+Running a simulation for the first time requires its parameters to be chosen,
+which tends to be a non-trivial task. The syntax of the input parameter files
+and the meaning of the various parameters is described in some detail in
+main/README.infiles and doc/parms.pdf. Examples of valid parameter files are
+contained in the directory main/examples.
+
+
+EXPORTED FIELD FORMAT
+
+The field configurations generated in the course of a simulation are written
+to disk in a machine-independent format (see modules/misc/archive.c).
+Independently of the machine endianness, the fields are written in little
+endian format. A byte-reordering is therefore not required when machines with
+different endianness are used for the simulation and the physics analysis.
+
+
+AUTHORS
+
+The initial release of the openQCD package was written by Martin Lüscher and
+Stefan Schaefer. Support for Schrödinger functional boundary conditions was
+added by John Bulava. Several modules were taken over from the DD-HMC program
+tree, which includes contributions from Luigi Del Debbio, Leonardo Giusti,
+Björn Leder and Filippo Palombi.
+
+
+ACKNOWLEDGEMENTS
+
+In the course of the development of the openQCD code, many people suggested
+corrections and improvements or tested preliminary versions of the programs.
+The authors are particularly grateful to Isabel Campos, Dalibor Djukanovic,
+Georg Engel, Leonardo Giusti, Björn Leder, Carlos Pena and Hubert Simma for
+their communications and help.
+
+
+LICENSE
+
+The software may be used under the terms of the GNU General Public Licence
+(GPL).
+
+
+BUG REPORTS
+
+If a bug is discovered, please send a report to .
+
+
+ALTERNATIVE PACKAGES AND COMPLEMENTARY PROGRAMS
+
+There is a publicly available BG/Q version of openQCD that takes advantage of
+the machine-specific features of IBM BlueGene/Q computers. The version is
+available at .
+
+The openQCD programs currently do not support reweighting in the quark
+masses, but a module providing this functionality can be downloaded from
+.
+
+Previously generated gauge-field configurations are often used as initial
+configuration for a new run. If the old and new lattices or boundary
+conditions are not the same, the old configuration may however need to be
+adapted, using a field conversion tool such as the one available at
+, before the new run is started.
+
+######################################################################
+kernel_D README
+
+Important compiler defines XXX are (-DXXX)
+MPI -> switch on parallelisation
+PARALLELXYZT -> 4-dimensional parallelisation
+PARALLELXYT -> 3-dim
+PARALLELXT -> 2-dim
+PARALLELT -> 1-dim
+SSE2 -> SSE2 inline assembly (to be used with one of the two following)
+P4 -> pentium 4
+OPTERON -> opteron
+_GAUGE_COPY -> non-strided memory access for gauge fields, but more memory required
+BGL -> Blue Gene /L
+BGP -> Blue Gene /P, to be used on top of BGL
+
+If none of them are used, you will get a serial version of the program.
+
+The local lattice size in the case of the one dimensional
+prallelisation is controlled by the parameters in the file
+benchmark.input:
+
+T = 32
+L = 16
+
+which will give a 32 x 16^3 global lattice.
+
+NrXProcs = 2
+
+needs only to be set in case of a parallel compilation and sets
+the number of processes in x-direction. The same holds for NrYProcs and NrZProcs.
+The number of processes in
+t-direction is computed from NrX|Y|ZProcs and the total number of processes.
+You should only take care that all this fits with the lattice size.
+
+the package size of the data that are send and recieved is
+192 * (1/2) * L^3 Byte in case of the one dimensional parallelisation.
+In case of the two dimensional parallelisation it is
+192 * (1/2) ((L*L*L/N_PROC_X)+(T*L*L)) Byte.
+
+A run of the benchmark takes about one minute.
+
+The out-put of the program is something like this: (T=2,L=16)
+
+The number of processes is 12
+The local lattice size is 2 x 16 ^3
+total time 4.681349e+00 sec, Variance of the time 6.314982e-03 sec
+
+ (297 Mflops [64 bit arithmetic])
+
+communication switched off
+ (577 Mflops [64 bit arithmetic])
+
+The size of the package is 393216 Byte
+The bandwidth is 84.49 + 84.49 MB/sec
+
+
+If you use the serial version of course the part depending on the
+parallel setup will be missing.
+
+
+Compilation examples (you need a c-compiler with c99 standard, otherwise you may need to define inline, restrict etc. to nothing):
+
+in general (gcc)
+gcc -std=c99 -I. -I./ -I.. -o benchmark -D_GAUGE_COPY -O Hopping_Matrix.c mpi_init.c geometry_eo.c test/check_xchange.c test/check_geometry.c boundary.c start.c ranlxd.c init_gauge_field.c init_geometry_indices.c init_moment_field.c init_spinor_field.c read_input.c benchmark.c update_backward_gauge.c D_psi.c ranlxs.c -lm
+
+gcc and OPTERON (64 Bit architecture):
+gcc -std=c99 -I. -I./ -I.. -o benchmark -DOPTERON -DSSE2 -mfpmath=387 -fomit-frame-pointer -ffloat-store -D_GAUGE_COPY -O Hopping_Matrix.c mpi_init.c geometry_eo.c test/check_xchange.c test/check_geometry.c boundary.c start.c ranlxd.c init_gauge_field.c init_geometry_indices.c init_moment_field.c init_spinor_field.c read_input.c benchmark.c update_backward_gauge.c D_psi.c ranlxs.c -lm
+
+gcc and pentium4:
+gcc -std=c99 -I. -I./ -I.. -o benchmark -DSSE2 -DP4 -march=pentium4 -malign-double -fomit-frame-pointer -ffloat-store -D_GAUGE_COPY -O Hopping_Matrix.c mpi_init.c geometry_eo.c test/check_xchange.c test/check_geometry.c boundary.c start.c ranlxd.c init_gauge_field.c init_geometry_indices.c init_moment_field.c init_spinor_field.c read_input.c benchmark.c update_backward_gauge.c D_psi.c ranlxs.c -lm
+
+mpicc (gcc) general, four dimensional parallelisation:
+mpicc -std=c99 -I. -I./ -I.. -o benchmark -O3 -DMPI -DPARALLELXYZT -D_GAUGE_COPY -O Hopping_Matrix.c Hopping_Matrix_nocom.c xchange_deri.c xchange_field.c xchange_gauge.c xchange_halffield.c xchange_lexicfield.c mpi_init.c geometry_eo.c test/check_xchange.c test/check_geometry.c boundary.c start.c ranlxd.c init_gauge_field.c init_geometry_indices.c init_moment_field.c init_spinor_field.c read_input.c benchmark.c update_backward_gauge.c D_psi.c ranlxs.c init_dirac_halfspinor.c -lm
+
+
+######################################################################
+kernel_E README
+
+none
\ No newline at end of file
diff --git a/qcd/part_cpu/applications/QCD/analyse.xml b/qcd/part_cpu/applications/QCD/analyse.xml
new file mode 100644
index 0000000000000000000000000000000000000000..e3d204ce271e43f7ca6b105360161edfffcf5c6b
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/analyse.xml
@@ -0,0 +1,97 @@
+
+
+
+ (cd $outdir; bash collectData.sh)
+
+
+
+
+
+
+
+
+
+ (cd $outdir; bash collectData.sh)
+
+
+
+
+
+
+
+ (cd $outdir; bash collectData.sh)
+
+
+
+
+
+
+
+ (cd $outdir; bash collectData.sh)
+
+
+
+
+
+
+
+ (cd $outdir; bash collectData.sh)
+
+
+
+
+
+
+
+
+
+ (cd $outdir; bash collectData.sh)
+
+
+
+
+
+
+
+
+
+ (cd $outdir; bash collectData.sh)
+
+
+
+
+
+
+
+
+
+ (cd $outdir; bash collectData.sh)
+
+
+
+
+
+
+
+
+
+ (cd $outdir; bash collectData.sh)
+
+
+
+
+
+
+
+
+
+ (cd $outdir; bash collectData.sh)
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/compile.xml b/qcd/part_cpu/applications/QCD/compile.xml
new file mode 100644
index 0000000000000000000000000000000000000000..045ca1891c60d323ca71994f1e12c4fd02f92506
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/compile.xml
@@ -0,0 +1,2071 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ( cd kernel_A; sh fixLouhiPP.sh; cd ..; source /opt/modules/default/init/sh; `index('$CRAYPAT','on')==0 ? 'module load xt-craypat;' : ' '` make `index('$CRAYPAT','on')==0 ? '; pat_build -v -u -g $CRAYPAT_GROUP -o ${execname}+pat $execname; cp ${execname}+pat $execname' : ' '`)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ( cd kernel_A; sh fixLouhiPP.sh; cd ..; source /opt/modules/default/init/sh; `index('$CRAYPAT','on')==0 ? 'module load xt-craypat;' : ' '` make `index('$CRAYPAT','on')==0 ? '; pat_build -v -u -g $CRAYPAT_GROUP -o ${execname}+pat $execname; cp ${execname}+pat $execname' : ' '`)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ( cd kernel_A; sh fixLouhiPP.sh; cd ..; source /opt/modules/default/init/sh; `index('$CRAYPAT','on')==0 ? 'module load xt-craypat;' : ' '` make `index('$CRAYPAT','on')==0 ? '; pat_build -v -u -g $CRAYPAT_GROUP -o ${execname}+pat $execname; cp ${execname}+pat $execname' : ' '`)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (`index('$PAPI','on')==0 ? 'module load papi;' : ' '` `index('$IHPCT_HWC','on')==0 || index('$IHPCT_MPITR','on')==0 ? 'module load hpct;' : ' '` make)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (make)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (make)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (cd kernel_A; sh fixLouhiPP.sh; cd ..; make)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (`index('$IHPCT_HWC','on')==0 || index('$IHPCT_MPITR','on')==0 ? 'module load ihpct;' : ' '` make)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (cd kernel_A; sh fixLouhiPP.sh; cd ..; make)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (cd kernel_A; sh fixLouhiPP.sh; cd ..; make)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (cd kernel_A; sh fixLouhiPP.sh; cd ..; make)
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/execute.xml b/qcd/part_cpu/applications/QCD/execute.xml
new file mode 100644
index 0000000000000000000000000000000000000000..deae4afe9d42603614e9d4e928d3fa823ac46d9e
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/execute.xml
@@ -0,0 +1,450 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ llsubmit ibm_llsubmit.job
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ qsub -q prace cray_qsub.job
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ qsub cray_PBSsubmit.job
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ qsub intel_PBSsubmit.job
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ msub intel_PBSsubmit.job
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ llsubmit ibm_llsubmit.job
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ llsubmit ibm_llsubmit.job
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ llsubmit ibm_llsubmit.job
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ llsubmit ibm_llsubmit.job
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ sbatch intel_SLURMsubmit.job
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ qsub intel_PBSsubmit.job
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/input/kernel_A.input b/qcd/part_cpu/applications/QCD/input/kernel_A.input
new file mode 100644
index 0000000000000000000000000000000000000000..9a8e24d12eae37c64bc6b35e6e1cb316a54ca851
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/input/kernel_A.input
@@ -0,0 +1,29 @@
+run 0
+
+lattice 2 2 2 2
+processes 1 1 1 1
+boundary_conditions_fermions 1 1 1 -1
+
+beta 5
+kappa 0.13
+csw 2.3327
+h 0
+
+hmc_test 0
+hmc_model C
+hmc_rho 0.1
+hmc_trajectory_length 0.2
+hmc_steps 10
+hmc_accept_first 1
+hmc_m_scale 3
+
+start_configuration cold
+start_random default
+
+mc_steps 1
+mc_total_steps 100
+
+solver_rest 1e-99
+solver_maxiter 50
+solver_ignore_no_convergence 2
+solver_mre_vectors 7
diff --git a/qcd/part_cpu/applications/QCD/input/kernel_A.input.in b/qcd/part_cpu/applications/QCD/input/kernel_A.input.in
new file mode 100644
index 0000000000000000000000000000000000000000..b5ef4d8645bb9e63ef1cf08e31fe14dab6476069
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/input/kernel_A.input.in
@@ -0,0 +1,29 @@
+run 0
+
+lattice #KA_LATTICE#
+processes #KA_PROCESSES#
+boundary_conditions_fermions 1 1 1 -1
+
+beta 5
+kappa 0.13
+csw 2.3327
+h 0
+
+hmc_test 0
+hmc_model C
+hmc_rho 0.1
+hmc_trajectory_length 0.2
+hmc_steps 10
+hmc_accept_first 1
+hmc_m_scale 3
+
+start_configuration cold
+start_random default
+
+mc_steps 1
+mc_total_steps 100
+
+solver_rest 1e-99
+solver_maxiter #KA_MAXITER#
+solver_ignore_no_convergence 2
+solver_mre_vectors 7
diff --git a/qcd/part_cpu/applications/QCD/input/kernel_B.input.beta.in b/qcd/part_cpu/applications/QCD/input/kernel_B.input.beta.in
new file mode 100644
index 0000000000000000000000000000000000000000..084fefd6335861e219b2a507ebfb35672efc9fe1
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/input/kernel_B.input.beta.in
@@ -0,0 +1,3 @@
+betag 12
+x 0.06
+y 0.69025056
diff --git a/qcd/part_cpu/applications/QCD/input/kernel_B.input.parameters.in b/qcd/part_cpu/applications/QCD/input/kernel_B.input.parameters.in
new file mode 100644
index 0000000000000000000000000000000000000000..9e892b7d77d7a02a143c855e464b52ace48c2350
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/input/kernel_B.input.parameters.in
@@ -0,0 +1,11 @@
+nx #KB_NX#
+ny #KB_NY#
+nz #KB_NZ#
+micro steps 4
+n_measurement 1
+n_correlation 10000
+w_correlation 100000
+n_save -1000
+blocking levels 1
+level 0 1
+level 1 1
diff --git a/qcd/part_cpu/applications/QCD/input/kernel_B.input.status.in b/qcd/part_cpu/applications/QCD/input/kernel_B.input.status.in
new file mode 100644
index 0000000000000000000000000000000000000000..9234430ec16df17bf40e6eb2c9646d2075b70c7f
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/input/kernel_B.input.status.in
@@ -0,0 +1,9 @@
+restart 0
+n_iteration #KB_MAXITER#
+n_thermal 0
+seed 989357013
+run status
+iteration
+time: gauge
+time: higgs
+time: rest
diff --git a/qcd/part_cpu/applications/QCD/input/kernel_D.input.in b/qcd/part_cpu/applications/QCD/input/kernel_D.input.in
new file mode 100644
index 0000000000000000000000000000000000000000..585b4875301ed2e9f00d8e232ea2b39e451a006b
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/input/kernel_D.input.in
@@ -0,0 +1,7 @@
+L=#KD_L#
+T=#KD_T#
+
+# no of processors per direction, time direction chosen automatically
+NrXProcs = #KD_NP_X#
+NrYProcs = #KD_NP_Y#
+NrZProcs = #KD_NP_Z#
diff --git a/qcd/part_cpu/applications/QCD/input/kernel_E.input.in b/qcd/part_cpu/applications/QCD/input/kernel_E.input.in
new file mode 100644
index 0000000000000000000000000000000000000000..461ca9e2b921991e17fd4cbf83026041d917c179
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/input/kernel_E.input.in
@@ -0,0 +1,15 @@
+#lattice
+nx #KE_NX#
+ny #KE_NY#
+nz #KE_NZ#
+nt #KE_NT#
+totnodes #KE_PROCS#
+
+#wilson
+mass_wilson #KE_WILSON_MASS#
+
+#max iterations
+max_cg_iters #KE_MAXITER#
+
+#etc
+verbose 1
diff --git a/qcd/part_cpu/applications/QCD/patterns-gprof-qcd.xml b/qcd/part_cpu/applications/QCD/patterns-gprof-qcd.xml
new file mode 100644
index 0000000000000000000000000000000000000000..e4f83d1f3925fb72b10ac27384b05832ca771b8e
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/patterns-gprof-qcd.xml
@@ -0,0 +1,20 @@
+
+
+ JuBE: gprof: proc 1:\s*$patwrd\s*$patnfp
+ JuBE: gprof: proc 1:\s*$patnwrd\s*$patfp
+
+ JuBE: gprof: proc 2:\s*$patwrd\s*$patnfp
+ JuBE: gprof: proc 2:\s*$patnwrd\s*$patfp
+
+ JuBE: gprof: proc 3:\s*$patwrd\s*$patnfp
+ JuBE: gprof: proc 3:\s*$patnwrd\s*$patfp
+
+ JuBE: gprof: proc 4:\s*$patwrd\s*$patnfp
+ JuBE: gprof: proc 4:\s*$patnwrd\s*$patfp
+
+ JuBE: gprof: proc 5:\s*$patwrd\s*$patnfp
+ JuBE: gprof: proc 5:\s*$patnwrd\s*$patfp
+
+ JuBE: gprof: proc $patint:\s+$patwrd\s+$patfp
+
+
\ No newline at end of file
diff --git a/qcd/part_cpu/applications/QCD/patterns-ihpct-qcd.xml b/qcd/part_cpu/applications/QCD/patterns-ihpct-qcd.xml
new file mode 100644
index 0000000000000000000000000000000000000000..9b0d9be02118c3150871797ecba74dead0efa088
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/patterns-ihpct-qcd.xml
@@ -0,0 +1,31 @@
+
+
+ IHPCT: libHPM: in section kernel_A: PM_FPU_FLOP:\s*$patint
+ IHPCT: libHPM: in section kernel_B: PM_FPU_FLOP:\s*$patint
+ IHPCT: libHPM: in section kernel_C: PM_FPU_FLOP:\s*$patint
+ IHPCT: libHPM: in section kernel_D: PM_FPU_FLOP:\s*$patint
+ IHPCT: libHPM: in section kernel_E: PM_FPU_FLOP:\s*$patint
+
+ IHPCT: libHPM: in section kernel_A: \% of peak performance:\s*$patint
+ IHPCT: libHPM: in section kernel_B: \% of peak performance:\s*$patint
+ IHPCT: libHPM: in section kernel_C: \% of peak performance:\s*$patint
+ IHPCT: libHPM: in section kernel_D: \% of peak performance:\s*$patint
+ IHPCT: libHPM: in section kernel_E: \% of peak performance:\s*$patint
+
+ IHPCT: libHPM: in section kernel_A: number of load/stores per L1 miss:\s*$patint
+ IHPCT: libHPM: in section kernel_B: number of load/stores per L1 miss:\s*$patint
+ IHPCT: libHPM: in section kernel_C: number of load/stores per L1 miss:\s*$patint
+ IHPCT: libHPM: in section kernel_D: number of load/stores per L1 miss:\s*$patint
+ IHPCT: libHPM: in section kernel_E: number of load/stores per L1 miss:\s*$patint
+
+ IHPCT: libHPM: in section kernel_A: $patwrd:\s+$patint
+ IHPCT: libHPM: in section kernel_B: $patwrd:\s+$patint
+ IHPCT: libHPM: in section kernel_C: $patwrd:\s+$patint
+ IHPCT: libHPM: in section kernel_D: $patwrd:\s+$patint
+ IHPCT: libHPM: in section kernel_E: $patwrd:\s+$patint
+
+ IHPCT: libHPM: in section QCD: $patwrd:\s+$patint
+
+ IHPCT: MPITracer: median communication time =\s*$patfp\s*sec
+
+
diff --git a/qcd/part_cpu/applications/QCD/patterns-jube-qcd.xml b/qcd/part_cpu/applications/QCD/patterns-jube-qcd.xml
new file mode 100644
index 0000000000000000000000000000000000000000..80081f279e56ed788d6fedb72ebcb21021f214d1
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/patterns-jube-qcd.xml
@@ -0,0 +1,45 @@
+
+
+ JuBE: total mean run time: $patfp
+ JuBE: total mean run time: $patfp
+
+ JuBE global mean timing statistics:\s*kernel_A\s*$patnfp\s*$patnfp\s*$patnfp\s*$patfp
+ JuBE global mean timing statistics:\s*kernel_B\s*$patnfp\s*$patnfp\s*$patnfp\s*$patfp
+ JuBE global mean timing statistics:\s*kernel_C\s*$patnfp\s*$patnfp\s*$patnfp\s*$patfp
+ JuBE global mean timing statistics:\s*kernel_D\s*$patnfp\s*$patnfp\s*$patnfp\s*$patfp
+ JuBE global mean timing statistics:\s*kernel_E\s*$patnfp\s*$patnfp\s*$patnfp\s*$patfp
+
+ JuBE global mean timing statistics:\s*kernel_A\s*$patfp\s*$patnfp\s*$patnfp\s*$patnfp
+ JuBE global mean timing statistics:\s*kernel_B\s*$patfp\s*$patnfp\s*$patnfp\s*$patnfp
+ JuBE global mean timing statistics:\s*kernel_C\s*$patfp\s*$patnfp\s*$patnfp\s*$patnfp
+ JuBE global mean timing statistics:\s*kernel_D\s*$patfp\s*$patnfp\s*$patnfp\s*$patnfp
+ JuBE global mean timing statistics:\s*kernel_E\s*$patfp\s*$patnfp\s*$patnfp\s*$patnfp
+
+ JuBE global mean timing statistics:\s*kernel_A\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp
+ JuBE global mean timing statistics:\s*kernel_B\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp
+ JuBE global mean timing statistics:\s*kernel_C\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp
+ JuBE global mean timing statistics:\s*kernel_D\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp
+ JuBE global mean timing statistics:\s*kernel_E\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp
+
+ JuBE global mean timing statistics:\s*kernel_A\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp
+ JuBE global mean timing statistics:\s*kernel_B\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp
+ JuBE global mean timing statistics:\s*kernel_C\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp
+ JuBE global mean timing statistics:\s*kernel_D\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp
+ JuBE global mean timing statistics:\s*kernel_E\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp
+
+ JuBE global mean timing statistics:\s*kernel_A\s*$patnfp\s*$patnfp\s*$patfp\s*$patnfp
+ JuBE global mean timing statistics:\s*kernel_B\s*$patnfp\s*$patnfp\s*$patfp\s*$patnfp
+ JuBE global mean timing statistics:\s*kernel_C\s*$patnfp\s*$patnfp\s*$patfp\s*$patnfp
+ JuBE global mean timing statistics:\s*kernel_D\s*$patnfp\s*$patnfp\s*$patfp\s*$patnfp
+ JuBE global mean timing statistics:\s*kernel_E\s*$patnfp\s*$patnfp\s*$patfp\s*$patnfp
+
+
+ JuBE: total max mem:\s*$patint
+
+ JuBE: max mem for kernel_A:\s*$patint
+ JuBE: max mem for kernel_B:\s*$patint
+ JuBE: max mem for kernel_C:\s*$patint
+ JuBE: max mem for kernel_D:\s*$patint
+ JuBE: max mem for kernel_E:\s*$patint
+
+
\ No newline at end of file
diff --git a/qcd/part_cpu/applications/QCD/patterns-papi-qcd.xml b/qcd/part_cpu/applications/QCD/patterns-papi-qcd.xml
new file mode 100644
index 0000000000000000000000000000000000000000..caf68d1e3ac62256b676feb1501593a6bfbdb8a6
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/patterns-papi-qcd.xml
@@ -0,0 +1,17 @@
+
+
+ JuBE: PAPI counter for kernel_A: PAPI_TOT_CYC:\s*$patint
+ JuBE: PAPI counter for kernel_B: PAPI_TOT_CYC:\s*$patint
+ JuBE: PAPI counter for kernel_C: PAPI_TOT_CYC:\s*$patint
+ JuBE: PAPI counter for kernel_D: PAPI_TOT_CYC:\s*$patint
+ JuBE: PAPI counter for kernel_E: PAPI_TOT_CYC:\s*$patint
+
+ JuBE: PAPI counter for kernel_A: PAPI_FP_OPS:\s*$patint
+ JuBE: PAPI counter for kernel_B: PAPI_FP_OPS:\s*$patint
+ JuBE: PAPI counter for kernel_C: PAPI_FP_OPS:\s*$patint
+ JuBE: PAPI counter for kernel_D: PAPI_FP_OPS:\s*$patint
+ JuBE: PAPI counter for kernel_E: PAPI_FP_OPS:\s*$patint
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-functional-cartesius.xml b/qcd/part_cpu/applications/QCD/prace-functional-cartesius.xml
new file mode 100644
index 0000000000000000000000000000000000000000..bdd7b2cafe16fa5351e7f62f010f744948b84e10
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-functional-cartesius.xml
@@ -0,0 +1,188 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-functional-cartesius_24.xml b/qcd/part_cpu/applications/QCD/prace-functional-cartesius_24.xml
new file mode 100644
index 0000000000000000000000000000000000000000..f7546843fd11e6de8d12fcfabc9214a01301037c
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-functional-cartesius_24.xml
@@ -0,0 +1,188 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-functional-hector.xml b/qcd/part_cpu/applications/QCD/prace-functional-hector.xml
new file mode 100644
index 0000000000000000000000000000000000000000..e600a4b7adf4db00a4366d4adf9f69a550778f0a
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-functional-hector.xml
@@ -0,0 +1,182 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-functional-hermit.xml b/qcd/part_cpu/applications/QCD/prace-functional-hermit.xml
new file mode 100644
index 0000000000000000000000000000000000000000..80fc4c95f3310dc60b8e409f0bd9f0a2b326d375
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-functional-hermit.xml
@@ -0,0 +1,186 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-functional-huygens.xml b/qcd/part_cpu/applications/QCD/prace-functional-huygens.xml
new file mode 100644
index 0000000000000000000000000000000000000000..b9228ca77c72197f8317b3a22f6ecbdba831e3e4
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-functional-huygens.xml
@@ -0,0 +1,197 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-functional-jugene.xml b/qcd/part_cpu/applications/QCD/prace-functional-jugene.xml
new file mode 100644
index 0000000000000000000000000000000000000000..489fe4fb1ba0c007dfaf509918051739af4ea8db
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-functional-jugene.xml
@@ -0,0 +1,194 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-functional-juqueen.xml b/qcd/part_cpu/applications/QCD/prace-functional-juqueen.xml
new file mode 100644
index 0000000000000000000000000000000000000000..68af37a2c7eacd6d68adab42e56870cdeaa1406b
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-functional-juqueen.xml
@@ -0,0 +1,198 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-functional-juropa.xml b/qcd/part_cpu/applications/QCD/prace-functional-juropa.xml
new file mode 100644
index 0000000000000000000000000000000000000000..0b83b7abb4a504f6da2eb7cbe08e53d32bfb9085
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-functional-juropa.xml
@@ -0,0 +1,188 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-functional-louhi.xml b/qcd/part_cpu/applications/QCD/prace-functional-louhi.xml
new file mode 100644
index 0000000000000000000000000000000000000000..8544acda0cc3ad228d6546257290205998c82ec7
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-functional-louhi.xml
@@ -0,0 +1,184 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-functional-marconi.xml b/qcd/part_cpu/applications/QCD/prace-functional-marconi.xml
new file mode 100644
index 0000000000000000000000000000000000000000..0557883ef7a1f527fcb10b0f9eefc9dd67222771
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-functional-marconi.xml
@@ -0,0 +1,188 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-functional-marconi2.xml b/qcd/part_cpu/applications/QCD/prace-functional-marconi2.xml
new file mode 100644
index 0000000000000000000000000000000000000000..19b2d1f8b82bf1585eca082c89626c31c52d8664
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-functional-marconi2.xml
@@ -0,0 +1,188 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-functional-supermuc.xml b/qcd/part_cpu/applications/QCD/prace-functional-supermuc.xml
new file mode 100644
index 0000000000000000000000000000000000000000..b3aedd187111f9acf057d0fe77e951fff67cc656
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-functional-supermuc.xml
@@ -0,0 +1,189 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-cartesius.xml b/qcd/part_cpu/applications/QCD/prace-scaling-cartesius.xml
new file mode 100644
index 0000000000000000000000000000000000000000..1a12d10dfcb2a9c6b59660711767e9d8d2b2543a
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-scaling-cartesius.xml
@@ -0,0 +1,223 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-cartesius_24.xml b/qcd/part_cpu/applications/QCD/prace-scaling-cartesius_24.xml
new file mode 100644
index 0000000000000000000000000000000000000000..a6f7c403a116a204d4a4968b01ce0c6b78df6d1e
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-scaling-cartesius_24.xml
@@ -0,0 +1,192 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-curie.xml b/qcd/part_cpu/applications/QCD/prace-scaling-curie.xml
new file mode 100644
index 0000000000000000000000000000000000000000..ff6563a561f25773b5a8a3e4c113bd518c9699d2
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-scaling-curie.xml
@@ -0,0 +1,193 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-hector-medium.xml b/qcd/part_cpu/applications/QCD/prace-scaling-hector-medium.xml
new file mode 100644
index 0000000000000000000000000000000000000000..57ae9c7365f668fcf76eb308be554a183de1fae0
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-scaling-hector-medium.xml
@@ -0,0 +1,192 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-hector-small.xml b/qcd/part_cpu/applications/QCD/prace-scaling-hector-small.xml
new file mode 100644
index 0000000000000000000000000000000000000000..8d90296be638349041bcbadbaeecf422d8346cd8
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-scaling-hector-small.xml
@@ -0,0 +1,192 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-hermit.xml b/qcd/part_cpu/applications/QCD/prace-scaling-hermit.xml
new file mode 100644
index 0000000000000000000000000000000000000000..94908820eb67a5172cabf59e860daa3bf309b734
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-scaling-hermit.xml
@@ -0,0 +1,192 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-huygens.xml b/qcd/part_cpu/applications/QCD/prace-scaling-huygens.xml
new file mode 100644
index 0000000000000000000000000000000000000000..adcf6b83f2bc6269f13acd8b1e211dc8427fa317
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-scaling-huygens.xml
@@ -0,0 +1,202 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-jugene.xml b/qcd/part_cpu/applications/QCD/prace-scaling-jugene.xml
new file mode 100644
index 0000000000000000000000000000000000000000..09ad25ce3c09ad9ec355b017df99d0e294f3a546
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-scaling-jugene.xml
@@ -0,0 +1,204 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-juqueen.xml b/qcd/part_cpu/applications/QCD/prace-scaling-juqueen.xml
new file mode 100644
index 0000000000000000000000000000000000000000..e09e5907dc13f70738410f9e83376ad04eae2a16
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-scaling-juqueen.xml
@@ -0,0 +1,204 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-juropa.xml b/qcd/part_cpu/applications/QCD/prace-scaling-juropa.xml
new file mode 100644
index 0000000000000000000000000000000000000000..9a64dd3625e4b706d7eae32bc0ed07115a9a6830
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-scaling-juropa.xml
@@ -0,0 +1,193 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-louhi.xml b/qcd/part_cpu/applications/QCD/prace-scaling-louhi.xml
new file mode 100644
index 0000000000000000000000000000000000000000..519be3d8fd83344f08f2d54c4d9586b73d3131e8
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-scaling-louhi.xml
@@ -0,0 +1,185 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-marconi.xml b/qcd/part_cpu/applications/QCD/prace-scaling-marconi.xml
new file mode 100644
index 0000000000000000000000000000000000000000..b4bdd614eda85b7501d5e1435a094257bf5b18ce
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-scaling-marconi.xml
@@ -0,0 +1,194 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-supermuc.xml b/qcd/part_cpu/applications/QCD/prace-scaling-supermuc.xml
new file mode 100644
index 0000000000000000000000000000000000000000..908f78c65c1c224dd9093f4184fd8d43b5432555
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prace-scaling-supermuc.xml
@@ -0,0 +1,194 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/prepare.xml b/qcd/part_cpu/applications/QCD/prepare.xml
new file mode 100644
index 0000000000000000000000000000000000000000..04c33977e48bc8455a0097425df15d4bacc0af8e
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/prepare.xml
@@ -0,0 +1,85 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/result.gprof.xml b/qcd/part_cpu/applications/QCD/result.gprof.xml
new file mode 100644
index 0000000000000000000000000000000000000000..b1486a3fd9a10889a58be28b26eddebf9c34205b
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/result.gprof.xml
@@ -0,0 +1,9 @@
+
+
+ KERNELS, GPROF_01_NAME, GPROF_01_PART, GPROF_02_NAME, GPROF_02_PART, GPROF_03_NAME, GPROF_03_PART
+
+
+
+ name
+
+
diff --git a/qcd/part_cpu/applications/QCD/result.hwc.xml b/qcd/part_cpu/applications/QCD/result.hwc.xml
new file mode 100644
index 0000000000000000000000000000000000000000..b965eae3c00291053b4bdc0e3a506e17728c0170
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/result.hwc.xml
@@ -0,0 +1,13 @@
+
+
+ KERNELS, walltime, HWC_FLOP_KA_avg, HWC_FLOP_KB_avg, HWC_FLOP_KC_avg
+
+
+
+ name
+
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/result.mpi.xml b/qcd/part_cpu/applications/QCD/result.mpi.xml
new file mode 100644
index 0000000000000000000000000000000000000000..085cb5e36095059f84d6a558de5b6c66ddc5258e
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/result.mpi.xml
@@ -0,0 +1,9 @@
+
+
+ KERNELS, walltime, MPI_COMM_TIME_avg, MPI_COMM_TIME_std
+
+
+
+ name
+
+
diff --git a/qcd/part_cpu/applications/QCD/result.wct.xml b/qcd/part_cpu/applications/QCD/result.wct.xml
new file mode 100644
index 0000000000000000000000000000000000000000..faf08f81146ee3bd87fb054851fb30d3352d7015
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/result.wct.xml
@@ -0,0 +1,9 @@
+
+
+ KERNELS, walltime, KA_wct, KB_wct, KC_wct, KD_wct
+
+
+
+ name
+
+
diff --git a/qcd/part_cpu/applications/QCD/result.xml b/qcd/part_cpu/applications/QCD/result.xml
new file mode 100644
index 0000000000000000000000000000000000000000..429be125b68e6e3317ef42d14de491715737cd24
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/result.xml
@@ -0,0 +1,64 @@
+
+
+
+ ncpus, time, time_KA, time_KB, time_KC, time_KD, time_KE
+
+
+
+
+
+ ncpus, KERNELS, optflags, walltime, WCT_KA, WCT_KB, WCT_KC, WCT_KD, WCT_KE, COMMENT
+
+
+
+ ncpus, KERNELS, walltime, TIME_INIT_KA, TIME_RUN_KA, TIME_FINALIZE_KA, KA_NX, KA_NY, KA_NZ, KA_NT
+
+
+ ncpus, KERNELS, walltime, TIME_INIT_KB, TIME_RUN_KB, TIME_FINALIZE_KB, KB_NX, KB_NY, KB_NZ
+
+
+ ncpus, KERNELS, walltime, TIME_INIT_KC, TIME_RUN_KC, TIME_FINALIZE_KC, KC_NX, KC_NY, KC_NZ, KC_NT
+
+
+ ncpus, KERNELS, walltime, TIME_INIT_KD, TIME_RUN_KD, TIME_FINALIZE_KD, KD_L, KD_T
+
+
+ ncpus, KERNELS, walltime, TIME_INIT_KE, TIME_RUN_KE, TIME_FINALIZE_KE, KE_NX, KE_NY, KE_NZ, KE_NT
+
+
+
+
+ KERNELS, walltime, MEM_MAX, MEM_KA, MEM_KB, MEM_KC, MEM_KD, MEM_KE, COMMENT
+
+
+
+ KERNELS, GPROF_01_NAME, GPROF_01_PART, GPROF_02_NAME, GPROF_02_PART, GPROF_03_NAME, GPROF_03_PART
+
+
+
+ KERNELS, walltime, HWC_FLOP_KA_avg, HWC_FLOP_KB_avg, HWC_FLOP_KC_avg, HWC_FLOP_KD_avg, HWC_FLOP_KE_avg
+
+
+
+ KERNELS, walltime, MPI_COMM_TIME
+
+
+
+
+ ncpus, bgconn, optflags, KA_NPROC0, KA_NPROC1, KA_NPROC2, KA_NPROC3
+
+
+
+ ncpus, bgconn, walltime, KA_LATTICE0, KA_LATTICE1, KA_LATTICE2, KA_LATTICE3, KA_LIBCOMM, KA_LIBCLOVER, KA_LIBD
+
+
+
+ jobenddate
+
+
+
+
+ KERNELS, ncpus, subid
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/run/collectData.sh.in b/qcd/part_cpu/applications/QCD/run/collectData.sh.in
new file mode 100644
index 0000000000000000000000000000000000000000..eb4483efcef2739e4964915f0b7bb394ff1687dd
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/run/collectData.sh.in
@@ -0,0 +1,24 @@
+rm -f IHPCT.log GPROF.log CRAYPAT.log
+
+##COLLECT_IHPCT_HWC# #JUGENE# sed '$d' < QCD.viz > tmp.viz; mv tmp.viz QCD.viz
+ #COLLECT_IHPCT_HWC# #PERL# #BENCHHOME#/../../utils/ihpct/parseHWC.pl QCD*.viz >> IHPCT.log
+
+#COLLECT_IHPCT_MPITR# #PERL# #BENCHHOME#/../../utils/ihpct/parseMPITR.pl mpi_profile.* >> IHPCT.log
+
+#COLLECT_GPROF# #HUYGENS# gprof #EXECUTABLE# profdir*/gmon.out > GPROF.dat; #PERL# #BENCHHOME#/../../utils/gprof/parseGPROF.pl -1 1 GPROF.dat > GPROF.log
+#COLLECT_GPROF# #JUGENE# gprof #EXECUTABLE# gmon.out.* > GPROF.dat; #PERL# #BENCHHOME#/../../utils/gprof/parseGPROF.pl -1 1 GPROF.dat > GPROF.log
+#COLLECT_GPROF# #JUMP# gprof #EXECUTABLE# gmon.*.out > GPROF.dat; #PERL# #BENCHHOME#/../../utils/gprof/parseGPROF.pl -2 1 GPROF.dat > GPROF.log
+
+#COLLECT_CRAYPAT# source /opt/modules/default/init/sh; module load xt-craypat
+
+#COLLECT_CRAYPAT# pat_report -d P -b totals *.xf > CRAYPAT.HWC.dat; pat_report -d P -b totals ./*/*.xf > CRAYPAT.HWC.dat;
+#COLLECT_CRAYPAT# #PERL# #BENCHHOME#/../../utils/craypat/parseCRAYPAT.pl HWC CRAYPAT.HWC.dat >> CRAYPAT.log
+
+#COLLECT_CRAYPAT# pat_report -d flops *.xf > CRAYPAT.FLOPS.dat; pat_report -d flops ./*/*.xf > CRAYPAT.FLOPS.dat;
+#COLLECT_CRAYPAT# #PERL# #BENCHHOME#/../../utils/craypat/parseCRAYPAT.pl FLOPS CRAYPAT.FLOPS.dat >> CRAYPAT.log
+
+#COLLECT_CRAYPAT# pat_report -d am *.xf > CRAYPAT.HEAP.dat; pat_report -d am ./*/*.xf > CRAYPAT.HEAP.dat;
+#COLLECT_CRAYPAT# #PERL# #BENCHHOME#/../../utils/craypat/parseCRAYPAT.pl HEAP CRAYPAT.HEAP.dat >> CRAYPAT.log
+
+#COLLECT_CRAYPAT# pat_report -d time@%0.1 *.xf > CRAYPAT.TIME.dat; pat_report -d time@%0.1 ./*/*.xf > CRAYPAT.TIME.dat;
+#COLLECT_CRAYPAT# #PERL# #BENCHHOME#/../../utils/craypat/parseCRAYPAT.pl TIME CRAYPAT.TIME.dat >> CRAYPAT.log
diff --git a/qcd/part_cpu/applications/QCD/run/verify_qcd.pl b/qcd/part_cpu/applications/QCD/run/verify_qcd.pl
new file mode 100644
index 0000000000000000000000000000000000000000..4626ce67b54506c73fed26614c559401d85c55f0
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/run/verify_qcd.pl
@@ -0,0 +1,32 @@
+#!/usr/local/bin/perl -w
+
+use strict;
+use Carp;
+
+my $patint="([\\+\\-\\d]+)"; # Pattern for Integer number
+my $patfp ="([\\+\\-\\d.Ee]+)"; # Pattern for Floating Point number
+my $patwrd="([\^\\s]+)"; # Pattern for Work (all noblank characters)
+my $patnint="[\\+\\-\\d]+"; # Pattern for Integer number, no ()
+my $patnfp ="[\\+\\-\\d.Ee]+"; # Pattern for Floating Point number, no ()
+my $patnwrd="[\^\\s]+"; # Pattern for Work (all noblank characters), no ()
+my $patbl ="\\s+"; # Pattern for blank space (variable length)
+
+if((scalar @ARGV) != 1) {
+ printf(STDERR "incorrect number of parameter (%d) of $0 (6 required)\n",scalar @ARGV);
+ exit(-1);
+}
+
+my $xmloutfile = $ARGV[0];
+my $vcheck=0;
+my $vcomment="not implemented";
+
+open(XMLOUT,"> $xmloutfile") || die "cannot open file $xmloutfile";
+print XMLOUT "\n";
+print XMLOUT " \n";
+print XMLOUT " \n";
+print XMLOUT "\n";
+print XMLOUT "\n";
+close(XMLOUT);
+
+
+exit(0);
diff --git a/qcd/part_cpu/applications/QCD/sizes.pp b/qcd/part_cpu/applications/QCD/sizes.pp
new file mode 100644
index 0000000000000000000000000000000000000000..d0e69102317db6ed0c92dc78234647091a8fbe81
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/sizes.pp
@@ -0,0 +1,7 @@
+medium problem size
+
+ * KA_NX=KA_NY=32, KA_NZ=KA_NT=64
+ * KB_NX=KB_NY=KB_NC=256
+ * KC_NX=KC_NY=KC_NZ=KC_NT=8
+ * KD_L=KD_T=64
+ * KE_NX=KE_NY=KE_NZ=64, KE_NT=32
\ No newline at end of file
diff --git a/qcd/part_cpu/applications/QCD/src/Makefile.in b/qcd/part_cpu/applications/QCD/src/Makefile.in
new file mode 100644
index 0000000000000000000000000000000000000000..f846472788e0196aee61ca5fad45b646474647d0
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/Makefile.in
@@ -0,0 +1,35 @@
+KERNEL_ARCHS = #KERNEL_ARCHS#
+
+qcd-bench: qcd-bench.o $(KERNEL_ARCHS)
+ #LD# #LDFLAGS# -o #EXECNAME# qcd-bench.o $(KERNEL_ARCHS) #LDLIBS#
+
+kernel_A.a:
+ cd kernel_A && gmake prep-#KA_PLATFORM# && gmake kernel
+
+kernel_B.a:
+ cd kernel_B && gmake kernel
+
+kernel_C.a:
+ cd kernel_C && make kernel
+
+kernel_D.a:
+ cd kernel_D && make kernel
+
+kernel_E.a:
+ cd kernel_E && gmake kernel
+
+qcd-bench.o: qcd-bench.c
+ #MPI_CC# #CFLAGS# -c -o qcd-bench.o qcd-bench.c
+
+clean:
+ cd kernel_A && gmake clobber
+ cd kernel_B && make clean
+ cd kernel_C && make clean
+ cd kernel_D && make clean
+ cd kernel_E && gmake clean
+ #RM# -f qcd-bench.o qcd-bench
+ #RM# -f $(KERNEL_ARCHS)
+
+#nm -X64 *.a | grep ' T ' | cut -f 1 -d ' ' | sort > all.sort.dat
+#nm -X64 *.a | grep ' T ' | cut -f 1 -d ' ' | sort | sort -u > all.sort.unique.dat
+#diff all.sort.dat all.sort.unique.dat | grep '<' > doublicates.dat
\ No newline at end of file
diff --git a/qcd/part_cpu/applications/QCD/src/exchangeNames.sh b/qcd/part_cpu/applications/QCD/src/exchangeNames.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b1ab0f9efbf8d47b6e9ffd8d90d7856a31cb71c7
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/exchangeNames.sh
@@ -0,0 +1,44 @@
+replaceInFile ()
+{
+ sed "s/${1}(/${2}(/g" $3 > tmp.dat; cp tmp.dat $3
+}
+
+case $1 in
+ "doublicates") echo "generate doublicate list";
+ echo "parse files:\n`ls *.a`";
+ nm -X64 *.a | grep ' T ' | cut -f 1 -d ' ' | sort > all.sort.dat
+ nm -X64 *.a | grep ' T ' | cut -f 1 -d ' ' | sort | sort -u > all.sort.unique.dat
+ diff all.sort.dat all.sort.unique.dat | grep '<' | cut -f 2- -d '.'> doublicates.dat
+ echo "found `wc -l doublicates.dat` doublicates" ;;
+
+ "find") echo "looking for $2";
+ grep -r $2 `find . -name '*.[c|h|f|f90|F90]'` | cut -f 2- -d '<' > find.dat;
+ cat find.dat;;
+
+ "replace") echo "replace $2 by $3 in $4";
+ replaceInFile $2 $3 $4;;
+
+ "replaceAllFiles") echo "replace all $2 by $3";
+ grep -r $2 `find . -name '*.[c|h|f|f90|F90]'` | cut -f 2- -d '<' | cut -f 1 -d ':' | sort -u > find.dat;
+ for i in `cat find.dat`
+ do
+ echo "replacing $2 by $3 in $i";
+ replaceInFile $2 $3 $i
+ done
+ ;;
+
+ "replaceAll")
+ echo "replace all doublicates in this directory, using doublicate list $2 and postfix $3";
+ for dn in `cat $2`
+ do
+ echo "replace $dn"
+ grep -r $dn `find . -name '*.[c|h|f|f90|F90]'` | cut -f 2- -d '<' | cut -f 1 -d ':' | sort -u > find.dat;
+ for fn in `cat find.dat`
+ do
+ echo "replacing $dn by ${dn}_$3 in $fn";
+ replaceInFile ${dn} ${dn}_${3} ${fn}
+ done
+ done
+ ;;
+
+esac
\ No newline at end of file
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..7aab461bd39a9afb5243cf59d7563fbf804fe00e
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile
@@ -0,0 +1,185 @@
+#===============================================================================
+#
+# BQCD -- Berlin Quantum ChromoDynamics programme
+#
+# Author: Hinnerk Stueben
+#
+# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+#
+#-------------------------------------------------------------------------------
+#
+# Makefile
+#
+#===============================================================================
+
+include Makefile.defs
+
+MODULES_DIR = modules
+
+.SUFFIXES:
+.SUFFIXES: .o .F90 .c
+
+
+.F90.f90:
+ $(FPP) $(FPPFLAGS) $< > $@
+
+.F90.o:
+ $(FPP) $(FPPFLAGS) $< > $*.f90
+ $(F90) -c $(FFLAGS) -I$(MODULES_DIR) $*.f90
+
+
+MODULES = $(MODULES_DIR)/*.o
+
+OBJS = \
+ action.o \
+ cg.o \
+ checks.o \
+ $(CKSUM_O) \
+ conf.o \
+ conf_info.o \
+ cooling.o \
+ dsd.o \
+ dsg.o \
+ dsf.o \
+ dsf1.o \
+ dsf2.o \
+ files.o \
+ flip_bc.o \
+ hmc.o \
+ hmc_init_p.o \
+ hmc_init_phi.o \
+ hmc_integrator.o \
+ hmc_forces.o \
+ hmc_leap_frog.o \
+ hmc_test.o \
+ hmc_u.o \
+ h_mult.o \
+ index.o \
+ index2.o \
+ init_common.o \
+ init_modules.o \
+ iteration_count.o \
+ mc.o \
+ misc.o \
+ mre.o \
+ mtdagmt.o \
+ m_tilde.o \
+ polyakov_loop.o \
+ $(RANDOM_O) \
+ sc.o \
+ service.o \
+ staple.o \
+ su3.o \
+ swap.o \
+ timing.o \
+ traces.o \
+ $(UUU_O) \
+ w_mult.o \
+ xyzt2i.o
+
+LIBS = d/$(LIBD) comm/$(LIBCOMM) clover/$(LIBCLOVER)
+
+kernel:
+ cd modules && $(MAKE)
+ cd d && $(MAKE) fast
+ cd comm && $(MAKE) fast
+ cd clover && $(MAKE) fast
+ $(MAKE) ../kernel_A.a
+
+../kernel_A.a: bqcd.o $(MODULES) $(OBJS) $(LIBS)
+ $(AR) $(ARFLAGS) ../kernel_A.a *.o d/*.o modules/*.o comm/*.o clover/*.o
+
+bqcd: bqcd.o $(MODULES) $(OBJS) $(LIBS)
+ $(F90) -o $@ $(LDFLAGS) bqcd.o $(MODULES) $(OBJS) $(LIBS) $(SYSLIBS)
+
+
+fast:
+ cd modules && $(MAKE)
+ cd d && $(MAKE) fast
+ cd comm && $(MAKE) fast
+ cd clover && $(MAKE) fast
+ $(FAST_MAKE) bqcd
+
+clean:
+ rm -f bqcd.[0-9][0-9][0-9].* diag.[0-9][0-9] core app.rif
+ rm -f random_test random_test.dump random_test.out
+ rm -f test_echo
+ rm -f a.out out out1 out2
+ rm -f ../kernel-bqcd.a
+
+tidy: clean
+ rm -f *.[Toid] *.f90 *.mod work.pc work.pcl
+
+clobber: tidy
+ rm -f bqcd
+ $(MAKE) clobber_libs
+ cd modules && $(MAKE) clean
+
+Modules:
+ cd modules && $(MAKE)
+
+libd:
+ cd d && $(MAKE)
+
+libclover: $(MODULES)
+ cd clover && $(MAKE)
+
+libs:
+ cd d && $(MAKE)
+ cd comm && $(MAKE)
+ cd clover && $(MAKE)
+
+clean_libs:
+ cd d && $(MAKE) clean
+ cd comm && $(MAKE) clean
+ cd clover && $(MAKE) clean
+
+clobber_libs:
+ cd d && $(MAKE) clobber
+ cd comm && $(MAKE) clobber
+ cd clover && $(MAKE) clobber
+
+the_ranf_test: ranf.o
+ $(FPP) $(FPPFLAGS) ranf_test.F90 ranf_test.f90
+ $(F90) ranf_test.f90 ranf.o
+ ./a.out | diff - ranf_test.reference
+
+test_echo: test_echo.o service.o
+ $(F90) -o $@ $(LDFLAGS) test_echo.o service.o
+
+prep:
+ rm -f Makefile.var service.F90
+ ln -s platform/Makefile-$(PLATFORM).var Makefile.var
+ ln -s platform/service-$(PLATFORM).F90 service.F90
+
+prep-altix:
+ $(MAKE) prep PLATFORM=altix
+
+prep-bgl:
+ $(MAKE) prep PLATFORM=bgl
+
+prep-cray:
+ $(MAKE) prep PLATFORM=cray
+
+prep-hitachi:
+ $(MAKE) prep PLATFORM=hitachi
+
+prep-hitachi-omp:
+ $(MAKE) prep PLATFORM=hitachi-omp
+
+prep-ibm:
+ $(MAKE) prep PLATFORM=ibm
+
+prep-hp:
+ $(MAKE) prep PLATFORM=hp
+
+prep-intel:
+ $(MAKE) prep PLATFORM=intel
+
+prep-nec:
+ $(MAKE) prep PLATFORM=nec
+
+prep-sun:
+ $(MAKE) prep PLATFORM=sun
+
+#===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.defs.in b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.defs.in
new file mode 100644
index 0000000000000000000000000000000000000000..574fc5d616c56d9a63511edd4fdc39102cead3f5
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.defs.in
@@ -0,0 +1,32 @@
+d3_buffer_vol = #d3_buffer_vol#
+
+SHELL = #SHELL#
+
+FPP = #FPP#
+FPPFLAGS = #FPPFLAGS#
+
+F90 = #MPI_F90#
+FFLAGS = #F90FLAGS#
+
+CC = #MPI_CC#
+CFLAGS = #CFLAGS#
+
+AR = #AR#
+ARFLAGS = #ARFLAGS#
+
+RANLIB = #RANLIB#
+
+LDFLAGS = #LDFLAGS#
+SYSLIBS = #SYSLIBS#
+
+FAST_MAKE = #FAST_MAKE#
+
+RM = #RM#
+
+CKSUM_O = #CKSUM_O#
+RANDOM_O = #RANDOM_O#
+UUU_O = #UUU_O#
+
+LIBD = #KA_LIBD#
+LIBCOMM = #KA_LIBCOMM#
+LIBCLOVER = #KA_LIBCLOVER#
\ No newline at end of file
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.in b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.in
new file mode 100644
index 0000000000000000000000000000000000000000..b1ee85d3623d75eb8c53b99f22757e8de8c6e937
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.in
@@ -0,0 +1,176 @@
+#===============================================================================
+#
+# BQCD -- Berlin Quantum ChromoDynamics programme
+#
+# Author: Hinnerk Stueben
+#
+# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+#
+#-------------------------------------------------------------------------------
+#
+# Makefile
+#
+#===============================================================================
+
+#include Makefile.var
+include Makefile.defs
+
+.SUFFIXES:
+.SUFFIXES: .o .F90 .c
+
+
+.F90.f90:
+ $(FPP) $(FPPFLAGS) $< > $@
+
+.F90.o:
+ $(FPP) $(FPPFLAGS) $< > $*.f90
+ $(F90) -c $(FFLAGS) $*.f90
+
+MODULES_DIR = modules
+
+MODULES = modules/*.o
+
+OBJS = \
+ action.o \
+ cg.o \
+ checks.o \
+ $(CKSUM_O) \
+ conf.o \
+ conf_info.o \
+ cooling.o \
+ dsd.o \
+ dsg.o \
+ dsf.o \
+ dsf1.o \
+ dsf2.o \
+ files.o \
+ flip_bc.o \
+ hmc.o \
+ hmc_init_p.o \
+ hmc_init_phi.o \
+ hmc_integrator.o \
+ hmc_forces.o \
+ hmc_leap_frog.o \
+ hmc_test.o \
+ hmc_u.o \
+ h_mult.o \
+ index.o \
+ index2.o \
+ init_common.o \
+ init_modules.o \
+ iteration_count.o \
+ mc.o \
+ misc.o \
+ mre.o \
+ mtdagmt.o \
+ m_tilde.o \
+ polyakov_loop.o \
+ $(RANDOM_O) \
+ sc.o \
+ service.o \
+ staple.o \
+ su3.o \
+ swap.o \
+ timing.o \
+ traces.o \
+ $(UUU_O) \
+ w_mult.o \
+ xyzt2i.o
+
+LIBS = d/$(LIBD) comm/$(LIBCOMM) clover/$(LIBCLOVER)
+
+bqcd: bqcd.o $(MODULES) $(OBJS) $(LIBS)
+ $(F90) -o $@ $(LDFLAGS) bqcd.o $(MODULES) $(OBJS) $(LIBS) $(SYSLIBS)
+
+
+fast:
+ cd modules && $(MAKE)
+ cd d && $(MAKE) fast
+ cd comm && $(MAKE) fast
+ cd clover && $(MAKE) fast
+ $(FAST_MAKE) bqcd
+ mv bqcd #EXECNAME#
+# mv bqcd ../BQCD_SGI-ALTIX_cname_SGI-ALTIX.exe
+
+clean:
+ rm -f bqcd.[0-9][0-9][0-9].* diag.[0-9][0-9] core app.rif
+ rm -f random_test random_test.dump random_test.out
+ rm -f test_echo
+ rm -f a.out out out1 out2
+
+tidy: clean
+ rm -f *.[Toid] *.f90 *.mod work.pc work.pcl
+
+clobber: tidy
+ rm -f bqcd
+ $(MAKE) clobber_libs
+ cd modules && $(MAKE) clean
+
+Modules:
+ cd modules && $(MAKE)
+
+libd:
+ cd d && $(MAKE)
+
+libclover: $(MODULES)
+ cd clover && $(MAKE)
+
+libs:
+ cd d && $(MAKE)
+ cd comm && $(MAKE)
+ cd clover && $(MAKE)
+
+clean_libs:
+ cd d && $(MAKE) clean
+ cd comm && $(MAKE) clean
+ cd clover && $(MAKE) clean
+
+clobber_libs:
+ cd d && $(MAKE) clobber
+ cd comm && $(MAKE) clobber
+ cd clover && $(MAKE) clobber
+
+the_ranf_test: ranf.o
+ $(FPP) $(FPPFLAGS) ranf_test.F90 ranf_test.f90
+ $(F90) ranf_test.f90 ranf.o
+ ./a.out | diff - ranf_test.reference
+
+test_echo: test_echo.o service.o
+ $(F90) -o $@ $(LDFLAGS) test_echo.o service.o
+
+prep:
+ rm -f Makefile.var service.F90
+ ln -s platform/Makefile-$(PLATFORM).var Makefile.var
+ ln -s platform/service-$(PLATFORM).F90 service.F90
+
+prep-altix:
+ $(MAKE) prep PLATFORM=altix
+
+prep-bgl:
+ $(MAKE) prep PLATFORM=bgl
+
+prep-cray:
+ $(MAKE) prep PLATFORM=cray
+
+prep-hitachi:
+ $(MAKE) prep PLATFORM=hitachi
+
+prep-hitachi-omp:
+ $(MAKE) prep PLATFORM=hitachi-omp
+
+prep-ibm:
+ $(MAKE) prep PLATFORM=ibm
+
+prep-hp:
+ $(MAKE) prep PLATFORM=hp
+
+prep-intel:
+ $(MAKE) prep PLATFORM=intel
+
+prep-nec:
+ $(MAKE) prep PLATFORM=nec
+
+prep-sun:
+ $(MAKE) prep PLATFORM=sun
+
+#===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.var b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.var
new file mode 100644
index 0000000000000000000000000000000000000000..4058c3904be9f1b63ec9cc2a5939e6a3ac34f158
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.var
@@ -0,0 +1,108 @@
+#===============================================================================
+#
+# BQCD -- Berlin Quantum ChromoDynamics programme
+#
+# Author: Hinnerk Stueben
+#
+# Copyright (C) 2005, Hinnerk Stueben, Zuse-Institut Berlin
+#
+#-------------------------------------------------------------------------------
+#
+# Makefile-altix.var - settings on SGI-Altix
+#
+#-------------------------------------------------------------------------------
+
+timing = 1
+mpi = 1
+omp = 1
+shmem =
+shmempi =
+debug =
+libd = 2
+d3_buffer_vol = 32*32*16*16
+
+#-------------------------------------------------------------------------------
+
+SHELL = /bin/ksh
+
+FPP = mpif90 -g -E
+FPP2 = icc -E -C -P
+F90 = mpif90
+CC = mpicc
+AR = ar
+RANLIB = echo
+
+MODULES_FLAG = -I$(MODULES_DIR)
+
+MYFLAGS = -DINTEL -DALTIX
+FFLAGS_STD= $(MODULES_FLAG)
+CFLAGS_STD= -DNamesToLower_
+ARFLAGS = rv
+
+LDFLAGS = -Vaxlib
+SYSLIBS =
+
+FAST_MAKE = gmake -j 8
+
+CKSUM_O = cksum.o
+RANDOM_O = ran.o ranf.o
+UUU_O = uuu_f90.o
+
+LIBD =
+LIBCOMM = lib_single_pe.a
+LIBCLOVER = libclover.a
+
+#-------------------------------------------------------------------------------
+
+ifdef timing
+ MYFLAGS += -DTIMING
+endif
+
+ifdef mpi
+ LIBCOMM = lib_mpi.a
+endif
+
+ifdef omp
+ F90 += -openmp
+ MYFLAGS += -D_OPENMP
+endif
+
+ifdef shmem
+ LDFLAGS += -lsma
+ LIBCOMM = lib_shmem.a
+endif
+
+ifdef shmempi
+ LDFLAGS += -lsma
+ LIBCOMM = lib_shmempi.a
+endif
+
+ifdef debug
+ FFLAGS = -g -O0 $(FFLAGS_STD)
+ CFLAGS = -g -O0 $(CFLAGS_STD)
+else
+ FFLAGS = -O2 $(FFLAGS_STD)
+ CFLAGS = -O2 $(CFLAGS_STD)
+endif
+
+ifeq ($(libd),1)
+ LIBD = libd.a
+ MYFLAGS += -DD3_BUFFER_VOL=1
+endif
+
+ifeq ($(libd),2)
+ LIBD = libd2.a
+ MYFLAGS += -DD3_BUFFER_VOL=1
+endif
+
+ifeq ($(libd),21)
+ LIBD = libd21.a
+ MYFLAGS += -DD3_BUFFER_VOL=1
+endif
+
+ifeq ($(libd),3)
+ LIBD = libd3.a
+ MYFLAGS += -DD3_BUFFER_VOL='$(d3_buffer_vol)'
+endif
+
+#===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/action.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/action.F90
new file mode 100644
index 0000000000000000000000000000000000000000..24ad7a1c5268a8a0adc7042d49432513986f4769
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/action.F90
@@ -0,0 +1,130 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! action.F90 - calculation of actions
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+REAL function sf(para, conf) ! returns S_f
+
+ use typedef_hmc
+ use module_p_interface
+ use module_vol
+ implicit none
+
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(in) :: conf
+
+ P_SPINCOL_FIELD, save :: a, b
+
+ REAL, external :: dotprod, clover_action
+ integer :: iterations
+ external :: mtdagmt
+
+ if (para%kappa == ZERO) then
+ sf = ZERO
+ else
+ ALLOCATE_SC_FIELD(a)
+ ALLOCATE_SC_FIELD(b)
+
+ call flip_bc(conf%u)
+
+ call sc_copy(a, conf%phi) ! A = phi
+
+ call cg(mtdagmt, a, conf%phi, para, conf, iterations) ! A = inv(M~+ M~) Phi
+ call mtil(b, a, para, conf) ! B = M~ A
+
+ sf = dotprod(b, b, SIZE_SC_FIELD)
+
+ call flip_bc(conf%u)
+ endif
+
+ if (para%csw_kappa /= ZERO) sf = sf + clover_action(conf%b(1,1,ODD))
+end
+
+!-------------------------------------------------------------------------------
+REAL function sg(u) ! returns S_g
+
+ use module_nn
+ use module_vol
+ implicit none
+
+ GAUGE_FIELD :: u
+ REAL :: plaq, global_sum, p
+ SU3 :: uuu
+ integer :: i, e, o, mu, nu, j1, j2
+ REAL, external :: Re_Tr_uu
+
+ TIMING_START(timing_bin_plaq)
+
+ plaq = 0
+
+ do mu = 1, DIM
+ do e = EVEN, ODD
+ o = EVEN + ODD - e
+ do nu = mu + 1, DIM
+ p = ZERO
+ !$omp parallel do reduction(+: p) private(j1, j2, uuu)
+ do i = 1, VOLH
+
+ ! (j2,o) --<-- x nu
+ ! | |
+ ! v ^ ^
+ ! | | |
+ ! (i,e) -->-- (j1,o) x--> mu
+
+
+ j1 = nn(i, e, mu, FWD)
+ j2 = nn(i, e, nu, FWD)
+
+ uuu = 0
+ call uuu_fwd(uuu, u(1, 1, j1, o, nu), &
+ u(1, 1, j2, o, mu), &
+ u(1, 1, i, e, nu))
+
+ p = p + Re_Tr_uu(uuu, u(1, 1, i, e, mu))
+ enddo
+ !$omp end parallel do
+ plaq = plaq + p
+ enddo
+ enddo
+ enddo
+
+ plaq = global_sum(plaq)
+
+ sg = (6 * volume) - plaq / THREE
+
+ TIMING_STOP(timing_bin_plaq)
+
+end
+
+!-------------------------------------------------------------------------------
+REAL function sp(p) ! returns action of momenta p
+
+ use module_vol
+ implicit none
+ REAL, external :: dotprod
+ GENERATOR_FIELD :: p
+
+ integer :: mu, eo
+
+ sp = ZERO
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ sp = sp + dotprod(p(1, 1, eo, mu), p(1, 1, eo, mu), NGEN * volh)
+ enddo
+ enddo
+ sp = sp * HALF
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/bqcd.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/bqcd.F90
new file mode 100644
index 0000000000000000000000000000000000000000..9141c4c2f7c6ad61a5a654ffd8f896df441d13aa
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/bqcd.F90
@@ -0,0 +1,534 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! bqcd.F90 - main program and read/write of parameters
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+
+! JuBE
+! use kernel_a as a subroutine in the qcd-bench, this was the main function
+! in the original code
+subroutine kernel_a()
+
+ use typedef_flags
+ use typedef_para
+ use module_input
+ use module_function_decl
+ implicit none
+
+ type(type_para) :: para
+ type(hmc_conf), dimension(MAX_TEMPER) :: conf
+ type(type_flags) :: flags
+ SECONDS :: time0, sekunden
+ integer :: kernel_number
+
+ kernel_number = 0
+
+! JuBE
+! call jube initial function
+ call jube_kernel_init(kernel_number)
+
+! JuBE
+! set the flags%input to the inputfile name: bqcd-input
+ flags%input = "kernel_A.input"
+
+ time0 = sekunden() ! start/initialize timer
+
+ TIMING_START(timing_bin_total)
+
+ call comm_init()
+
+! JuBE
+! there is no need for the following call, we ignore all cmd line arguments, non
+! of them but the input file name (set above) is relevant for the benchmark
+! call get_flags(flags)
+
+ call begin(UREC, "Job")
+ call input_read(flags%input)
+ call init_para(para, flags)
+ call init_counter(para, flags)
+ call init_ran(para, flags)
+ call init_cooling(input%measure_cooling_list)
+
+ call set_fmt_ensemble(para%n_temper)
+ call check_fmt(para%run, para%n_temper, para%maxtraj, para%L(4) - 1)
+
+ call init_common(para)
+ call init_modules()
+
+ call write_header(para)
+
+ call init_flip_bc()
+ call init_cg_para(para%cg_rest, para%cg_maxiter, para%cg_log)
+ call init_cg_stat()
+ call init_xbound()
+ call init_confs(para, conf)
+
+ call check_former(para%n_temper, conf)
+
+
+! JuBE
+! call jube kernel run function
+ call jube_kernel_run()
+
+ call mc(para, conf)
+ !!call xbound_test()
+
+! JuBE
+! call jube kernel finalize function
+ call jube_kernel_finalize()
+
+
+ call conf_write(.true., para, conf)
+
+ call write_counter(para%maxtraj)
+ call write_ran()
+
+ TIMING_STOP(timing_bin_total)
+
+ call write_footer(time0)
+ call end_A(UREC, "Job")
+
+ call comm_finalize()
+
+! JuBE
+! call jube kernel end function
+ call jube_kernel_end()
+
+end subroutine kernel_a
+
+!-------------------------------------------------------------------------------
+subroutine init_para(para, flags)
+
+ ! initialises module_para, module_switches and module_mre
+
+ use typedef_flags
+ use typedef_para
+ use module_bqcd
+ use module_input
+ use module_mre
+ use module_switches
+ implicit none
+
+ type(type_para) :: para
+ type(type_flags) :: flags
+ integer :: i
+ logical :: quenched, dynamical, clover, h_ext
+
+ quenched = .false.
+ dynamical = .false.
+ clover = .false.
+ h_ext = .false.
+
+ para%run = input%run
+ para%L = input%lattice
+ para%NPE = input%processes
+ para%bc_fermions = input%boundary_conditions_fermions
+ para%gamma_index = input%gamma_index
+ para%n_temper = input%ensembles
+ para%nstd = input%tempering_steps_without
+ para%nforce = input%hmc_accept_first
+ para%ntraj = input%mc_steps
+ para%maxtraj = input%mc_total_steps
+ para%nsave = input%mc_save_frequency
+ para%c_cg_rest = input%solver_rest
+ para%cg_maxiter = input%solver_maxiter
+ para%cg_log = input%solver_ignore_no_convergence
+ mre_n_vec = input%solver_mre_vectors
+
+ call check_bc_fermions(para%bc_fermions, para%gamma_index)
+
+ read(para%c_cg_rest, *) para%cg_rest
+
+ if (para%n_temper <= 0) call die("init_para(): n_temper <= 0")
+ if (para%n_temper > MAX_TEMPER) call die("init_para(): n_temper > max_temper")
+
+ do i = 1, para%n_temper
+ para%c_hmc(i)%beta = input%beta(i)
+ para%c_hmc(i)%kappa = input%kappa(i)
+ para%c_hmc(i)%csw = input%csw(i)
+ para%c_hmc(i)%h = input%h(i)
+ para%c_hmc(i)%traj_length = input%hmc_trajectory_length(i)
+ para%c_hmc(i)%ntau = input%hmc_steps(i)
+ para%c_hmc(i)%rho = input%hmc_rho(i)
+ para%c_hmc(i)%m_scale = input%hmc_m_scale(i)
+ para%info_file(i) = input%start_info_file(i)
+
+ read(para%c_hmc(i)%beta, *) para%hmc(i)%beta
+ read(para%c_hmc(i)%kappa, *) para%hmc(i)%kappa
+ read(para%c_hmc(i)%csw, *) para%hmc(i)%csw
+ read(para%c_hmc(i)%h, *) para%hmc(i)%h
+ read(para%c_hmc(i)%traj_length,*) para%hmc(i)%traj_length
+ read(para%c_hmc(i)%ntau, *) para%hmc(i)%ntau
+ read(para%c_hmc(i)%rho, *) para%hmc(i)%rho
+ read(para%c_hmc(i)%m_scale, *) para%hmc(i)%m_scale
+
+ if (para%hmc(i)%kappa == ZERO .and. para%hmc(i)%csw /= ZERO) then
+ para%hmc(i)%csw_kappa = para%hmc(i)%csw
+ para%c_hmc(i)%csw = "-1 (infinity)"
+ para%hmc(i)%csw = -1
+ else
+ para%hmc(i)%csw_kappa = para%hmc(i)%csw * para%hmc(i)%kappa
+ call check_csw(para%hmc(i)%beta, para%hmc(i)%csw)
+ endif
+
+ para%hmc(i)%tau = para%hmc(i)%traj_length / para%hmc(i)%ntau
+
+ write(para%c_hmc(i)%csw_kappa, "(f20.15)") para%hmc(i)%csw_kappa
+ write(para%c_hmc(i)%tau, "(f20.15)") para%hmc(i)%tau
+
+ if (para%hmc(i)%kappa == ZERO .and. para%hmc(i)%csw == ZERO) then
+ quenched = .true.
+ else
+ dynamical = .true.
+ endif
+
+ if (para%hmc(i)%csw /= ZERO) clover = .true.
+ if (para%hmc(i)%h /= ZERO) h_ext = .true.
+
+ para%hmc(i)%model = input%hmc_model
+
+ if (para%hmc(i)%model == "A" .and. para%hmc(i)%rho /= ZERO) then
+ call warn("init_para(): model == A but rho /= 0")
+ endif
+
+ if (para%hmc(i)%model /= "A" .and. para%hmc(i)%rho == ZERO) then
+ call warn("init_para(): model /= A but rho == 0")
+ endif
+ enddo
+
+ select case (input%start_configuration)
+ case ("hot"); para%start = START_HOT
+ case ("cold"); para%start = START_COLD
+ case ("file"); para%start = START_FILE
+ case default
+ call die("init_para(): start_configuration must be {hot|cold|file}")
+ end select
+
+ select case (input%start_random)
+ case ("random"); para%seed = -1
+ case ("default"); para%seed = 0
+ case default; read(input%start_random, *) para%seed
+ end select
+
+ select case (input%tempering_swap_sequence)
+ case ("up"); para%swap_seq = SWAP_UP
+ case ("down"); para%swap_seq = SWAP_DOWN
+ case ("random"); para%swap_seq = SWAP_RANDOM
+ case default
+ call die("init_para(): tempering_swap_sequence must be {up|down|random}")
+ end select
+
+ if (quenched .and. dynamical) call die("init_para(): quenched/dynamical mixed")
+
+ if (para%nforce < 0) call die("init_para(): nforce < 0")
+
+ if (flags%continuation_job) para%start = START_CONT
+
+
+ switches%quenched = quenched
+ switches%dynamical = dynamical
+ switches%clover = clover
+ switches%h_ext = h_ext
+ switches%hasenbusch = (input%hmc_model /= "A")
+
+ if (quenched) switches%hasenbusch = .false.
+
+ switches%tempering = .false.
+ switches%measure_polyakov_loop = .false.
+ switches%measure_traces = .false.
+
+ if (input%ensembles > 1) switches%tempering = .true.
+ if (input%measure_polyakov_loop /= 0) switches%measure_polyakov_loop = .true.
+ if (input%measure_traces /= 0) switches%measure_traces = .true.
+
+ if (input%hmc_test == 0) then
+ switches%hmc_test = .false.
+ else
+ switches%hmc_test = .true.
+ endif
+
+end subroutine init_para
+
+!-------------------------------------------------------------------------------
+subroutine init_counter(para, flags)
+
+ use typedef_flags
+ use typedef_para
+ use module_counter
+ use module_function_decl
+ implicit none
+
+ type(type_para) :: para
+ type(type_flags) :: flags
+ FILENAME, external :: count_file, stop_file
+
+ if (f_exist(stop_file())) then
+ call die("init_counter(): found stop file " // stop_file())
+ endif
+
+ counter%run = para%run
+ counter%j_traj = 0
+
+ if (flags%continuation_job) then
+ open(UCOUNT, file = count_file(), action = "read", status = "old")
+ read(UCOUNT, *) counter%run
+ read(UCOUNT, *) counter%job
+ read(UCOUNT, *) counter%traj
+ close(UCOUNT)
+
+ if (counter%run /= para%run) call die("init_counter(): RUN inconsistent")
+ counter%job = counter%job + 1
+ else
+ counter%run = para%run
+ counter%job = 1
+ counter%traj = -para%nforce
+ endif
+
+end subroutine init_counter
+
+!-------------------------------------------------------------------------------
+subroutine write_counter(maxtraj)
+
+ use module_counter
+ use module_function_decl
+ implicit none
+
+ integer :: maxtraj
+ FILENAME, external :: count_file, stop_file
+
+ if (my_pe() /= 0) return
+
+ open(UCOUNT, file = count_file(), action = "write")
+ write(UCOUNT, *) counter%run, " run"
+ write(UCOUNT, *) counter%job, " job"
+ write(UCOUNT, *) counter%traj, " traj"
+ close(UCOUNT)
+
+ if (counter%traj >= maxtraj) then
+ open(UCOUNT, file = stop_file(), status = "unknown")
+ close(UCOUNT)
+ endif
+
+end subroutine write_counter
+
+!-------------------------------------------------------------------------------
+subroutine write_header(para)
+
+ use typedef_para
+ use module_bqcd
+ use module_counter
+ use module_function_decl
+ use module_input
+ use module_mre
+ use module_thread
+ implicit none
+
+ type(type_para) :: para
+ integer :: i
+ character(len = 50) :: fmt
+ character(len = 4), external :: format_ensemble
+
+ if (my_pe() == 0) then
+
+ fmt = "(1x,a," // format_ensemble() // ",2a)"
+
+ call begin(UREC, "Header")
+
+ if (input%comment /= "") then
+ write(UREC, 405) "Comment", trim(input%comment)
+ endif
+
+ write(UREC, 400) "Program", prog_name, prog_version
+ write(UREC, *) "Version_of_D ", version_of_d()
+ write(UREC, *) "Communication ", trim(comm_method())
+ write(UREC, *) "Run ", para%run
+ write(UREC, *) "Job ", counter%job
+ write(UREC, 405) "Host", rechner()
+ write(UREC, 400) "Date", datum(), uhrzeit()
+ write(UREC, 410) "L ", para%L
+ write(UREC, 410) "NPE ", para%NPE
+ write(UREC, 410) "bc_fermions", para%bc_fermions
+ write(UREC, 410) "gamma_index", para%gamma_index
+
+
+ write(UREC, *) "Threads ", n_thread
+ write(UREC, *) "Start ", para%start
+
+ if (para%start == START_FILE) then
+ do i = 1, para%n_temper
+ write(UREC, fmt) "StartConf_", i, " ", trim(para%info_file(i))
+ enddo
+ endif
+
+ write(UREC, *) "Seed ", para%seed
+ write(UREC, *) "Swap_seq", para%swap_seq
+ write(UREC, *) "N_force ", para%nforce
+ write(UREC, *) "N_traj ", para%ntraj
+ write(UREC, *) "N_save ", para%nsave
+ write(UREC, *) "N_temper", para%n_temper
+
+ do i = 1, para%n_temper
+ write(UREC, fmt) "beta_", i, " ", trim(para%c_hmc(i)%beta)
+ write(UREC, fmt) "kappa_", i, " ", trim(para%c_hmc(i)%kappa)
+ write(UREC, fmt) "csw_", i, " ", trim(para%c_hmc(i)%csw)
+ write(UREC, fmt) "csw_kappa_", i, " ", trim(para%c_hmc(i)%csw_kappa)
+ write(UREC, fmt) "h_", i, " ", trim(para%c_hmc(i)%h)
+ write(UREC, fmt) "tau_", i, " ", trim(para%c_hmc(i)%tau)
+ write(UREC, fmt) "N_tau_", i, " ", trim(para%c_hmc(i)%ntau)
+ write(UREC, fmt) "traj_length_", i, " ", trim(para%c_hmc(i)%traj_length)
+ write(UREC, fmt) "rho_", i, " ", trim(para%c_hmc(i)%rho)
+ write(UREC, fmt) "m_scale_", i, " ", trim(para%c_hmc(i)%m_scale)
+ enddo
+
+ write(UREC, *) "HMC_model ", para%hmc(1)%model
+ write(UREC, *) "REAL_kind ", RKIND
+ write(UREC, 405) "CG_rest ", trim(para%c_cg_rest)
+ write(UREC, *) "MRE_vectors ", mre_n_vec
+
+ call end_A(UREC, "Header")
+
+400 format (3(1x,a))
+405 format (2(1x,a))
+410 format (1x,a,4i3)
+
+ endif
+
+end subroutine write_header
+
+!-------------------------------------------------------------------------------
+subroutine write_footer(time0)
+
+ use module_function_decl
+ use module_thread
+ implicit none
+
+ SEED :: seed
+ SECONDS :: time0, sekunden
+
+ call ranget(seed)
+
+ call begin(UREC, "Footer")
+
+ if (my_pe() == 0) then
+ write(UREC, 400) "Date", datum(), uhrzeit()
+ write(UREC, *) "Seed", seed
+ write(UREC, 410) "CPU-Time", &
+ sekunden() - time0, "s on", num_pes() * n_thread, "CPUs"
+ endif
+
+400 format (3(1x,a))
+410 format (1x,a,1x,f8.1,1x,a,1x,i5,1x,a)
+
+ TIMING_WRITE(UREC)
+
+ call end_A(UREC, "Footer")
+
+end subroutine write_footer
+
+!-------------------------------------------------------------------------------
+subroutine get_flags(flags)
+
+ use typedef_cksum
+ use typedef_flags
+ use module_bqcd
+ use module_function_decl
+ use module_input
+ implicit none
+
+ type(type_flags), intent(out) :: flags
+
+ integer :: iarg, length, stat, narg
+ integer, external :: ipxfargc
+ character(len = 2) :: opt
+
+ flags%continuation_job = .false.
+ flags%show_version = .false.
+
+ narg = ipxfargc()
+
+ iarg = 1
+ do while (iarg <= narg)
+ call pxfgetarg(iarg, opt, length, stat)
+
+ if (opt(1:1) == "-") then
+ if (length > 2) call usage()
+
+ select case (opt(2:2))
+ case ("c")
+ flags%continuation_job = .true.
+ iarg = iarg + 1
+ case ("I")
+ call input_dump(6)
+ call comm_finalize()
+ stop
+ case ("V")
+ flags%show_version = .true.
+ iarg = iarg + 1
+ case default
+ call usage
+ end select
+ else
+ exit
+ endif
+ enddo
+
+ if (flags%show_version) then
+ call version()
+ call comm_finalize()
+ stop
+ endif
+
+ call take_arg(iarg, flags%input, narg)
+ if (narg >= iarg) call usage
+
+CONTAINS
+
+ subroutine usage()
+ implicit none
+ call die("Usage: " // prog_name // " [-c] [-I] [-V] input")
+ end subroutine usage
+
+ subroutine version()
+ implicit none
+
+ if (my_pe() == 0) then
+ write(6,*) "This is ", prog_name, " ", prog_version
+ write(6,*) " input format: ", input_version
+ write(6,*) " conf info format:", conf_info_version
+ write(6,*) " MAX_TEMPER: ", MAX_TEMPER
+ write(6,*) " real kind: ", RKIND
+ write(6,*) " version of D: ", version_of_d()
+ write(6,*) " D3: buffer vol: ", get_d3_buffer_vol()
+ write(6,*) " communication: ", trim(comm_method())
+ endif
+ end subroutine version
+
+ subroutine take_arg(iarg, arg, narg)
+ implicit none
+ integer, intent(inout) :: iarg
+ character(len = *), intent(out) :: arg
+ integer, intent(in) :: narg
+ integer :: length, stat
+
+ if (iarg > narg) call usage
+ call pxfgetarg(iarg, arg, length, stat)
+ if (length > len(arg)) then
+ call die("get_flags(): " // arg // ": argument too long")
+ endif
+ iarg = iarg + 1
+ end subroutine take_arg
+
+end subroutine get_flags
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/bqcd.pcl b/qcd/part_cpu/applications/QCD/src/kernel_A/bqcd.pcl
new file mode 100644
index 0000000000000000000000000000000000000000..10905e9f70bd1b3bd9c17624efe4251d574c8400
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/bqcd.pcl
@@ -0,0 +1,2 @@
+work.pc
+modules/work.pc
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/cg.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/cg.F90
new file mode 100644
index 0000000000000000000000000000000000000000..8ab5d17ce98c328687c12ea6631da68d1e2e4837
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/cg.F90
@@ -0,0 +1,167 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! cg.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_cg
+
+ type type_cg_para
+ real :: rest
+ integer :: maxiter
+ integer :: log
+ end type type_cg_para
+
+ type(type_cg_para), save :: cg_para
+
+ type type_cg_stat
+ integer :: niter
+ integer :: niter_max
+ integer :: niter_tot
+ integer :: ncall
+ end type type_cg_stat
+
+ type(type_cg_stat), save :: cg_stat
+
+ integer, save :: cg_iterations_total = 0 ! used in timing.F90
+end
+
+!-------------------------------------------------------------------------------
+subroutine cg(matrix_mult, x, b, para, conf, iterations)
+
+ ! solves "matrix_mult * x = b" and returns number of iterations
+
+ use module_cg
+ use module_function_decl
+ use module_p_interface
+ use module_vol
+ use typedef_hmc
+ implicit none
+
+ external :: matrix_mult
+ SPINCOL_OVERINDEXED, intent(out) :: x
+ SPINCOL_OVERINDEXED, intent(in) :: b
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(in) :: conf
+ integer, intent(out) :: iterations
+
+ P_SPINCOL_OVERINDEXED, save :: r, p, aap
+
+ REAL :: ak, bk, rtr, rtrold, paap
+ integer :: i, niter
+ character(72) :: msg
+
+ TIMING_START(timing_bin_cg)
+
+ ALLOCATE_SC_OVERINDEXED(r)
+ ALLOCATE_SC_OVERINDEXED(p)
+ ALLOCATE_SC_OVERINDEXED(aap)
+
+ call matrix_mult(r, x, para, conf)
+
+ rtrold = ZERO
+ !$omp parallel do reduction(+: rtrold)
+ do i = 1, size_sc_field
+ r(i) = b(i) - r(i)
+ p(i) = r(i)
+ rtrold = rtrold + r(i)**2
+ enddo
+
+ rtrold = global_sum(rtrold)
+
+ do niter = 1, cg_para%maxiter
+ call matrix_mult(aap, p, para, conf)
+
+ paap = sc_dot(p, aap)
+ paap = global_sum(paap)
+
+ ak = rtrold / paap
+
+ rtr = ZERO
+ !$omp parallel do reduction(+: rtr)
+ do i = 1, size_sc_field
+ x(i) = x(i) + ak * p(i)
+ r(i) = r(i) - ak * aap(i)
+ rtr = rtr + r(i)**2
+ enddo
+
+ rtr = global_sum(rtr)
+
+ if (rtr <= cg_para%rest) goto 9999
+
+ bk = rtr / rtrold
+ rtrold = rtr
+
+ call sc_xpby(p, r, bk) ! p = r + bk * p
+ enddo
+
+ niter = niter - 1
+
+ if (cg_para%log /= 2) then
+ write(msg, *) "cg(): no convergence; rtr = ", rtr
+ call die(msg)
+ endif
+
+9999 continue
+
+ cg_stat%ncall = cg_stat%ncall + 1
+ cg_stat%niter = niter
+ cg_stat%niter_max = max(cg_stat%niter_max, niter)
+ cg_stat%niter_tot = cg_stat%niter_tot + niter
+ cg_iterations_total = cg_iterations_total + niter
+
+ iterations = niter
+
+ TIMING_STOP(timing_bin_cg)
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_cg_para(rest, maxiter, log)
+
+ use module_cg
+ implicit none
+ real rest
+ integer maxiter, log
+
+ cg_para%rest = rest
+ cg_para%maxiter = maxiter
+ cg_para%log = log
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_cg_stat()
+
+ use module_cg
+ implicit none
+
+ cg_stat%ncall = 0
+ cg_stat%niter_max = 0
+ cg_stat%niter_tot = 0
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine get_cg_stat(ncall, niter_max, niter_tot)
+
+ use module_cg
+ implicit none
+ integer ncall, niter_max, niter_tot
+
+ ncall = cg_stat%ncall
+ niter_max = cg_stat%niter_max
+ niter_tot = cg_stat%niter_tot
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/checks.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/checks.F90
new file mode 100644
index 0000000000000000000000000000000000000000..8c8cd815dd6e1089ba95ef3aa3d80833755bc086
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/checks.F90
@@ -0,0 +1,60 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! checks.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine check_csw(beta, csw)
+
+ implicit none
+ REAL, intent(in) :: beta, csw
+ REAL :: g, c
+
+ if (beta == ZERO) return
+ if (csw == ZERO) return
+
+ g = SIX / beta
+ c = ONE - 0.454 * g - 0.175 * g**2 + 0.012 * g**3 + 0.045 * g**4
+ c = c / (ONE - 0.720 * g)
+
+ if (abs(c - csw) > 0.00005) then
+ call warn("check_csw(): c_sw differs more than 0.00005 from ALPHA value")
+ endif
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine check_bc_fermions(bc_fermions, gamma_index)
+
+ ! warns if the number of anti-periodic fermionic b.c. is 1 and
+ ! the anti-periodic direction is not the gamma_4 direction
+
+ implicit none
+ integer, dimension(DIM), intent(in) :: bc_fermions, gamma_index
+
+ integer :: i, i_anti, count
+
+ count = 0
+ do i = 1, DIM
+ if (bc_fermions(i) < 0) then
+ count = count + 1
+ i_anti = i
+ endif
+ enddo
+
+ if (count == 1 .and. gamma_index(i_anti) /= 4) then
+ call warn("check_bc_fermions(): anti-periodic b.c. not in gamma_4 direction")
+ endif
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/cksum.c b/qcd/part_cpu/applications/QCD/src/kernel_A/cksum.c
new file mode 100644
index 0000000000000000000000000000000000000000..0b3c77470db5537b5b750e9a784a0474568b16ea
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/cksum.c
@@ -0,0 +1,154 @@
+/*
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! Adopted from:
+!
+! ==================
+! QCD SF/T3E PROGRAM
+! ==================
+!
+! Calculate a modified cyclic redundancy check (CRC is specified by the
+! POSIX.2 standard). Modification is necessary since I do not know how
+! to handle a unsigned long in FORTRAN. Solution: CKSUM returns negative
+! number if result is > LONG_MAX.
+!
+! CKSUM_GET() returns always positive numbers < LONG_MAX. (H.S.)
+!
+!
+! Parts of the source come from cksum.c of the GNU text utilities (version
+! 1.19) written by Q. Frank Xia.
+!
+! $Log: cksum.c,v $
+! Revision 1.1 2007/11/14 13:10:15 mallalen
+! *** empty log message ***
+!
+! Revision 1.1 1997/12/04 10:31:09 pleiter
+! Initial writing attempt
+!
+!-----------------------------------------------------------------------------
+*/
+
+#ifdef NamesToLower_
+# define CKSUM_INIT cksum_init_
+# define CKSUM_ADD cksum_add_
+# define CKSUM_GET cksum_get_
+#endif
+
+#ifdef NamesToLower
+# define CKSUM_INIT cksum_init
+# define CKSUM_ADD cksum_add
+# define CKSUM_GET cksum_get
+#endif
+
+#ifdef LongLong
+# define INT8 long long
+#else
+# define INT8 long
+#endif
+
+void CKSUM_INIT(void);
+void CKSUM_ADD(void *, INT8 *);
+void CKSUM_GET(INT8 *, INT8 *);
+
+static unsigned INT8 the_crc = 0;
+static INT8 the_bytes = 0;
+
+static unsigned INT8 const crctab[256] =
+{
+ 0x0,
+ 0x04C11DB7, 0x09823B6E, 0x0D4326D9, 0x130476DC, 0x17C56B6B,
+ 0x1A864DB2, 0x1E475005, 0x2608EDB8, 0x22C9F00F, 0x2F8AD6D6,
+ 0x2B4BCB61, 0x350C9B64, 0x31CD86D3, 0x3C8EA00A, 0x384FBDBD,
+ 0x4C11DB70, 0x48D0C6C7, 0x4593E01E, 0x4152FDA9, 0x5F15ADAC,
+ 0x5BD4B01B, 0x569796C2, 0x52568B75, 0x6A1936C8, 0x6ED82B7F,
+ 0x639B0DA6, 0x675A1011, 0x791D4014, 0x7DDC5DA3, 0x709F7B7A,
+ 0x745E66CD, 0x9823B6E0, 0x9CE2AB57, 0x91A18D8E, 0x95609039,
+ 0x8B27C03C, 0x8FE6DD8B, 0x82A5FB52, 0x8664E6E5, 0xBE2B5B58,
+ 0xBAEA46EF, 0xB7A96036, 0xB3687D81, 0xAD2F2D84, 0xA9EE3033,
+ 0xA4AD16EA, 0xA06C0B5D, 0xD4326D90, 0xD0F37027, 0xDDB056FE,
+ 0xD9714B49, 0xC7361B4C, 0xC3F706FB, 0xCEB42022, 0xCA753D95,
+ 0xF23A8028, 0xF6FB9D9F, 0xFBB8BB46, 0xFF79A6F1, 0xE13EF6F4,
+ 0xE5FFEB43, 0xE8BCCD9A, 0xEC7DD02D, 0x34867077, 0x30476DC0,
+ 0x3D044B19, 0x39C556AE, 0x278206AB, 0x23431B1C, 0x2E003DC5,
+ 0x2AC12072, 0x128E9DCF, 0x164F8078, 0x1B0CA6A1, 0x1FCDBB16,
+ 0x018AEB13, 0x054BF6A4, 0x0808D07D, 0x0CC9CDCA, 0x7897AB07,
+ 0x7C56B6B0, 0x71159069, 0x75D48DDE, 0x6B93DDDB, 0x6F52C06C,
+ 0x6211E6B5, 0x66D0FB02, 0x5E9F46BF, 0x5A5E5B08, 0x571D7DD1,
+ 0x53DC6066, 0x4D9B3063, 0x495A2DD4, 0x44190B0D, 0x40D816BA,
+ 0xACA5C697, 0xA864DB20, 0xA527FDF9, 0xA1E6E04E, 0xBFA1B04B,
+ 0xBB60ADFC, 0xB6238B25, 0xB2E29692, 0x8AAD2B2F, 0x8E6C3698,
+ 0x832F1041, 0x87EE0DF6, 0x99A95DF3, 0x9D684044, 0x902B669D,
+ 0x94EA7B2A, 0xE0B41DE7, 0xE4750050, 0xE9362689, 0xEDF73B3E,
+ 0xF3B06B3B, 0xF771768C, 0xFA325055, 0xFEF34DE2, 0xC6BCF05F,
+ 0xC27DEDE8, 0xCF3ECB31, 0xCBFFD686, 0xD5B88683, 0xD1799B34,
+ 0xDC3ABDED, 0xD8FBA05A, 0x690CE0EE, 0x6DCDFD59, 0x608EDB80,
+ 0x644FC637, 0x7A089632, 0x7EC98B85, 0x738AAD5C, 0x774BB0EB,
+ 0x4F040D56, 0x4BC510E1, 0x46863638, 0x42472B8F, 0x5C007B8A,
+ 0x58C1663D, 0x558240E4, 0x51435D53, 0x251D3B9E, 0x21DC2629,
+ 0x2C9F00F0, 0x285E1D47, 0x36194D42, 0x32D850F5, 0x3F9B762C,
+ 0x3B5A6B9B, 0x0315D626, 0x07D4CB91, 0x0A97ED48, 0x0E56F0FF,
+ 0x1011A0FA, 0x14D0BD4D, 0x19939B94, 0x1D528623, 0xF12F560E,
+ 0xF5EE4BB9, 0xF8AD6D60, 0xFC6C70D7, 0xE22B20D2, 0xE6EA3D65,
+ 0xEBA91BBC, 0xEF68060B, 0xD727BBB6, 0xD3E6A601, 0xDEA580D8,
+ 0xDA649D6F, 0xC423CD6A, 0xC0E2D0DD, 0xCDA1F604, 0xC960EBB3,
+ 0xBD3E8D7E, 0xB9FF90C9, 0xB4BCB610, 0xB07DABA7, 0xAE3AFBA2,
+ 0xAAFBE615, 0xA7B8C0CC, 0xA379DD7B, 0x9B3660C6, 0x9FF77D71,
+ 0x92B45BA8, 0x9675461F, 0x8832161A, 0x8CF30BAD, 0x81B02D74,
+ 0x857130C3, 0x5D8A9099, 0x594B8D2E, 0x5408ABF7, 0x50C9B640,
+ 0x4E8EE645, 0x4A4FFBF2, 0x470CDD2B, 0x43CDC09C, 0x7B827D21,
+ 0x7F436096, 0x7200464F, 0x76C15BF8, 0x68860BFD, 0x6C47164A,
+ 0x61043093, 0x65C52D24, 0x119B4BE9, 0x155A565E, 0x18197087,
+ 0x1CD86D30, 0x029F3D35, 0x065E2082, 0x0B1D065B, 0x0FDC1BEC,
+ 0x3793A651, 0x3352BBE6, 0x3E119D3F, 0x3AD08088, 0x2497D08D,
+ 0x2056CD3A, 0x2D15EBE3, 0x29D4F654, 0xC5A92679, 0xC1683BCE,
+ 0xCC2B1D17, 0xC8EA00A0, 0xD6AD50A5, 0xD26C4D12, 0xDF2F6BCB,
+ 0xDBEE767C, 0xE3A1CBC1, 0xE760D676, 0xEA23F0AF, 0xEEE2ED18,
+ 0xF0A5BD1D, 0xF464A0AA, 0xF9278673, 0xFDE69BC4, 0x89B8FD09,
+ 0x8D79E0BE, 0x803AC667, 0x84FBDBD0, 0x9ABC8BD5, 0x9E7D9662,
+ 0x933EB0BB, 0x97FFAD0C, 0xAFB010B1, 0xAB710D06, 0xA6322BDF,
+ 0xA2F33668, 0xBCB4666D, 0xB8757BDA, 0xB5365D03, 0xB1F740B4
+};
+
+void CKSUM_INIT(void)
+{
+ the_crc = 0;
+ the_bytes = 0;
+}
+
+void CKSUM_ADD(void *memptr, INT8 *nbytes)
+{
+ register unsigned INT8 crc;
+ register INT8 i;
+ register unsigned char *cp;
+
+ crc = the_crc;
+ cp = (unsigned char *) memptr;
+ for (i=0; i < *nbytes; i++)
+ crc = (crc << 8) ^ crctab[((crc >> 24) ^ *(cp++)) & 0xFF];
+
+ the_crc = crc;
+ the_bytes += *nbytes;
+}
+
+void CKSUM_GET(INT8 *total_crc, INT8 *total_bytes)
+{
+ register unsigned INT8 crc;
+ register INT8 i;
+
+ crc = the_crc;
+
+ for (i = the_bytes; i > 0; i >>= 8)
+ crc = (crc << 8) ^ crctab[((crc >> 24) ^ i) & 0xFF];
+ crc = (~crc & 0xFFFFFFFF);
+
+ *total_crc = (INT8) crc;
+ *total_bytes = the_bytes;
+}
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/cksum_dummy.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/cksum_dummy.F90
new file mode 100644
index 0000000000000000000000000000000000000000..97fb8fda1b92d63dae060068a61611037a64fb39
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/cksum_dummy.F90
@@ -0,0 +1,35 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! cksum_dummy.F90 - dummy routines that can replace the real C routines
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine cksum_init()
+ return
+end
+
+!-------------------------------------------------------------------------------
+subroutine cksum_add(i, j)
+ integer i(*)
+ CHECK_SUM j
+ return
+end
+
+!-------------------------------------------------------------------------------
+subroutine cksum_get(sum, bytes)
+ CHECK_SUM sum, bytes
+ sum = 0
+ bytes = 0
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..445b413cf94079e033efb4c609177f59e8c69ea8
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/Makefile
@@ -0,0 +1,67 @@
+#===============================================================================
+#
+# BQCD -- Berlin Quantum ChromoDynamics programme
+#
+# Author: Hinnerk Stueben
+#
+# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+#
+#-------------------------------------------------------------------------------
+#
+# clover/Makefile
+#
+#===============================================================================
+
+include ../Makefile.defs
+
+fpp = $(FPP) -I.. $(FPPFLAGS)
+
+MODULES_DIR = ../modules
+
+.SUFFIXES:
+.SUFFIXES: .a .o .F90
+
+.F90.o:
+ $(fpp) $< > $*.f90
+ $(F90) -c $(FFLAGS) -I$(MODULES_DIR) $*.f90
+
+OBJS = \
+ clover_action.o \
+ clover_allocate.o \
+ clover_bsa.o \
+ clover_d.o \
+ clover_f_mu_nu.o \
+ clover_init.o \
+ clover_inv.o \
+ clover_mult_a.o \
+ clover_mult_ao.o \
+ clover_mult_b.o \
+ clover_t_init.o \
+ clover_ts.o \
+ clover_uuu.o \
+ clover_uuuu.o
+
+OBJS_CTEST = \
+ ctest.o \
+ clover_inv.o \
+ clover_mult_a.o \
+ clover_mult_ao.o \
+ clover_mult_b.o
+
+$(LIBCLOVER):
+
+libclover.a: $(OBJS)
+ $(AR) $(ARFLAGS) $@ $(OBJS)
+ $(RANLIB) $@
+
+fast:
+ $(FAST_MAKE)
+
+ctest: $(OBJS_CTEST)
+ f90 -o $@ $(OBJS_CTEST)
+
+clean:
+ rm -f *.[Tiod] *.f90 *.mod core work.pc work.pcl
+
+clobber: clean
+ rm -f libclover.a ctest
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/bqcd.pcl b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/bqcd.pcl
new file mode 100644
index 0000000000000000000000000000000000000000..906244500b31700684482c3dcfd32f6cec4279db
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/bqcd.pcl
@@ -0,0 +1,2 @@
+work.pc
+../modules/work.pc
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover.h b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f22a2453f2aaceb37ffb33d9d8f03b4e08f0ac5
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover.h
@@ -0,0 +1,166 @@
+#ifdef CLOVER_AS_COMPLEX_ARRAY
+
+# define A11 Re(a(1,J,i))
+# define A22 Im(a(1,J,i))
+# define A33 Re(a(11,J,i))
+# define A44 Im(a(11,J,i))
+# define A55 Re(a(17,J,i))
+# define A66 Im(a(17,J,i))
+
+# define A12 a(2,J,i)
+# define A13 a(3,J,i)
+# define A14 a(4,J,i)
+# define A15 a(5,J,i)
+# define A16 a(6,J,i)
+
+# define A23 a(7,J,i)
+# define A24 a(8,J,i)
+# define A25 a(9,J,i)
+# define A26 a(10,J,i)
+
+# define A34 a(12,J,i)
+# define A35 a(13,J,i)
+# define A36 a(14,J,i)
+
+# define A45 a(15,J,i)
+# define A46 a(16,J,i)
+
+# define A56 a(18,J,i)
+
+# define B11 Re(b(16,J,i))
+# define B22 Im(b(16,J,i))
+# define B33 Re(b(17,J,i))
+# define B44 Im(b(17,J,i))
+# define B55 Re(b(18,J,i))
+# define B66 Im(b(18,J,i))
+
+# define B21 b(1,J,i)
+
+# define B31 b(2,J,i)
+# define B32 b(3,J,i)
+
+# define B41 b(4,J,i)
+# define B42 b(5,J,i)
+# define B43 b(6,J,i)
+
+# define B51 b(7,J,i)
+# define B52 b(8,J,i)
+# define B53 b(9,J,i)
+# define B54 b(10,J,i)
+
+# define B61 b(11,J,i)
+# define B62 b(12,J,i)
+# define B63 b(13,J,i)
+# define B64 b(14,J,i)
+# define B65 b(15,J,i)
+
+#else
+
+# define A11 a%i11
+# define A22 a%i22
+# define A33 a%i33
+# define A44 a%i44
+# define A55 a%i55
+# define A66 a%i66
+
+# define A12 a%i12
+# define A13 a%i13
+# define A14 a%i14
+# define A15 a%i15
+# define A16 a%i16
+
+# define A23 a%i23
+# define A24 a%i24
+# define A25 a%i25
+# define A26 a%i26
+
+# define A34 a%i34
+# define A35 a%i35
+# define A36 a%i36
+
+# define A45 a%i45
+# define A46 a%i46
+
+# define A56 a%i56
+
+# define B11 b%i11
+# define B22 b%i22
+# define B33 b%i33
+# define B44 b%i44
+# define B55 b%i55
+# define B66 b%i66
+
+# define B21 b%i21
+
+# define B31 b%i31
+# define B32 b%i32
+
+# define B41 b%i41
+# define B42 b%i42
+# define B43 b%i43
+
+# define B51 b%i51
+# define B52 b%i52
+# define B53 b%i53
+# define B54 b%i54
+
+# define B61 b%i61
+# define B62 b%i62
+# define B63 b%i63
+# define B64 b%i64
+# define B65 b%i65
+
+#endif
+
+# define A21 conjg(A12)
+# define A31 conjg(A13)
+# define A41 conjg(A14)
+# define A51 conjg(A15)
+# define A61 conjg(A16)
+
+# define A32 conjg(A23)
+# define A42 conjg(A24)
+# define A52 conjg(A25)
+# define A62 conjg(A26)
+
+# define A43 conjg(A34)
+# define A53 conjg(A35)
+# define A63 conjg(A36)
+
+# define A54 conjg(A45)
+# define A64 conjg(A46)
+
+# define A65 conjg(A56)
+
+# define B12 conjg(B21)
+
+# define B13 conjg(B31)
+# define B23 conjg(B32)
+
+# define B14 conjg(B41)
+# define B24 conjg(B42)
+# define B34 conjg(B43)
+
+# define B15 conjg(B51)
+# define B25 conjg(B52)
+# define B35 conjg(B53)
+# define B45 conjg(B54)
+
+# define B16 conjg(B61)
+# define B26 conjg(B62)
+# define B36 conjg(B63)
+# define B46 conjg(B64)
+# define B56 conjg(B65)
+
+# define SC1 1, 1
+# define SC2 1, 2
+# define SC3 1, 3
+# define SC4 2, 1
+# define SC5 2, 2
+# define SC6 2, 3
+# define SC7 3, 1
+# define SC8 3, 2
+# define SC9 3, 3
+# define SC10 4, 1
+# define SC11 4, 2
+# define SC12 4, 3
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_action.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_action.F90
new file mode 100644
index 0000000000000000000000000000000000000000..a9fbd0450cbba703fd61728ca1a91fd003fead18
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_action.F90
@@ -0,0 +1,51 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! clover_action.F90 - calculates: -2 Tr(log(T_oo))
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+# include "clover.h"
+
+!-------------------------------------------------------------------------------
+REAL function clover_action(b)
+
+ use typedef_clover
+ use module_vol
+ implicit none
+
+ type(type_clover_b) :: b(2, volh)
+ integer :: i
+ REAL :: s, global_sum
+
+
+ s = ZERO
+
+ !$omp parallel do reduction(+: s)
+ do i = 1, volh
+ s = s + log(det(b(1, i)) * det(b(2, i)))
+ enddo
+
+ clover_action = TWO * global_sum(s)
+
+
+CONTAINS
+
+ REAL function det(b) ! returns (1 / det)
+
+ type(type_clover_b) :: b
+
+ det = B11 * B22 * B33 * B44 * B55 * B66
+
+ end function det
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_allocate.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_allocate.F90
new file mode 100644
index 0000000000000000000000000000000000000000..60475de54ce8caf5937248dac933d897ec09de9a
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_allocate.F90
@@ -0,0 +1,40 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! clover_allocate.F90 - allocation of clover arrays
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine allocate_clover_field_a(a)
+
+ use typedef_clover
+ use module_vol
+ implicit none
+ P_CLOVER_FIELD_A :: a
+
+ allocate(a(2, volh, EVEN:ODD))
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine allocate_clover_field_b(b)
+
+ use typedef_clover
+ use module_vol
+ implicit none
+ P_CLOVER_FIELD_B :: b
+
+ allocate(b(2, volh, EVEN:ODD))
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa.F90
new file mode 100644
index 0000000000000000000000000000000000000000..499abb10c889fd6d8c6b673a8fdd676a0e6a5f6c
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa.F90
@@ -0,0 +1,165 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! clover_bsa.F90 - calculates "B sigma A"
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine clover_bsa(mu, nu, w, b, a) ! w = transposed(conjg(b) sigma_mu_nu a)
+
+ use module_vol
+ implicit none
+
+ integer :: mu, nu
+ SU3_FIELD :: w
+ SPINCOL_FIELD :: b, a
+
+ if (mu == 1) then
+ if (nu == 2) then ; call clover_bsa_12(w, b, a)
+ elseif (nu == 3) then ; call clover_bsa_13(w, b, a)
+ elseif (nu == 4) then ; call clover_bsa_14(w, b, a) ; endif
+ elseif (mu == 2) then
+ if (nu == 1) then ; call clover_bsa_21(w, b, a)
+ elseif (nu == 3) then ; call clover_bsa_23(w, b, a)
+ elseif (nu == 4) then ; call clover_bsa_24(w, b, a) ; endif
+ elseif (mu == 3) then
+ if (nu == 1) then ; call clover_bsa_31(w, b, a)
+ elseif (nu == 2) then ; call clover_bsa_32(w, b, a)
+ elseif (nu == 4) then ; call clover_bsa_34(w, b, a) ; endif
+ elseif (mu == 4) then
+ if (nu == 1) then ; call clover_bsa_41(w, b, a)
+ elseif (nu == 2) then ; call clover_bsa_42(w, b, a)
+ elseif (nu == 3) then ; call clover_bsa_43(w, b, a) ; endif
+ endif
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_bsa_12(w, b, a)
+
+# include "clover_bsa_head.h90"
+ a1 = -a(1, ca, i)
+ a2 = a(2, ca, i)
+ a3 = -a(3, ca, i)
+ a4 = a(4, ca, i)
+# include "clover_bsa_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_bsa_21(w, b, a)
+
+# include "clover_bsa_head.h90"
+ a1 = a(1, ca, i)
+ a2 = -a(2, ca, i)
+ a3 = a(3, ca, i)
+ a4 = -a(4, ca, i)
+# include "clover_bsa_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_bsa_13(w, b, a)
+
+# include "clover_bsa_head.h90"
+ a1 = -i_times(a(2, ca, i))
+ a2 = i_times(a(1, ca, i))
+ a3 = -i_times(a(4, ca, i))
+ a4 = i_times(a(3, ca, i))
+# include "clover_bsa_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_bsa_31(w, b, a)
+
+# include "clover_bsa_head.h90"
+ a1 = i_times(a(2, ca, i))
+ a2 = -i_times(a(1, ca, i))
+ a3 = i_times(a(4, ca, i))
+ a4 = -i_times(a(3, ca, i))
+# include "clover_bsa_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_bsa_14(w, b, a)
+
+# include "clover_bsa_head.h90"
+ a1 = a(4, ca, i)
+ a2 = a(3, ca, i)
+ a3 = a(2, ca, i)
+ a4 = a(1, ca, i)
+# include "clover_bsa_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_bsa_41(w, b, a)
+
+# include "clover_bsa_head.h90"
+ a1 = -a(4, ca, i)
+ a2 = -a(3, ca, i)
+ a3 = -a(2, ca, i)
+ a4 = -a(1, ca, i)
+# include "clover_bsa_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_bsa_23(w, b, a)
+
+# include "clover_bsa_head.h90"
+ a1 = -a(2, ca, i)
+ a2 = -a(1, ca, i)
+ a3 = -a(4, ca, i)
+ a4 = -a(3, ca, i)
+# include "clover_bsa_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_bsa_32(w, b, a)
+
+# include "clover_bsa_head.h90"
+ a1 = a(2, ca, i)
+ a2 = a(1, ca, i)
+ a3 = a(4, ca, i)
+ a4 = a(3, ca, i)
+# include "clover_bsa_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_bsa_24(w, b, a)
+
+# include "clover_bsa_head.h90"
+ a1 = -i_times(a(4, ca, i))
+ a2 = i_times(a(3, ca, i))
+ a3 = -i_times(a(2, ca, i))
+ a4 = i_times(a(1, ca, i))
+# include "clover_bsa_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_bsa_42(w, b, a)
+
+# include "clover_bsa_head.h90"
+ a1 = i_times(a(4, ca, i))
+ a2 = -i_times(a(3, ca, i))
+ a3 = i_times(a(2, ca, i))
+ a4 = -i_times(a(1, ca, i))
+# include "clover_bsa_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_bsa_34(w, b, a)
+
+# include "clover_bsa_head.h90"
+ a1 = a(3, ca, i)
+ a2 = -a(4, ca, i)
+ a3 = a(1, ca, i)
+ a4 = -a(2, ca, i)
+# include "clover_bsa_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_bsa_43(w, b, a)
+
+# include "clover_bsa_head.h90"
+ a1 = -a(3, ca, i)
+ a2 = a(4, ca, i)
+ a3 = -a(1, ca, i)
+ a4 = a(2, ca, i)
+# include "clover_bsa_tail.h90"
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa_head.h90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa_head.h90
new file mode 100644
index 0000000000000000000000000000000000000000..b2dae802a86eb8c4376fa6ac835510fd28849878
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa_head.h90
@@ -0,0 +1,16 @@
+ use module_vol
+ implicit none
+
+ SU3_FIELD :: w
+ SPINCOL_FIELD :: b, a
+ COMPLEX :: a1, a2, a3, a4
+ integer :: i, ca, cb
+
+ ! statement function:
+
+ COMPLEX :: i_times, c
+ i_times(c) = cmplx(-aimag(c), real(c))
+
+ !$omp parallel do private(ca, cb, a1, a2, a3, a4)
+ do i = 1, volh
+ do ca = 1, NCOL
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa_tail.h90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa_tail.h90
new file mode 100644
index 0000000000000000000000000000000000000000..66092341dae7a1266365740e4405af306c5c8abc
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa_tail.h90
@@ -0,0 +1,10 @@
+ do cb = 1, NCOL
+ w(ca, cb, i) = a1 * conjg(b(1, cb, i)) &
+ + a2 * conjg(b(2, cb, i)) &
+ + a3 * conjg(b(3, cb, i)) &
+ + a4 * conjg(b(4, cb, i))
+ enddo
+ enddo
+ enddo
+
+end
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_d.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_d.F90
new file mode 100644
index 0000000000000000000000000000000000000000..71611a1c43b24aebdaa31bc1668f3b33ecaf1587
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_d.F90
@@ -0,0 +1,255 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! clover_d.F90 - derivative of clover term
+!
+!-------------------------------------------------------------------------------
+!
+! E -- 2 -- B
+! | | A = x
+! 3 1 ^ nu B = x + mu^ + nu^
+! | | | C = x + mu^ - nu^
+! A -- 0 -- D x --> mu
+! | | D = x + mu^
+! 4 6 E = x + nu^
+! | | F = x - nu^
+! F -- 5 -- C
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine clover_dsd(eo, p, b, s, u)
+
+ use typedef_clover
+ use module_p_interface
+ use module_vol
+ implicit none
+
+ integer :: eo ! EVEN/ODD property of "b"
+ GENERATOR_FIELD :: p
+ CLOVER_FIELD_B :: b
+ REAL :: s
+ GAUGE_FIELD :: u
+
+ P_GAUGE_FIELD, save :: w ! use existing data structure
+
+ CLOVER_FIELD_C :: t
+!dir$ cache_align t
+
+
+ TIMING_START(timing_bin_clover_dsd)
+
+ ALLOCATE_G_FIELD(w)
+
+ call clover_t_init(t, b(1, 1, eo))
+
+ call clover_ts(1, 2, w(1, 1, 1, EVEN, 1), t) ; call xbound_g(w, EVEN, 1)
+ call clover_ts(1, 3, w(1, 1, 1, ODD, 1), t) ; call xbound_g(w, ODD, 1)
+ call clover_ts(1, 4, w(1, 1, 1, EVEN, 2), t) ; call xbound_g(w, EVEN, 2)
+ call clover_ts(2, 3, w(1, 1, 1, ODD, 2), t) ; call xbound_g(w, ODD, 2)
+ call clover_ts(2, 4, w(1, 1, 1, EVEN, 3), t) ; call xbound_g(w, EVEN, 3)
+ call clover_ts(3, 4, w(1, 1, 1, ODD, 3), t) ; call xbound_g(w, ODD, 3)
+
+ call clover_d(eo, p, s, u, w)
+
+ TIMING_STOP(timing_bin_clover_dsd)
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_dsf(eo, p, b, a, s, u)
+
+ use module_p_interface
+ use module_vol
+ implicit none
+
+ integer :: eo ! EVEN/ODD property of "b" and "a"
+ GENERATOR_FIELD :: p
+ SPINCOL_FIELD :: b, a
+ REAL :: s
+ GAUGE_FIELD :: u
+
+ P_GAUGE_FIELD, save :: w ! use existing data structure
+
+
+ TIMING_START(timing_bin_clover_dsf)
+
+ ALLOCATE_G_FIELD(w)
+
+ call clover_bsa(1, 2, w(1, 1, 1, EVEN, 1), b, a) ; call xbound_g(w, EVEN, 1)
+ call clover_bsa(1, 3, w(1, 1, 1, ODD, 1), b, a) ; call xbound_g(w, ODD, 1)
+ call clover_bsa(1, 4, w(1, 1, 1, EVEN, 2), b, a) ; call xbound_g(w, EVEN, 2)
+ call clover_bsa(2, 3, w(1, 1, 1, ODD, 2), b, a) ; call xbound_g(w, ODD, 2)
+ call clover_bsa(2, 4, w(1, 1, 1, EVEN, 3), b, a) ; call xbound_g(w, EVEN, 3)
+ call clover_bsa(3, 4, w(1, 1, 1, ODD, 3), b, a) ; call xbound_g(w, ODD, 3)
+
+ call clover_d(eo, p, s, u, w)
+
+ TIMING_STOP(timing_bin_clover_dsf)
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_d(eo, p, s, u, w)
+
+ use module_vol
+ implicit none
+
+ integer :: eo
+ GENERATOR_FIELD :: p
+ REAL :: s
+ GAUGE_FIELD :: u, w
+
+ call clover_d_mu_nu(eo, 1, 2, p, s, u, w(1, 1, 1, EVEN, 1))
+ call clover_d_mu_nu(eo, 1, 3, p, s, u, w(1, 1, 1, ODD, 1))
+ call clover_d_mu_nu(eo, 1, 4, p, s, u, w(1, 1, 1, EVEN, 2))
+ call clover_d_mu_nu(eo, 2, 3, p, s, u, w(1, 1, 1, ODD, 2))
+ call clover_d_mu_nu(eo, 2, 4, p, s, u, w(1, 1, 1, EVEN, 3))
+ call clover_d_mu_nu(eo, 3, 4, p, s, u, w(1, 1, 1, ODD, 3))
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_d_mu_nu(e, mu, nu, p, s, u, w)
+
+ use module_vol
+ implicit none
+
+ integer :: e, o, mu, nu
+ GENERATOR_FIELD :: p
+ REAL :: s
+ GAUGE_FIELD :: u
+ SU3_FIELD :: w
+ external clover_d_same_eo, clover_d_diff_eo
+
+ o = EVEN + ODD - e
+
+ call clover_d_loop(e, mu, nu, p, s, u, w, clover_d_same_eo)
+ call clover_d_loop(e, nu, mu, p, -s, u, w, clover_d_same_eo)
+ call clover_d_loop(o, mu, nu, p, s, u, w, clover_d_diff_eo)
+ call clover_d_loop(o, nu, mu, p, -s, u, w, clover_d_diff_eo)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_d_loop(e, mu, nu, p, s, u, w, clover_dd)
+
+ use module_vol
+ use module_nn
+ implicit none
+
+ integer :: e, mu, nu
+ GENERATOR_FIELD :: p
+ REAL :: s
+ GAUGE_FIELD :: u
+ SU3_FIELD :: w
+ external clover_dd
+
+ integer :: o, i, ia, ib, ic, id, ie, if, j
+ GENERATOR :: q
+
+ o = EVEN + ODD - e
+
+ !$omp parallel do private(ia, ib, ic, id, ie, if, j, q)
+ do i = 1, volh
+
+ id = nn(i, e, mu, FWD)
+ ie = nn(i, e, nu, FWD)
+ if = nn(i, e, nu, BWD)
+
+ ia = i
+ ib = nn(id, o, nu, FWD)
+ ic = nn(id, o, nu, BWD)
+
+ call clover_dd(q, s, u(1, 1, ia, e, mu), &
+ u(1, 1, id, o, nu), &
+ u(1, 1, ie, o, mu), &
+ u(1, 1, ia, e, nu), &
+ u(1, 1, if, o, nu), &
+ u(1, 1, if, o, mu), &
+ u(1, 1, ic, e, nu), &
+ w(1, 1, ia), &
+ w(1, 1, ib), &
+ w(1, 1, ic), &
+ w(1, 1, id), &
+ w(1, 1, ie), &
+ w(1, 1, if))
+
+ do j = 1, NGEN
+ p(j, i, e, mu) = p(j, i, e, mu) + q(j)
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_d_same_eo(p, s, u0,u1,u2,u3,u4,u5,u6, wa,wb,wc, wd,we,wf)
+
+ implicit none
+
+ GENERATOR :: p
+ REAL :: s
+ SU3 :: u0, u1, u2, u3, u4, u5, u6, wa, wb, wc, wd, we, wf
+ SU3 :: r, u, v
+
+ p = ZERO
+
+ call uu(u, u0, u1)
+ call uu(v, u3, u2)
+
+ call clover_uuu_udu(r, u, v, wa) ; call re_tr_j(p, r, s)
+ call clover_uuu_uud(r, wa, v, u) ; call re_tr_j(p, r, s)
+ call clover_uuu_uud(r, u, wb, v) ; call re_tr_j(p, r, s)
+ call clover_uuu_uud(r, v, wb, u) ; call re_tr_j(p, r, s)
+
+ call uud(u, u0, u6)
+ call udu(v, u4, u5)
+
+ call clover_uuu_uud(r, wa, v, u) ; call re_tr_j(p, r, -s)
+ call clover_uuu_udu(r, u, v, wa) ; call re_tr_j(p, r, -s)
+ call clover_uuu_uud(r, v, wc, u) ; call re_tr_j(p, r, -s)
+ call clover_uuu_uud(r, u, wc, v) ; call re_tr_j(p, r, -s)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_d_diff_eo(p, s, u0,u1,u2,u3,u4,u5,u6, wa,wb,wc, wd,we,wf)
+
+ implicit none
+
+ GENERATOR :: p
+ REAL :: s
+ SU3 :: u0, u1, u2, u3, u4, u5, u6, wa, wb, wc, wd, we, wf
+ SU3 :: r, u, v
+
+ p = ZERO
+
+ u = ZERO
+ v = ZERO
+ call uuu_fwd(u, u1, u2, u3)
+ call uuu_fwd(v, u2, u1, u0)
+
+ call clover_uuu_uuu(r, u0, wd, u) ; call re_tr_j(p, r, s)
+ call clover_uuu_dud(r, u, wd, u0) ; call re_tr_j(p, r, s)
+ call clover_uuu_dud(r, v, we, u3) ; call re_tr_j(p, r, s)
+ call clover_uuu_uuu(r, u3, we, v) ; call re_tr_j(p, r, s)
+
+ u = ZERO
+ v = ZERO
+ call uuu_bwd(u, u6, u5, u4)
+ call uuu_fwd(v, u0, u6, u5)
+
+ call clover_uuu_dud(r, u, wd, u0) ; call re_tr_j(p, r, -s)
+ call clover_uuu_uuu(r, u0, wd, u) ; call re_tr_j(p, r, -s)
+ call clover_uuu_dud(r, u4, wf, v) ; call re_tr_j(p, r, -s)
+ call clover_uuu_uuu(r, v, wf, u4) ; call re_tr_j(p, r, -s)
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_dummy.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_dummy.F90
new file mode 100644
index 0000000000000000000000000000000000000000..bcbe8b7de98fe3e0c379bef7bfaf82cfdfc86fa8
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_dummy.F90
@@ -0,0 +1,136 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2000-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! clover_dummy.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine allocate_clover_field_a(a)
+
+ use typedef_clover
+ use module_vol
+ implicit none
+ P_CLOVER_FIELD_A :: a
+
+ call die("allocate_clover_field_a(): must not be called.")
+end
+
+!-------------------------------------------------------------------------------
+subroutine allocate_clover_field_b(b)
+
+ use typedef_clover
+ use module_vol
+ implicit none
+ P_CLOVER_FIELD_B :: b
+
+ call die("allocate_clover_field_b(): must not be called.")
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_init(a, b, u, csw_kappa)
+
+ use typedef_clover
+ use module_vol
+ implicit none
+
+ CLOVER_FIELD_A, intent(out) :: a
+ CLOVER_FIELD_B, intent(out) :: b
+ GAUGE_FIELD, intent(in) :: u
+ REAL, intent(in) :: csw_kappa
+
+ call die("clover_init(): must not be called.")
+end
+
+!-------------------------------------------------------------------------------
+REAL function clover_action(b)
+
+ use typedef_clover
+ use module_vol
+ implicit none
+
+ type(type_clover_b) :: b(2, volh)
+ integer :: i
+ REAL :: s, global_sum
+
+
+ call die("clover_action(): must not be called.")
+ clover_action = ZERO
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_mult_a(out, a, in, volh)
+
+ implicit none
+
+ COMPLEX, dimension(18, 2, *) :: a
+ COMPLEX, dimension(NDIRAC, NCOL, *) :: out, in
+ integer :: volh
+
+ call die("clover_mult_a(): must not be called.")
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_mult_ao(a, x, volh) ! x := A x
+
+ implicit none
+
+ COMPLEX, dimension(18, 2, *) :: a
+ COMPLEX, dimension(NDIRAC, NCOL, *) :: x
+ integer :: volh
+
+ call die("clover_mult_ao(): must not be called.")
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_mult_b(b, x, volh)
+
+ implicit none
+
+ COMPLEX, dimension(18, 2, *) :: b
+ COMPLEX, dimension(NDIRAC, NCOL, *) :: x
+ integer :: volh
+
+ call die("clover_mult_b(): must not be called.")
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_dsd(eo, p, b, s, u)
+
+ use typedef_clover
+ use module_vol
+ implicit none
+
+ integer :: eo
+ GENERATOR_FIELD :: p
+ CLOVER_FIELD_B :: b
+ REAL :: s
+ GAUGE_FIELD :: u
+
+ call die("clover_dsd(): must not be called.")
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_dsf(eo, p, b, a, s, u)
+
+ use module_vol
+ implicit none
+
+ integer :: eo
+ GENERATOR_FIELD :: p
+ SPINCOL_FIELD :: b, a
+ REAL :: s
+ GAUGE_FIELD :: u
+
+ call die("clover_dsf(): must not be called.")
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_f_mu_nu.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_f_mu_nu.F90
new file mode 100644
index 0000000000000000000000000000000000000000..12a0fef130e5e1986f8b65647689f7ad38702b26
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_f_mu_nu.F90
@@ -0,0 +1,94 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! clover_f_mu_nu.F90 - F_mu_nu = (Q_mu_nu - h.c.) / i (missing factor 1/8)
+!
+!-------------------------------------------------------------------------------
+!
+! ^ nu
+! xmp x_p (xpp) |
+! |
+! xm_ x xp_ x --> mu
+!
+! xmm x_m xpm
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine clover_f_mu_nu(f, mu, nu, x, e, u)
+
+ use module_vol
+ use module_nn
+ implicit none
+
+ SU3, intent(out) :: f
+ integer, intent(in) :: mu, nu, x, e
+ GAUGE_FIELD, intent(in) :: u
+
+ integer :: xmp, x_p, xm_, xp_, xmm, x_m, xpm, o
+
+ ! statement function:
+
+ COMPLEX :: i_times, c
+ i_times(c) = cmplx(-aimag(c), real(c))
+
+ o = EVEN + ODD - e
+
+ xp_ = nn(x, e, mu, FWD)
+ xm_ = nn(x, e, mu, BWD)
+ x_p = nn(x, e, nu, FWD)
+ x_m = nn(x, e, nu, BWD)
+
+ xmp = nn(xm_, o, nu, FWD)
+ xmm = nn(xm_, o, nu, BWD)
+ xpm = nn(xp_, o, nu, BWD)
+
+ if (xmp /= nn(x_p, o, mu, BWD)) call die("colver_f_mu_nu(): xmp")
+ if (xmm /= nn(x_m, o, mu, BWD)) call die("colver_f_mu_nu(): xmm")
+ if (xpm /= nn(x_m, o, mu, FWD)) call die("colver_f_mu_nu(): xpm")
+
+ f = ZERO
+
+ call clover_uuuu1(f, u(1, 1, x, e, mu), &
+ u(1, 1, xp_, o, nu), &
+ u(1, 1, x_p, o, mu), &
+ u(1, 1, x, e, nu))
+
+ call clover_uuuu2(f, u(1, 1, x, e, nu), &
+ u(1, 1, xmp, e, mu), &
+ u(1, 1, xm_, o, nu), &
+ u(1, 1, xm_, o, mu))
+
+ call clover_uuuu3(f, u(1, 1, xm_, o, mu), &
+ u(1, 1, xmm, e, nu), &
+ u(1, 1, xmm, e, mu), &
+ u(1, 1, x_m, o, nu))
+
+ call clover_uuuu4(f, u(1, 1, x_m, o, nu), &
+ u(1, 1, x_m, o, mu), &
+ u(1, 1, xpm, e, nu), &
+ u(1, 1, x, e, mu))
+
+ f(1, 1) = cmplx(TWO * Im(f(1, 1)), ZERO)
+ f(2, 2) = cmplx(TWO * Im(f(2, 2)), ZERO)
+ f(3, 3) = cmplx(TWO * Im(f(3, 3)), ZERO)
+
+ f(1, 2) = i_times(conjg(f(2, 1)) - f(1, 2))
+ f(1, 3) = i_times(conjg(f(3, 1)) - f(1, 3))
+ f(2, 3) = i_times(conjg(f(3, 2)) - f(2, 3))
+
+ f(2, 1) = conjg(f(1, 2))
+ f(3, 1) = conjg(f(1, 3))
+ f(3, 2) = conjg(f(2, 3))
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_init.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_init.F90
new file mode 100644
index 0000000000000000000000000000000000000000..8a7b62b1d3853d44f728db2c383a5ba92bf248e5
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_init.F90
@@ -0,0 +1,180 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! clover_init.F90 - calculates clover matrix and its inverse
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine clover_init(a, ainv, b, u, csw_kappa)
+
+ use typedef_clover
+ use module_vol
+ implicit none
+
+ CLOVER_FIELD_A, intent(out) :: a, ainv
+ CLOVER_FIELD_B, intent(out) :: b
+ GAUGE_FIELD, intent(in) :: u
+ REAL, intent(in) :: csw_kappa
+
+ integer :: i, eo
+ SU3 :: f, g
+ type(type_clover_a) :: p, q
+ REAL :: factor
+
+ TIMING_START(timing_bin_clover_init)
+
+ factor = -csw_kappa / EIGHT
+
+ do eo = EVEN, ODD
+ !$omp parallel do private(f, g, p, q)
+ do i = 1, VOLH
+ call clover_f_mu_nu(f, 2, 1, i, eo, u)
+
+ call clover_init1(p, f)
+
+ call clover_f_mu_nu(f, 3, 2, i, eo, u)
+ call clover_f_mu_nu(g, 3, 1, i, eo, u)
+
+ call clover_init2(p, f, g)
+
+ call clover_f_mu_nu(f, 3, 4, i, eo, u)
+
+ call clover_init1(q, f)
+
+ call clover_f_mu_nu(f, 1, 4, i, eo, u)
+ call clover_f_mu_nu(g, 4, 2, i, eo, u)
+
+ call clover_init2(q, f, g)
+
+ call clover_init3(a(1, i, eo), a(2, i, eo), p, q, factor)
+
+ call clover_inv(b(1, i, eo), ainv(1, i, eo), a(1, i, eo))
+ call clover_inv(b(2, i, eo), ainv(2, i, eo), a(2, i, eo))
+ enddo
+ enddo
+
+ TIMING_STOP(timing_bin_clover_init)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_init1(a, f)
+
+ use typedef_clover
+ implicit none
+ type(type_clover_a) :: a
+ SU3 :: f
+
+ a%i11 = Re(f(1, 1))
+ a%i22 = Re(f(2, 2))
+ a%i33 = Re(f(3, 3))
+
+ a%i44 = -a%i11
+ a%i55 = -a%i22
+ a%i66 = -a%i33
+
+ a%i12 = f(1, 2)
+ a%i13 = f(1, 3)
+ a%i23 = f(2, 3)
+
+ a%i45 = -a%i12
+ a%i46 = -a%i13
+ a%i56 = -a%i23
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_init2(a, f, g)
+
+ use typedef_clover
+ implicit none
+ type(type_clover_a) :: a
+ SU3 :: f, g
+
+ ! statement function:
+
+ COMPLEX :: i_times, c
+ i_times(c) = cmplx(-aimag(c), real(c))
+
+ a%i14 = f(1, 1) + i_times(g(1, 1))
+ a%i15 = f(1, 2) + i_times(g(1, 2))
+ a%i16 = f(1, 3) + i_times(g(1, 3))
+
+ a%i24 = f(2, 1) + i_times(g(2, 1))
+ a%i25 = f(2, 2) + i_times(g(2, 2))
+ a%i26 = f(2, 3) + i_times(g(2, 3))
+
+ a%i34 = f(3, 1) + i_times(g(3, 1))
+ a%i35 = f(3, 2) + i_times(g(3, 2))
+ a%i36 = f(3, 3) + i_times(g(3, 3))
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_init3(a1, a2, p, q, s)
+
+ use typedef_clover
+ implicit none
+ type(type_clover_a) :: a1, a2, p, q
+ REAL :: s
+
+# define CLOVER_INIT_3(I, J) \
+a1%i ## I ## J = s * (p%i ## I ## J + q%i ## I ## J ## ) ; \
+a2%i ## I ## J = s * (p%i ## I ## J - q%i ## I ## J ## )
+
+! define =>
+! a1%iIJ = s * (p%iIJ + q%iIJ) ; a2%iIJ = s * (p%iIJ - q%iIJ)
+
+ CLOVER_INIT_3(1, 1)
+ CLOVER_INIT_3(1, 2)
+ CLOVER_INIT_3(1, 3)
+ CLOVER_INIT_3(1, 4)
+ CLOVER_INIT_3(1, 5)
+ CLOVER_INIT_3(1, 6)
+
+ CLOVER_INIT_3(2, 2)
+ CLOVER_INIT_3(2, 3)
+ CLOVER_INIT_3(2, 4)
+ CLOVER_INIT_3(2, 5)
+ CLOVER_INIT_3(2, 6)
+
+ CLOVER_INIT_3(3, 3)
+ CLOVER_INIT_3(3, 4)
+ CLOVER_INIT_3(3, 5)
+ CLOVER_INIT_3(3, 6)
+
+ CLOVER_INIT_3(4, 4)
+ CLOVER_INIT_3(4, 5)
+ CLOVER_INIT_3(4, 6)
+
+ CLOVER_INIT_3(5, 5)
+ CLOVER_INIT_3(5, 6)
+
+ CLOVER_INIT_3(6, 6)
+
+ a1%i11 = a1%i11 + ONE
+ a1%i22 = a1%i22 + ONE
+ a1%i33 = a1%i33 + ONE
+ a1%i44 = a1%i44 + ONE
+ a1%i55 = a1%i55 + ONE
+ a1%i66 = a1%i66 + ONE
+
+ a2%i11 = a2%i11 + ONE
+ a2%i22 = a2%i22 + ONE
+ a2%i33 = a2%i33 + ONE
+ a2%i44 = a2%i44 + ONE
+ a2%i55 = a2%i55 + ONE
+ a2%i66 = a2%i66 + ONE
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_inv.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_inv.F90
new file mode 100644
index 0000000000000000000000000000000000000000..6ec62f61115dd60d37fbdca26171f2a2bcba9888
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_inv.F90
@@ -0,0 +1,211 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! clover_inv.F90 - calculates inverse of clover matrix
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+# include "clover.h"
+
+!-------------------------------------------------------------------------------
+subroutine clover_inv(b, ainv, a)
+
+ use typedef_clover
+ implicit none
+ type(type_clover_a), intent(inout) :: a
+ type(type_clover_a), intent(out) :: ainv
+ type(type_clover_b), intent(out) :: b
+
+ REAL :: d1, d2, d3, d4, d5
+
+ ! statement function:
+
+ COMPLEX :: z
+ REAL :: sq
+ sq(z) = (Re(z)**2 + Im(z)**2)
+
+ d1 = A11 ! D1
+ B11 = ONE / d1 ! 1 / D1
+
+ B21 = A21 * B11 ! L21
+
+ d2 = A22 - d1 * sq(B21) ! D2
+ B22 = ONE / d2 ! 1 / D2
+
+ B31 = A31 ! L31 D1
+ B32 = A32 - B31 * B12 ! L32 D2
+
+ B31 = B31 * B11 ! L31
+ B32 = B32 * B22 ! L32
+
+ d3 = A33 - d1 * sq(B31) - d2 * sq(B32) ! D3
+ B33 = ONE / d3 ! 1 / D3
+
+ B41 = A41 ! L41 D1
+ B42 = A42 - B41 * B12 ! L42 D2
+ B43 = A43 - B41 * B13 - B42 * B23 ! L43 D3
+
+ B41 = B41 * B11 ! L41
+ B42 = B42 * B22 ! L42
+ B43 = B43 * B33 ! L43
+
+ d4 = A44 - d1 * sq(B41) - d2 * sq(B42) - d3 * sq(B43) ! D4
+ B44 = ONE / d4 ! 1 / D4
+
+ B51 = A51
+ B52 = A52 - B51 * B12
+ B53 = A53 - B51 * B13 - B52 * B23
+ B54 = A54 - B51 * B14 - B52 * B24 - B53 * B34
+
+ B51 = B51 * B11
+ B52 = B52 * B22
+ B53 = B53 * B33
+ B54 = B54 * B44
+
+ d5 = A55 - d1 * sq(B51) - d2 * sq(B52) - d3 * sq(B53) - d4 * sq(B54)
+ B55 = ONE / d5
+
+ B61 = A61
+ B62 = A62 - B61 * B12
+ B63 = A63 - B61 * B13 - B62 * B23
+ B64 = A64 - B61 * B14 - B62 * B24 - B63 * B34
+ B65 = A65 - B61 * B15 - B62 * B25 - B63 * B35 - B64 * B45
+
+ B61 = B61 * B11
+ B62 = B62 * B22
+ B63 = B63 * B33
+ B64 = B64 * B44
+ B65 = B65 * B55
+
+ B66 = A66 - d1 * sq(B61) - d2 * sq(B62) - d3 * sq(B63) &
+ - d4 * sq(B64) - d5 * sq(B65)
+
+ B66 = ONE / B66
+
+ call clover_inv2(ainv, b)
+
+ B11 = HALF * B11
+ B22 = HALF * B22
+ B33 = HALF * B33
+ B44 = HALF * B44
+ B55 = HALF * B55
+ B66 = HALF * B66
+
+ A11 = HALF * A11
+ A12 = HALF * A12
+ A13 = HALF * A13
+ A14 = HALF * A14
+ A15 = HALF * A15
+ A16 = HALF * A16
+
+ A22 = HALF * A22
+ A23 = HALF * A23
+ A24 = HALF * A24
+ A25 = HALF * A25
+ A26 = HALF * A26
+
+ A33 = HALF * A33
+ A34 = HALF * A34
+ A35 = HALF * A35
+ A36 = HALF * A36
+
+ A44 = HALF * A44
+ A45 = HALF * A45
+ A46 = HALF * A46
+
+ A55 = HALF * A55
+ A56 = HALF * A56
+
+ A66 = HALF * A66
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_inv2(a, b)
+
+ use typedef_clover
+ implicit none
+ type(type_clover_a), intent(out) :: a
+ type(type_clover_b), intent(in) :: b
+
+ COMPLEX, dimension(6) :: u, x, y
+
+ call inv(1)
+ A11 = Re(x(1))
+ A12 = x(2)
+ A13 = x(3)
+ A14 = x(4)
+ A15 = x(5)
+ A16 = x(6)
+
+ call inv(2)
+ A22 = Re(x(2))
+ A23 = x(3)
+ A24 = x(4)
+ A25 = x(5)
+ A26 = x(6)
+
+ call inv(3)
+ A33 = Re(x(3))
+ A34 = x(4)
+ A35 = x(5)
+ A36 = x(6)
+
+ call inv(4)
+ A44 = Re(x(4))
+ A45 = x(5)
+ A46 = x(6)
+
+ call inv(5)
+ A55 = Re(x(5))
+ A56 = x(6)
+
+ call inv(6)
+ A66 = Re(x(6))
+
+
+CONTAINS
+
+ subroutine inv(i)
+
+ integer :: i
+
+ u = ZERO
+ u(i) = ONE
+
+ y(1) = u(1)
+ y(2) = u(2) - B21 * y(1)
+ y(3) = u(3) - B31 * y(1) - B32 * y(2)
+ y(4) = u(4) - B41 * y(1) - B42 * y(2) - B43 * y(3)
+ y(5) = u(5) - B51 * y(1) - B52 * y(2) - B53 * y(3) - B54 * y(4)
+ y(6) = u(6) - B61 * y(1) - B62 * y(2) - B63 * y(3) - B64 * y(4) - B65 * y(5)
+
+ x(6) = y(6) * B66
+ x(5) = y(5) * B55 - B56 * x(6)
+ x(4) = y(4) * B44 - B45 * x(5) - B46 * x(6)
+ x(3) = y(3) * B33 - B34 * x(4) - B35 * x(5) &
+ - B36 * x(6)
+ x(2) = y(2) * B22 - B23 * x(3) - B24 * x(4) &
+ - B25 * x(5) - B26 * x(6)
+ x(1) = y(1) * B11 - B12 * x(2) - B13 * x(3) &
+ - B14 * x(4) - B15 * x(5) - B16 * x(6)
+
+ x(1) = HALF * conjg(x(1))
+ x(2) = HALF * conjg(x(2))
+ x(3) = HALF * conjg(x(3))
+ x(4) = HALF * conjg(x(4))
+ x(5) = HALF * conjg(x(5))
+ x(6) = HALF * conjg(x(6))
+
+ end subroutine inv
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_a.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_a.F90
new file mode 100644
index 0000000000000000000000000000000000000000..2c03c4ad17b4c5a09bfd9f8684856e868364d596
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_a.F90
@@ -0,0 +1,87 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! clover_mult_a.F90
+!
+!-------------------------------------------------------------------------------
+# define CLOVER_AS_COMPLEX_ARRAY
+# include "defs.h"
+# include "clover.h"
+
+!-------------------------------------------------------------------------------
+subroutine clover_mult_a(out, a, in, volh) ! out := A in
+
+ implicit none
+
+ COMPLEX, dimension(18, 2, *) :: a
+ COMPLEX, dimension(NDIRAC, NCOL, *) :: out, in
+ integer :: volh
+
+ integer :: i
+ COMPLEX :: x1, x2, x3, x4, x5, x6
+ COMPLEX :: y1, y2, y3, y4, y5, y6
+
+ TIMING_START(timing_bin_clover_mult_a)
+
+ !$omp parallel do private(x1, x2, x3, x4, x5, x6, y1, y2, y3, y4, y5, y6)
+ do i = 1, volh
+ x1 = in(SC1, i) + in(SC7, i)
+ x2 = in(SC2, i) + in(SC8, i)
+ x3 = in(SC3, i) + in(SC9, i)
+ x4 = in(SC4, i) + in(SC10, i)
+ x5 = in(SC5, i) + in(SC11, i)
+ x6 = in(SC6, i) + in(SC12, i)
+
+# define J 1
+# include "clover_mult_a.h90"
+
+ out(SC1, i) = y1
+ out(SC2, i) = y2
+ out(SC3, i) = y3
+ out(SC4, i) = y4
+ out(SC5, i) = y5
+ out(SC6, i) = y6
+ out(SC7, i) = y1
+ out(SC8, i) = y2
+ out(SC9, i) = y3
+ out(SC10, i) = y4
+ out(SC11, i) = y5
+ out(SC12, i) = y6
+
+ x1 = in(SC1, i) - in(SC7, i)
+ x2 = in(SC2, i) - in(SC8, i)
+ x3 = in(SC3, i) - in(SC9, i)
+ x4 = in(SC4, i) - in(SC10, i)
+ x5 = in(SC5, i) - in(SC11, i)
+ x6 = in(SC6, i) - in(SC12, i)
+
+# undef J
+# define J 2
+# include "clover_mult_a.h90"
+
+ out(SC1, i) = out(SC1, i) + y1
+ out(SC2, i) = out(SC2, i) + y2
+ out(SC3, i) = out(SC3, i) + y3
+ out(SC4, i) = out(SC4, i) + y4
+ out(SC5, i) = out(SC5, i) + y5
+ out(SC6, i) = out(SC6, i) + y6
+ out(SC7, i) = out(SC7, i) - y1
+ out(SC8, i) = out(SC8, i) - y2
+ out(SC9, i) = out(SC9, i) - y3
+ out(SC10, i) = out(SC10, i) - y4
+ out(SC11, i) = out(SC11, i) - y5
+ out(SC12, i) = out(SC12, i) - y6
+
+ enddo
+
+ TIMING_STOP(timing_bin_clover_mult_a)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_a.h90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_a.h90
new file mode 100644
index 0000000000000000000000000000000000000000..49a8b2aaa3d48d29aefc6233d5379d6101708785
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_a.h90
@@ -0,0 +1,12 @@
+ y1 = A11 * x1 + A12 * x2 + A13 * x3 &
+ + A14 * x4 + A15 * x5 + A16 * x6
+ y2 = A21 * x1 + A22 * x2 + A23 * x3 &
+ + A24 * x4 + A25 * x5 + A26 * x6
+ y3 = A31 * x1 + A32 * x2 + A33 * x3 &
+ + A34 * x4 + A35 * x5 + A36 * x6
+ y4 = A41 * x1 + A42 * x2 + A43 * x3 &
+ + A44 * x4 + A45 * x5 + A46 * x6
+ y5 = A51 * x1 + A52 * x2 + A53 * x3 &
+ + A54 * x4 + A55 * x5 + A56 * x6
+ y6 = A61 * x1 + A62 * x2 + A63 * x3 &
+ + A64 * x4 + A65 * x5 + A66 * x6
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_ao.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_ao.F90
new file mode 100644
index 0000000000000000000000000000000000000000..c35d1656cbb4dcbfe1568bd8e2b59c6361d5ab4a
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_ao.F90
@@ -0,0 +1,87 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! clover_mult_ao.F90 - ao: "A overwrite"
+!
+!-------------------------------------------------------------------------------
+# define CLOVER_AS_COMPLEX_ARRAY
+# include "defs.h"
+# include "clover.h"
+
+!-------------------------------------------------------------------------------
+subroutine clover_mult_ao(a, x, volh) ! x := A x
+
+ implicit none
+
+ COMPLEX, dimension(18, 2, *) :: a
+ COMPLEX, dimension(NDIRAC, NCOL, *) :: x
+ integer :: volh
+
+ integer :: i
+ COMPLEX :: x1, x2, x3, x4, x5, x6
+ COMPLEX :: y1, y2, y3, y4, y5, y6
+
+ TIMING_START(timing_bin_clover_mult_ao)
+
+ !$omp parallel do private(x1, x2, x3, x4, x5, x6, y1, y2, y3, y4, y5, y6)
+ do i = 1, volh
+ x1 = x(SC1, i) + x(SC7, i)
+ x2 = x(SC2, i) + x(SC8, i)
+ x3 = x(SC3, i) + x(SC9, i)
+ x4 = x(SC4, i) + x(SC10, i)
+ x5 = x(SC5, i) + x(SC11, i)
+ x6 = x(SC6, i) + x(SC12, i)
+
+# define J 1
+# include "clover_mult_a.h90"
+
+ x1 = x(SC1, i) - x(SC7, i)
+ x2 = x(SC2, i) - x(SC8, i)
+ x3 = x(SC3, i) - x(SC9, i)
+ x4 = x(SC4, i) - x(SC10, i)
+ x5 = x(SC5, i) - x(SC11, i)
+ x6 = x(SC6, i) - x(SC12, i)
+
+ x(SC1, i) = y1
+ x(SC2, i) = y2
+ x(SC3, i) = y3
+ x(SC4, i) = y4
+ x(SC5, i) = y5
+ x(SC6, i) = y6
+ x(SC7, i) = y1
+ x(SC8, i) = y2
+ x(SC9, i) = y3
+ x(SC10, i) = y4
+ x(SC11, i) = y5
+ x(SC12, i) = y6
+
+# undef J
+# define J 2
+# include "clover_mult_a.h90"
+
+ x(SC1, i) = x(SC1, i) + y1
+ x(SC2, i) = x(SC2, i) + y2
+ x(SC3, i) = x(SC3, i) + y3
+ x(SC4, i) = x(SC4, i) + y4
+ x(SC5, i) = x(SC5, i) + y5
+ x(SC6, i) = x(SC6, i) + y6
+ x(SC7, i) = x(SC7, i) - y1
+ x(SC8, i) = x(SC8, i) - y2
+ x(SC9, i) = x(SC9, i) - y3
+ x(SC10, i) = x(SC10, i) - y4
+ x(SC11, i) = x(SC11, i) - y5
+ x(SC12, i) = x(SC12, i) - y6
+
+ enddo
+
+ TIMING_STOP(timing_bin_clover_mult_ao)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_b.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_b.F90
new file mode 100644
index 0000000000000000000000000000000000000000..b0d30d053b882b1cdbc68c9ab3b09d30956e096f
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_b.F90
@@ -0,0 +1,88 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! clover_mult_b.F90
+!
+!-------------------------------------------------------------------------------
+# define CLOVER_AS_COMPLEX_ARRAY
+# include "defs.h"
+# include "clover.h"
+
+!-------------------------------------------------------------------------------
+subroutine clover_mult_b(b, x, volh) ! x := B x
+
+ implicit none
+
+ COMPLEX, dimension(18, 2, *) :: b
+ COMPLEX, dimension(NDIRAC, NCOL, *) :: x
+ integer :: volh
+
+ integer :: i
+ COMPLEX :: x1, x2, x3, x4, x5, x6
+ COMPLEX :: y1, y2, y3, y4, y5, y6
+
+ TIMING_START(timing_bin_clover_mult_b)
+
+ !$omp parallel do private(x1, x2, x3, x4, x5, x6, y1, y2, y3, y4, y5, y6)
+ do i = 1, volh
+
+ y1 = x(SC1, i) + x(SC7, i)
+ y2 = x(SC2, i) + x(SC8, i)
+ y3 = x(SC3, i) + x(SC9, i)
+ y4 = x(SC4, i) + x(SC10, i)
+ y5 = x(SC5, i) + x(SC11, i)
+ y6 = x(SC6, i) + x(SC12, i)
+
+# define J 1
+# include "clover_mult_b.h90"
+
+ y1 = x(SC1, i) - x(SC7, i)
+ y2 = x(SC2, i) - x(SC8, i)
+ y3 = x(SC3, i) - x(SC9, i)
+ y4 = x(SC4, i) - x(SC10, i)
+ y5 = x(SC5, i) - x(SC11, i)
+ y6 = x(SC6, i) - x(SC12, i)
+
+ x(SC1, i) = x1
+ x(SC2, i) = x2
+ x(SC3, i) = x3
+ x(SC4, i) = x4
+ x(SC5, i) = x5
+ x(SC6, i) = x6
+ x(SC7, i) = x1
+ x(SC8, i) = x2
+ x(SC9, i) = x3
+ x(SC10, i) = x4
+ x(SC11, i) = x5
+ x(SC12, i) = x6
+
+# undef J
+# define J 2
+# include "clover_mult_b.h90"
+
+ x(SC1, i) = x(SC1, i) + x1
+ x(SC2, i) = x(SC2, i) + x2
+ x(SC3, i) = x(SC3, i) + x3
+ x(SC4, i) = x(SC4, i) + x4
+ x(SC5, i) = x(SC5, i) + x5
+ x(SC6, i) = x(SC6, i) + x6
+ x(SC7, i) = x(SC7, i) - x1
+ x(SC8, i) = x(SC8, i) - x2
+ x(SC9, i) = x(SC9, i) - x3
+ x(SC10, i) = x(SC10, i) - x4
+ x(SC11, i) = x(SC11, i) - x5
+ x(SC12, i) = x(SC12, i) - x6
+
+ enddo
+
+ TIMING_STOP(timing_bin_clover_mult_b)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_b.h90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_b.h90
new file mode 100644
index 0000000000000000000000000000000000000000..f38943b10b35aaf30abdd18030e26f65e67a6930
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_b.h90
@@ -0,0 +1,15 @@
+ y2 = y2 - B21 * y1
+ y3 = y3 - B31 * y1 - B32 * y2
+ y4 = y4 - B41 * y1 - B42 * y2 - B43 * y3
+ y5 = y5 - B51 * y1 - B52 * y2 - B53 * y3 - B54 * y4
+ y6 = y6 - B61 * y1 - B62 * y2 - B63 * y3 - B64 * y4 - B65 * y5
+
+ x6 = y6 * B66
+ x5 = y5 * B55 - B56 * x6
+ x4 = y4 * B44 - B45 * x5 - B46 * x6
+ x3 = y3 * B33 - B34 * x4 - B35 * x5 &
+ - B36 * x6
+ x2 = y2 * B22 - B23 * x3 - B24 * x4 &
+ - B25 * x5 - B26 * x6
+ x1 = y1 * B11 - B12 * x2 - B13 * x3 &
+ - B14 * x4 - B15 * x5 - B16 * x6
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_t_init.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_t_init.F90
new file mode 100644
index 0000000000000000000000000000000000000000..1c98847df50f08a5779ce46a72390f5d3b48243d
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_t_init.F90
@@ -0,0 +1,56 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! clover_t_init.F90 - calculates T
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine clover_t_init(t, b)
+
+ use typedef_clover
+ use module_vol
+ implicit none
+
+ CLOVER_FIELD_C :: t
+ type(type_clover_b) :: b(2, volh)
+
+ SPINCOL_FIELD :: x
+!dir$ cache_align x
+
+ integer :: c1, c2, s1, s2, i
+
+ do c2 = 1, NCOL
+ do s2 = 1, NDIRAC
+
+ x = ZERO
+ !$omp parallel do
+ do i = 1, volh
+ x(s2, c2, i) = ONE
+ enddo
+
+ call clover_mult_b(b, x, volh)
+
+ !$omp parallel do private(c1, s1)
+ do i = 1, volh
+ do c1 = 1, NCOL
+ do s1 = 1, NDIRAC
+ t(s1, c1, s2, c2, i) = x(s1, c1, i)
+ enddo
+ enddo
+ enddo
+
+ enddo
+ enddo
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts.F90
new file mode 100644
index 0000000000000000000000000000000000000000..c0fb1a8f81f1bf8612951f5bb451f6d157736db8
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts.F90
@@ -0,0 +1,165 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! clover_ts.F90 - calculates T * sigma
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine clover_ts(mu, nu, w, t) ! w = t sigma_mu_nu
+
+ use module_vol
+ implicit none
+
+ integer :: mu, nu
+ SU3_FIELD :: w
+ CLOVER_FIELD_C :: t
+
+ if (mu == 1) then
+ if (nu == 2) then ; call clover_ts_12(w, t)
+ elseif (nu == 3) then ; call clover_ts_13(w, t)
+ elseif (nu == 4) then ; call clover_ts_14(w, t) ; endif
+ elseif (mu == 2) then
+ if (nu == 1) then ; call clover_ts_21(w, t)
+ elseif (nu == 3) then ; call clover_ts_23(w, t)
+ elseif (nu == 4) then ; call clover_ts_24(w, t) ; endif
+ elseif (mu == 3) then
+ if (nu == 1) then ; call clover_ts_31(w, t)
+ elseif (nu == 2) then ; call clover_ts_32(w, t)
+ elseif (nu == 4) then ; call clover_ts_34(w, t) ; endif
+ elseif (mu == 4) then
+ if (nu == 1) then ; call clover_ts_41(w, t)
+ elseif (nu == 2) then ; call clover_ts_42(w, t)
+ elseif (nu == 3) then ; call clover_ts_43(w, t) ; endif
+ endif
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_ts_12(w, t)
+
+# include "clover_ts_head.h90"
+ w(c1, c2, i) = -t(1, c1, 1, c2, i) &
+ + t(2, c1, 2, c2, i) &
+ - t(3, c1, 3, c2, i) &
+ + t(4, c1, 4, c2, i)
+# include "clover_ts_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_ts_21(w, t)
+
+# include "clover_ts_head.h90"
+ w(c1, c2, i) = t(1, c1, 1, c2, i) &
+ - t(2, c1, 2, c2, i) &
+ + t(3, c1, 3, c2, i) &
+ - t(4, c1, 4, c2, i)
+# include "clover_ts_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_ts_13(w, t)
+
+# include "clover_ts_head.h90"
+ w(c1, c2, i) = i_times(t(1, c1, 2, c2, i)) &
+ - i_times(t(2, c1, 1, c2, i)) &
+ + i_times(t(3, c1, 4, c2, i)) &
+ - i_times(t(4, c1, 3, c2, i))
+# include "clover_ts_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_ts_31(w, t)
+
+# include "clover_ts_head.h90"
+ w(c1, c2, i) = -i_times(t(1, c1, 2, c2, i)) &
+ + i_times(t(2, c1, 1, c2, i)) &
+ - i_times(t(3, c1, 4, c2, i)) &
+ + i_times(t(4, c1, 3, c2, i))
+# include "clover_ts_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_ts_14(w, t)
+
+# include "clover_ts_head.h90"
+ w(c1, c2, i) = t(1, c1, 4, c2, i) &
+ + t(2, c1, 3, c2, i) &
+ + t(3, c1, 2, c2, i) &
+ + t(4, c1, 1, c2, i)
+# include "clover_ts_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_ts_41(w, t)
+
+# include "clover_ts_head.h90"
+ w(c1, c2, i) = -t(1, c1, 4, c2, i) &
+ - t(2, c1, 3, c2, i) &
+ - t(3, c1, 2, c2, i) &
+ - t(4, c1, 1, c2, i)
+# include "clover_ts_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_ts_23(w, t)
+
+# include "clover_ts_head.h90"
+ w(c1, c2, i) = -t(1, c1, 2, c2, i) &
+ - t(2, c1, 1, c2, i) &
+ - t(3, c1, 4, c2, i) &
+ - t(4, c1, 3, c2, i)
+# include "clover_ts_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_ts_32(w, t)
+
+# include "clover_ts_head.h90"
+ w(c1, c2, i) = t(1, c1, 2, c2, i) &
+ + t(2, c1, 1, c2, i) &
+ + t(3, c1, 4, c2, i) &
+ + t(4, c1, 3, c2, i)
+# include "clover_ts_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_ts_24(w, t)
+
+# include "clover_ts_head.h90"
+ w(c1, c2, i) = i_times(t(1, c1, 4, c2, i)) &
+ - i_times(t(2, c1, 3, c2, i)) &
+ + i_times(t(3, c1, 2, c2, i)) &
+ - i_times(t(4, c1, 1, c2, i))
+# include "clover_ts_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_ts_42(w, t)
+
+# include "clover_ts_head.h90"
+ w(c1, c2, i) = -i_times(t(1, c1, 4, c2, i)) &
+ + i_times(t(2, c1, 3, c2, i)) &
+ - i_times(t(3, c1, 2, c2, i)) &
+ + i_times(t(4, c1, 1, c2, i))
+# include "clover_ts_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_ts_34(w, t)
+
+# include "clover_ts_head.h90"
+ w(c1, c2, i) = t(1, c1, 3, c2, i) &
+ - t(2, c1, 4, c2, i) &
+ + t(3, c1, 1, c2, i) &
+ - t(4, c1, 2, c2, i)
+# include "clover_ts_tail.h90"
+
+!-------------------------------------------------------------------------------
+subroutine clover_ts_43(w, t)
+
+# include "clover_ts_head.h90"
+ w(c1, c2, i) = -t(1, c1, 3, c2, i) &
+ + t(2, c1, 4, c2, i) &
+ - t(3, c1, 1, c2, i) &
+ + t(4, c1, 2, c2, i)
+# include "clover_ts_tail.h90"
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts_head.h90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts_head.h90
new file mode 100644
index 0000000000000000000000000000000000000000..fb209369c093d42e29f54e163f8612a271a45387
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts_head.h90
@@ -0,0 +1,17 @@
+ use module_vol
+ implicit none
+
+ SU3_FIELD :: w
+ CLOVER_FIELD_C :: t
+
+ integer :: i, c1, c2
+
+ ! statement function:
+
+ COMPLEX :: i_times, c
+ i_times(c) = cmplx(-aimag(c), real(c))
+
+ !$omp parallel do private(c1, c2)
+ do i = 1, volh
+ do c2 = 1, NCOL
+ do c1 = 1, NCOL
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts_tail.h90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts_tail.h90
new file mode 100644
index 0000000000000000000000000000000000000000..46655ebcdf9f049df5ea6453a96851d927592acb
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts_tail.h90
@@ -0,0 +1,5 @@
+ enddo
+ enddo
+ enddo
+
+end
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_uuu.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_uuu.F90
new file mode 100644
index 0000000000000000000000000000000000000000..0b513a2c746dba130f2208c72cad2dfb932439f3
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_uuu.F90
@@ -0,0 +1,116 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! clover_uuu.F90 - multiplications of three SU(3) matrices
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine clover_uuu_uuu(r, u, v, w) ! r = u * v * w
+
+ implicit none
+ SU3 :: r, u, v, w
+ integer :: i, j, k, l
+
+ do i = 1, NCOL
+ do l = 1, NCOL
+ r(i, l) = ZERO
+ do j = 1, NCOL
+ do k = 1, NCOL
+ r(i, l) = r(i, l) + u(i, j) * v(j, k) * w(k, l)
+ enddo
+ enddo
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_uuu_duu(r, u, v, w) ! r = u+ * v * w
+
+ implicit none
+ SU3 :: r, u, v, w
+ integer :: i, j, k, l
+
+ do i = 1, NCOL
+ do l = 1, NCOL
+ r(i, l) = ZERO
+ do j = 1, NCOL
+ do k = 1, NCOL
+ r(i, l) = r(i, l) + conjg(u(j, i)) * v(j, k) * w(k, l)
+ enddo
+ enddo
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_uuu_udu(r, u, v, w) ! r = u * v+ * w
+
+ implicit none
+ SU3 :: r, u, v, w
+ integer :: i, j, k, l
+
+ do i = 1, NCOL
+ do l = 1, NCOL
+ r(i, l) = ZERO
+ do j = 1, NCOL
+ do k = 1, NCOL
+ r(i, l) = r(i, l) + u(i, j) * conjg(v(k, j)) * w(k, l)
+ enddo
+ enddo
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_uuu_uud(r, u, v, w) ! r = u * v * w+
+
+ implicit none
+ SU3 :: r, u, v, w
+ integer :: i, j, k, l
+
+ do i = 1, NCOL
+ do l = 1, NCOL
+ r(i, l) = ZERO
+ do j = 1, NCOL
+ do k = 1, NCOL
+ r(i, l) = r(i, l) + u(i, j) * v(j, k) * conjg(w(l, k))
+ enddo
+ enddo
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_uuu_dud(r, u, v, w) ! r = u+ * v * w+
+
+ implicit none
+ SU3 :: r, u, v, w
+ integer :: i, j, k, l
+
+ do i = 1, NCOL
+ do l = 1, NCOL
+ r(i, l) = ZERO
+ do j = 1, NCOL
+ do k = 1, NCOL
+ r(i, l) = r(i, l) + conjg(u(j, i)) * v(j, k) * conjg(w(l, k))
+ enddo
+ enddo
+ enddo
+ enddo
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_uuuu.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_uuuu.F90
new file mode 100644
index 0000000000000000000000000000000000000000..a9994b8dc041702c348dea588210dc4e4162e017
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_uuuu.F90
@@ -0,0 +1,117 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! clover_uuuu.F90 - multiplications of four SU(3) matrices
+!
+!-------------------------------------------------------------------------------
+!
+! --<-- --<--
+! | | | |
+! v 2 ^ v 1 ^
+! | | | |
+! -->-- -->--
+! x
+! --<-- --<--
+! | | | |
+! v 3 ^ v 4 ^
+! | | | |
+! -->-- -->--
+!
+! uuuu1: uuuu += u1 u2 u3+ u4+ ! + = dagger
+! uuuu2: uuuu += u1 u2+ u3+ u4
+! uuuu3: uuuu += u1+ u2+ u3 u4
+! uuuu4: uuuu += u1+ u2 u3 u4+
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine clover_uuuu1(uuuu, u1, u2, u3, u4)
+
+ implicit none
+ SU3 :: uuuu, u1, u2, u3, u4
+ integer :: i, j, k, l, m
+
+ do i = 1, NCOL
+ do m = 1, NCOL
+ do j = 1, NCOL
+ do k = 1, NCOL
+ do l = 1, NCOL
+ uuuu(i,m)= uuuu(i,m)+ u1(i,j) * u2(j,k) * conjg(u3(l,k)) * conjg(u4(m,l))
+ enddo
+ enddo
+ enddo
+ enddo
+ enddo
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_uuuu2(uuuu, u1, u2, u3, u4)
+
+ implicit none
+ SU3 :: uuuu, u1, u2, u3, u4
+ integer :: i, j, k, l, m
+
+ do i = 1, NCOL
+ do m = 1, NCOL
+ do j = 1, NCOL
+ do k = 1, NCOL
+ do l = 1, NCOL
+ uuuu(i,m)= uuuu(i,m)+ u1(i,j) * conjg(u2(k,j)) * conjg(u3(l,k)) * u4(l,m)
+ enddo
+ enddo
+ enddo
+ enddo
+ enddo
+end
+
+
+
+!-------------------------------------------------------------------------------
+subroutine clover_uuuu3(uuuu, u1, u2, u3, u4)
+
+ implicit none
+ SU3 :: uuuu, u1, u2, u3, u4
+ integer :: i, j, k, l, m
+
+ do i = 1, NCOL
+ do m = 1, NCOL
+ do j = 1, NCOL
+ do k = 1, NCOL
+ do l = 1, NCOL
+ uuuu(i,m)= uuuu(i,m)+ conjg(u1(j,i)) * conjg(u2(k,j)) * u3(k,l) * u4(l,m)
+ enddo
+ enddo
+ enddo
+ enddo
+ enddo
+end
+
+!-------------------------------------------------------------------------------
+subroutine clover_uuuu4(uuuu, u1, u2, u3, u4)
+
+ implicit none
+ SU3 :: uuuu, u1, u2, u3, u4
+ integer :: i, j, k, l, m
+
+ do i = 1, NCOL
+ do m = 1, NCOL
+ do j = 1, NCOL
+ do k = 1, NCOL
+ do l = 1, NCOL
+ uuuu(i,m)= uuuu(i,m)+ conjg(u1(j,i)) * u2(j,k) * u3(k,l) * conjg(u4(m,l))
+ enddo
+ enddo
+ enddo
+ enddo
+ enddo
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/ctest.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/ctest.F90
new file mode 100644
index 0000000000000000000000000000000000000000..942ab53a936812d9bdec4f106ad51c82df549b8f
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/ctest.F90
@@ -0,0 +1,151 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! ctest.F90 - test of clover matrix multiplications: is (A * inv(A) = 1) ?
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+# include "clover.h"
+
+!-------------------------------------------------------------------------------
+program ctest
+
+ use typedef_clover
+ implicit none
+
+ integer, parameter :: volh = 1
+ integer, parameter :: nz = 1
+
+ type(type_clover_a) :: a(2, volh), ainv(2, volh)
+ type(type_clover_b) :: b(2, volh)
+
+ COMPLEX, dimension(NDIRAC, NCOL, volh) :: z, r
+
+ integer :: i, j, s, c
+
+ do i = 1, volh
+ do j = 1, 2
+ call cinit(a(j, i))
+ call clover_inv(b(j, i), ainv(j, i), a(j, i))
+ enddo
+ enddo
+
+
+ do j = 1,12
+ call zinit(z, j, volh)
+ !!call clover_mult_b(b, z, volh)
+ call clover_mult_ao(ainv, z, volh)
+ call clover_mult_a(r, a, z, volh)
+ call zwrite(r, j, volh)
+
+! call zinit(z, j+6, volh)
+! call clover_mult_b(b, z, volh)
+! call zwrite(z, j+6, volh)
+
+! call zinit(z, j+6, volh)
+! call clover_mult_a(r, a, z, volh)
+! call zwrite(r, j+6, volh)
+
+
+!! call clover_mult_b(b, r, volh)
+ !!call zwrite(r, j, volh)
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine cinit(a)
+
+ use typedef_clover
+ implicit none
+ type(type_clover_a) :: a
+ real, intrinsic :: ranf
+
+ A11 = ranf()
+ A22 = ranf()
+ A33 = ranf()
+ A44 = ranf()
+ A55 = ranf()
+ A66 = ranf()
+
+ A12 = cmplx(ranf(), ranf())
+ A13 = cmplx(ranf(), ranf())
+ A14 = cmplx(ranf(), ranf())
+ A15 = cmplx(ranf(), ranf())
+ A16 = cmplx(ranf(), ranf())
+
+ A23 = cmplx(ranf(), ranf())
+ A24 = cmplx(ranf(), ranf())
+ A25 = cmplx(ranf(), ranf())
+ A26 = cmplx(ranf(), ranf())
+
+ A34 = cmplx(ranf(), ranf())
+ A35 = cmplx(ranf(), ranf())
+ A36 = cmplx(ranf(), ranf())
+
+ A45 = cmplx(ranf(), ranf())
+ A46 = cmplx(ranf(), ranf())
+
+ A56 = cmplx(ranf(), ranf())
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine zinit(z, j, volh)
+
+ implicit none
+ integer :: volh, i, j
+ COMPLEX, dimension(NDIRAC, NCOL, volh) :: z
+
+ z = 0
+ do i = 1, volh
+ if (j == 1) z(SC1, i) = 1
+ if (j == 2) z(SC2, i) = 1
+ if (j == 3) z(SC3, i) = 1
+ if (j == 4) z(SC4, i) = 1
+ if (j == 5) z(SC5, i) = 1
+ if (j == 6) z(SC6, i) = 1
+ if (j == 7) z(SC7, i) = 1
+ if (j == 8) z(SC8, i) = 1
+ if (j == 9) z(SC9, i) = 1
+ if (j == 10) z(SC10, i) = 1
+ if (j == 11) z(SC11, i) = 1
+ if (j == 12) z(SC12, i) = 1
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine zwrite(z, j, volh)
+
+ implicit none
+ integer :: volh, i, j
+ COMPLEX, dimension(NDIRAC, NCOL, volh) :: z
+
+ write(6,*) "-----------------------------------------------"
+ do i = 1, volh
+ write(6, "(4i4,2f16.8)") j, i, SC1, z(SC1, i)
+ write(6, "(4i4,2f16.8)") j, i, SC2, z(SC2, i)
+ write(6, "(4i4,2f16.8)") j, i, SC3, z(SC3, i)
+ write(6, "(4i4,2f16.8)") j, i, SC4, z(SC4, i)
+ write(6, "(4i4,2f16.8)") j, i, SC5, z(SC5, i)
+ write(6, "(4i4,2f16.8)") j, i, SC6, z(SC6, i)
+ write(6,*)
+ write(6, "(4i4,2f16.8)") j, i, SC7, z(SC7, i)
+ write(6, "(4i4,2f16.8)") j, i, SC8, z(SC8, i)
+ write(6, "(4i4,2f16.8)") j, i, SC9, z(SC9, i)
+ write(6, "(4i4,2f16.8)") j, i, SC10, z(SC10, i)
+ write(6, "(4i4,2f16.8)") j, i, SC11, z(SC11, i)
+ write(6, "(4i4,2f16.8)") j, i, SC12, z(SC12, i)
+ enddo
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..8dd716e619545bfc4323d61b9c446881b58769d1
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/Makefile
@@ -0,0 +1,89 @@
+#===============================================================================
+#
+# BQCD -- Berlin Quantum ChromoDynamics programme
+#
+# Author: Hinnerk Stueben
+#
+# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+#
+#-------------------------------------------------------------------------------
+#
+# comm/Makefile
+#
+#===============================================================================
+
+include ../Makefile.defs
+
+MODULES_DIR = ../modules
+
+.SUFFIXES:
+.SUFFIXES: .a .o .F90
+
+.F90.o:
+ $(FPP) -I.. $(FPPFLAGS) $< > $*.f90
+ $(F90) -c $(FFLAGS) -I$(MODULES_DIR) $*.f90
+
+
+OBJS_MPI = \
+ dotprod.o \
+ comm_mpi.o \
+ allocate.o \
+ field_io_mpi.o \
+ pes_mpi.o \
+ reduction_mpi.o \
+ seed_mpi.o \
+ xbound_mpi.o
+
+OBJS_SHMEM = \
+ dotprod.o \
+ comm_shmem.o \
+ allocate_shmem.o \
+ field_io_shmem.o \
+ reduction_shmem.o \
+ seed_shmem.o \
+ xbound_shmem.o
+
+OBJS_SHMEMPI = \
+ dotprod.o \
+ comm_shmempi.o \
+ allocate_shmem.o \
+ field_io_mpi.o \
+ pes_mpi.o \
+ reduction_mpi.o \
+ seed_mpi.o \
+ xbound_shmem.o
+
+OBJS_SINGLE_PE = \
+ dotprod.o \
+ allocate.o \
+ comm_single_pe.o \
+ field_io_single_pe.o \
+ pes_single_pe.o \
+ reduction_single_pe.o \
+ seed_single_pe.o \
+ xbound_single_pe.o
+
+$(LIBCOMM):
+
+fast:
+ $(FAST_MAKE)
+
+lib_mpi.a: $(OBJS_MPI)
+ $(AR) $(ARFLAGS) $@ $(OBJS_MPI)
+ $(RANLIB) $@
+
+lib_shmem.a: $(OBJS_SHMEM)
+ $(AR) $(ARFLAGS) $@ $(OBJS_SHMEM)
+ $(RANLIB) $@
+
+lib_shmempi.a: $(OBJS_SHMEMPI)
+ $(AR) $(ARFLAGS) $@ $(OBJS_SHMEMPI)
+ $(RANLIB) $@
+
+lib_single_pe.a: $(OBJS_SINGLE_PE)
+ $(AR) $(ARFLAGS) $@ $(OBJS_SINGLE_PE)
+ $(RANLIB) $@
+
+clobber:
+ rm -f *.[Tiod] *.f90 *.mod work.pc work.pcl
+ rm -f lib_mpi.a lib_shmem.a lib_shmempi.a lib_single_pe.a
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/allocate.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/allocate.F90
new file mode 100644
index 0000000000000000000000000000000000000000..6993128034183d80f1efac6cb00aa360b7a8bfb2
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/allocate.F90
@@ -0,0 +1,133 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! allocate.F90 - allocation of gauge and pseudo fermion fields
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine allocate_g_field(u)
+
+ use module_vol
+ implicit none
+ P_GAUGE_FIELD :: u
+
+ if (associated(u)) then
+ call die("allocate_g_field(): memory leak")
+ else
+ allocate(u(NCOL, NCOL, volh_tot, EVEN:ODD, DIM))
+ call conf_zero(u)
+ endif
+end
+
+!-------------------------------------------------------------------------------
+subroutine allocate_g_field_io(u)
+
+ use module_lattice_io
+ implicit none
+ P_GAUGE_FIELD_IO :: u
+
+ if (associated(u)) then
+ call die("allocate_g_field_io(): memory leak")
+ else
+ allocate(u(NCOL, NCOL-1, DIM, 0:NX-1, 0:NY-1, 0:NZ-1, 0:NT-1))
+ endif
+end
+
+!-------------------------------------------------------------------------------
+subroutine allocate_gen_field(x)
+
+ use module_vol
+ implicit none
+ P_GENERATOR_FIELD :: x
+
+ integer :: i, eo, mu
+
+ if (associated(x)) then
+ call die("allocate_gen_field(): memory leak")
+ else
+ allocate(x(NGEN, volh_tot, EVEN:ODD, DIM))
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ !$omp parallel do
+ do i = 1, volh
+ x(1, i, eo, mu) = ZERO
+ x(2, i, eo, mu) = ZERO
+ x(3, i, eo, mu) = ZERO
+ x(4, i, eo, mu) = ZERO
+ x(5, i, eo, mu) = ZERO
+ x(6, i, eo, mu) = ZERO
+ x(7, i, eo, mu) = ZERO
+ x(8, i, eo, mu) = ZERO
+ enddo
+ enddo
+ enddo
+ endif
+end
+
+!-------------------------------------------------------------------------------
+subroutine allocate_sc_field(x)
+
+ use module_vol
+ implicit none
+ P_SPINCOL_FIELD :: x
+
+ if (associated(x)) then
+ call die("allocate_sc_field(): memory leak")
+ else
+ allocate(x(NDIRAC, NCOL, volh_tot))
+ call sc_zero(x)
+ endif
+end
+
+!-------------------------------------------------------------------------------
+subroutine allocate_sc_field_io(x)
+
+ use module_lattice_io
+ implicit none
+ P_SPINCOL_FIELD_IO :: x
+
+ if (associated(x)) then
+ call die("allocate_sc_field_io(): memory leak")
+ else
+ allocate(x(NDIRAC, NCOL, 0:NXH-1, 0:NY-1, 0:NZ-1, 0:NT-1))
+ endif
+end
+
+!-------------------------------------------------------------------------------
+subroutine allocate_sc_overindexed(x)
+
+ use module_vol
+ implicit none
+ P_SPINCOL_OVERINDEXED :: x
+
+ if (associated(x)) then
+ call die("allocate_sc_overindexed(): memory leak")
+ else
+ allocate(x(SIZE_COMPLEX*NDIRAC*NCOL*volh_tot))
+ endif
+end
+
+!-------------------------------------------------------------------------------
+subroutine allocate_sc2_field(x)
+
+ use module_vol
+ implicit none
+ P_SC2_FIELD :: x
+
+ if (associated(x)) then
+ call die("allocate_sc2_field(): memory leak")
+ else
+ allocate(x(2, NCOL, volh_tot, DIM, FWD:BWD))
+ endif
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/allocate_shmem.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/allocate_shmem.F90
new file mode 100644
index 0000000000000000000000000000000000000000..8e79bcea19c76dca26bf02917496e3b60f04f2cb
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/allocate_shmem.F90
@@ -0,0 +1,229 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! allocate_shmem.F90 - allocation of gauge and pseudo fermion fields using shmem
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+# include "shmem.h"
+
+!-------------------------------------------------------------------------------
+subroutine allocate_g_field(u)
+
+ use module_vol
+ implicit none
+ P_GAUGE_FIELD :: u
+
+ GAUGE_FIELD :: uu
+ pointer (p_uu, uu)
+
+ integer :: ierr
+
+ if (associated(u)) call die("allocate_g_field(): memory leak")
+
+ call barrier()
+ call shpalloc(p_uu, SIZE_COMPLEX * NCOL * NCOL * volh_tot * 2 * DIM, ierr, 1)
+ call cray_pointer_to_f90_pointer(uu)
+
+CONTAINS
+
+ subroutine cray_pointer_to_f90_pointer(uu)
+
+ implicit none
+ GAUGE_FIELD, target :: uu
+
+ u => uu
+ end subroutine cray_pointer_to_f90_pointer
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine allocate_g_field_io(u)
+
+ use module_lattice_io
+ use module_vol
+ implicit none
+ P_GAUGE_FIELD_IO :: u
+
+ GAUGE_FIELD_IO :: uu
+ pointer (p_uu, uu)
+
+ integer :: ierr
+
+ if (associated(u)) call die("allocate_g_field_io(): memory leak")
+
+ call barrier()
+ call shpalloc(p_uu, SIZE_COMPLEX * NCOL * (NCOL-1) * DIM * vol, ierr, 1)
+ call cray_pointer_to_f90_pointer(uu)
+
+CONTAINS
+
+ subroutine cray_pointer_to_f90_pointer(uu)
+
+ implicit none
+ GAUGE_FIELD_IO, target :: uu
+
+ u => uu
+ end subroutine cray_pointer_to_f90_pointer
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine allocate_gen_field(u)
+
+ use module_vol
+ implicit none
+ P_GENERATOR_FIELD :: u
+
+ GENERATOR_FIELD :: uu
+ pointer (p_uu, uu)
+
+ integer :: ierr
+
+ if (associated(u)) call die("allocate_gen_field(): memory leak")
+
+ call barrier()
+ call shpalloc(p_uu, NGEN * volh_tot * 2 * DIM, ierr, 1)
+ call cray_pointer_to_f90_pointer(uu)
+
+CONTAINS
+
+ subroutine cray_pointer_to_f90_pointer(uu)
+
+ implicit none
+ GENERATOR_FIELD, target :: uu
+
+ u => uu
+ end subroutine cray_pointer_to_f90_pointer
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine allocate_sc_field(x)
+
+ use module_vol
+ implicit none
+ P_SPINCOL_FIELD :: x
+
+ SPINCOL_FIELD :: xx
+ pointer (p_xx, xx)
+
+ integer :: ierr
+
+ if (associated(x)) call die("allocate_sc_field(): memory leak")
+
+ call barrier()
+ call shpalloc(p_xx, SIZE_COMPLEX * NDIRAC * NCOL * volh_tot, ierr, 1)
+ call cray_pointer_to_f90_pointer(xx)
+
+CONTAINS
+
+ subroutine cray_pointer_to_f90_pointer(xx)
+
+ implicit none
+ SPINCOL_FIELD, target :: xx
+
+ x => xx
+ end subroutine cray_pointer_to_f90_pointer
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine allocate_sc2_field(x)
+
+ use module_vol
+ implicit none
+ P_SC2_FIELD :: x
+
+ SC2_FIELD :: xx
+ pointer (p_xx, xx)
+
+ integer :: ierr
+
+ if (associated(x)) call die("allocate_sc2_field(): memory leak")
+
+ call barrier()
+ call shpalloc(p_xx, SIZE_COMPLEX * 2 * NCOL * volh_tot * DIM * 2, ierr, 1)
+ call cray_pointer_to_f90_pointer(xx)
+
+CONTAINS
+
+ subroutine cray_pointer_to_f90_pointer(xx)
+
+ implicit none
+ SC2_FIELD, target :: xx
+
+ x => xx
+ end subroutine cray_pointer_to_f90_pointer
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine allocate_sc_field_io(x)
+
+ use module_lattice_io
+ use module_vol
+ implicit none
+ P_SPINCOL_FIELD_IO :: x
+
+ SPINCOL_FIELD_IO :: xx
+ pointer (p_xx, xx)
+
+ integer :: ierr
+
+ if (associated(x)) call die("allocate_sc_field_io(): memory leak")
+
+ call barrier()
+ call shpalloc(p_xx, SIZE_COMPLEX * NDIRAC * NCOL * volh, ierr, 1)
+ call cray_pointer_to_f90_pointer(xx)
+
+CONTAINS
+
+ subroutine cray_pointer_to_f90_pointer(xx)
+
+ implicit none
+ SPINCOL_FIELD_IO, target :: xx
+
+ x => xx
+ end subroutine cray_pointer_to_f90_pointer
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine allocate_sc_overindexed(x)
+
+ use module_vol
+ implicit none
+ P_SPINCOL_OVERINDEXED :: x
+
+ SPINCOL_OVERINDEXED :: xx
+ pointer (p_xx, xx)
+
+ integer :: ierr
+
+ if (associated(x)) call die("allocate_sc_overindexed(): memory leak")
+
+ call barrier()
+ call shpalloc(p_xx, SIZE_COMPLEX * NDIRAC * NCOL * volh_tot, ierr, 1)
+ call cray_pointer_to_f90_pointer(xx)
+
+CONTAINS
+
+ subroutine cray_pointer_to_f90_pointer(xx)
+
+ implicit none
+ SPINCOL_OVERINDEXED, target :: xx
+
+ x => xx
+ end subroutine cray_pointer_to_f90_pointer
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/bqcd.pcl b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/bqcd.pcl
new file mode 100644
index 0000000000000000000000000000000000000000..906244500b31700684482c3dcfd32f6cec4279db
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/bqcd.pcl
@@ -0,0 +1,2 @@
+work.pc
+../modules/work.pc
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_mpi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_mpi.F90
new file mode 100644
index 0000000000000000000000000000000000000000..cf4de03f19d6fd754f7db9491642aab5330eb56c
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_mpi.F90
@@ -0,0 +1,46 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! comm_mpi.F90 - wrapper for MPI routines
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine comm_init()
+
+ implicit none
+ include 'mpif.h'
+ integer ierror
+
+! call mpi_init(ierror)
+end
+
+!-------------------------------------------------------------------------------
+subroutine comm_finalize()
+
+ implicit none
+ include 'mpif.h'
+ integer ierror
+
+! call mpi_finalize(ierror)
+end
+
+!-------------------------------------------------------------------------------
+COMM_METHOD function comm_method()
+
+#ifdef _OPENMP
+ comm_method = "MPI + OpenMP"
+#else
+ comm_method = "MPI"
+#endif
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_shmem.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_shmem.F90
new file mode 100644
index 0000000000000000000000000000000000000000..84a096d709fea1ae7a90083fce27df6ff44cd4da
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_shmem.F90
@@ -0,0 +1,44 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! comm_shmem.F90 - routines for shmem versions
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine comm_init()
+#ifdef ALTIX
+ call start_pes(0)
+#endif
+ return
+end
+
+!-------------------------------------------------------------------------------
+subroutine comm_finalize()
+ return
+end
+
+!-------------------------------------------------------------------------------
+COMM_METHOD function comm_method()
+
+#ifdef _OPENMP
+ comm_method = "shmem + OpenMP"
+#else
+ comm_method = "shmem"
+#endif
+end
+
+!-------------------------------------------------------------------------------
+integer function get_d3_buffer_vol()
+ get_d3_buffer_vol = 0
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_shmempi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_shmempi.F90
new file mode 100644
index 0000000000000000000000000000000000000000..d592215714ab6e724bcec00ad5138d5601589336
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_shmempi.F90
@@ -0,0 +1,51 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! comm_shmempi.F90 - MPI + shmem on Altix
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine comm_init()
+
+ implicit none
+ include 'mpif.h'
+ integer ierror
+
+ call mpi_init(ierror)
+end
+
+!-------------------------------------------------------------------------------
+subroutine comm_finalize()
+
+ implicit none
+ include 'mpif.h'
+ integer ierror
+
+ call mpi_finalize(ierror)
+end
+
+!-------------------------------------------------------------------------------
+COMM_METHOD function comm_method()
+
+#ifdef _OPENMP
+ comm_method = "shmem/MPI + OpenMP"
+#else
+ comm_method = "shmem/MPI"
+#endif
+end
+
+!-------------------------------------------------------------------------------
+integer function get_d3_buffer_vol()
+ get_d3_buffer_vol = 0
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_single_pe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_single_pe.F90
new file mode 100644
index 0000000000000000000000000000000000000000..a977e50363d06772a61c3176c267543584f705f3
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_single_pe.F90
@@ -0,0 +1,41 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! comm_single_pe.F90 - (dummy) routines for single CPU version
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine comm_init()
+ return
+end
+
+!-------------------------------------------------------------------------------
+subroutine comm_finalize()
+ return
+end
+
+!-------------------------------------------------------------------------------
+COMM_METHOD function comm_method()
+
+#ifdef _OPENMP
+ comm_method = "single_pe + OpenMP"
+#else
+ comm_method = "single_pe"
+#endif
+end
+
+!-------------------------------------------------------------------------------
+integer function get_d3_buffer_vol()
+ get_d3_buffer_vol = 0
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/dotprod.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/dotprod.F90
new file mode 100644
index 0000000000000000000000000000000000000000..3600c9039f33867db6ec3cee765797bc041d0fe4
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/dotprod.F90
@@ -0,0 +1,33 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! dotprod.F90 - dot product for parallel computers
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+REAL function dotprod(a, b, n)
+
+ implicit none
+ integer i, n
+ REAL a(n), b(n), s, global_sum
+
+ s = ZERO
+ !$omp parallel do reduction(+: s)
+ do i = 1, n
+ s = s + a(i) * b(i)
+ enddo
+
+ dotprod = global_sum(s)
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_mpi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_mpi.F90
new file mode 100644
index 0000000000000000000000000000000000000000..83843827fafee23e802b7692cb5bab6482e1ef2a
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_mpi.F90
@@ -0,0 +1,240 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! field_io_mpi.F90 - I/O routine for gauge and pseudo fermion fields using MPI
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+#ifndef INCLUDE_MPIF_H
+#define INCLUDE_MPIF_H include 'mpif.h'
+#endif
+
+!-------------------------------------------------------------------------------
+subroutine field_io(action, m, mx, field, cksum) ! read or write g- or sc-field
+
+ use typedef_cksum
+ use module_decomp
+ use module_function_decl
+ use module_lattice_io
+ use module_vol
+ implicit none
+ INCLUDE_MPIF_H
+
+ character(len = *) :: action
+ integer :: m, mx
+ type(type_cksum) :: cksum(0:LT - 1)
+
+ REAL :: field(SIZE_COMPLEX * m, 0:mx - 1, 0:NY - 1, 0:NZ - 1, 0:NT - 1)
+
+ REAL :: buffer(0:(SIZE_COMPLEX * m * mx * NPE(1))-1, 0:(NY * NPE(2))-1)
+
+ FILENAME :: file
+ integer :: i_pe(DIM)
+ integer, external :: i_global, ilex
+ integer :: x, y, z, t, t_global, me, pe, size, rec, recl
+ integer :: size_field
+ integer :: pe_x, pe_y, pe_z, pe_t
+ CHECK_SUM :: check_sum(2), n_bytes
+ integer :: tt, pe_tt, pe_io
+ integer :: status(MPI_STATUS_SIZE), ierror
+ integer :: count, block_length, stride, buf_type
+ logical :: io_pe
+
+
+ count = NY
+ block_length = SIZE_COMPLEX * m * mx
+ stride = block_length * NPE(1)
+
+ size = block_length * NY ! words in send/recv
+ size_field = block_length * NY * NZ * NT ! words in "field"
+ n_bytes = size * NPE(1) * NPE(2) * RKIND ! bytes in "buffer"
+ recl = n_bytes ! cast to standard integer
+
+ ASSERT(mod(recl, RECL_UNIT) == 0)
+ recl = recl / RECL_UNIT
+
+ i_pe = decomp%std%i_pe
+
+ call mpi_type_vector(count, block_length, stride, BQCD_REAL, buf_type, ierror)
+ call mpi_type_commit(buf_type, ierror)
+
+ if (action == "write") call swap_endian8(size_field, field)
+
+ if (i_pe(1) == 0 .and. i_pe(2) == 0 .and. i_pe(3) == 0) then
+ io_pe = .true.
+ else
+ io_pe = .false.
+ endif
+
+ pe_t = i_pe(4)
+ do t = 0, NT - 1
+ t_global = i_global(t, NT, i_pe(4))
+
+ file = cksum(t_global)%file
+
+ if (io_pe) then
+ open(UCONF, file = file, action = action, form = "unformatted", &
+ access = "direct", recl = recl)
+ endif
+
+ rec = 0
+ call cksum_init()
+
+ do pe_z = 0, NPE(3) - 1
+ do z = 0, NZ - 1
+ rec = rec + 1
+
+ if (io_pe .and. action == "read") then
+ read(UCONF, rec = rec) buffer
+ call cksum_add(buffer, n_bytes)
+ endif
+
+ do pe_y = 0, NPE(2) - 1
+ do pe_x = 0, NPE(1) - 1
+
+ y = count * pe_y
+ x = block_length * pe_x
+
+ call field_io_pes(pe, pe_io, (/pe_x, pe_y, pe_z, pe_t/))
+
+ if (io_pe) then
+ if (my_pe() /= pe_io) call die("my_pe() /= pe_io")
+ endif
+
+ if (my_pe() == pe .and. my_pe() == pe_io) then
+ call field_io_seq(action, count, block_length, stride, &
+ field(1,0,0,z,t), buffer(x,y))
+ else
+ if (action == "read") then
+ if (my_pe() == pe_io) then
+ call mpi_ssend(buffer(x,y), 1, buf_type, &
+ pe, 0, MPI_COMM_WORLD, ierror)
+ endif
+ if (my_pe() == pe) then
+ call mpi_recv(field(1,0,0,z,t), size, BQCD_REAL, &
+ pe_io, 0, MPI_COMM_WORLD, status, ierror)
+ endif
+ else
+ if (my_pe() == pe_io) then
+ call mpi_recv(buffer(x,y), 1, buf_type, &
+ pe, 0, MPI_COMM_WORLD, status, ierror)
+ endif
+ if (my_pe() == pe) then
+ call mpi_ssend(field(1,0,0,z,t), size, BQCD_REAL, &
+ pe_io, 0, MPI_COMM_WORLD, ierror)
+ endif
+ endif
+ endif
+
+ enddo
+ enddo
+
+ if (io_pe .and. action == "write") then
+ write(UCONF, rec = rec) buffer
+ call cksum_add(buffer, n_bytes)
+ endif
+ enddo
+ enddo
+
+ if (io_pe) then
+ close(UCONF)
+ call cksum_get(check_sum(1), check_sum(2))
+
+ if (action == "read") then
+
+ if (check_sum(1) /= cksum(t_global)%sum) then
+ call die("field_io(): check sum error in file " // file)
+ endif
+
+ else
+
+ if (my_pe() == 0) then
+ cksum(t_global)%sum = check_sum(1)
+ cksum(t_global)%bytes = check_sum(2)
+ do pe_tt = 1, NPE(4) - 1
+ tt = i_global(t, NT, pe_tt)
+ call mpi_recv(check_sum, 2, BQCD_CHECK_SUM, &
+ MPI_ANY_SOURCE, tt, MPI_COMM_WORLD, status, ierror)
+ cksum(tt)%sum = check_sum(1)
+ cksum(tt)%bytes = check_sum(2)
+ enddo
+ else
+ call mpi_ssend(check_sum, 2, BQCD_CHECK_SUM, 0, t_global, &
+ MPI_COMM_WORLD, ierror)
+ endif
+ endif
+ endif
+ enddo
+
+ call swap_endian8(size_field, field)
+ call mpi_type_free(buf_type, ierror)
+end
+
+!-------------------------------------------------------------------------------
+subroutine field_io_seq(action, count, block_length, stride, field, buffer)
+
+ use module_lattice_io
+ use module_vol
+ implicit none
+
+ character(len = *) :: action
+ integer :: count, block_length, stride
+ REAL :: field(*)
+ REAL :: buffer(*)
+ integer :: i, j, x, y
+
+ i = 0
+ j = 0
+ do y = 1, count
+ do x = 1, block_length
+ i = i + 1
+ if (action == "read") then
+ field(i) = buffer(j + x)
+ else
+ buffer(j + x) = field(i)
+ endif
+ enddo
+ j = j + stride
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine field_io_pes(pe, pe_io, x_std)
+
+ use module_lattice ! in contrast to the calling routine !!
+ implicit none
+ integer, intent(out) :: pe, pe_io
+ integer, intent(in) :: x_std(DIM)
+ integer :: x_act(DIM), x_std_io(DIM), x_act_io(DIM)
+ integer, external :: ilex
+
+ x_std_io(1) = 0
+ x_std_io(2) = 0
+ x_std_io(3) = 0
+ x_std_io(4) = x_std(4)
+
+ x_act(1) = x_std(gamma_index(1))
+ x_act(2) = x_std(gamma_index(2))
+ x_act(3) = x_std(gamma_index(3))
+ x_act(4) = x_std(gamma_index(4))
+
+ x_act_io(1) = x_std_io(gamma_index(1))
+ x_act_io(2) = x_std_io(gamma_index(2))
+ x_act_io(3) = x_std_io(gamma_index(3))
+ x_act_io(4) = x_std_io(gamma_index(4))
+
+ pe = ilex(DIM, x_act, NPE)
+ pe_io = ilex(DIM, x_act_io, NPE)
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_shmem.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_shmem.F90
new file mode 100644
index 0000000000000000000000000000000000000000..00da858897bd42b72f8acc299cb36ed6c6481034
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_shmem.F90
@@ -0,0 +1,136 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! field_io_shmem.F90 - I/O routine for gauge and pseudo fermion fields (shmem)
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+# include "shmem.h"
+
+!-------------------------------------------------------------------------------
+subroutine field_io(action, m, mx, field, cksum) ! read or write g- or sc-field
+
+ use typedef_cksum
+ use module_function_decl
+ use module_lattice
+ use module_vol
+ implicit none
+
+ character(len = *) :: action
+ integer :: m, mx
+ COMPLEX :: field(m, 0:mx-1, 0:NY-1, 0:NZ-1, 0:NT-1)
+ type(type_cksum) :: cksum(0:LT-1)
+
+!!! COMPLEX :: buffer(m * mx * NY, 0:NPE(2)-1)
+!!!!dir$ symmetric buffer
+ FILENAME :: file
+ integer :: i_pe(DIM)
+ integer, external :: i_global, ilex
+ integer :: t, t_global, z, me, pe, size, rec, recl
+ integer :: size_field
+ integer :: pe_x, pe_y, pe_z, pe_t
+ CHECK_SUM :: check_sum(2), n_bytes
+
+ COMPLEX :: buffer(m * mx * NY, 0:NPE(2)-1)
+ CHECK_SUM :: check_sum_master(2, 0:LT-1)
+
+ pointer(p_buffer, buffer)
+ pointer(p_check_sum_master, check_sum_master)
+
+ save p_buffer
+ save p_check_sum_master
+
+ logical, save :: initialized = .false.
+ integer :: ierr
+
+ if (.not. initialized) then
+ call barrier()
+ call shpalloc(p_buffer, SIZE_COMPLEX * m * mx * NY * NPE(2), ierr, 1)
+ call barrier()
+ call shpalloc(p_check_sum_master, 2 * LT, ierr, 1)
+ initialized = .true.
+ endif
+
+ call barrier()
+
+ call unlex(my_pe(), DIM, i_pe, NPE)
+
+ if (i_pe(2) == 0 .and. i_pe(3) == 0) then
+ size = SIZE_COMPLEX * m * mx * NY ! size in shmem
+ size_field = size * NZ * NT ! size of "field"
+ n_bytes = size * RKIND * NPE(2) ! no. of bytes of u_buf
+ recl = n_bytes ! cast to standard integer
+
+ ASSERT(mod(recl, RECL_UNIT) == 0)
+ recl = recl / RECL_UNIT
+
+ if (action == "write") call swap_endian8(size_field, field)
+
+ pe_t = i_pe(4)
+ do t = 0, NT - 1
+ t_global = i_global(t, NT, i_pe(4))
+
+ file = cksum(t_global)%file
+
+ open(UCONF, file = file, action = action, form = "unformatted", &
+ access = "direct", recl = recl)
+ rec = 0
+ call cksum_init()
+
+ do pe_z = 0, NPE(3) - 1
+ do z = 0, NZ - 1
+ rec = rec + 1
+
+ if (action == "read") then
+ read(UCONF, rec = rec) buffer
+ call cksum_add(buffer, n_bytes)
+ endif
+
+ do pe_y = 0, NPE(2) - 1
+ pe_x = 0
+ pe = ilex(DIM, (/pe_x, pe_y, pe_z, pe_t/), NPE)
+
+ if (action == "read") then
+ call shmem_put(field(1,0,0,z,t), buffer(1, pe_y), size, pe)
+ else
+ call shmem_get(buffer(1, pe_y), field(1,0,0,z,t), size, pe)
+ endif
+
+ enddo
+ if (action == "write") then
+ write(UCONF, rec = rec) buffer
+ call cksum_add(buffer, n_bytes)
+ endif
+ enddo
+ enddo
+ close(UCONF)
+ call cksum_get(check_sum(1), check_sum(2))
+ if (action == "read") then
+ if (check_sum(1) /= cksum(t_global)%sum) then
+ call die("field_io(): check sum error in file " // file)
+ endif
+ else
+ call shmem_put(check_sum_master(1, t_global), check_sum, 2, 0)
+ endif
+ enddo
+ endif
+ call barrier()
+
+ if (action == "write") then
+ do t_global = 0, LT - 1
+ cksum(t_global)%sum = check_sum_master(1, t_global)
+ cksum(t_global)%bytes = check_sum_master(2, t_global)
+ enddo
+ endif
+
+ call swap_endian8(size_field, field)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_single_pe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_single_pe.F90
new file mode 100644
index 0000000000000000000000000000000000000000..e72e1030314a7218443002bf5e15447eba370efb
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_single_pe.F90
@@ -0,0 +1,52 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! field_io_single_pe.F90 - I/O routine for gauge and pseudo fermion fields
+! (single processor version)
+!
+!-------------------------------------------------------------------------------
+# define INCLUDE_MPIF_H
+
+# define MPI_STATUS_SIZE 2
+# define MPI_REAL8 0
+# define mpi_real8 0
+# define MPI_COMM_WORLD 0
+# define MPI_INTEGER8 0
+# define mpi_integer8 0
+# define MPI_ANY_SOURCE 0
+
+# include "field_io_mpi.F90"
+
+!-------------------------------------------------------------------------------
+subroutine mpi_type_vector(a, b, c, d, e, f)
+ return
+end
+
+!-------------------------------------------------------------------------------
+subroutine mpi_type_commit(a, b)
+ return
+end
+
+!-------------------------------------------------------------------------------
+subroutine mpi_type_free(a, b)
+ return
+end
+
+!-------------------------------------------------------------------------------
+subroutine mpi_ssend(a, b, c, d, e, f, g)
+ call die("mpi_ssend(): MPI must not be called in single PE version")
+end
+
+!-------------------------------------------------------------------------------
+subroutine mpi_recv(a, b, c, d, e, f, g, h)
+ call die("mpi_recv(): MPI must not be called in single PE version")
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/pes_mpi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/pes_mpi.F90
new file mode 100644
index 0000000000000000000000000000000000000000..88576ead59b4565ca3aa1703df73539b4617d109
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/pes_mpi.F90
@@ -0,0 +1,33 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! pes_mpi.F90 - MPI version of shmem functions
+!
+!-------------------------------------------------------------------------------
+integer function my_pe()
+
+ implicit none
+ include 'mpif.h'
+ integer ierror
+
+ call mpi_comm_rank(MPI_COMM_WORLD, my_pe, ierror)
+end
+
+!-------------------------------------------------------------------------------
+integer function num_pes()
+
+ implicit none
+ include 'mpif.h'
+ integer ierror
+
+ call mpi_comm_size(MPI_COMM_WORLD, num_pes, ierror)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/pes_single_pe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/pes_single_pe.F90
new file mode 100644
index 0000000000000000000000000000000000000000..ad919b808f4465cd4244abbbebf8c9987ae48195
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/pes_single_pe.F90
@@ -0,0 +1,28 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! pes_single_pe.F90 - dummy routines for shmem functions
+!
+!-------------------------------------------------------------------------------
+integer function my_pe()
+ my_pe = 0
+end
+
+!-------------------------------------------------------------------------------
+integer function num_pes()
+ num_pes = 1
+end
+
+!-------------------------------------------------------------------------------
+subroutine barrier()
+ return
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_mpi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_mpi.F90
new file mode 100644
index 0000000000000000000000000000000000000000..3cd8fe7adb0870c3854ae66ca2d43fd1d2884dca
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_mpi.F90
@@ -0,0 +1,74 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! reduction_mpi.F90 - reduction operations in MPI
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+function global_sum(local_sum)
+
+ implicit none
+ include 'mpif.h'
+ REAL global_sum, local_sum
+ integer ierror
+
+ TIMING_START(timing_bin_global_sum)
+
+ call mpi_allreduce(local_sum, global_sum, 1, &
+ BQCD_REAL, MPI_SUM, MPI_COMM_WORLD, ierror)
+
+ TIMING_STOP(timing_bin_global_sum)
+end
+
+!-------------------------------------------------------------------------------
+function global_min(local_min)
+
+ implicit none
+ include 'mpif.h'
+ real global_min, local_min
+ integer ierror
+
+ call mpi_allreduce(local_min, global_min, 1, &
+ MPI_REAL, MPI_MIN, MPI_COMM_WORLD, ierror)
+end
+
+!-------------------------------------------------------------------------------
+function global_max(local_max)
+
+ implicit none
+ include 'mpif.h'
+ real global_max, local_max
+ integer ierror
+
+ call mpi_allreduce(local_max, global_max, 1, &
+ MPI_REAL, MPI_MAX, MPI_COMM_WORLD, ierror)
+end
+
+!-------------------------------------------------------------------------------
+subroutine global_sum_vec(n, sum)
+
+ implicit none
+ include 'mpif.h'
+ integer, intent(in) :: n
+ REAL, intent(inout) :: sum(n)
+ REAL :: tmp(n)
+ integer ierror
+
+ TIMING_START(timing_bin_global_sum_vec)
+
+ tmp = sum
+ call mpi_allreduce(tmp, sum, n, BQCD_REAL, MPI_SUM, MPI_COMM_WORLD, ierror)
+
+ TIMING_STOP(timing_bin_global_sum_vec)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_shmem.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_shmem.F90
new file mode 100644
index 0000000000000000000000000000000000000000..6fb261cd1ddcc41a6a3616a437b6b19d0cba2227
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_shmem.F90
@@ -0,0 +1,118 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! reduction_shmem.F90 - reduction operations in shmem
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+# include "shmem.h"
+
+!-------------------------------------------------------------------------------
+function global_sum(local_sum)
+
+ implicit none
+ include 'mpp/shmem.fh'
+
+ REAL :: global_sum, local_sum
+ REAL, save :: source, target
+ integer :: n_pes
+ integer, external :: num_pes
+ REAL, save :: pWrk(2 + shmem_reduce_min_wrkdata_size)
+ integer, save :: pSync(shmem_reduce_sync_size)
+
+
+ TIMING_START(timing_bin_global_sum)
+
+ n_pes = num_pes()
+
+ if (n_pes == 1) then
+ global_sum = local_sum
+ return
+ endif
+
+ source = local_sum
+
+ call shmem_real8_sum_to_all(target, source, 1, 0, 0, n_pes, pWrk, pSync)
+
+ global_sum = target
+
+ TIMING_STOP(timing_bin_global_sum)
+end
+
+!-------------------------------------------------------------------------------
+function global_min(local_min)
+
+ implicit none
+ include 'mpp/shmem.fh'
+
+ real :: global_min, local_min
+ real, save :: source, target
+ integer :: n_pes
+ integer, external :: num_pes
+ real, save :: pWrk(2 + shmem_reduce_min_wrkdata_size)
+ integer, save :: pSync(shmem_reduce_sync_size)
+
+ n_pes = num_pes()
+
+ if (n_pes == 1) then
+ global_min = local_min
+ return
+ endif
+
+ source = local_min
+
+ call shmem_real8_min_to_all(target, source, 1, 0, 0, n_pes, pWrk, pSync)
+
+ global_min = target
+end
+
+!-------------------------------------------------------------------------------
+function global_max(local_max)
+
+ implicit none
+ include 'mpp/shmem.fh'
+
+ real :: global_max, local_max
+ real, save :: source, target
+ integer :: n_pes
+ integer, external :: num_pes
+ real, save :: pWrk(2 + shmem_reduce_min_wrkdata_size)
+ integer, save :: pSync(shmem_reduce_sync_size)
+
+ n_pes = num_pes()
+
+ if (n_pes == 1) then
+ global_max = local_max
+ return
+ endif
+
+ source = local_max
+
+ call shmem_real8_max_to_all(target, source, 1, 0, 0, n_pes, pWrk, pSync)
+
+ global_max = target
+end
+
+!-------------------------------------------------------------------------------
+subroutine global_sum_vec(n, sum)
+
+ implicit none
+
+ integer, intent(in) :: n
+ REAL, intent(inout) :: sum(n)
+
+ TIMING_START(timing_bin_global_sum_vec)
+
+ call die("global_sum_vec(): shmem version not implemented yet.")
+
+ TIMING_STOP(timing_bin_global_sum_vec)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_single_pe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_single_pe.F90
new file mode 100644
index 0000000000000000000000000000000000000000..67bf6f99beb81aa5615da8aaf705be1931fd099d
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_single_pe.F90
@@ -0,0 +1,52 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! reduction_single_pe.F90 - reduction operations on a single processor
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+function global_sum(local_sum)
+
+ implicit none
+ REAL :: global_sum, local_sum
+
+ global_sum = local_sum
+end
+
+!-------------------------------------------------------------------------------
+function global_min(local_min)
+
+ implicit none
+ real :: global_min, local_min
+
+ global_min = local_min
+end
+
+!-------------------------------------------------------------------------------
+function global_max(local_max)
+
+ implicit none
+ real :: global_max, local_max
+
+ global_max = local_max
+end
+
+!-------------------------------------------------------------------------------
+subroutine global_sum_vec(n, sum)
+
+ implicit none
+ integer, intent(in) :: n
+ REAL, intent(inout) :: sum(n)
+
+ return
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_mpi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_mpi.F90
new file mode 100644
index 0000000000000000000000000000000000000000..762a65cb26987313de6193127f10f4c021f619be
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_mpi.F90
@@ -0,0 +1,48 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! seed_mpi.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine seed_broadcast(seed)
+
+ use module_function_decl
+ implicit none
+ include 'mpif.h'
+ SEED seed
+ integer ierror
+
+ call mpi_bcast(seed, 1, BQCD_SEED, 0, MPI_COMM_WORLD, ierror)
+end
+
+!-------------------------------------------------------------------------------
+subroutine seed_compare(seed)
+
+ use module_function_decl
+ implicit none
+ include 'mpif.h'
+ SEED seed, s
+ integer pe, status(MPI_STATUS_SIZE), ierror
+
+ if (my_pe() /= 0) then
+ call mpi_ssend(seed, 1, BQCD_SEED, 0, 0, MPI_COMM_WORLD, ierror)
+ else
+ do pe = 1, num_pes() - 1
+ call mpi_recv(s, 1, BQCD_SEED, pe, 0, MPI_COMM_WORLD, status, ierror)
+ if (s /= seed) call die('rancheck(): seeds differ')
+ enddo
+ endif
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_shmem.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_shmem.F90
new file mode 100644
index 0000000000000000000000000000000000000000..8d76b5cf36f131a7e1c7c1370fba0306471fe53b
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_shmem.F90
@@ -0,0 +1,61 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! seed_shmem.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+# include "shmem.h"
+
+!-------------------------------------------------------------------------------
+subroutine seed_broadcast(seed)
+
+ use module_function_decl
+ implicit none
+ include "mpp/shmem.fh"
+
+ SEED :: seed
+ SEED, save :: s
+ integer, save :: psync(SHMEM_BCAST_SYNC_SIZE)
+
+ psync = SHMEM_SYNC_VALUE
+ s = seed
+
+ call barrier()
+ call shmem_broadcast(s, s, 1, 0, 0, 0, num_pes(), psync)
+ call barrier()
+
+ seed = s
+end
+
+!-------------------------------------------------------------------------------
+subroutine seed_compare(seed)
+
+ use module_function_decl
+ implicit none
+
+ SEED :: seed
+ SEED, save :: s
+ integer :: pe
+
+ s = seed
+ call barrier()
+
+ if (my_pe() == 0) then
+ do pe = 1, num_pes() - 1
+ call shmem_get(s, s, 1, pe)
+ if (s /= seed) call die('rancheck(): seeds differ')
+ enddo
+ endif
+
+ call barrier()
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_single_pe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_single_pe.F90
new file mode 100644
index 0000000000000000000000000000000000000000..188f7e7af78b1d2fb0c776784216f8459837456a
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_single_pe.F90
@@ -0,0 +1,34 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! seed_single_pe.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine seed_broadcast(seed)
+
+ implicit none
+ SEED seed
+
+ return
+end
+
+!-------------------------------------------------------------------------------
+subroutine seed_compare(seed)
+
+ implicit none
+ SEED seed
+
+ return
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/shmem.h b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/shmem.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6289299e7c9bd847cbb0e3f48eb7f8083dcedbd
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/shmem.h
@@ -0,0 +1,7 @@
+#ifdef ALTIX
+# define barrier shmem_barrier_all
+# define shmem_broadcast shmem_broadcast8
+# define shmem_get shmem_get8
+# define shmem_put shmem_put8
+# define shpalloc(addr, length, errcode, abort) shpalloc(addr, 2 * (length), errcode, abort)
+#endif
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_mpi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_mpi.F90
new file mode 100644
index 0000000000000000000000000000000000000000..51ca5e2eb0e696aa5e62fd8aceae61334a2de5f4
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_mpi.F90
@@ -0,0 +1,816 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! xbound_mpi.F90 - boundary exchange with MPI
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_xbound
+
+ implicit none
+ integer, parameter :: max_bound = 3 * 3 * 3 * 3
+
+ type type_xbound
+ integer :: i_source
+ integer :: i_target
+ integer :: pe_source
+ integer :: pe_target
+ integer :: size ! total size
+ integer :: vector_type
+ integer :: block_count
+ integer :: block_size
+ integer :: block_stride
+ end type type_xbound
+end
+
+!-------------------------------------------------------------------------------
+module module_xbound_g
+
+ use module_xbound
+ implicit none
+ integer, save :: n_bound = 0
+ type (type_xbound), save :: b(max_bound)
+end
+
+!-------------------------------------------------------------------------------
+module module_xbound_sc
+
+ use module_xbound
+ implicit none
+ integer, save :: n_bound = 0
+ integer, save :: i_bound(DIM, FWD:BWD)
+ type (type_xbound), save :: b(max_bound)
+end
+
+!-------------------------------------------------------------------------------
+module module_xbound_sc2
+
+ use module_xbound
+ implicit none
+ integer, save :: n_bound = 0
+ integer, save :: i_bound(DIM, FWD:BWD)
+ type (type_xbound), save :: b(max_bound)
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_xbound()
+
+ implicit none
+ integer, external :: version_of_d
+
+ call init_xbound_g()
+ call init_xbound_sc()
+ call init_xbound_sc2()
+ if (version_of_d() == 3) call init_xbound_d3()
+ if (version_of_d() == 31) call init_xbound_d3()
+ if (version_of_d() == 4) call init_xbound_d4()
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_xbound_g()
+
+ use module_xbound_g
+ implicit none
+
+ integer :: x, y, z, t
+
+ do t = -1, 1
+ do z = -1, 1
+ do y = -1, 1
+ do x = -1, 1
+ call init_xch_bound(n_bound, b, NCOL * NCOL * SIZE_COMPLEX, x, y, z, t)
+ enddo
+ enddo
+ enddo
+ enddo
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_xbound_sc()
+
+ use module_xbound_sc
+ use module_lattice
+ implicit none
+
+ integer :: mu, block
+
+
+ block = NDIRAC * NCOL * SIZE_COMPLEX
+
+ call init_xch_bound(n_bound, b, block, -1,0,0,0); i_bound(1, BWD) = n_bound
+ call init_xch_bound(n_bound, b, block, +1,0,0,0); i_bound(1, FWD) = n_bound
+ call init_xch_bound(n_bound, b, block, 0,-1,0,0); i_bound(2, BWD) = n_bound
+ call init_xch_bound(n_bound, b, block, 0,+1,0,0); i_bound(2, FWD) = n_bound
+ call init_xch_bound(n_bound, b, block, 0,0,-1,0); i_bound(3, BWD) = n_bound
+ call init_xch_bound(n_bound, b, block, 0,0,+1,0); i_bound(3, FWD) = n_bound
+ call init_xch_bound(n_bound, b, block, 0,0,0,-1); i_bound(4, BWD) = n_bound
+ call init_xch_bound(n_bound, b, block, 0,0,0,+1); i_bound(4, FWD) = n_bound
+
+ do mu = 1, DIM
+ if (npe(mu) == 1) then
+ i_bound(mu, FWD) = 0
+ i_bound(mu, BWD) = 0
+ endif
+ enddo
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_xbound_sc2()
+
+ use module_xbound_sc2
+ use module_lattice
+ implicit none
+
+ integer :: mu, block
+
+
+ block = 2 * NCOL * SIZE_COMPLEX
+
+ call init_xch_bound(n_bound, b, block, -1,0,0,0); i_bound(1, BWD) = n_bound
+ call init_xch_bound(n_bound, b, block, +1,0,0,0); i_bound(1, FWD) = n_bound
+ call init_xch_bound(n_bound, b, block, 0,-1,0,0); i_bound(2, BWD) = n_bound
+ call init_xch_bound(n_bound, b, block, 0,+1,0,0); i_bound(2, FWD) = n_bound
+ call init_xch_bound(n_bound, b, block, 0,0,-1,0); i_bound(3, BWD) = n_bound
+ call init_xch_bound(n_bound, b, block, 0,0,+1,0); i_bound(3, FWD) = n_bound
+ call init_xch_bound(n_bound, b, block, 0,0,0,-1); i_bound(4, BWD) = n_bound
+ call init_xch_bound(n_bound, b, block, 0,0,0,+1); i_bound(4, FWD) = n_bound
+
+ do mu = 1, DIM
+ if (npe(mu) == 1) then
+ i_bound(mu, FWD) = 0
+ i_bound(mu, BWD) = 0
+ endif
+ enddo
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_xch_bound(n_bound, b, block_size, xx, yy, zz, tt)
+
+ use module_xbound
+ use module_function_decl
+ use module_nnpe
+ use module_offset
+ use module_lattice
+ use module_vol
+ implicit none
+ include 'mpif.h'
+
+ integer, intent(inout) :: n_bound
+ type (type_xbound), intent(inout) :: b(max_bound)
+ integer, intent(in) :: block_size, xx, yy, zz, tt
+
+ integer, dimension (DIM) :: dir, m, i, target, source
+ integer, external :: xyzt2i, n_sites, e_o
+ integer :: x, y, z, t, size, mu, stride, block_count, ierror
+ integer :: tmp_type1, tmp_type2, the_type
+ integer(MPI_ADDRESS_KIND):: true_lb, true_extent
+ integer :: extent
+
+ logical :: special
+
+
+ if (nnpe(xx, yy, zz, tt) == my_pe()) return
+
+ if (xx /= 0 .and. yy == 0 .and. zz /= 0 .and. tt == 0) then
+ special = .true.
+ else
+ special = .false.
+ endif
+
+
+ dir = (/ xx, yy, zz, tt /)
+
+ do mu = 1, DIM
+ if (dir(mu) /= 0) then
+ m(mu) = 1
+ else
+ m(mu) = NH(mu)
+ endif
+
+ if (dir(mu) == -1) then
+ target(mu) = -1
+ source(mu) = N(mu) - 1
+ elseif (dir(mu) == +1) then
+ target(mu) = N(mu)
+ source(mu) = 0
+ else
+ target(mu) = 0
+ source(mu) = 0
+ endif
+ enddo
+
+
+ size = block_size
+ do mu = 1, DIM
+ if (dir(mu) == 0) then
+ size = size * NH(mu)
+ m(mu) = 1
+ else
+ exit
+ endif
+ enddo
+
+ stride = block_size
+ do mu = 1, DIM
+ if (m(mu) == 1) then
+ stride = stride * NH(mu)
+ else
+ exit
+ endif
+ enddo
+
+ block_count = 1
+ do mu = 1, DIM
+ block_count = block_count * m(mu)
+ enddo
+
+ n_bound = n_bound + 1
+ ASSERT(n_bound <= max_bound)
+
+ if (special) then ! (y,t)-plane
+
+ ! MPY-type for y-line:
+
+ block_count = NY
+ size = block_size
+ stride = block_size * NXH
+
+ call mpi_type_vector(block_count, size, stride, BQCD_REAL, tmp_type1, ierror)
+ call mpi_type_commit(tmp_type1, ierror)
+
+
+#ifdef ALTIX
+ ! use MPI-1
+ call mpi_type_extent(BQCD_REAL, extent, ierror)
+ call mpi_type_struct(2, (/1, 1/), (/0, extent/), (/tmp_type1, MPI_UB/), &
+ tmp_type2, ierror)
+#else
+ ! use MPI-2
+ call mpi_type_get_true_extent(BQCD_REAL, true_lb, true_extent, ierror)
+ call mpi_type_create_resized(tmp_type1, true_lb, true_extent, tmp_type2, ierror)
+#endif
+ call mpi_type_commit(tmp_type2, ierror)
+
+ ! MPI-parameters for (y,t)-plane:
+
+ block_count = NT
+ size = 1
+ stride = block_size * NXH * NY * NZ
+ the_type = tmp_type2
+ b(n_bound)%size = block_size * NY * NT
+
+ else
+
+ the_type = BQCD_REAL
+ b(n_bound)%size = block_count * size
+
+ endif
+
+
+ b(n_bound)%i_source = xyzt2i(source)
+ b(n_bound)%i_target = xyzt2i(target)
+ b(n_bound)%pe_source = nnpe(xx, yy, zz, tt)
+ b(n_bound)%pe_target = nnpe(-xx, -yy, -zz, -tt)
+
+ b(n_bound)%block_count = block_count
+ b(n_bound)%block_size = size
+ b(n_bound)%block_stride= stride
+
+ call mpi_type_vector(block_count, size, stride, the_type, &
+ b(n_bound)%vector_type, ierror)
+ call mpi_type_commit(b(n_bound)%vector_type, ierror)
+
+ !!if ( my_pe() == 0) write(6,*) xx,yy,zz,tt, block_count, size, stride
+ !!if ( my_pe() == 0) write(6,*) xx,yy,zz,tt, b(n_bound)%i_source, b(n_bound)%i_target, nnpe(xx,yy,zz,tt), my_pe()
+
+ !!ASSERT(b(n_bound)%size == block_size * n_sites(DIM, dir, NH, NPE))
+
+ if (special) then
+ call mpi_type_free(tmp_type1, ierror)
+ call mpi_type_free(tmp_type2, ierror)
+ endif
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_g_field(u)
+
+ use module_function_decl
+ use module_vol
+ implicit none
+
+ GAUGE_FIELD :: u
+ integer :: mu, eo, x, y, z, t
+
+ if (num_pes() == 1) return
+
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ call xbound_g(u, eo, mu)
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_g(u, eo, mu)
+
+ use module_xbound_g
+ use module_function_decl
+ use module_vol
+ implicit none
+ include 'mpif.h'
+
+ integer :: eo, mu, i, status(MPI_STATUS_SIZE), ierror
+ GAUGE_FIELD :: u
+
+ if (num_pes() == 1) return
+
+ do i = 1, n_bound
+ call mpi_sendrecv( &
+ u(1,1, b(i)%i_source, eo,mu), 1, b(i)%vector_type, b(i)%pe_target, 0,&
+ u(1,1, b(i)%i_target, eo,mu), b(i)%size, BQCD_REAL, b(i)%pe_source, 0,&
+ MPI_COMM_WORLD, status, ierror)
+ enddo
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_sc_field(a)
+
+ use module_function_decl
+ use module_vol
+ implicit none
+ include 'mpif.h'
+
+ integer :: i, status(MPI_STATUS_SIZE), ierror
+ integer :: mu, fb
+ SPINCOL_FIELD :: a
+
+ if (num_pes() == 1) return
+
+ do mu = 1, DIM
+ call xbound_sc(a, mu)
+ enddo
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_sc(a, direction)
+
+ use module_xbound_sc
+ use module_function_decl
+ use module_vol
+ implicit none
+ include 'mpif.h'
+
+ integer :: i, status(MPI_STATUS_SIZE), ierror
+ integer :: direction, mu, fb
+ SPINCOL_FIELD :: a
+
+ if (num_pes() == 1) return
+
+ mu = direction
+ do fb = FWD, BWD
+ i = i_bound(mu, fb)
+ if (i /= 0) then
+ call mpi_sendrecv( &
+ a(1,1, b(i)%i_source), 1, b(i)%vector_type, b(i)%pe_target, 0,&
+ a(1,1, b(i)%i_target), b(i)%size, BQCD_REAL, b(i)%pe_source, 0,&
+ MPI_COMM_WORLD, status, ierror)
+ endif
+ enddo
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_sc2_field(a)
+
+ use module_function_decl
+ use module_vol
+ implicit none
+ include 'mpif.h'
+
+ SC2_FIELD :: a
+ integer :: mu
+
+ if (num_pes() == 1) return
+
+ do mu = 1, DIM
+ call xbound_sc2(a, mu)
+ enddo
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_sc2_field_i(a) ! "i"mmediate MPI calls
+
+ use module_xbound_sc2
+ use module_function_decl
+ use module_vol
+ implicit none
+ include 'mpif.h'
+
+ SC2_FIELD :: a
+
+ integer, parameter :: max_request = 2 * 2 * DIM
+
+ integer :: request(max_request), status(MPI_STATUS_SIZE, max_request), ierror
+ integer :: mu, fb, i, n_request
+
+ if (num_pes() == 1) return
+
+ n_request = 0
+
+ do mu = 1, DIM
+ do fb = FWD, BWD
+ i = i_bound(mu, fb)
+ if (i /= 0) then
+ n_request = n_request + 1
+ call mpi_irecv( &
+ a(1,1, b(i)%i_target, mu,fb), b(i)%size, BQCD_REAL, b(i)%pe_source, 0,&
+ MPI_COMM_WORLD, request(n_request), ierror)
+ endif
+ enddo
+ enddo
+
+ do mu = 1, DIM
+ do fb = FWD, BWD
+ i = i_bound(mu, fb)
+ if (i /= 0) then
+ n_request = n_request + 1
+ call mpi_isend( &
+ a(1,1, b(i)%i_source, mu,fb), 1, b(i)%vector_type, b(i)%pe_target, 0,&
+ MPI_COMM_WORLD, request(n_request), ierror)
+ endif
+ enddo
+ enddo
+
+ call mpi_waitall(n_request, request, status, ierror)
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_sc2(a, direction)
+
+ use module_xbound_sc2
+ use module_function_decl
+ use module_vol
+ implicit none
+ include 'mpif.h'
+
+ integer :: i, status(MPI_STATUS_SIZE), ierror
+ integer :: direction, mu, fb
+ SC2_FIELD :: a
+
+ if (num_pes() == 1) return
+
+ mu = direction
+ do fb = FWD, BWD
+ i = i_bound(mu, fb)
+ if (i /= 0) then
+ call mpi_sendrecv( &
+ a(1,1, b(i)%i_source, mu,fb), 1, b(i)%vector_type, b(i)%pe_target, 0,&
+ a(1,1, b(i)%i_target, mu,fb), b(i)%size, BQCD_REAL, b(i)%pe_source, 0,&
+ MPI_COMM_WORLD, status, ierror)
+ endif
+ enddo
+end
+
+!===============================================================================
+!
+! new stuff for libd3
+!
+!-------------------------------------------------------------------------------
+module module_xbound_d3
+
+ use module_xbound
+ implicit none
+ integer, save :: n_bound(DIM)
+ type (type_xbound), save :: b(2, DIM)
+
+ integer, save :: xch_yf = 0
+ integer, save :: xch_yb = 0
+ integer, save :: xch_zf = 0
+ integer, save :: xch_zb = 0
+ integer, save :: xch_tf = 0
+ integer, save :: xch_tb = 0
+#ifdef D3_BUFFER_VOL
+ integer, parameter :: d3_buffer_vol = D3_BUFFER_VOL
+#else
+ integer, parameter :: d3_buffer_vol = 0
+#endif
+
+ type (type_xbound), save :: byf, byb, bzf, bzb, btf, btb
+
+ ! allocate buffer for MPI in static memory to speed-up communication on SR8000
+ integer, parameter :: max_buffer = NDIRAC*NCOL*SIZE_COMPLEX*d3_buffer_vol
+
+ REAL, dimension (max_buffer), save :: buffer_yf, buffer_yb, &
+ buffer_zf, buffer_zb
+end
+
+!-------------------------------------------------------------------------------
+integer function get_d3_buffer_vol()
+
+ use module_xbound_d3
+ implicit none
+
+ get_d3_buffer_vol = d3_buffer_vol
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_xbound_d3()
+
+ use module_lattice
+ use module_xbound_d3
+ implicit none
+
+ if (npe(1) /= 1) call die("init_xbound_d3(): npe(1) /= 1")
+
+ n_bound = 0
+
+ call init_xch_bound(n_bound(2), b(1,2), NDIRAC*NCOL*SIZE_COMPLEX, 0, 1,0,0)
+ call init_xch_bound(n_bound(2), b(1,2), NDIRAC*NCOL*SIZE_COMPLEX, 0,-1,0,0)
+
+ call init_xch_bound(n_bound(3), b(1,3), NDIRAC*NCOL*SIZE_COMPLEX, 0,0, 1,0)
+ call init_xch_bound(n_bound(3), b(1,3), NDIRAC*NCOL*SIZE_COMPLEX, 0,0,-1,0)
+
+ call init_xch_bound(n_bound(4), b(1,4), NDIRAC*NCOL*SIZE_COMPLEX, 0,0,0, 1)
+ call init_xch_bound(n_bound(4), b(1,4), NDIRAC*NCOL*SIZE_COMPLEX, 0,0,0,-1)
+
+ call init_xch_bound(xch_yf, byf, NDIRAC*NCOL*SIZE_COMPLEX, 0, 1,0,0)
+ call init_xch_bound(xch_yb, byb, NDIRAC*NCOL*SIZE_COMPLEX, 0,-1,0,0)
+
+ call init_xch_bound(xch_zf, bzf, NDIRAC*NCOL*SIZE_COMPLEX, 0,0, 1,0)
+ call init_xch_bound(xch_zb, bzb, NDIRAC*NCOL*SIZE_COMPLEX, 0,0,-1,0)
+
+ call init_xch_bound(xch_tf, btf, NDIRAC*NCOL*SIZE_COMPLEX, 0,0,0, 1)
+ call init_xch_bound(xch_tb, btb, NDIRAC*NCOL*SIZE_COMPLEX, 0,0,0,-1)
+
+ !!if (xch_yf /= 0) allocate(buffer_yf(byf%size))
+ !!if (xch_yb /= 0) allocate(buffer_yb(byb%size))
+ !!if (xch_zf /= 0) allocate(buffer_zf(bzf%size))
+ !!if (xch_zb /= 0) allocate(buffer_zb(bzb%size))
+
+ if (byf%size > max_buffer) call die("init_xbound_d3(): byf%size")
+ if (byb%size > max_buffer) call die("init_xbound_d3(): byb%size")
+ if (bzf%size > max_buffer) call die("init_xbound_d3(): bzf%size")
+ if (bzb%size > max_buffer) call die("init_xbound_d3(): bzb%size")
+
+ if (xch_yf /= xch_yb) call die("init_xbound_d3(): xch_y")
+ if (xch_zf /= xch_zb) call die("init_xbound_d3(): xch_z")
+ if (byf%block_size /= byb%block_size) call die("init_xbound_d3(): size")
+ if (byf%block_count /= byb%block_count) call die("init_xbound_d3(): count")
+ if (byf%block_stride /= byb%block_stride) call die("init_xbound_d3(): stride")
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_d3(a, direction)
+
+ use module_xbound_d3
+ use module_function_decl
+ use module_vol
+ implicit none
+ include 'mpif.h'
+
+ integer :: i, status(MPI_STATUS_SIZE), ierror, direction, d
+ SPINCOL_FIELD :: a
+
+ if (num_pes() == 1) return
+
+ d = direction
+
+ do i = 1, n_bound(d)
+ call mpi_sendrecv( &
+ a(1, 1, b(i,d)%i_source), 1, b(i,d)%vector_type, b(i,d)%pe_target,0,&
+ a(1, 1, b(i,d)%i_target), b(i,d)%size, BQCD_REAL, b(i,d)%pe_source,0,&
+ MPI_COMM_WORLD, status, ierror)
+ enddo
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_yf(a)
+
+ use module_xbound_d3
+ use module_function_decl
+ use module_vol
+ implicit none
+ include 'mpif.h'
+
+ integer :: i, status(MPI_STATUS_SIZE), ierror
+ SPINCOL_FIELD :: a
+
+ if (num_pes() == 1 .or. xch_yf == 0) return
+
+ call mpi_sendrecv( &
+ a(1, 1, byf%i_source), 1, byf%vector_type, byf%pe_target,1,&
+ a(1, 1, byf%i_target), byf%size, BQCD_REAL, byf%pe_source,1,&
+ MPI_COMM_WORLD, status, ierror)
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_yb(a)
+
+ use module_xbound_d3
+ use module_function_decl
+ use module_vol
+ implicit none
+ include 'mpif.h'
+
+ integer :: i, status(MPI_STATUS_SIZE), ierror
+ SPINCOL_FIELD :: a
+
+ if (num_pes() == 1 .or. xch_yb == 0) return
+
+ call mpi_sendrecv( &
+ a(1, 1, byb%i_source), 1, byb%vector_type, byb%pe_target,2,&
+ a(1, 1, byb%i_target), byb%size, BQCD_REAL, byb%pe_source,2,&
+ MPI_COMM_WORLD, status, ierror)
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_fill_buffer_y(a)
+
+ use module_xbound_d3
+ use module_function_decl
+ use module_vol
+ implicit none
+
+ REAL :: a(*)
+ integer :: i, j, off_af, off_ab, off_b
+ integer :: count, size, stride, start_af, start_ab
+
+ if (num_pes() == 1 .or. xch_yf == 0) return
+
+ !$omp parallel private(i, j, off_af, off_ab, off_b, start_af, start_ab, &
+ !$omp count, size, stride)
+ start_af = (byf%i_source - 1) * NDIRAC * NCOL * SIZE_COMPLEX + 1
+ start_ab = (byb%i_source - 1) * NDIRAC * NCOL * SIZE_COMPLEX + 1
+
+ count = byf%block_count
+ size = byf%block_size
+ stride = byf%block_stride
+
+ !$omp do
+ do i = 0, count - 1
+ off_af = start_af + i * stride
+ off_ab = start_ab + i * stride
+ off_b = i * size + 1
+ do j = 0, size - 1
+ buffer_yf(off_b + j) = a(off_af + j)
+ buffer_yb(off_b + j) = a(off_ab + j)
+ enddo
+ enddo
+ !$omp end parallel
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_copy_buffer_y(a)
+
+ use module_xbound_d3
+ use module_function_decl
+ use module_vol
+ implicit none
+ include 'mpif.h'
+
+ integer :: status(MPI_STATUS_SIZE), ierror
+ SPINCOL_FIELD :: a
+
+ if (num_pes() == 1 .or. xch_yf == 0) return
+
+ call mpi_sendrecv( &
+ buffer_yf(1), byf%size, BQCD_REAL, byf%pe_target, 0,&
+ a(1, 1, byf%i_target), byf%size, BQCD_REAL, byf%pe_source, 0,&
+ MPI_COMM_WORLD, status, ierror)
+
+ call mpi_sendrecv( &
+ buffer_yb(1), byb%size, BQCD_REAL, byb%pe_target, 0,&
+ a(1, 1, byb%i_target), byb%size, BQCD_REAL, byb%pe_source, 0,&
+ MPI_COMM_WORLD, status, ierror)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_fill_buffer_z(a)
+
+ use module_xbound_d3
+ use module_function_decl
+ use module_vol
+ implicit none
+
+ REAL :: a(*)
+ integer :: i, j, off_af, off_ab, off_b
+ integer :: count, size, stride, start_af, start_ab
+
+ if (num_pes() == 1 .or. xch_zf == 0) return
+
+ !$omp parallel private(i, j, off_af, off_ab, off_b, start_af, start_ab, &
+ !$omp count, size, stride)
+ start_af = (bzf%i_source - 1) * NDIRAC * NCOL * SIZE_COMPLEX + 1
+ start_ab = (bzb%i_source - 1) * NDIRAC * NCOL * SIZE_COMPLEX + 1
+
+ count = bzf%block_count
+ size = bzf%block_size
+ stride = bzf%block_stride
+
+ !$omp do
+ do i = 0, count - 1
+ off_af = start_af + i * stride
+ off_ab = start_ab + i * stride
+ off_b = i * size + 1
+ do j = 0, size - 1
+ buffer_zf(off_b + j) = a(off_af + j)
+ buffer_zb(off_b + j) = a(off_ab + j)
+ enddo
+ enddo
+ !$omp end parallel
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_copy_buffer_z(a)
+
+ use module_xbound_d3
+ use module_function_decl
+ use module_vol
+ implicit none
+ include 'mpif.h'
+
+ integer :: status(MPI_STATUS_SIZE), ierror
+ SPINCOL_FIELD :: a
+
+ if (num_pes() == 1 .or. xch_zf == 0) return
+
+ call mpi_sendrecv( &
+ buffer_zf(1), bzf%size, BQCD_REAL, bzf%pe_target, 0,&
+ a(1, 1, bzf%i_target), bzf%size, BQCD_REAL, bzf%pe_source, 0,&
+ MPI_COMM_WORLD, status, ierror)
+
+ call mpi_sendrecv( &
+ buffer_zb(1), bzb%size, BQCD_REAL, bzb%pe_target, 0,&
+ a(1, 1, bzb%i_target), bzb%size, BQCD_REAL, bzb%pe_source, 0,&
+ MPI_COMM_WORLD, status, ierror)
+
+end
+
+!===============================================================================
+!
+! stuff for lib_d4
+!
+!-------------------------------------------------------------------------------
+subroutine init_xbound_d4()
+
+ use module_lattice
+ implicit none
+
+ if (npe(1) /= 1) call die("init_xbound_d4(): npe(1) /= 1")
+ if (npe(2) /= 1) call die("init_xbound_d4(): npe(2) /= 1")
+ if (npe(3) /= 1) call die("init_xbound_d4(): npe(3) /= 1")
+
+ call init_xbound_d3()
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_tf(a)
+
+ use module_xbound_d3
+ use module_function_decl
+ use module_vol
+ implicit none
+ include 'mpif.h'
+
+ integer :: i, status(MPI_STATUS_SIZE), ierror
+ SPINCOL_FIELD :: a
+
+ if (num_pes() == 1 .or. xch_tf == 0) return
+
+ call mpi_sendrecv( &
+ a(1, 1, btf%i_source), 1, btf%vector_type, btf%pe_target,1,&
+ a(1, 1, btf%i_target), btf%size, BQCD_REAL, btf%pe_source,1,&
+ MPI_COMM_WORLD, status, ierror)
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_tb(a)
+
+ use module_xbound_d3
+ use module_function_decl
+ use module_vol
+ implicit none
+ include 'mpif.h'
+
+ integer :: i, status(MPI_STATUS_SIZE), ierror
+ SPINCOL_FIELD :: a
+
+ if (num_pes() == 1 .or. xch_tb == 0) return
+
+ call mpi_sendrecv( &
+ a(1, 1, btb%i_source), 1, btb%vector_type, btb%pe_target,2,&
+ a(1, 1, btb%i_target), btb%size, BQCD_REAL, btb%pe_source,2,&
+ MPI_COMM_WORLD, status, ierror)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_shmem.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_shmem.F90
new file mode 100644
index 0000000000000000000000000000000000000000..3cff78a725a096e8d19e47bd531e6789c347f6bd
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_shmem.F90
@@ -0,0 +1,208 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! xbound_shmem.F90 - boundary exchange with shmem
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+# include "shmem.h"
+
+!-------------------------------------------------------------------------------
+subroutine init_xbound()
+
+ return
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_g(u, eo, mu)
+
+ use module_function_decl
+ use module_vol
+ implicit none
+
+ integer :: eo, mu, x, y, z, t
+ GAUGE_FIELD :: u
+
+ if (num_pes() == 1) return
+
+ call barrier()
+
+ do t = -1, 1
+ do z = -1, 1
+ do y = -1, 1
+ do x = -1, 1
+ call xch_bound(NCOL * NCOL * SIZE_COMPLEX, u(1, 1, 1, eo, mu), x, y, z, t)
+ call barrier()
+ enddo
+ enddo
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_g_field(u)
+
+ use module_function_decl
+ use module_vol
+ implicit none
+
+ GAUGE_FIELD :: u
+ integer :: mu, eo, x, y, z, t
+
+ if (num_pes() == 1) return
+
+ call barrier()
+
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ call xbound_g(u, eo, mu)
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_sc_field(array)
+
+ use module_function_decl
+ use module_vol
+ implicit none
+
+ SPINCOL_FIELD :: array
+ integer :: x, y, z, t
+
+ if (num_pes() == 1) return
+
+ call barrier()
+
+ do t = -1, 1
+ do z = -1, 1
+ do y = -1, 1
+ do x = -1, 1
+ if ((abs(x) + abs(y) + abs(z) + abs(t)) == 1) then
+ call xch_bound(NDIRAC * NCOL * SIZE_COMPLEX, array, x, y, z, t)
+ call barrier()
+ endif
+ enddo
+ enddo
+ enddo
+ enddo
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_sc2_field_i(a)
+
+ use module_function_decl
+ use module_lattice
+ use module_vol
+ implicit none
+
+ SC2_FIELD :: a
+ integer :: x, y, z, t
+
+ if (num_pes() == 1) return
+
+ call barrier()
+
+ call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 1, FWD), +1, 0, 0, 0)
+ call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 1, BWD), -1, 0, 0, 0)
+ call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 2, FWD), 0, +1, 0, 0)
+ call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 2, BWD), 0, -1, 0, 0)
+ call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 3, FWD), 0, 0, +1, 0)
+ call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 3, BWD), 0, 0, -1, 0)
+ call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 4, FWD), 0, 0, 0, +1)
+ call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 4, BWD), 0, 0, 0, -1)
+
+ call barrier()
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine xch_bound(mm, array, xx, yy, zz, tt)
+
+ use module_function_decl
+ use module_nnpe
+ use module_offset
+ use module_lattice
+ use module_vol
+ implicit none
+ include 'mpif.h'
+
+ integer :: mm
+ REAL, dimension (mm, volh_tot) :: array
+ integer, dimension (DIM) :: dir, m, i, target, source
+ integer, external :: xyzt2i
+ integer :: xx, yy, zz, tt, x, y, z, t, pe, size, mu
+
+ pe = nnpe(xx, yy, zz, tt)
+ if (pe == my_pe()) return
+
+ dir = (/ xx, yy, zz, tt /)
+
+ do mu = 1, DIM
+ if (dir(mu) /= 0) then
+ m(mu) = 1
+ else
+ m(mu) = NH(mu)
+ endif
+ enddo
+
+ size = mm
+ if (dir(1) == 0) then
+ size = size * NXH
+ m(1) = 1
+ if (dir(2) == 0) then
+ size = size * N(2)
+ m(2) = 1
+ if (dir(3) == 0) then
+ size = size * N(3)
+ m(3) = 1
+ if (dir(4) == 0) then
+ size = size * N(4)
+ m(4) = 1
+ endif
+ endif
+ endif
+ endif
+
+ do t = 0, m(4) - 1
+ do z = 0, m(3) - 1
+ do y = 0, m(2) - 1
+ do x = 0, m(1) - 1
+
+ i = (/ x, y, z, t /)
+
+ do mu = 1, DIM
+ if (dir(mu) == -1) then
+ target(mu) = -1
+ source(mu) = N(mu) - 1
+ elseif (dir(mu) == +1) then
+ target(mu) = N(mu)
+ source(mu) = 0
+ else
+ target(mu) = i(mu)
+ source(mu) = i(mu)
+ endif
+ enddo
+
+!!! call shmem_get(array(1, xyzt2i(target)), &
+!!! array(1, xyzt2i(source)), size, pe)
+ call shmem_put(array(1, xyzt2i(target)), &
+ array(1, xyzt2i(source)), size, nnpe(-xx,-yy,-zz,-tt))
+
+ enddo
+ enddo
+ enddo
+ enddo
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_single_pe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_single_pe.F90
new file mode 100644
index 0000000000000000000000000000000000000000..5fab9e1eeb57377b0ef2d1ecbd9de848a5a7fc3d
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_single_pe.F90
@@ -0,0 +1,72 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! xbound_single_pe.F90 - dummy routines for boundary exchange
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine init_xbound()
+ return
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_g(u, eo, mu)
+
+ use module_vol
+ implicit none
+ integer :: eo, mu
+ GAUGE_FIELD :: u
+
+ return
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_g_field(u)
+
+ use module_vol
+ implicit none
+ GAUGE_FIELD :: u
+
+ return
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_sc_field(array)
+
+ use module_vol
+ implicit none
+ SPINCOL_FIELD :: array
+
+ return
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_sc2_field(array)
+
+ use module_vol
+ implicit none
+ SC2_FIELD :: array
+
+ return
+end
+
+!-------------------------------------------------------------------------------
+subroutine xbound_sc2_field_i(array)
+
+ use module_vol
+ implicit none
+ SC2_FIELD :: array
+
+ return
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_test.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_test.F90
new file mode 100644
index 0000000000000000000000000000000000000000..f76c76cbc1fb12d5daa003f5e98b97ac16425a47
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_test.F90
@@ -0,0 +1,177 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! xbound_test.F90 - test of xbound_g()
+! all possible dimensions must be decomposed
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine xbound_test()
+
+ use module_function_decl
+ use module_lattice
+ use module_vol
+ implicit none
+ GAUGE_FIELD :: u, v
+ integer :: x, y, z, t, j(4), i, eo, global(4)
+ integer :: is_bound_x, is_bound_y, is_bound_z, is_bound_t
+ integer :: dx, dy, dz, dt
+ integer, external :: xyzt2i, e_o
+ character(16) :: status
+
+!! call conf_zero(u)
+!!
+!! do i = 1, volh
+!! u(1, 1, i, EVEN, 1) = 123
+!! u(1, 1, i, ODD, 1) = 789
+!! enddo
+!!
+!! call xbound_g(u, EVEN, 1)
+!! call xbound_g(u, ODD, 1)
+!!
+!! do i = 1, volh_tot
+!! ASSERT(u(1, 1, i, EVEN, 1) == 123)
+!! ASSERT(u(1, 1, i, ODD, 1) == 789)
+!! enddo
+!!
+!!!-----------------------------------------------
+ call conf_zero(u)
+
+ u = cmplx(12345.0, 67890.0)
+
+ call open_diag()
+
+ do t = 0, NT - 1
+ do z = 0, NZ - 1
+ do y = 0, NY - 1
+ do x = 0, NX - 1
+ j = (/x, y, z, t/)
+
+ i = xyzt2i(j)
+ eo = e_o(j)
+
+ call local2global(my_pe(), j, global)
+
+ u(1, 1, i, eo, 1) = global(1)
+ u(2, 2, i, eo, 1) = global(2)
+ u(3, 3, i, eo, 1) = global(3)
+ u(1, 2, i, eo, 1) = global(4)
+
+ !!write(UDIAG, "(4i6, 2i8)") j, i, eo
+
+ !!write(UDIAG, "(10i6)") j, global, i, eo
+ enddo
+ enddo
+ enddo
+ enddo
+
+ call xbound_g(u, EVEN, 1)
+ call xbound_g(u, ODD, 1)
+
+ write(UDIAG,*)
+ write(UDIAG,*)
+
+ !!ASSERT(e_o((/0,0,0,0/)) == 0)
+
+
+ do i = 1, volh_tot
+ do eo = EVEN, ODD
+ write(UDIAG, "(a,2i6,4f8.1)") "alles: ", i, eo, &
+ real(u(1, 1, i, eo, 1)), &
+ real(u(2, 2, i, eo, 1)), &
+ real(u(3, 3, i, eo, 1)), &
+ real(u(1, 2, i, eo, 1))
+ enddo
+ enddo
+
+ write(UDIAG,*)
+ write(UDIAG,*)
+
+ do t = -1, NT
+ is_bound_t = 0
+ if (t == -1) is_bound_t = 1
+ if (t == NT) is_bound_t = 1
+ do z = -1, NZ
+ is_bound_z = 0
+ if (z == -1) is_bound_z = 1
+ if (z == NZ) is_bound_z = 1
+ do y = -1, NY
+ is_bound_y = 0
+ if (y == -1) is_bound_y = 1
+ if (y == NY) is_bound_y = 1
+ do x = -1, NX
+ is_bound_x = 0
+ if (x == -1) is_bound_x = 1
+ if (x == NX) is_bound_x = 1
+
+ if (is_bound_x + is_bound_y + is_bound_z + is_bound_t <= 2) then
+
+
+!! do x = NX, NX
+
+ j = (/x, y, z, t/)
+ j = (/x, y, z, t/)
+
+ i = xyzt2i(j)
+ eo = e_o(j)
+
+ call local2global(my_pe(), j, global)
+
+ dx = -is_bound_x; if (x == NX) dx = 1
+ dy = -is_bound_y; if (y == NY) dy = 1
+ dz = -is_bound_z; if (z == NZ) dz = 1
+ dt = -is_bound_t; if (t == NT) dt = 1
+
+ if (u(1, 1, i, eo, 1) == global(1) .and. &
+ u(2, 2, i, eo, 1) == global(2) .and. &
+ u(3, 3, i, eo, 1) == global(3) .and. &
+ u(1, 2, i, eo, 1) == global(4)) then
+ status = " okay"
+ !!elseif (u(1, 1, i, eo, 1) == global(1) + dx .and. &
+ !! u(2, 2, i, eo, 1) == global(2) + dy .and. &
+ !! u(3, 3, i, eo, 1) == global(3) + dz .and. &
+ !! u(1, 2, i, eo, 1) == global(4) + dt) then
+ !! status = " okay2"
+ else
+ status = ""
+
+ dx = int(u(1, 1, i, eo, 1)) - global(1)
+ dy = int(u(2, 2, i, eo, 1)) - global(2)
+ dz = int(u(3, 3, i, eo, 1)) - global(3)
+ dt = int(u(1, 2, i, eo, 1)) - global(4)
+
+ write(status,"(a,4i3,a)") " (", dx, dy, dz, dt, ")"
+ endif
+
+
+ !!if (eo == 0) write(UDIAG, "(4i6, 2x, 4i6, i8, 2i3)") j, i, eo
+ !write(UDIAG, "(10i6)") j, global, i, eo
+ !!ASSERT(eo == mod(4+x+y+z+t, 2))
+
+ write(UDIAG, "(10i6,4f8.1,a)") j, global, i, eo, &
+ real(u(1, 1, i, eo, 1)), &
+ real(u(2, 2, i, eo, 1)), &
+ real(u(3, 3, i, eo, 1)), &
+ real(u(1, 2, i, eo, 1)), status
+
+ ASSERT(u(1, 1, i, eo, 1) == global(1))
+ ASSERT(u(2, 2, i, eo, 1) == global(2))
+ ASSERT(u(3, 3, i, eo, 1) == global(3))
+ ASSERT(u(1, 2, i, eo, 1) == global(4))
+
+ endif
+ enddo
+ enddo
+ enddo
+ enddo
+
+end
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/conf.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/conf.F90
new file mode 100644
index 0000000000000000000000000000000000000000..94f937766b843e369c832555630c8e04fe1cb717
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/conf.F90
@@ -0,0 +1,461 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! conf.F90 - operations on gauge field and pseudo fermion field configurations
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine init_confs(para, conf)
+
+ use typedef_para
+ use module_p_interface
+ use module_switches
+ implicit none
+
+ type(type_para) :: para
+ type(hmc_conf), dimension(MAX_TEMPER) :: conf
+ integer :: i
+
+ do i = 1, para%n_temper
+ call allocate_g_field(conf(i)%u)
+ call allocate_sc_field(conf(i)%phi)
+ if (switches%hasenbusch) call allocate_sc_field(conf(i)%phi2)
+ if (para%start == START_HOT .or. para%start == START_COLD) then
+ call init_u(conf(i)%u, para%start)
+ conf(i)%former = i
+ endif
+ enddo
+
+ if (para%start == START_CONT) call conf_read(.true., para, conf)
+ if (para%start == START_FILE) call conf_read(.false., para, conf)
+
+ do i = 1, para%n_temper
+ if (para%hmc(i)%csw_kappa /= ZERO) then
+ call allocate_clover_field_a(conf(i)%a)
+ call allocate_clover_field_a(conf(i)%i)
+ call allocate_clover_field_b(conf(i)%b)
+ call clover_init(conf(i)%a, conf(i)%i, conf(i)%b, &
+ conf(i)%u, para%hmc(i)%csw_kappa)
+ endif
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_u(u, start) ! initialization of u-field (at trajectory 0)
+
+ use module_vol
+ implicit none
+
+ GAUGE_FIELD :: u
+ integer :: start
+
+ select case (start)
+ case (START_HOT)
+ call conf_hot(u)
+ case (START_COLD)
+ call conf_cold(u)
+ case default
+ call die("init_u(): don't know how to start")
+ end select
+
+ call xbound_g_field(u)
+end
+
+!-------------------------------------------------------------------------------
+subroutine conf_check(u) ! checks if u-field is SU(3)
+
+ use module_vol
+ implicit none
+
+ GAUGE_FIELD, intent(in) :: u
+ SU3 :: v
+ SU3, parameter :: su3_one = reshape( &
+ (/ ONE,ZERO,ZERO, &
+ ZERO,ONE,ZERO, &
+ ZERO,ZERO,ONE /), &
+ (/ NCOL, NCOL /))
+ REAL :: dev, d
+ integer :: i, j, k, eo, mu
+
+
+ dev = ZERO
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ do k = 1, VOLH
+ call su3_check_det(u(1, 1, k, eo, mu))
+ call uud(v, u(1, 1, k, eo, mu), u(1, 1, k, eo, mu))
+ do i = 1, NCOL
+ do j = 1, NCOL
+ d = abs(Re(v(i, j)) - Re(su3_one(i, j))) &
+ + abs(Im(v(i, j)) - Im(su3_one(i, j)))
+ enddo
+ enddo
+ dev = max(dev, d)
+ enddo
+ enddo
+ enddo
+
+ if (dev > 1e-13) call die('conf_check(): dev > 1e-13')
+!!write(0,'(x,a,e10.2)') 'conf_check(): max deviation is ', dev
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine conf_normalize(u) ! normalizes u-field to SU(3)
+
+ use module_vol
+ implicit none
+
+ GAUGE_FIELD, intent(inout) :: u
+ integer :: i, eo, mu
+
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ do i = 1, volh
+ call u_normalize(u(1, 1, i, eo, mu))
+ enddo
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine conf_zero(u) ! init ("OpenMP first touch")
+
+ use module_vol
+ implicit none
+
+ GAUGE_FIELD :: u
+ integer :: i, eo, mu
+
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ !$omp parallel do
+ do i = 1, volh
+ u(1, 1, i, eo, mu) = ZERO
+ u(2, 1, i, eo, mu) = ZERO
+ u(3, 1, i, eo, mu) = ZERO
+ u(1, 2, i, eo, mu) = ZERO
+ u(2, 2, i, eo, mu) = ZERO
+ u(3, 2, i, eo, mu) = ZERO
+ u(1, 3, i, eo, mu) = ZERO
+ u(2, 3, i, eo, mu) = ZERO
+ u(3, 3, i, eo, mu) = ZERO
+ enddo
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine conf_cold(u) ! cold start
+
+ use module_vol
+ implicit none
+
+ GAUGE_FIELD :: u
+ integer :: i, eo, mu
+
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ do i = 1, volh
+ u(1, 1, i, eo, mu) = ONE
+ u(2, 1, i, eo, mu) = ZERO
+ u(3, 1, i, eo, mu) = ZERO
+ u(1, 2, i, eo, mu) = ZERO
+ u(2, 2, i, eo, mu) = ONE
+ u(3, 2, i, eo, mu) = ZERO
+ u(1, 3, i, eo, mu) = ZERO
+ u(2, 3, i, eo, mu) = ZERO
+ u(3, 3, i, eo, mu) = ONE
+ enddo
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine conf_hot(u) ! hot start
+
+ use module_vol
+ implicit none
+
+ GAUGE_FIELD :: u
+ integer :: i, eo, mu
+
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ call ran_gauss_volh(NCOL * NCOL, u(1, 1, 1, eo, mu), ONE, eo)
+ do i = 1, volh
+ call u_normalize(u(1, 1, i, eo, mu))
+ enddo
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine conf_seq(action, u, u_io) ! arranges u-field for i/o
+
+ use module_lattice_io
+ use module_decomp
+ use module_vol
+ implicit none
+
+ character(*) :: action
+ GAUGE_FIELD :: u
+ GAUGE_FIELD_IO :: u_io
+
+ integer, dimension(DIM) :: j
+ integer :: x, y, z, t, i, eo, mu, c1, c2
+ integer, external :: std_xyzt2i, e_o
+
+ do t = 0, NT - 1
+ do z = 0, NZ - 1
+ do y = 0, NY - 1
+ do x = 0, NX - 1
+
+ j = (/x, y, z, t/)
+
+ i = std_xyzt2i(j)
+ eo = e_o(j)
+
+ do mu = 1, DIM
+ do c2 = 1, NCOL - 1
+ do c1 = 1, NCOL
+ if (action == "read") then
+ u(c1, c2, i, eo, mu) = u_io(c1, c2, mu, x, y, z, t)
+ else
+ u_io(c1, c2, mu, x, y, z, t) = u(c1, c2, i, eo, mu)
+ endif
+ enddo
+ enddo
+ if (action == "read") call u_complete(u(1, 1, i, eo ,mu))
+ enddo
+ enddo
+ enddo
+ enddo
+ enddo
+end
+
+!-------------------------------------------------------------------------------
+subroutine phi_seq(action, phi, phi_io) ! arranges phi-field for i/o
+
+ use module_lattice_io
+ use module_decomp
+ use module_vol
+ implicit none
+
+ character(*) :: action
+ SPINCOL_FIELD :: phi
+ SPINCOL_FIELD_IO :: phi_io
+
+ integer, dimension(DIM) :: j
+ integer :: x, y, z, t, d, c, i
+ integer, external :: std_xyzt2i, e_o
+
+ do t = 0, NT - 1
+ do z = 0, NZ - 1
+ do y = 0, NY - 1
+ do x = 0, NX - 1
+
+ j = (/x, y, z, t/)
+
+ if (e_o(j) == EVEN) then
+ i = std_xyzt2i(j)
+ do c = 1, NCOL
+ do d = 1, NDIRAC
+ if (action == "read") then
+ phi(d, c, i) = phi_io(d, c, x, y, z, t)
+ else
+ phi_io(d, c, x, y, z, t) = phi(d, c, i)
+ endif
+ enddo
+ enddo
+ endif
+
+ enddo
+ enddo
+ enddo
+ enddo
+end
+
+!-------------------------------------------------------------------------------
+subroutine conf_read(restart, para, conf)
+
+ use typedef_cksum
+ use typedef_para
+ use module_conf_info
+ use module_lattice_io
+ use module_p_interface
+ use module_switches
+ use module_vol
+ implicit none
+
+ character(len = *), parameter :: READ = "read"
+
+ logical :: restart
+ type(type_para) :: para
+ type(hmc_conf), dimension(MAX_TEMPER) :: conf
+
+ type(type_conf_info) :: info
+ type(type_cksum), dimension(0:para%L(4)-1) :: cksum
+ P_GAUGE_FIELD_IO, save :: u_io
+ P_SPINCOL_FIELD_IO, save :: phi_io
+ FILENAME, external :: u_file, phi_file, info_file
+ FILENAME :: file
+ integer :: i, t
+ integer :: u_m, u_mx, phi_m, phi_mx
+ integer :: n_u_io, n_phi
+
+ ALLOCATE_G_FIELD_IO(u_io)
+ ALLOCATE_SC_FIELD_IO(phi_io)
+
+ u_m = NCOL * (NCOL - 1) * DIM
+ u_mx = NX
+ phi_m = NDIRAC * NCOL
+ phi_mx = NXH
+ n_u_io = u_m * vol * SIZE_COMPLEX
+ n_phi = size_sc_field
+
+ do i = 1, para%n_temper
+
+ if (restart) then
+ file = info_file(i)
+ else
+ file = para%info_file(i)
+ endif
+
+ open(UINFO, file = file, action = READ, status = "old")
+ call read_conf_info_header(UINFO, info)
+ call check_conf_info_header(restart, info, para)
+
+ if (restart) then
+ conf(i)%former = info%ensemble(2)
+ else
+ conf(i)%former = i
+ endif
+
+ ! read U
+
+ call read_cksum(restart, UINFO, cksum, para%L(4), i, u_file)
+
+ TIMING_START(timing_bin_u_read)
+ call field_io(READ, u_m, u_mx, u_io, cksum)
+ TIMING_STOP(timing_bin_u_read)
+
+ call conf_seq(READ, conf(i)%u, u_io)
+ call xbound_g_field(conf(i)%u)
+
+ if (switches%tempering .and. switches%dynamical) then
+ ! read PHI
+ call read_cksum(restart, UINFO, cksum, para%L(4), i, phi_file)
+ call field_io(READ, phi_m, phi_mx, phi_io, cksum)
+ call phi_seq(READ, conf(i)%phi, phi_io)
+ call xbound_sc_field(conf(i)%phi)
+ endif
+
+ close(UINFO)
+ enddo
+end
+
+!-------------------------------------------------------------------------------
+subroutine conf_write(restart, para, conf)
+
+ use typedef_cksum
+ use typedef_para
+ use module_function_decl
+ use module_lattice_io
+ use module_p_interface
+ use module_switches
+ use module_vol
+ implicit none
+
+ character(len = *), parameter :: WRITE = "write"
+
+ logical :: restart
+ type(type_para) :: para
+ type(hmc_conf), dimension(MAX_TEMPER) :: conf
+
+ type(type_cksum), dimension(0:para%L(4)-1) :: cksum
+ P_GAUGE_FIELD_IO, save :: u_io
+ P_SPINCOL_FIELD_IO, save :: phi_io
+ FILENAME, external :: u_file, phi_file, info_file
+ FILENAME, external :: conf_file, conf_info_file
+ FILENAME :: f_info
+ integer :: i, j, t
+ integer :: u_m, u_mx, phi_m, phi_mx
+ integer :: n_u_io, n_phi
+ REAL :: plaq
+ REAL, external :: sg
+
+
+ ALLOCATE_G_FIELD_IO(u_io)
+ ALLOCATE_SC_FIELD_IO(phi_io)
+
+ u_m = NCOL * (NCOL - 1) * DIM
+ u_mx = NX
+ phi_m = NDIRAC * NCOL
+ phi_mx = NXH
+ n_u_io = u_m * vol * SIZE_COMPLEX
+ n_phi = size_sc_field
+
+ call check_former(para%n_temper, conf)
+
+ do i = 1, para%n_temper
+
+ j = conf(i)%former
+
+ if (restart) then
+ f_info = info_file(i)
+ else
+ f_info = conf_info_file(i, j)
+ endif
+
+ if (my_pe() == 0) open(UINFO, file = f_info, action = WRITE)
+ plaq = sg(conf(i)%u) / (SIX * volume)
+ call write_conf_info_header(para, i, j, plaq)
+
+ ! write U
+
+ do t = 0, para%L(4) - 1
+ if (restart) then
+ cksum(t)%file = u_file(i, t)
+ else
+ cksum(t)%file = conf_file(i, j, t)
+ endif
+ enddo
+
+ call conf_seq(WRITE, conf(i)%u, u_io)
+
+ TIMING_START(timing_bin_u_write)
+ call field_io(WRITE, u_m, u_mx, u_io, cksum)
+ TIMING_STOP(timing_bin_u_write)
+
+ call write_cksum(UINFO, cksum, para%L(4))
+
+ if (switches%tempering .and. switches%dynamical .and. restart) then
+ ! write PHI
+ do t = 0, para%L(4) - 1
+ cksum(t)%file = phi_file(i, t)
+ enddo
+ call phi_seq(WRITE, conf(i)%phi, phi_io)
+ call field_io(WRITE, phi_m, phi_mx, phi_io, cksum)
+ call write_cksum(UINFO, cksum, para%L(4))
+ endif
+
+ if (my_pe() == 0) close(UINFO)
+ enddo
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/conf_info.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/conf_info.F90
new file mode 100644
index 0000000000000000000000000000000000000000..a2b227a4b3b5b5753920c2ec33064f99a3640cb7
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/conf_info.F90
@@ -0,0 +1,200 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! conf_info.F90 - read/write/check file containing configuration parameters
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine write_conf_info_header(para, i_ensemble1, i_ensemble2, plaq)
+
+ use module_conf_info
+ use typedef_para
+ use module_bqcd
+ use module_counter
+ use module_decomp
+ use module_function_decl
+ implicit none
+
+ type(type_para) :: para
+ REAL :: plaq
+ integer :: i_ensemble1, i_ensemble2
+ integer :: i, e(2)
+
+ e(1) = i_ensemble1
+ e(2) = i_ensemble2
+
+ if (my_pe() == 0) then
+ call begin(UINFO, "ConfInfoHeader")
+ write(UINFO, *) k_format, conf_info_version
+ write(UINFO, 400) k_prog, prog_name, prog_version
+ write(UINFO, *) k_run, para%run
+ write(UINFO, *) k_traj, counter%traj
+ write(UINFO, 405) k_host, rechner()
+ write(UINFO, 400) k_date, datum(), uhrzeit()
+ write(UINFO, 410) k_L, decomp%std%L
+ write(UINFO, 410) k_bc, decomp%std%bc_fermions
+ write(UINFO, *) k_rkind, RKIND
+ write(UINFO, 420) k_plaq, plaq
+
+ do i = 1, 2
+ write(UINFO, *) trim(k_ensemble(i)), e(i)
+ write(UINFO, 405) trim(k_beta(i)), trim(para%c_hmc(e(i))%beta)
+ write(UINFO, 405) trim(k_kappa(i)), trim(para%c_hmc(e(i))%kappa)
+ write(UINFO, 405) trim(k_csw(i)), trim(para%c_hmc(e(i))%csw)
+ write(UINFO, 405) trim(k_csw_kappa(i)),trim(para%c_hmc(e(i))%csw_kappa)
+ write(UINFO, 405) trim(k_h(i)), trim(para%c_hmc(e(i))%h)
+ enddo
+ call end_A(UINFO, "ConfInfoHeader")
+ endif
+
+400 format (3(1x,a))
+405 format (2(1x,a))
+410 format (1x,a,4i3)
+420 format (1x,a,1x,e25.14)
+end
+
+!-------------------------------------------------------------------------------
+subroutine read_conf_info_header(unit, info)
+
+ use module_bqcd
+ use module_conf_info
+ implicit none
+
+ type(type_conf_info) :: info
+ integer :: unit, v, i
+ integer, external :: pos_keyword
+
+ call read_keyword_int(unit, k_format, v, 1)
+
+ if (v /= conf_info_version) then
+ call die("read_conf_info_header(): wrong file format")
+ endif
+
+ call read_keyword_int(unit, k_L, info%L, DIM)
+ call read_keyword_int(unit, k_bc, info%bc_fermions, DIM)
+ call read_keyword_int(unit, k_rkind, info%rkind, 1)
+
+ do i = 1, 2
+ call read_keyword_int (unit, k_ensemble(i), info%ensemble(i), 1)
+ call read_keyword_REAL(unit, k_beta(i), info%beta(i), 1)
+ call read_keyword_REAL(unit, k_kappa(i), info%kappa(i), 1)
+ call read_keyword_REAL(unit, k_csw(i), info%csw(i), 1)
+ call read_keyword_REAL(unit, k_csw_kappa(i),info%csw_kappa(i),1)
+ call read_keyword_REAL(unit, k_h(i), info%h(i), 1)
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine check_conf_info_header(restart, info, para)
+
+ use module_conf_info
+ use module_decomp
+ use typedef_para
+ implicit none
+
+ logical :: restart
+ type(type_conf_info) :: info
+ type(type_para) :: para
+ integer :: mu, i
+
+ if (info%rkind /= RKIND) call die("check_conf_info_header(): RKIND wrong")
+
+ do mu = 1, DIM
+ if (info%L(mu) /= decomp%std%L(mu)) then
+ call die("check_conf_info_header(): L inconsistent")
+ endif
+ enddo
+
+ if (restart) then
+ do mu = 1, DIM
+ if (info%bc_fermions(mu) /= decomp%std%bc_fermions(mu)) then
+ call die("check_conf_info_header(): bc_fermions inconsistent")
+ endif
+ enddo
+
+ do i = 1, 2
+ if (info%ensemble(i) < 1 .or. info%ensemble(i) > para%n_temper) then
+ call die("check_conf_info_header(): i_ensemble out of range")
+ endif
+
+ if (info%beta(i) /= para%hmc(info%ensemble(i))%beta) then
+ call die("check_conf_info_header(): beta inconsistent")
+ endif
+
+ if (info%kappa(i) /= para%hmc(info%ensemble(i))%kappa) then
+ call die("check_conf_info_header(): kappa inconsistent")
+ endif
+
+ if (info%csw(i) /= para%hmc(info%ensemble(i))%csw) then
+ call die("check_conf_info_header(): csw inconsistent")
+ endif
+
+ if (abs(info%csw_kappa(i) - &
+ para%hmc(info%ensemble(i))%csw_kappa) > 1e-13 ) then
+ call die("check_conf_info_header(): csw_kappa inconsistent")
+ endif
+
+ if (info%h(i) /= para%hmc(info%ensemble(i))%h) then
+ call die("check_conf_info_header(): h inconsistent")
+ endif
+ enddo
+ endif
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine read_cksum(restart, unit, cksum, LT, i_temper, file_name)
+
+ use typedef_cksum
+ implicit none
+ logical :: restart
+ integer :: unit, LT, i_temper, t
+ FILENAME, external :: file_name
+ type(type_cksum), dimension(0:LT-1) :: cksum
+
+ call pos_keyword(unit, ">BeginCheckSum")
+ read(unit,*)
+
+ do t = 0, LT - 1
+ read(unit,*) cksum(t)%file, cksum(t)%sum
+
+ if (restart) then
+ if (cksum(t)%file /= file_name(i_temper, t)) then
+ call die("read_cksum(): file names inconsistent")
+ endif
+ endif
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine write_cksum(unit, cksum, LT)
+
+ use typedef_cksum
+ use module_function_decl
+ implicit none
+
+ integer :: unit, LT, i
+ type(type_cksum), dimension(LT) :: cksum
+
+ if (my_pe() == 0) then
+ call begin(unit, "CheckSum")
+ do i = 1, LT
+ write(unit, *) trim(cksum(i)%file), cksum(i)%sum, cksum(i)%bytes
+ enddo
+ call end_A(unit, "CheckSum")
+ endif
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/cooling.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/cooling.F90
new file mode 100644
index 0000000000000000000000000000000000000000..fba73b1a8947cfb6f10f85cd709722a15f0e5a40
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/cooling.F90
@@ -0,0 +1,287 @@
+!-------------------------------------------------------------------------------
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! cooling.F90 - measurement of the topological charge using standard cooling
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_cooling
+
+ integer, save :: n_cool
+ logical, dimension(:), pointer, save :: measure_q
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_cooling(list)
+
+ use module_cooling
+ implicit none
+ character(*), intent(in) :: list
+ integer :: i, iostat
+
+ if (list /= " ") then
+ open(ULIST, file = list, action = "read", status = "old")
+
+ iostat = 0
+ n_cool = 0
+ do while (iostat == 0)
+ read(ULIST, *, iostat = iostat) i
+ if (i < 0) then
+ call die("init_cooling(): list has negative entries")
+ else
+ n_cool = max(n_cool, i)
+ endif
+ enddo
+
+ allocate(measure_q(0:n_cool))
+
+ do i = 0, n_cool
+ measure_q(i) = .false.
+ enddo
+
+ rewind(ULIST)
+
+ iostat = 0
+ do while (iostat == 0)
+ read(ULIST, *, iostat = iostat) i
+ measure_q(i) = .true.
+ enddo
+
+ close(ULIST)
+
+ else
+ n_cool = -1
+ endif
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine cooling(u_in, traj, i_ensemble1, i_ensemble2)
+
+ use module_cooling
+ use module_function_decl
+ use module_p_interface
+ use module_vol
+ implicit none
+
+ integer, intent(in) :: traj, i_ensemble1, i_ensemble2
+ GAUGE_FIELD, intent(in) :: u_in
+ P_GAUGE_FIELD, save :: u
+ integer :: i
+ character(len = *), parameter :: key = "%Qc"
+ REAL :: q, plaq
+
+ if (n_cool < 0) return
+
+ TIMING_START(timing_bin_cooling)
+
+ ALLOCATE_G_FIELD(u)
+
+ u = u_in
+
+ call begin(UREC, "Cooling")
+
+ if (my_pe() == 0) then
+ write(UREC, 400) "T", key, "traj", "e", "f", "i_cool", &
+ "Q_cool", "PlaqEnergy"
+ endif
+
+ do i = 0, n_cool
+ if (measure_q(i)) then
+ call conf_check(u)
+ call top_charge(q, u, plaq)
+ if (my_pe() == 0) then
+ write(UREC, 410) key, traj, i_ensemble1, i_ensemble2, i, q, plaq
+ endif
+ endif
+ if (i < n_cool) call conf_relax(u)
+ enddo
+
+ call end_A(UREC, "Cooling")
+
+400 format (1x, 2a, a6, 2a3, a8, a15, a15)
+410 format (1x, a4, i6, 2i3, i8, f15.6, f15.10)
+
+ TIMING_STOP(timing_bin_cooling)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine conf_relax(u)
+
+ use module_vol
+ implicit none
+
+ GAUGE_FIELD, intent(inout) :: u
+ SU3 :: uuu, w, a
+ SU3, parameter :: su3_one = reshape( &
+ (/ ONE,ZERO,ZERO, &
+ ZERO,ONE,ZERO, &
+ ZERO,ZERO,ONE /), &
+ (/ NCOL, NCOL /))
+ REAL :: p0, p1, p2, p3, fac
+ REAL :: a0, a1, a2, a3
+ integer :: i, eo, mu, k, c1, c2
+
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ !$omp parallel do private(uuu, w, a, p0, p1, p2, p3, fac, &
+ !$omp a0, a1, a2, a3, k, c1, c2)
+ do i = 1, VOLH
+ call staple(uuu, u, i, eo, mu)
+ do k = 1, NCOL
+ if (k == 1) then
+ c1 = 1
+ c2 = 2
+ else if (k == 2) then
+ c1 = 1
+ c2 = 3
+ else if (k == 3) then
+ c1 = 2
+ c2 = 3
+ endif
+
+ call uu(w, u(1, 1, i, eo, mu), uuu) ! w = u * uuu
+
+ p0 = Re(w(c1, c1)) + Re(w(c2, c2))
+ p1 = -(Im(w(c1, c2)) + Im(w(c2, c1)))
+ p2 = -(Re(w(c1, c2)) - Re(w(c2, c1)))
+ p3 = -(Im(w(c1, c1)) - Im(w(c2, c2)))
+
+ fac = ONE / sqrt(p0**2 + p1**2 + p2**2 + p3**2)
+
+ a0 = fac * p0
+ a1 = fac * p1
+ a2 = fac * p2
+ a3 = fac * p3
+
+ a = su3_one
+
+ a(c1, c1) = cmplx( a0, a3)
+ a(c1, c2) = cmplx( a2, a1)
+ a(c2, c1) = cmplx(-a2, a1)
+ a(c2, c2) = cmplx( a0, -a3)
+
+ call u_update(u(1, 1, i, eo, mu), a) ! u = a * u
+
+ enddo ! k
+ enddo ! i
+ call xbound_g(u, eo, mu)
+ enddo ! eo
+ enddo ! mu
+end
+
+!-------------------------------------------------------------------------------
+subroutine top_charge(qq, u, plaq_energy)
+
+ use module_vol
+ use module_nn
+ implicit none
+
+ REAL, intent(out) :: qq, plaq_energy
+ GAUGE_FIELD, intent(in) :: u
+
+ integer :: e, o, mu, nu, i, j1, j2, j3, j4, j5, j6, j7, c1, c2
+ SU3 :: uuu, left, right
+ COMPLEX, dimension(NCOL, NCOL, DIM - 1, DIM) :: ut ! U~(x,mu,nu) - h.c.
+ REAL :: q, plaq
+ REAL, external :: global_sum, Re_Tr_uu
+
+ !----------------------------------------------------------------------
+ !
+ ! (j3, e) --->--- (j2, o) ---<--- x
+ ! | | |
+ ! | | |
+ ! ^ v ^ nu
+ ! | | | ^
+ ! | | | |
+ ! (j4, o) ---<--- (i,e) --->--- (j1, o) x--> mu
+ ! | | |
+ ! | | |
+ ! v ^ v
+ ! | | |
+ ! | | |
+ ! (j5, e) --->--- (j6, o) ---<--- (j7, e)
+ !
+ !----------------------------------------------------------------------
+
+ q = 0
+ plaq = 0
+
+ do e = EVEN, ODD
+ o = EVEN + ODD - e
+ !$omp parallel do reduction(+: q, plaq) private(uuu, left, right, ut, &
+ !$omp mu, nu, i, j1, j2, j3, j4, j5, j6, j7, c1, c2)
+ do i = 1, VOLH
+ do mu = 1, DIM - 1
+ do nu = mu + 1, DIM
+
+ j1 = nn(i, e, mu, FWD)
+ j2 = nn(i, e, nu, FWD)
+ j3 = nn(j2, o, mu, BWD)
+ j4 = nn(j3, e, nu, BWD)
+ j5 = nn(j4, o, nu, BWD)
+ j6 = nn(j5, e, mu, FWD)
+ j7 = nn(j6, o, mu, FWD)
+
+ uuu = 0
+
+ call uuu_fwd(uuu, u(1, 1, j1, o, nu), &
+ u(1, 1, j2, o, mu), &
+ u(1, 1, i, e, nu))
+
+ plaq = plaq + Re_Tr_uu(uuu, u(1, 1, i, e, mu))
+
+ call uuu_bwd_m(uuu, u(1, 1, j7, e, nu), &
+ u(1, 1, j6, o, mu), &
+ u(1, 1, j6, o, nu))
+
+ call uu(right, u(1, 1, i, e, mu), uuu)
+
+ uuu = 0
+
+ call uuu_fwd(uuu, u(1, 1, i, e, nu), &
+ u(1, 1, j3, e, mu), &
+ u(1, 1, j4, o, nu))
+
+ call uuu_bwd_m(uuu, u(1, 1, j6, o, nu), &
+ u(1, 1, j5, e, mu), &
+ u(1, 1, j5, e, nu))
+
+ call uu(left, uuu, u(1, 1, j4, o, mu))
+
+ do c2 = 1, NCOL
+ do c1 = 1, NCOL
+ ut(c1, c2, mu, nu) = right(c1, c2) - conjg(right(c2, c1)) &
+ + left(c1, c2) - conjg(left(c2, c1))
+ enddo
+ enddo
+ enddo ! nu
+ enddo ! mu
+
+ q = q + Re_Tr_uu(ut(1, 1, 1, 2), ut(1, 1, 3, 4)) &
+ - Re_Tr_uu(ut(1, 1, 1, 3), ut(1, 1, 2, 4)) &
+ + Re_Tr_uu(ut(1, 1, 1, 4), ut(1, 1, 2, 3))
+
+ enddo ! i
+ enddo ! e/o
+
+ q = global_sum(q)
+ plaq = global_sum(plaq)
+
+ q = -q / (256 * PI**2)
+ qq = q
+ plaq_energy = ONE - plaq / (THREE * SIX * volume)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D.F90
new file mode 100644
index 0000000000000000000000000000000000000000..dc1887d20347cadf207611ee826a99426962eb8d
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D.F90
@@ -0,0 +1,52 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! D.F90 - multiplication with the Wilson hopping matrix D (or D^\dagger)
+! (optimization for Cray T3E)
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine NAME(e, o, out, in, u)
+
+! out := NAME in
+!
+! NAME = d or d_dag
+!
+! out is of type "e" = EVEN or ODD
+! in is of type "o" = ODD or EVEN
+
+ use module_nn
+ use module_vol
+ implicit none
+
+ integer :: e, o
+ SPINCOL_FIELD :: out, in
+ GAUGE_FIELD :: u
+
+ TIMING_START(STRCAT(timing_bin_, NAME))
+
+ call xbound_sc_field(in)
+
+ call STRCAT(NAME, _t )(out, in, u(1, 1, 1, e, 4), u(1, 1, 1, o, 4), &
+ nn(1, e, 4, FWD), nn(1, e, 4, BWD), VOLH)
+ call STRCAT(NAME, _zb)(out, in, u(1, 1, 1, o, 3), nn(1, e, 3, BWD), VOLH)
+ call STRCAT(NAME, _zf)(out, in, u(1, 1, 1, e, 3), nn(1, e, 3, FWD), VOLH)
+ call STRCAT(NAME, _yb)(out, in, u(1, 1, 1, o, 2), nn(1, e, 2, BWD), VOLH)
+ call STRCAT(NAME, _yf)(out, in, u(1, 1, 1, e, 2), nn(1, e, 2, FWD), VOLH)
+ call STRCAT(NAME, _xb)(out, in, u(1, 1, 1, o, 1), nn(1, e, 1, BWD), VOLH)
+ call STRCAT(NAME, _xf)(out, in, u(1, 1, 1, e, 1), nn(1, e, 1, FWD), VOLH)
+
+ TIMING_STOP(STRCAT(timing_bin_, NAME))
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D2.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D2.F90
new file mode 100644
index 0000000000000000000000000000000000000000..fe9f9fc5618e274f62ee710478b11d8a2145bb59
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D2.F90
@@ -0,0 +1,52 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! D2.F90 - multiplication with the Wilson hopping matrix D (or D^\dagger)
+! (optimization for Hitachi SR8000)
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine NAME(e, o, out, in, u)
+
+! out := NAME in
+!
+! NAME = d or d_dag
+!
+! out is of type "e" = EVEN or ODD
+! in is of type "o" = ODD or EVEN
+
+ use module_nn
+ use module_vol
+ implicit none
+
+ integer :: e, o
+ SPINCOL_FIELD :: out, in
+ GAUGE_FIELD :: u
+
+ TIMING_START(STRCAT(timing_bin_, NAME))
+
+ call xbound_sc_field(in)
+
+ call STRCAT(NAME, _t )(out, in, u(1, 1, 1, e, 4), u(1, 1, 1, o, 4), &
+ nn(1, e, 4, FWD), nn(1, e, 4, BWD), VOLH)
+ call STRCAT(NAME, _zf)(out, in, u(1, 1, 1, e, 3), u(1, 1, 1, o, 3), &
+ nn(1, e, 3, FWD), nn(1, e, 3, BWD), VOLH)
+ call STRCAT(NAME, _yf)(out, in, u(1, 1, 1, e, 2), u(1, 1, 1, o, 2), &
+ nn(1, e, 2, FWD), nn(1, e, 2, BWD), VOLH)
+ call STRCAT(NAME, _xf)(out, in, u(1, 1, 1, e, 1), u(1, 1, 1, o, 1), &
+ nn(1, e, 1, FWD), nn(1, e, 1, BWD), VOLH)
+
+ TIMING_STOP(STRCAT(timing_bin_, NAME))
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D21.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D21.F90
new file mode 100644
index 0000000000000000000000000000000000000000..0ce1150ece84f5ace8a67a3871a369565b63d570
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D21.F90
@@ -0,0 +1,173 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! D21.F90 - multiplication with the Wilson hopping matrix D (or D^\dagger)
+! projection onto 2 spincol components
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine NAME(e, o, out, in, u)
+
+! out := NAME in
+!
+! NAME = d or d_dag
+!
+! out is of type "e" = EVEN or ODD
+! in is of type "o" = ODD or EVEN
+
+ use module_d21
+ use module_nn
+ use module_vol
+ use module_p_interface
+ implicit none
+
+ integer :: e, o
+ SPINCOL_FIELD :: out, in
+ GAUGE_FIELD :: u
+
+
+ TIMING_START(STRCAT(timing_bin_, NAME))
+
+ ALLOCATE_SC2_FIELD(a)
+
+ call STRCAT(NAME, _projection)(a, in)
+
+!!call xbound_sc2_field(a)
+ call xbound_sc2_field_i(a)
+
+ call STRCAT(NAME, _t )(out, a(1, 1, 1, 4, FWD), a(1, 1, 1, 4, BWD), &
+ u(1, 1, 1, e, 4), u(1, 1, 1, o, 4), &
+ nn(1, e, 4, FWD), nn(1, e, 4, BWD), VOLH)
+
+ call STRCAT(NAME, _zf)(out, a(1, 1, 1, 3, FWD), a(1, 1, 1, 3, BWD), &
+ u(1, 1, 1, e, 3), u(1, 1, 1, o, 3), &
+ nn(1, e, 3, FWD), nn(1, e, 3, BWD), VOLH)
+
+ call STRCAT(NAME, _yf)(out, a(1, 1, 1, 2, FWD), a(1, 1, 1, 2, BWD), &
+ u(1, 1, 1, e, 2), u(1, 1, 1, o, 2), &
+ nn(1, e, 2, FWD), nn(1, e, 2, BWD), VOLH)
+
+ call STRCAT(NAME, _xf)(out, a(1, 1, 1, 1, FWD), a(1, 1, 1, 1, BWD), &
+ u(1, 1, 1, e, 1), u(1, 1, 1, o, 1), &
+ nn(1, e, 1, FWD), nn(1, e, 1, BWD), VOLH)
+
+ TIMING_STOP(STRCAT(timing_bin_, NAME))
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine STRCAT(NAME, _projection)(out, in)
+
+ use module_vol
+ implicit none
+
+ SC2_FIELD, intent(out) :: out
+ SPINCOL_FIELD, intent(in) :: in
+ integer :: i
+
+ ! statement function:
+
+ COMPLEX :: i_times, c
+ i_times(c) = cmplx(-aimag(c), real(c))
+
+ TIMING_START(timing_bin_sc2_projection)
+
+#ifdef DAGGER
+# define PLUS -
+# define MINUS +
+# define D_T_ONE 3
+# define D_T_TWO 4
+# define D_T_THREE 1
+# define D_T_FOUR 2
+#else
+# define PLUS +
+# define MINUS -
+# define D_T_ONE 1
+# define D_T_TWO 2
+# define D_T_THREE 3
+# define D_T_FOUR 4
+#endif
+
+
+ !$omp parallel do
+ do i = 1, volh
+
+ out(1, 1, i, 1, FWD) = in(1, 1, i) MINUS i_times(in(4, 1, i))
+ out(2, 1, i, 1, FWD) = in(2, 1, i) MINUS i_times(in(3, 1, i))
+ out(1, 1, i, 1, BWD) = in(1, 1, i) PLUS i_times(in(4, 1, i))
+ out(2, 1, i, 1, BWD) = in(2, 1, i) PLUS i_times(in(3, 1, i))
+
+ out(1, 2, i, 1, FWD) = in(1, 2, i) MINUS i_times(in(4, 2, i))
+ out(2, 2, i, 1, FWD) = in(2, 2, i) MINUS i_times(in(3, 2, i))
+ out(1, 2, i, 1, BWD) = in(1, 2, i) PLUS i_times(in(4, 2, i))
+ out(2, 2, i, 1, BWD) = in(2, 2, i) PLUS i_times(in(3, 2, i))
+
+ out(1, 3, i, 1, FWD) = in(1, 3, i) MINUS i_times(in(4, 3, i))
+ out(2, 3, i, 1, FWD) = in(2, 3, i) MINUS i_times(in(3, 3, i))
+ out(1, 3, i, 1, BWD) = in(1, 3, i) PLUS i_times(in(4, 3, i))
+ out(2, 3, i, 1, BWD) = in(2, 3, i) PLUS i_times(in(3, 3, i))
+
+
+ out(1, 1, i, 2, FWD) = in(1, 1, i) MINUS in(4, 1, i)
+ out(2, 1, i, 2, FWD) = in(2, 1, i) PLUS in(3, 1, i)
+ out(1, 1, i, 2, BWD) = in(1, 1, i) PLUS in(4, 1, i)
+ out(2, 1, i, 2, BWD) = in(2, 1, i) MINUS in(3, 1, i)
+
+ out(1, 2, i, 2, FWD) = in(1, 2, i) MINUS in(4, 2, i)
+ out(2, 2, i, 2, FWD) = in(2, 2, i) PLUS in(3, 2, i)
+ out(1, 2, i, 2, BWD) = in(1, 2, i) PLUS in(4, 2, i)
+ out(2, 2, i, 2, BWD) = in(2, 2, i) MINUS in(3, 2, i)
+
+ out(1, 3, i, 2, FWD) = in(1, 3, i) MINUS in(4, 3, i)
+ out(2, 3, i, 2, FWD) = in(2, 3, i) PLUS in(3, 3, i)
+ out(1, 3, i, 2, BWD) = in(1, 3, i) PLUS in(4, 3, i)
+ out(2, 3, i, 2, BWD) = in(2, 3, i) MINUS in(3, 3, i)
+
+
+ out(1, 1, i, 3, FWD) = in(1, 1, i) MINUS i_times(in(3, 1, i))
+ out(2, 1, i, 3, FWD) = in(2, 1, i) PLUS i_times(in(4, 1, i))
+ out(1, 1, i, 3, BWD) = in(1, 1, i) PLUS i_times(in(3, 1, i))
+ out(2, 1, i, 3, BWD) = in(2, 1, i) MINUS i_times(in(4, 1, i))
+
+ out(1, 2, i, 3, FWD) = in(1, 2, i) MINUS i_times(in(3, 2, i))
+ out(2, 2, i, 3, FWD) = in(2, 2, i) PLUS i_times(in(4, 2, i))
+ out(1, 2, i, 3, BWD) = in(1, 2, i) PLUS i_times(in(3, 2, i))
+ out(2, 2, i, 3, BWD) = in(2, 2, i) MINUS i_times(in(4, 2, i))
+
+ out(1, 3, i, 3, FWD) = in(1, 3, i) MINUS i_times(in(3, 3, i))
+ out(2, 3, i, 3, FWD) = in(2, 3, i) PLUS i_times(in(4, 3, i))
+ out(1, 3, i, 3, BWD) = in(1, 3, i) PLUS i_times(in(3, 3, i))
+ out(2, 3, i, 3, BWD) = in(2, 3, i) MINUS i_times(in(4, 3, i))
+
+
+ out(1, 1, i, 4, FWD) = in(D_T_THREE, 1, i)
+ out(2, 1, i, 4, FWD) = in(D_T_FOUR, 1, i)
+ out(1, 1, i, 4, BWD) = in(D_T_ONE, 1, i)
+ out(2, 1, i, 4, BWD) = in(D_T_TWO, 1, i)
+
+ out(1, 2, i, 4, FWD) = in(D_T_THREE, 2, i)
+ out(2, 2, i, 4, FWD) = in(D_T_FOUR, 2, i)
+ out(1, 2, i, 4, BWD) = in(D_T_ONE, 2, i)
+ out(2, 2, i, 4, BWD) = in(D_T_TWO, 2, i)
+
+ out(1, 3, i, 4, FWD) = in(D_T_THREE, 3, i)
+ out(2, 3, i, 4, FWD) = in(D_T_FOUR, 3, i)
+ out(1, 3, i, 4, BWD) = in(D_T_ONE, 3, i)
+ out(2, 3, i, 4, BWD) = in(D_T_TWO, 3, i)
+
+ enddo
+
+ TIMING_STOP(timing_bin_sc2_projection)
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D21xyzt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D21xyzt.F90
new file mode 100644
index 0000000000000000000000000000000000000000..db8ef9618b9be3f0f02b2d7ba2385734a231d43e
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D21xyzt.F90
@@ -0,0 +1,188 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! D21xyzt.F90 - routines needed in D21.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+# define GAMMA_AB1(C) a_bwd(1, C, jb)
+# define GAMMA_AB2(C) a_bwd(2, C, jb)
+# define GAMMA_AF1(C) a_fwd(1, C, jf)
+# define GAMMA_AF2(C) a_fwd(2, C, jf)
+
+#ifdef DIR_T
+
+#ifdef DAGGER
+# define GAMMA_B1(C) bf1_ ## C
+# define GAMMA_B2(C) bf2_ ## C
+# define GAMMA_B3(C) bb1_ ## C
+# define GAMMA_B4(C) bb2_ ## C
+#else
+# define GAMMA_B1(C) bb1_ ## C
+# define GAMMA_B2(C) bb2_ ## C
+# define GAMMA_B3(C) bf1_ ## C
+# define GAMMA_B4(C) bf2_ ## C
+#endif
+
+# define UPDATE_B(S, C) b(S, C, i) = TWO * GAMMA_B ## S ## (C)
+
+#else
+
+#ifdef DAGGER
+# define PLUS -
+# define MINUS +
+#else
+# define PLUS +
+# define MINUS -
+#endif
+
+#ifdef DIR_X
+# define GAMMA_B3(C) MINUS i_times(bb2_ ## C) PLUS i_times(bf2_ ## C)
+# define GAMMA_B4(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C)
+#endif
+
+#ifdef DIR_Y
+# define GAMMA_B3(C) MINUS bb2_ ## C PLUS bf2_ ## C
+# define GAMMA_B4(C) PLUS bb1_ ## C MINUS bf1_ ## C
+#endif
+
+#ifdef DIR_Z
+# define GAMMA_B3(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C)
+# define GAMMA_B4(C) PLUS i_times(bb2_ ## C) MINUS i_times(bf2_ ## C)
+#endif
+
+# define GAMMA_B1(C) + bb1_ ## C + bf1_ ## C
+# define GAMMA_B2(C) + bb2_ ## C + bf2_ ## C
+
+# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) GAMMA_B ## S ## (C)
+
+#endif
+
+!-------------------------------------------------------------------------------
+subroutine NAME(b, a_fwd, a_bwd, u_e, u_o, nn_fwd, nn_bwd, volh)
+
+ implicit none
+
+ COMPLEX, dimension (NDIRAC, NCOL, *), intent(inout) :: b
+ COMPLEX, dimension (2, NCOL, *), intent(in) :: a_fwd, a_bwd
+ COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u_e, u_o
+ INTEGER, dimension (*), intent(in) :: nn_fwd, nn_bwd
+ integer :: volh
+
+ integer :: i, jf, jb
+
+ COMPLEX :: ab1, ab2, af1, af2
+ COMPLEX :: bf1_1, bf2_1
+ COMPLEX :: bf1_2, bf2_2
+ COMPLEX :: bf1_3, bf2_3
+ COMPLEX :: bb1_1, bb2_1
+ COMPLEX :: bb1_2, bb2_2
+ COMPLEX :: bb1_3, bb2_3
+
+ ! statement function:
+
+ COMPLEX :: i_times, c
+ i_times(c) = cmplx(-aimag(c), real(c))
+
+
+ TIMING_START(STRCAT(timing_bin_, NAME))
+
+ !$omp parallel do private(jf, jb, ab1, ab2, af1, af2, &
+ !$omp bf1_1, bf2_1, bf1_2, bf2_2, bf1_3, bf2_3, &
+ !$omp bb1_1, bb2_1, bb1_2, bb2_2, bb1_3, bb2_3)
+ do i = 1, volh
+ jb = nn_bwd(i)
+
+ ab1 = GAMMA_AB1(1)
+ ab2 = GAMMA_AB2(1)
+
+ bb1_1 = ab1 * conjg(u_o(1, 1, jb))
+ bb2_1 = ab2 * conjg(u_o(1, 1, jb))
+ bb1_2 = ab1 * conjg(u_o(1, 2, jb))
+ bb2_2 = ab2 * conjg(u_o(1, 2, jb))
+ bb1_3 = ab1 * conjg(u_o(1, 3, jb))
+ bb2_3 = ab2 * conjg(u_o(1, 3, jb))
+
+ jf = nn_fwd(i)
+
+ af1 = GAMMA_AF1(1)
+ af2 = GAMMA_AF2(1)
+
+ bf1_1 = af1 * u_e(1, 1, i)
+ bf2_1 = af2 * u_e(1, 1, i)
+ bf1_2 = af1 * u_e(2, 1, i)
+ bf2_2 = af2 * u_e(2, 1, i)
+ bf1_3 = af1 * u_e(3, 1, i)
+ bf2_3 = af2 * u_e(3, 1, i)
+
+ ab1 = GAMMA_AB1(2)
+ ab2 = GAMMA_AB2(2)
+
+ bb1_1 = bb1_1 + ab1 * conjg(u_o(2, 1, jb))
+ bb2_1 = bb2_1 + ab2 * conjg(u_o(2, 1, jb))
+ bb1_2 = bb1_2 + ab1 * conjg(u_o(2, 2, jb))
+ bb2_2 = bb2_2 + ab2 * conjg(u_o(2, 2, jb))
+ bb1_3 = bb1_3 + ab1 * conjg(u_o(2, 3, jb))
+ bb2_3 = bb2_3 + ab2 * conjg(u_o(2, 3, jb))
+
+ af1 = GAMMA_AF1(2)
+ af2 = GAMMA_AF2(2)
+
+ bf1_1 = bf1_1 + af1 * u_e(1, 2, i)
+ bf2_1 = bf2_1 + af2 * u_e(1, 2, i)
+ bf1_2 = bf1_2 + af1 * u_e(2, 2, i)
+ bf2_2 = bf2_2 + af2 * u_e(2, 2, i)
+ bf1_3 = bf1_3 + af1 * u_e(3, 2, i)
+ bf2_3 = bf2_3 + af2 * u_e(3, 2, i)
+
+ ab1 = GAMMA_AB1(3)
+ ab2 = GAMMA_AB2(3)
+
+ bb1_1 = bb1_1 + ab1 * conjg(u_o(3, 1, jb))
+ bb2_1 = bb2_1 + ab2 * conjg(u_o(3, 1, jb))
+ bb1_2 = bb1_2 + ab1 * conjg(u_o(3, 2, jb))
+ bb2_2 = bb2_2 + ab2 * conjg(u_o(3, 2, jb))
+ bb1_3 = bb1_3 + ab1 * conjg(u_o(3, 3, jb))
+ bb2_3 = bb2_3 + ab2 * conjg(u_o(3, 3, jb))
+
+ af1 = GAMMA_AF1(3)
+ af2 = GAMMA_AF2(3)
+
+ bf1_1 = bf1_1 + af1 * u_e(1, 3, i)
+ bf2_1 = bf2_1 + af2 * u_e(1, 3, i)
+ bf1_2 = bf1_2 + af1 * u_e(2, 3, i)
+ bf2_2 = bf2_2 + af2 * u_e(2, 3, i)
+ bf1_3 = bf1_3 + af1 * u_e(3, 3, i)
+ bf2_3 = bf2_3 + af2 * u_e(3, 3, i)
+
+
+ UPDATE_B(1, 1)
+ UPDATE_B(2, 1)
+ UPDATE_B(3, 1)
+ UPDATE_B(4, 1)
+
+ UPDATE_B(1, 2)
+ UPDATE_B(2, 2)
+ UPDATE_B(3, 2)
+ UPDATE_B(4, 2)
+
+ UPDATE_B(1, 3)
+ UPDATE_B(2, 3)
+ UPDATE_B(3, 3)
+ UPDATE_B(4, 3)
+
+ enddo
+
+ TIMING_STOP(STRCAT(timing_bin_, NAME))
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D2xyzt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D2xyzt.F90
new file mode 100644
index 0000000000000000000000000000000000000000..01e0aa6e50b47ce1419a946d11e0ceb11fe02867
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D2xyzt.F90
@@ -0,0 +1,202 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! D2xyzt.F90 - routines needed in D2.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+#ifdef DIR_T
+
+#ifdef DAGGER
+# define GAMMA_AB1(C) a(3, C, jb)
+# define GAMMA_AB2(C) a(4, C, jb)
+# define GAMMA_AF1(C) a(1, C, jf)
+# define GAMMA_AF2(C) a(2, C, jf)
+# define GAMMA_B1(C) bf1_ ## C
+# define GAMMA_B2(C) bf2_ ## C
+# define GAMMA_B3(C) bb1_ ## C
+# define GAMMA_B4(C) bb2_ ## C
+#else
+# define GAMMA_AB1(C) a(1, C, jb)
+# define GAMMA_AB2(C) a(2, C, jb)
+# define GAMMA_AF1(C) a(3, C, jf)
+# define GAMMA_AF2(C) a(4, C, jf)
+# define GAMMA_B1(C) bb1_ ## C
+# define GAMMA_B2(C) bb2_ ## C
+# define GAMMA_B3(C) bf1_ ## C
+# define GAMMA_B4(C) bf2_ ## C
+#endif
+
+# define UPDATE_B(S, C) b(S, C, i) = TWO * GAMMA_B ## S ## (C)
+
+#else
+
+#ifdef DAGGER
+# define PLUS -
+# define MINUS +
+#else
+# define PLUS +
+# define MINUS -
+#endif
+
+#ifdef DIR_X
+# define GAMMA_AB1(C) a(1, C, jb) PLUS i_times(a(4, C, jb))
+# define GAMMA_AB2(C) a(2, C, jb) PLUS i_times(a(3, C, jb))
+# define GAMMA_AF1(C) a(1, C, jf) MINUS i_times(a(4, C, jf))
+# define GAMMA_AF2(C) a(2, C, jf) MINUS i_times(a(3, C, jf))
+# define GAMMA_B3(C) MINUS i_times(bb2_ ## C) PLUS i_times(bf2_ ## C)
+# define GAMMA_B4(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C)
+#endif
+
+#ifdef DIR_Y
+# define GAMMA_AB1(C) a(1, C, jb) PLUS a(4, C, jb)
+# define GAMMA_AB2(C) a(2, C, jb) MINUS a(3, C, jb)
+# define GAMMA_AF1(C) a(1, C, jf) MINUS a(4, C, jf)
+# define GAMMA_AF2(C) a(2, C, jf) PLUS a(3, C, jf)
+# define GAMMA_B3(C) MINUS bb2_ ## C PLUS bf2_ ## C
+# define GAMMA_B4(C) PLUS bb1_ ## C MINUS bf1_ ## C
+#endif
+
+#ifdef DIR_Z
+# define GAMMA_AB1(C) a(1, C, jb) PLUS i_times(a(3, C, jb))
+# define GAMMA_AB2(C) a(2, C, jb) MINUS i_times(a(4, C, jb))
+# define GAMMA_AF1(C) a(1, C, jf) MINUS i_times(a(3, C, jf))
+# define GAMMA_AF2(C) a(2, C, jf) PLUS i_times(a(4, C, jf))
+# define GAMMA_B3(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C)
+# define GAMMA_B4(C) PLUS i_times(bb2_ ## C) MINUS i_times(bf2_ ## C)
+#endif
+
+# define GAMMA_B1(C) + bb1_ ## C + bf1_ ## C
+# define GAMMA_B2(C) + bb2_ ## C + bf2_ ## C
+
+# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) GAMMA_B ## S ## (C)
+
+#endif
+
+!-------------------------------------------------------------------------------
+subroutine NAME(b, a, u_e, u_o, nn_fwd, nn_bwd, volh)
+
+ implicit none
+
+ COMPLEX, dimension (NDIRAC, NCOL, *), intent(inout) :: b
+ COMPLEX, dimension (NDIRAC, NCOL, *), intent(in) :: a
+ COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u_e, u_o
+ INTEGER, dimension (*), intent(in) :: nn_fwd, nn_bwd
+ integer :: volh
+
+ integer :: i, jf, jb
+
+ COMPLEX :: ab1, ab2, af1, af2
+ COMPLEX :: bf1_1, bf2_1
+ COMPLEX :: bf1_2, bf2_2
+ COMPLEX :: bf1_3, bf2_3
+ COMPLEX :: bb1_1, bb2_1
+ COMPLEX :: bb1_2, bb2_2
+ COMPLEX :: bb1_3, bb2_3
+
+ ! statement function:
+
+ COMPLEX :: i_times, c
+ i_times(c) = cmplx(-aimag(c), real(c))
+
+ TIMING_START(STRCAT(timing_bin_, NAME))
+
+ !$omp parallel do private(jf, jb, ab1, ab2, af1, af2, &
+ !$omp bf1_1, bf2_1, bf1_2, bf2_2, bf1_3, bf2_3, &
+ !$omp bb1_1, bb2_1, bb1_2, bb2_2, bb1_3, bb2_3)
+ do i = 1, volh
+ jb = nn_bwd(i)
+
+ ab1 = GAMMA_AB1(1)
+ ab2 = GAMMA_AB2(1)
+
+ bb1_1 = ab1 * conjg(u_o(1, 1, jb))
+ bb2_1 = ab2 * conjg(u_o(1, 1, jb))
+ bb1_2 = ab1 * conjg(u_o(1, 2, jb))
+ bb2_2 = ab2 * conjg(u_o(1, 2, jb))
+ bb1_3 = ab1 * conjg(u_o(1, 3, jb))
+ bb2_3 = ab2 * conjg(u_o(1, 3, jb))
+
+ jf = nn_fwd(i)
+
+ af1 = GAMMA_AF1(1)
+ af2 = GAMMA_AF2(1)
+
+ bf1_1 = af1 * u_e(1, 1, i)
+ bf2_1 = af2 * u_e(1, 1, i)
+ bf1_2 = af1 * u_e(2, 1, i)
+ bf2_2 = af2 * u_e(2, 1, i)
+ bf1_3 = af1 * u_e(3, 1, i)
+ bf2_3 = af2 * u_e(3, 1, i)
+
+ ab1 = GAMMA_AB1(2)
+ ab2 = GAMMA_AB2(2)
+
+ bb1_1 = bb1_1 + ab1 * conjg(u_o(2, 1, jb))
+ bb2_1 = bb2_1 + ab2 * conjg(u_o(2, 1, jb))
+ bb1_2 = bb1_2 + ab1 * conjg(u_o(2, 2, jb))
+ bb2_2 = bb2_2 + ab2 * conjg(u_o(2, 2, jb))
+ bb1_3 = bb1_3 + ab1 * conjg(u_o(2, 3, jb))
+ bb2_3 = bb2_3 + ab2 * conjg(u_o(2, 3, jb))
+
+ af1 = GAMMA_AF1(2)
+ af2 = GAMMA_AF2(2)
+
+ bf1_1 = bf1_1 + af1 * u_e(1, 2, i)
+ bf2_1 = bf2_1 + af2 * u_e(1, 2, i)
+ bf1_2 = bf1_2 + af1 * u_e(2, 2, i)
+ bf2_2 = bf2_2 + af2 * u_e(2, 2, i)
+ bf1_3 = bf1_3 + af1 * u_e(3, 2, i)
+ bf2_3 = bf2_3 + af2 * u_e(3, 2, i)
+
+ ab1 = GAMMA_AB1(3)
+ ab2 = GAMMA_AB2(3)
+
+ bb1_1 = bb1_1 + ab1 * conjg(u_o(3, 1, jb))
+ bb2_1 = bb2_1 + ab2 * conjg(u_o(3, 1, jb))
+ bb1_2 = bb1_2 + ab1 * conjg(u_o(3, 2, jb))
+ bb2_2 = bb2_2 + ab2 * conjg(u_o(3, 2, jb))
+ bb1_3 = bb1_3 + ab1 * conjg(u_o(3, 3, jb))
+ bb2_3 = bb2_3 + ab2 * conjg(u_o(3, 3, jb))
+
+ af1 = GAMMA_AF1(3)
+ af2 = GAMMA_AF2(3)
+
+ bf1_1 = bf1_1 + af1 * u_e(1, 3, i)
+ bf2_1 = bf2_1 + af2 * u_e(1, 3, i)
+ bf1_2 = bf1_2 + af1 * u_e(2, 3, i)
+ bf2_2 = bf2_2 + af2 * u_e(2, 3, i)
+ bf1_3 = bf1_3 + af1 * u_e(3, 3, i)
+ bf2_3 = bf2_3 + af2 * u_e(3, 3, i)
+
+
+ UPDATE_B(1, 1)
+ UPDATE_B(2, 1)
+ UPDATE_B(3, 1)
+ UPDATE_B(4, 1)
+
+ UPDATE_B(1, 2)
+ UPDATE_B(2, 2)
+ UPDATE_B(3, 2)
+ UPDATE_B(4, 2)
+
+ UPDATE_B(1, 3)
+ UPDATE_B(2, 3)
+ UPDATE_B(3, 3)
+ UPDATE_B(4, 3)
+
+ enddo
+
+ TIMING_STOP(STRCAT(timing_bin_, NAME))
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D3.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D3.F90
new file mode 100644
index 0000000000000000000000000000000000000000..f2916b5d3d041ca90c915f51913ecbfb90c60b1f
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D3.F90
@@ -0,0 +1,116 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! D3.F90 - multiplication with the Wilson hopping matrix D (or D^\dagger)
+! (optimization for Hitachi SR8000: hybrid programming model,
+! MPI + OpenMP + overlapping communication and computation)
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine NAME(ee, oo, out, in, u)
+
+! out := NAME in
+!
+! NAME = d or d_dag
+!
+! out is of type "e" = EVEN or ODD
+! in is of type "o" = ODD or EVEN
+
+ use module_nn
+ use module_vol
+ use module_thread
+ implicit none
+
+ integer :: ee, oo
+ SPINCOL_FIELD :: out, in
+ GAUGE_FIELD :: u
+
+ integer :: thread, i1, i2, omp_get_thread_num, e, o
+
+ TIMING_START(STRCAT(timing_bin_, NAME))
+
+ call xbound_fill_buffer_y(in)
+ call xbound_fill_buffer_z(in)
+
+ !$omp parallel private(thread, i1, i2, e, o)
+
+ thread = omp_get_thread_num()
+ e = ee
+ o = oo
+
+ !$omp barrier
+
+ i1 = xyz_start(thread)
+ i2 = xyz_end(thread)
+
+ if (thread == 0) then
+ TIMING_START(timing_bin_d_xf)
+ call xbound_copy_buffer_y(in)
+ else
+ call STRCAT(NAME, _xf)(out, in, u(1, 1, 1, e, 1), &
+ u(1, 1, 1, o, 1), &
+ nn(1, e, 1, FWD), &
+ nn(1, e, 1, BWD), i1, i2)
+ endif
+
+ !$omp barrier
+
+ if (thread == 0) then
+ TIMING_STOP(timing_bin_d_xf)
+ TIMING_START(timing_bin_d_yf)
+ call xbound_copy_buffer_z(in)
+ !!call xbound_d3(in, 3)
+ else
+ call STRCAT(NAME, _yf)(out, in, u(1, 1, 1, e, 2), &
+ u(1, 1, 1, o, 2), &
+ nn(1, e, 2, FWD), &
+ nn(1, e, 2, BWD), i1, i2)
+ endif
+
+ !$omp barrier
+
+ if (thread == 0) then
+ TIMING_STOP(timing_bin_d_yf)
+ TIMING_START(timing_bin_d_zf)
+ call xbound_d3(in, 4)
+ else
+ call STRCAT(NAME, _zf)(out, in, u(1, 1, 1, e, 3), &
+ u(1, 1, 1, o, 3), &
+ nn(1, e, 3, FWD), &
+ nn(1, e, 3, BWD), i1, i2)
+ endif
+
+ !$omp barrier
+
+#ifdef TIMING
+ if (thread == 0) then
+ TIMING_STOP(timing_bin_d_zf)
+ TIMING_START(timing_bin_d_t)
+ endif
+#endif
+
+ i1 = t_start(thread)
+ i2 = t_end(thread)
+
+ call STRCAT(NAME, _t )(out, in, u(1, 1, 1, e, 4), &
+ u(1, 1, 1, o, 4), &
+ nn(1, e, 4, FWD), &
+ nn(1, e, 4, BWD), i1, i2)
+
+ !$omp end parallel
+
+ TIMING_STOP(timing_bin_d_t)
+ TIMING_STOP(STRCAT(timing_bin_, NAME))
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31.F90
new file mode 100644
index 0000000000000000000000000000000000000000..81da092c7102ce3c7954fe6c420110c38cdc368a
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31.F90
@@ -0,0 +1,104 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! D3.F90 - multiplication with the Wilson hopping matrix D (or D^\dagger)
+! (optimization for Hitachi SR8000: hybrid programming model,
+! MPI + OpenMP + overlapping communication and computation)
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine NAME(ee, oo, out, in, u)
+
+! out := NAME in
+!
+! NAME = d or d_dag
+!
+! out is of type "e" = EVEN or ODD
+! in is of type "o" = ODD or EVEN
+
+ use module_nn
+ use module_vol
+ use module_thread
+ implicit none
+
+ integer :: ee, oo
+ SPINCOL_FIELD :: out, in
+ GAUGE_FIELD :: u
+
+ integer :: thread, i1, i2, omp_get_thread_num, e, o
+
+ TIMING_START(STRCAT(timing_bin_, NAME))
+
+ call xbound_fill_buffer_y(in)
+ call xbound_fill_buffer_z(in)
+
+ !$omp parallel private(thread, i1, i2, e, o)
+
+ thread = omp_get_thread_num()
+ e = ee
+ o = oo
+
+ !$omp barrier
+
+ i1 = xyz_start(thread)
+ i2 = xyz_end(thread)
+
+ if (thread == 0) then
+ TIMING_START(timing_bin_d_xf)
+ call xbound_copy_buffer_y(in)
+ else
+ call STRCAT(NAME, _switch_0)(e, o, out, in, u, i1, i2, 1)
+ endif
+
+ !$omp barrier
+
+ if (thread == 0) then
+ TIMING_STOP(timing_bin_d_xf)
+ TIMING_START(timing_bin_d_yf)
+ call xbound_copy_buffer_z(in)
+ !!call xbound_d3(in, 3)
+ else
+ call STRCAT(NAME, _switch)(e, o, out, in, u, i1, i2, 2)
+ endif
+
+ !$omp barrier
+
+ if (thread == 0) then
+ TIMING_STOP(timing_bin_d_yf)
+ TIMING_START(timing_bin_d_zf)
+ call xbound_d3(in, 4)
+ else
+ call STRCAT(NAME, _switch)(e, o, out, in, u, i1, i2, 3)
+ endif
+
+ !$omp barrier
+
+#ifdef TIMING
+ if (thread == 0) then
+ TIMING_STOP(timing_bin_d_zf)
+ TIMING_START(timing_bin_d_t)
+ endif
+#endif
+
+ i1 = t_start(thread)
+ i2 = t_end(thread)
+
+ call STRCAT(NAME, _switch)(e, o, out, in, u, i1, i2, 4)
+
+ !$omp end parallel
+
+ TIMING_STOP(timing_bin_d_t)
+ TIMING_STOP(STRCAT(timing_bin_, NAME))
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31_switch.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31_switch.F90
new file mode 100644
index 0000000000000000000000000000000000000000..bf61467401f0815012d1aac01c2ff3ee15e2fc79
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31_switch.F90
@@ -0,0 +1,59 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! D3_switch.F90 - switch layer for arbitrary choice of "gamma_index"
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+#ifdef INIT
+# define THE_NAME(a) STRCAT3(NAME, a, _0)
+#else
+# define THE_NAME(a) STRCAT(NAME, a)
+#endif
+
+! NAME = d or d_dag
+!-------------------------------------------------------------------------------
+subroutine THE_NAME(_switch)(e, o, out, in, u, i1, i2, mu)
+
+ use module_lattice
+ use module_nn
+ use module_vol
+ implicit none
+
+ integer :: e, o
+ SPINCOL_FIELD :: out, in
+ GAUGE_FIELD :: u
+ integer :: i1, i2, mu
+
+ select case (gamma_index(mu))
+ case (1)
+ call THE_NAME(_xf)(out, in, u(1, 1, 1, e, 1), &
+ u(1, 1, 1, o, 1), &
+ nn(1, e, 1, FWD), &
+ nn(1, e, 1, BWD), i1, i2)
+ case (2)
+ call THE_NAME(_yf)(out, in, u(1, 1, 1, e, 2), &
+ u(1, 1, 1, o, 2), &
+ nn(1, e, 2, FWD), &
+ nn(1, e, 2, BWD), i1, i2)
+ case (3)
+ call THE_NAME(_zf)(out, in, u(1, 1, 1, e, 3), &
+ u(1, 1, 1, o, 3), &
+ nn(1, e, 3, FWD), &
+ nn(1, e, 3, BWD), i1, i2)
+ case (4)
+ call THE_NAME(_t )(out, in, u(1, 1, 1, e, 4), &
+ u(1, 1, 1, o, 4), &
+ nn(1, e, 4, FWD), &
+ nn(1, e, 4, BWD), i1, i2)
+ end select
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31xyzt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31xyzt.F90
new file mode 100644
index 0000000000000000000000000000000000000000..534277d9eb8aba4c5de9085dea78aafc7f572640
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31xyzt.F90
@@ -0,0 +1,213 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! D3xyzt.F90 - routines needed in D3.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+#ifdef DIR_T
+
+#ifdef DAGGER
+# define GAMMA_AB1(C) a(3, C, jb)
+# define GAMMA_AB2(C) a(4, C, jb)
+# define GAMMA_AF1(C) a(1, C, jf)
+# define GAMMA_AF2(C) a(2, C, jf)
+# define GAMMA_B1(C) bf1_ ## C
+# define GAMMA_B2(C) bf2_ ## C
+# define GAMMA_B3(C) bb1_ ## C
+# define GAMMA_B4(C) bb2_ ## C
+#else
+# define GAMMA_AB1(C) a(1, C, jb)
+# define GAMMA_AB2(C) a(2, C, jb)
+# define GAMMA_AF1(C) a(3, C, jf)
+# define GAMMA_AF2(C) a(4, C, jf)
+# define GAMMA_B1(C) bb1_ ## C
+# define GAMMA_B2(C) bb2_ ## C
+# define GAMMA_B3(C) bf1_ ## C
+# define GAMMA_B4(C) bf2_ ## C
+#endif
+
+#ifdef INIT
+# define UPDATE_B(S, C) b(S, C, i) = TWO * GAMMA_B ## S ## (C)
+#else
+# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) + TWO * GAMMA_B ## S ## (C)
+#endif
+
+#else
+
+#ifdef DAGGER
+# define PLUS -
+# define MINUS +
+#else
+# define PLUS +
+# define MINUS -
+#endif
+
+#ifdef DIR_X
+# define GAMMA_AB1(C) a(1, C, jb) PLUS i_times(a(4, C, jb))
+# define GAMMA_AB2(C) a(2, C, jb) PLUS i_times(a(3, C, jb))
+# define GAMMA_AF1(C) a(1, C, jf) MINUS i_times(a(4, C, jf))
+# define GAMMA_AF2(C) a(2, C, jf) MINUS i_times(a(3, C, jf))
+# define GAMMA_B3(C) MINUS i_times(bb2_ ## C) PLUS i_times(bf2_ ## C)
+# define GAMMA_B4(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C)
+#endif
+
+#ifdef DIR_Y
+# define GAMMA_AB1(C) a(1, C, jb) PLUS a(4, C, jb)
+# define GAMMA_AB2(C) a(2, C, jb) MINUS a(3, C, jb)
+# define GAMMA_AF1(C) a(1, C, jf) MINUS a(4, C, jf)
+# define GAMMA_AF2(C) a(2, C, jf) PLUS a(3, C, jf)
+# define GAMMA_B3(C) MINUS bb2_ ## C PLUS bf2_ ## C
+# define GAMMA_B4(C) PLUS bb1_ ## C MINUS bf1_ ## C
+#endif
+
+#ifdef DIR_Z
+# define GAMMA_AB1(C) a(1, C, jb) PLUS i_times(a(3, C, jb))
+# define GAMMA_AB2(C) a(2, C, jb) MINUS i_times(a(4, C, jb))
+# define GAMMA_AF1(C) a(1, C, jf) MINUS i_times(a(3, C, jf))
+# define GAMMA_AF2(C) a(2, C, jf) PLUS i_times(a(4, C, jf))
+# define GAMMA_B3(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C)
+# define GAMMA_B4(C) PLUS i_times(bb2_ ## C) MINUS i_times(bf2_ ## C)
+#endif
+
+# define GAMMA_B1(C) + bb1_ ## C + bf1_ ## C
+# define GAMMA_B2(C) + bb2_ ## C + bf2_ ## C
+
+#ifdef INIT
+# define UPDATE_B(S, C) b(S, C, i) = GAMMA_B ## S ## (C)
+#else
+# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) GAMMA_B ## S ## (C)
+#endif
+
+#endif
+
+#ifdef INIT
+# define THE_NAME STRCAT(NAME, _0)
+#else
+# define THE_NAME NAME
+#endif
+
+!-------------------------------------------------------------------------------
+subroutine THE_NAME(b, a, u_e, u_o, nn_fwd, nn_bwd, i1, i2)
+
+ implicit none
+
+ COMPLEX, dimension (NDIRAC, NCOL, *), intent(inout) :: b
+ COMPLEX, dimension (NDIRAC, NCOL, *), intent(in) :: a
+ COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u_e, u_o
+ INTEGER, dimension (*), intent(in) :: nn_fwd, nn_bwd
+ integer :: i1, i2
+
+ integer :: i, jf, jb
+
+ COMPLEX :: ab1, ab2, af1, af2
+ COMPLEX :: bf1_1, bf2_1
+ COMPLEX :: bf1_2, bf2_2
+ COMPLEX :: bf1_3, bf2_3
+ COMPLEX :: bb1_1, bb2_1
+ COMPLEX :: bb1_2, bb2_2
+ COMPLEX :: bb1_3, bb2_3
+
+ ! statement function:
+
+ COMPLEX :: i_times, c
+ i_times(c) = cmplx(-aimag(c), real(c))
+
+ !!TIMING_START(STRCAT(timing_bin_, NAME))
+
+ do i = i1, i2
+ jb = nn_bwd(i)
+
+ ab1 = GAMMA_AB1(1)
+ ab2 = GAMMA_AB2(1)
+
+ bb1_1 = ab1 * conjg(u_o(1, 1, jb))
+ bb2_1 = ab2 * conjg(u_o(1, 1, jb))
+ bb1_2 = ab1 * conjg(u_o(1, 2, jb))
+ bb2_2 = ab2 * conjg(u_o(1, 2, jb))
+ bb1_3 = ab1 * conjg(u_o(1, 3, jb))
+ bb2_3 = ab2 * conjg(u_o(1, 3, jb))
+
+ jf = nn_fwd(i)
+
+ af1 = GAMMA_AF1(1)
+ af2 = GAMMA_AF2(1)
+
+ bf1_1 = af1 * u_e(1, 1, i)
+ bf2_1 = af2 * u_e(1, 1, i)
+ bf1_2 = af1 * u_e(2, 1, i)
+ bf2_2 = af2 * u_e(2, 1, i)
+ bf1_3 = af1 * u_e(3, 1, i)
+ bf2_3 = af2 * u_e(3, 1, i)
+
+ ab1 = GAMMA_AB1(2)
+ ab2 = GAMMA_AB2(2)
+
+ bb1_1 = bb1_1 + ab1 * conjg(u_o(2, 1, jb))
+ bb2_1 = bb2_1 + ab2 * conjg(u_o(2, 1, jb))
+ bb1_2 = bb1_2 + ab1 * conjg(u_o(2, 2, jb))
+ bb2_2 = bb2_2 + ab2 * conjg(u_o(2, 2, jb))
+ bb1_3 = bb1_3 + ab1 * conjg(u_o(2, 3, jb))
+ bb2_3 = bb2_3 + ab2 * conjg(u_o(2, 3, jb))
+
+ af1 = GAMMA_AF1(2)
+ af2 = GAMMA_AF2(2)
+
+ bf1_1 = bf1_1 + af1 * u_e(1, 2, i)
+ bf2_1 = bf2_1 + af2 * u_e(1, 2, i)
+ bf1_2 = bf1_2 + af1 * u_e(2, 2, i)
+ bf2_2 = bf2_2 + af2 * u_e(2, 2, i)
+ bf1_3 = bf1_3 + af1 * u_e(3, 2, i)
+ bf2_3 = bf2_3 + af2 * u_e(3, 2, i)
+
+ ab1 = GAMMA_AB1(3)
+ ab2 = GAMMA_AB2(3)
+
+ bb1_1 = bb1_1 + ab1 * conjg(u_o(3, 1, jb))
+ bb2_1 = bb2_1 + ab2 * conjg(u_o(3, 1, jb))
+ bb1_2 = bb1_2 + ab1 * conjg(u_o(3, 2, jb))
+ bb2_2 = bb2_2 + ab2 * conjg(u_o(3, 2, jb))
+ bb1_3 = bb1_3 + ab1 * conjg(u_o(3, 3, jb))
+ bb2_3 = bb2_3 + ab2 * conjg(u_o(3, 3, jb))
+
+ af1 = GAMMA_AF1(3)
+ af2 = GAMMA_AF2(3)
+
+ bf1_1 = bf1_1 + af1 * u_e(1, 3, i)
+ bf2_1 = bf2_1 + af2 * u_e(1, 3, i)
+ bf1_2 = bf1_2 + af1 * u_e(2, 3, i)
+ bf2_2 = bf2_2 + af2 * u_e(2, 3, i)
+ bf1_3 = bf1_3 + af1 * u_e(3, 3, i)
+ bf2_3 = bf2_3 + af2 * u_e(3, 3, i)
+
+
+ UPDATE_B(1, 1)
+ UPDATE_B(2, 1)
+ UPDATE_B(3, 1)
+ UPDATE_B(4, 1)
+
+ UPDATE_B(1, 2)
+ UPDATE_B(2, 2)
+ UPDATE_B(3, 2)
+ UPDATE_B(4, 2)
+
+ UPDATE_B(1, 3)
+ UPDATE_B(2, 3)
+ UPDATE_B(3, 3)
+ UPDATE_B(4, 3)
+
+ enddo
+
+ !!TIMING_STOP(STRCAT(timing_bin_, NAME))
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D3xyzt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D3xyzt.F90
new file mode 100644
index 0000000000000000000000000000000000000000..391e3189644197b1eeb0d29c3db87ed3d60d579f
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D3xyzt.F90
@@ -0,0 +1,203 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! D3xyzt.F90 - routines needed in D3.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+#ifdef DIR_T
+
+#ifdef DAGGER
+# define GAMMA_AB1(C) a(3, C, jb)
+# define GAMMA_AB2(C) a(4, C, jb)
+# define GAMMA_AF1(C) a(1, C, jf)
+# define GAMMA_AF2(C) a(2, C, jf)
+# define GAMMA_B1(C) bf1_ ## C
+# define GAMMA_B2(C) bf2_ ## C
+# define GAMMA_B3(C) bb1_ ## C
+# define GAMMA_B4(C) bb2_ ## C
+#else
+# define GAMMA_AB1(C) a(1, C, jb)
+# define GAMMA_AB2(C) a(2, C, jb)
+# define GAMMA_AF1(C) a(3, C, jf)
+# define GAMMA_AF2(C) a(4, C, jf)
+# define GAMMA_B1(C) bb1_ ## C
+# define GAMMA_B2(C) bb2_ ## C
+# define GAMMA_B3(C) bf1_ ## C
+# define GAMMA_B4(C) bf2_ ## C
+#endif
+
+# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) + TWO * GAMMA_B ## S ## (C)
+
+#else
+
+#ifdef DAGGER
+# define PLUS -
+# define MINUS +
+#else
+# define PLUS +
+# define MINUS -
+#endif
+
+#ifdef DIR_X
+# define GAMMA_AB1(C) a(1, C, jb) PLUS i_times(a(4, C, jb))
+# define GAMMA_AB2(C) a(2, C, jb) PLUS i_times(a(3, C, jb))
+# define GAMMA_AF1(C) a(1, C, jf) MINUS i_times(a(4, C, jf))
+# define GAMMA_AF2(C) a(2, C, jf) MINUS i_times(a(3, C, jf))
+# define GAMMA_B3(C) MINUS i_times(bb2_ ## C) PLUS i_times(bf2_ ## C)
+# define GAMMA_B4(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C)
+
+# define UPDATE_B(S, C) b(S, C, i) = GAMMA_B ## S ## (C)
+#endif
+
+#ifdef DIR_Y
+# define GAMMA_AB1(C) a(1, C, jb) PLUS a(4, C, jb)
+# define GAMMA_AB2(C) a(2, C, jb) MINUS a(3, C, jb)
+# define GAMMA_AF1(C) a(1, C, jf) MINUS a(4, C, jf)
+# define GAMMA_AF2(C) a(2, C, jf) PLUS a(3, C, jf)
+# define GAMMA_B3(C) MINUS bb2_ ## C PLUS bf2_ ## C
+# define GAMMA_B4(C) PLUS bb1_ ## C MINUS bf1_ ## C
+
+# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) GAMMA_B ## S ## (C)
+#endif
+
+#ifdef DIR_Z
+# define GAMMA_AB1(C) a(1, C, jb) PLUS i_times(a(3, C, jb))
+# define GAMMA_AB2(C) a(2, C, jb) MINUS i_times(a(4, C, jb))
+# define GAMMA_AF1(C) a(1, C, jf) MINUS i_times(a(3, C, jf))
+# define GAMMA_AF2(C) a(2, C, jf) PLUS i_times(a(4, C, jf))
+# define GAMMA_B3(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C)
+# define GAMMA_B4(C) PLUS i_times(bb2_ ## C) MINUS i_times(bf2_ ## C)
+
+# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) GAMMA_B ## S ## (C)
+#endif
+
+# define GAMMA_B1(C) + bb1_ ## C + bf1_ ## C
+# define GAMMA_B2(C) + bb2_ ## C + bf2_ ## C
+
+#endif
+
+!-------------------------------------------------------------------------------
+subroutine NAME(b, a, u_e, u_o, nn_fwd, nn_bwd, i1, i2)
+
+ implicit none
+
+ COMPLEX, dimension (NDIRAC, NCOL, *), intent(inout) :: b
+ COMPLEX, dimension (NDIRAC, NCOL, *), intent(in) :: a
+ COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u_e, u_o
+ INTEGER, dimension (*), intent(in) :: nn_fwd, nn_bwd
+ integer :: i1, i2
+
+ integer :: i, jf, jb
+
+ COMPLEX :: ab1, ab2, af1, af2
+ COMPLEX :: bf1_1, bf2_1
+ COMPLEX :: bf1_2, bf2_2
+ COMPLEX :: bf1_3, bf2_3
+ COMPLEX :: bb1_1, bb2_1
+ COMPLEX :: bb1_2, bb2_2
+ COMPLEX :: bb1_3, bb2_3
+
+ ! statement function:
+
+ COMPLEX :: i_times, c
+ i_times(c) = cmplx(-aimag(c), real(c))
+
+ !!TIMING_START(STRCAT(timing_bin_, NAME))
+
+ do i = i1, i2
+ jb = nn_bwd(i)
+
+ ab1 = GAMMA_AB1(1)
+ ab2 = GAMMA_AB2(1)
+
+ bb1_1 = ab1 * conjg(u_o(1, 1, jb))
+ bb2_1 = ab2 * conjg(u_o(1, 1, jb))
+ bb1_2 = ab1 * conjg(u_o(1, 2, jb))
+ bb2_2 = ab2 * conjg(u_o(1, 2, jb))
+ bb1_3 = ab1 * conjg(u_o(1, 3, jb))
+ bb2_3 = ab2 * conjg(u_o(1, 3, jb))
+
+ jf = nn_fwd(i)
+
+ af1 = GAMMA_AF1(1)
+ af2 = GAMMA_AF2(1)
+
+ bf1_1 = af1 * u_e(1, 1, i)
+ bf2_1 = af2 * u_e(1, 1, i)
+ bf1_2 = af1 * u_e(2, 1, i)
+ bf2_2 = af2 * u_e(2, 1, i)
+ bf1_3 = af1 * u_e(3, 1, i)
+ bf2_3 = af2 * u_e(3, 1, i)
+
+ ab1 = GAMMA_AB1(2)
+ ab2 = GAMMA_AB2(2)
+
+ bb1_1 = bb1_1 + ab1 * conjg(u_o(2, 1, jb))
+ bb2_1 = bb2_1 + ab2 * conjg(u_o(2, 1, jb))
+ bb1_2 = bb1_2 + ab1 * conjg(u_o(2, 2, jb))
+ bb2_2 = bb2_2 + ab2 * conjg(u_o(2, 2, jb))
+ bb1_3 = bb1_3 + ab1 * conjg(u_o(2, 3, jb))
+ bb2_3 = bb2_3 + ab2 * conjg(u_o(2, 3, jb))
+
+ af1 = GAMMA_AF1(2)
+ af2 = GAMMA_AF2(2)
+
+ bf1_1 = bf1_1 + af1 * u_e(1, 2, i)
+ bf2_1 = bf2_1 + af2 * u_e(1, 2, i)
+ bf1_2 = bf1_2 + af1 * u_e(2, 2, i)
+ bf2_2 = bf2_2 + af2 * u_e(2, 2, i)
+ bf1_3 = bf1_3 + af1 * u_e(3, 2, i)
+ bf2_3 = bf2_3 + af2 * u_e(3, 2, i)
+
+ ab1 = GAMMA_AB1(3)
+ ab2 = GAMMA_AB2(3)
+
+ bb1_1 = bb1_1 + ab1 * conjg(u_o(3, 1, jb))
+ bb2_1 = bb2_1 + ab2 * conjg(u_o(3, 1, jb))
+ bb1_2 = bb1_2 + ab1 * conjg(u_o(3, 2, jb))
+ bb2_2 = bb2_2 + ab2 * conjg(u_o(3, 2, jb))
+ bb1_3 = bb1_3 + ab1 * conjg(u_o(3, 3, jb))
+ bb2_3 = bb2_3 + ab2 * conjg(u_o(3, 3, jb))
+
+ af1 = GAMMA_AF1(3)
+ af2 = GAMMA_AF2(3)
+
+ bf1_1 = bf1_1 + af1 * u_e(1, 3, i)
+ bf2_1 = bf2_1 + af2 * u_e(1, 3, i)
+ bf1_2 = bf1_2 + af1 * u_e(2, 3, i)
+ bf2_2 = bf2_2 + af2 * u_e(2, 3, i)
+ bf1_3 = bf1_3 + af1 * u_e(3, 3, i)
+ bf2_3 = bf2_3 + af2 * u_e(3, 3, i)
+
+
+ UPDATE_B(1, 1)
+ UPDATE_B(2, 1)
+ UPDATE_B(3, 1)
+ UPDATE_B(4, 1)
+
+ UPDATE_B(1, 2)
+ UPDATE_B(2, 2)
+ UPDATE_B(3, 2)
+ UPDATE_B(4, 2)
+
+ UPDATE_B(1, 3)
+ UPDATE_B(2, 3)
+ UPDATE_B(3, 3)
+ UPDATE_B(4, 3)
+
+ enddo
+
+ !!TIMING_STOP(STRCAT(timing_bin_, NAME))
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/DSFxyzt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/DSFxyzt.F90
new file mode 100644
index 0000000000000000000000000000000000000000..23b0d6fb9600d4b9f4411d95e119babf591e61ea
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/DSFxyzt.F90
@@ -0,0 +1,108 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! DSFxyzt.F90 - routines (for standard Wilson fermions) needed in dsf.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+#ifdef DIR_X
+# define GAMMA_A1(C) a(1, C, J) minus i_times(a(4, C, J))
+# define GAMMA_A2(C) a(2, C, J) minus i_times(a(3, C, J))
+# define GAMMA_A3(C) plus i_times(a2)
+# define GAMMA_A4(C) plus i_times(a1)
+#endif
+
+#ifdef DIR_Y
+# define GAMMA_A1(C) a(1, C, J) minus a(4, C, J)
+# define GAMMA_A2(C) a(2, C, J) plus a(3, C, J)
+# define GAMMA_A3(C) plus a2
+# define GAMMA_A4(C) minus a1
+#endif
+
+#ifdef DIR_Z
+# define GAMMA_A1(C) a(1, C, J) minus i_times(a(3, C, J))
+# define GAMMA_A2(C) a(2, C, J) plus i_times(a(4, C, J))
+# define GAMMA_A3(C) plus i_times(a1)
+# define GAMMA_A4(C) minus i_times(a2)
+#endif
+
+#ifdef DIR_T
+#ifdef FORWARD
+# define GAMMA_A1(C) ZERO
+# define GAMMA_A2(C) ZERO
+# define GAMMA_A3(C) TWO * a(3, C, J)
+# define GAMMA_A4(C) TWO * a(4, C, J)
+#else
+# define GAMMA_A1(C) TWO * a(1, C, J)
+# define GAMMA_A2(C) TWO * a(2, C, J)
+# define GAMMA_A3(C) ZERO
+# define GAMMA_A4(C) ZERO
+#endif
+#endif
+
+#ifdef FORWARD
+# define UU(R, A, B) uu(R, A, B)
+# define plus +
+# define minus -
+# define I i
+# define J j
+#else
+# define UU(R, A, B) uud(R, B, A)
+# define plus -
+# define minus +
+# define I j
+# define J i
+#endif
+
+!-------------------------------------------------------------------------------
+subroutine NAME(p, b, a, s, u, nn, volh)
+
+ implicit none
+
+ REAL, dimension(NGEN, *), intent(inout) :: p
+ REAL, intent(in) :: s
+ COMPLEX, dimension (NDIRAC, NCOL, *), intent(in) :: b, a
+ COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u
+ INTEGER, intent(in) :: nn(*)
+ integer :: volh
+
+ integer :: i, j, ca, cb
+ COMPLEX :: a1, a2, a3, a4
+ SU3 :: v, w
+
+ ! statement function:
+
+ COMPLEX :: i_times, c
+ i_times(c) = cmplx(-aimag(c), real(c))
+
+
+ !$omp parallel do private(j,ca,cb,a1,a2,a3,a4,w,v)
+ do i = 1, volh
+ j = nn(i)
+ do ca = 1, NCOL
+ a1 = GAMMA_A1(ca)
+ a2 = GAMMA_A2(ca)
+ a3 = GAMMA_A3(ca)
+ a4 = GAMMA_A4(ca)
+ do cb = 1, NCOL
+ w(ca, cb) = a1 * conjg(b(1, cb, I)) &
+ + a2 * conjg(b(2, cb, I)) &
+ + a3 * conjg(b(3, cb, I)) &
+ + a4 * conjg(b(4, cb, I))
+ enddo
+ enddo
+ call UU(v, u(1, 1, i), w)
+ call im_tr_j(p(1, i), v, minus s)
+ enddo
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/DVersion.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/DVersion.F90
new file mode 100644
index 0000000000000000000000000000000000000000..7748291d4c1e1e9861b486f687425a94c53a4ef2
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/DVersion.F90
@@ -0,0 +1,20 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! DVersion.F90
+!
+!-------------------------------------------------------------------------------
+
+integer function version_of_d()
+ implicit none
+ version_of_d = VERSION
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/Dt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/Dt.F90
new file mode 100644
index 0000000000000000000000000000000000000000..28f6232e7ad1083562c3dca27fac0b0c49445739
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/Dt.F90
@@ -0,0 +1,154 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! Dt.F90 - routines needed in D.F90 (t-direction)
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+#ifdef DAGGER
+# define GAMMA_A1(C) a(3, C, jb)
+# define GAMMA_A2(C) a(4, C, jb)
+# define GAMMA_A3(C) a(1, C, jf)
+# define GAMMA_A4(C) a(2, C, jf)
+# define GAMMA_B1(C) f1_ ## C
+# define GAMMA_B2(C) f2_ ## C
+# define GAMMA_B3(C) b1_ ## C
+# define GAMMA_B4(C) b2_ ## C
+#else
+# define GAMMA_A1(C) a(1, C, jb)
+# define GAMMA_A2(C) a(2, C, jb)
+# define GAMMA_A3(C) a(3, C, jf)
+# define GAMMA_A4(C) a(4, C, jf)
+# define GAMMA_B1(C) b1_ ## C
+# define GAMMA_B2(C) b2_ ## C
+# define GAMMA_B3(C) f1_ ## C
+# define GAMMA_B4(C) f2_ ## C
+#endif
+
+!-------------------------------------------------------------------------------
+subroutine NAME(b, a, u_e, u_o, nn_fwd, nn_bwd, volh)
+
+ implicit none
+
+ COMPLEX, dimension (NDIRAC, NCOL, *), intent(inout) :: b
+ COMPLEX, dimension (NDIRAC, NCOL, *), intent(in) :: a
+ COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u_e, u_o
+ INTEGER, dimension (*), intent(in) :: nn_fwd, nn_bwd
+ integer :: volh
+
+ integer :: i, jf, jb
+
+ COMPLEX :: a1, a2, a3, a4
+ COMPLEX :: f1_1, f2_1
+ COMPLEX :: f1_2, f2_2
+ COMPLEX :: f1_3, f2_3
+ COMPLEX :: b1_1, b2_1
+ COMPLEX :: b1_2, b2_2
+ COMPLEX :: b1_3, b2_3
+
+ ! statement function:
+
+ COMPLEX :: i_times, c
+ i_times(c) = cmplx(-aimag(c), real(c))
+
+ TIMING_START(STRCAT(timing_bin_, NAME))
+
+ !$omp parallel do private(jf, jb, a1, a2, a3, a4, &
+ !$omp f1_1, f2_1, f1_2, f2_2, f1_3, f2_3, &
+ !$omp b1_1, b2_1, b1_2, b2_2, b1_3, b2_3)
+ do i = 1, volh
+ jb = nn_bwd(i)
+
+ a1 = GAMMA_A1(1)
+ a2 = GAMMA_A2(1)
+
+ b1_1 = a1 * conjg(u_o(1, 1, jb))
+ b2_1 = a2 * conjg(u_o(1, 1, jb))
+ b1_2 = a1 * conjg(u_o(1, 2, jb))
+ b2_2 = a2 * conjg(u_o(1, 2, jb))
+ b1_3 = a1 * conjg(u_o(1, 3, jb))
+ b2_3 = a2 * conjg(u_o(1, 3, jb))
+
+ jf = nn_fwd(i)
+
+ a3 = GAMMA_A3(1)
+ a4 = GAMMA_A4(1)
+
+ f1_1 = a3 * u_e(1, 1, i)
+ f2_1 = a4 * u_e(1, 1, i)
+ f1_2 = a3 * u_e(2, 1, i)
+ f2_2 = a4 * u_e(2, 1, i)
+ f1_3 = a3 * u_e(3, 1, i)
+ f2_3 = a4 * u_e(3, 1, i)
+
+ a1 = GAMMA_A1(2)
+ a2 = GAMMA_A2(2)
+
+ b1_1 = b1_1 + a1 * conjg(u_o(2, 1, jb))
+ b2_1 = b2_1 + a2 * conjg(u_o(2, 1, jb))
+ b1_2 = b1_2 + a1 * conjg(u_o(2, 2, jb))
+ b2_2 = b2_2 + a2 * conjg(u_o(2, 2, jb))
+ b1_3 = b1_3 + a1 * conjg(u_o(2, 3, jb))
+ b2_3 = b2_3 + a2 * conjg(u_o(2, 3, jb))
+
+ a3 = GAMMA_A3(2)
+ a4 = GAMMA_A4(2)
+
+ f1_1 = f1_1 + a3 * u_e(1, 2, i)
+ f2_1 = f2_1 + a4 * u_e(1, 2, i)
+ f1_2 = f1_2 + a3 * u_e(2, 2, i)
+ f2_2 = f2_2 + a4 * u_e(2, 2, i)
+ f1_3 = f1_3 + a3 * u_e(3, 2, i)
+ f2_3 = f2_3 + a4 * u_e(3, 2, i)
+
+ a1 = GAMMA_A1(3)
+ a2 = GAMMA_A2(3)
+
+ b1_1 = b1_1 + a1 * conjg(u_o(3, 1, jb))
+ b2_1 = b2_1 + a2 * conjg(u_o(3, 1, jb))
+ b1_2 = b1_2 + a1 * conjg(u_o(3, 2, jb))
+ b2_2 = b2_2 + a2 * conjg(u_o(3, 2, jb))
+ b1_3 = b1_3 + a1 * conjg(u_o(3, 3, jb))
+ b2_3 = b2_3 + a2 * conjg(u_o(3, 3, jb))
+
+ a3 = GAMMA_A3(3)
+ a4 = GAMMA_A4(3)
+
+ f1_1 = f1_1 + a3 * u_e(1, 3, i)
+ f2_1 = f2_1 + a4 * u_e(1, 3, i)
+ f1_2 = f1_2 + a3 * u_e(2, 3, i)
+ f2_2 = f2_2 + a4 * u_e(2, 3, i)
+ f1_3 = f1_3 + a3 * u_e(3, 3, i)
+ f2_3 = f2_3 + a4 * u_e(3, 3, i)
+
+
+ b(1, 1, i) = TWO * GAMMA_B1(1)
+ b(2, 1, i) = TWO * GAMMA_B2(1)
+ b(3, 1, i) = TWO * GAMMA_B3(1)
+ b(4, 1, i) = TWO * GAMMA_B4(1)
+
+ b(1, 2, i) = TWO * GAMMA_B1(2)
+ b(2, 2, i) = TWO * GAMMA_B2(2)
+ b(3, 2, i) = TWO * GAMMA_B3(2)
+ b(4, 2, i) = TWO * GAMMA_B4(2)
+
+ b(1, 3, i) = TWO * GAMMA_B1(3)
+ b(2, 3, i) = TWO * GAMMA_B2(3)
+ b(3, 3, i) = TWO * GAMMA_B3(3)
+ b(4, 3, i) = TWO * GAMMA_B4(3)
+
+ enddo
+
+ TIMING_STOP(STRCAT(timing_bin_, NAME))
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/Dxyz.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/Dxyz.F90
new file mode 100644
index 0000000000000000000000000000000000000000..a60d023a9327832b816e1a5eb2b0b7f6b282acd7
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/Dxyz.F90
@@ -0,0 +1,137 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! Dxyz.F90 - routines needed in D.F90 (x/y/z-directions)
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+#ifdef DIR_X
+# define GAMMA_A1(C) a(1, C, j) minus i_times(a(4, C, j))
+# define GAMMA_A2(C) a(2, C, j) minus i_times(a(3, C, j))
+# define GAMMA_B1(C) b(3, C, i) plus i_times(b2_ ## C)
+# define GAMMA_B2(C) b(4, C, i) plus i_times(b1_ ## C)
+#endif
+
+#ifdef DIR_Y
+# define GAMMA_A1(C) a(1, C, j) minus a(4, C, j)
+# define GAMMA_A2(C) a(2, C, j) plus a(3, C, j)
+# define GAMMA_B1(C) b(3, C, i) plus b2_ ## C
+# define GAMMA_B2(C) b(4, C, i) minus b1_ ## C
+#endif
+
+#ifdef DIR_Z
+# define GAMMA_A1(C) a(1, C, j) minus i_times(a(3, C, j))
+# define GAMMA_A2(C) a(2, C, j) plus i_times(a(4, C, j))
+# define GAMMA_B1(C) b(3, C, i) plus i_times(b1_ ## C)
+# define GAMMA_B2(C) b(4, C, i) minus i_times(b2_ ## C)
+#endif
+
+#ifdef FORWARD
+# define U(A, B) u(A, B, i)
+# define minus MINUS
+# define plus PLUS
+#else
+# define U(A, B) conjg(u(B, A, j))
+# define minus PLUS
+# define plus MINUS
+#endif
+
+#ifdef DAGGER
+# define PLUS -
+# define MINUS +
+#else
+# define PLUS +
+# define MINUS -
+#endif
+
+!-------------------------------------------------------------------------------
+subroutine NAME(b, a, u, nn, volh)
+
+ implicit none
+
+ COMPLEX, dimension (NDIRAC, NCOL, *), intent(inout) :: b
+ COMPLEX, dimension (NDIRAC, NCOL, *), intent(in) :: a
+ COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u
+ INTEGER, dimension (*), intent(in) :: nn
+ integer :: volh
+
+ integer :: i, j
+
+ COMPLEX :: a1, a2
+ COMPLEX :: b1_1, b2_1
+ COMPLEX :: b1_2, b2_2
+ COMPLEX :: b1_3, b2_3
+
+ ! statement function:
+
+ COMPLEX :: i_times, c
+ i_times(c) = cmplx(-aimag(c), real(c))
+
+ TIMING_START(STRCAT(timing_bin_, NAME))
+
+ !$omp parallel do private(j, a1, a2, b1_1, b2_1, b1_2, b2_2, b1_3, b2_3)
+ do i = 1, volh
+ j = nn(i)
+
+ a1 = GAMMA_A1(1)
+ a2 = GAMMA_A2(1)
+
+ b1_1 = a1 * U(1, 1)
+ b2_1 = a2 * U(1, 1)
+ b1_2 = a1 * U(2, 1)
+ b2_2 = a2 * U(2, 1)
+ b1_3 = a1 * U(3, 1)
+ b2_3 = a2 * U(3, 1)
+
+ a1 = GAMMA_A1(2)
+ a2 = GAMMA_A2(2)
+
+ b1_1 = b1_1 + a1 * U(1, 2)
+ b2_1 = b2_1 + a2 * U(1, 2)
+ b1_2 = b1_2 + a1 * U(2, 2)
+ b2_2 = b2_2 + a2 * U(2, 2)
+ b1_3 = b1_3 + a1 * U(3, 2)
+ b2_3 = b2_3 + a2 * U(3, 2)
+
+ a1 = GAMMA_A1(3)
+ a2 = GAMMA_A2(3)
+
+ b1_1 = b1_1 + a1 * U(1, 3)
+ b2_1 = b2_1 + a2 * U(1, 3)
+
+ b(1, 1, i) = b(1, 1, i) + b1_1
+ b(2, 1, i) = b(2, 1, i) + b2_1
+ b(3, 1, i) = GAMMA_B1(1)
+ b(4, 1, i) = GAMMA_B2(1)
+
+ b1_2 = b1_2 + a1 * U(2, 3)
+ b2_2 = b2_2 + a2 * U(2, 3)
+
+ b(1, 2, i) = b(1, 2, i) + b1_2
+ b(2, 2, i) = b(2, 2, i) + b2_2
+ b(3, 2, i) = GAMMA_B1(2)
+ b(4, 2, i) = GAMMA_B2(2)
+
+ b1_3 = b1_3 + a1 * U(3, 3)
+ b2_3 = b2_3 + a2 * U(3, 3)
+
+ b(1, 3, i) = b(1, 3, i) + b1_3
+ b(2, 3, i) = b(2, 3, i) + b2_3
+ b(3, 3, i) = GAMMA_B1(3)
+ b(4, 3, i) = GAMMA_B2(3)
+
+ enddo
+
+ TIMING_STOP(STRCAT(timing_bin_, NAME))
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_A/d/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..64a19ab5e5d6b7c1cd34db7eab15e53845cb71be
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/Makefile
@@ -0,0 +1,335 @@
+#===============================================================================
+#
+# BQCD -- Berlin Quantum ChromoDynamics programme
+#
+# Author: Hinnerk Stueben
+#
+# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+#
+#-------------------------------------------------------------------------------
+#
+# d/Makefile
+#
+#===============================================================================
+
+include ../Makefile.defs
+
+fpp = $(FPP) -I.. $(FPPFLAGS)
+
+MODULES_DIR = ../modules
+
+.SUFFIXES:
+.SUFFIXES: .a .o .f90 .F90
+
+.f90.o:
+ $(F90) -c $(FFLAGS) -I$(MODULES_DIR) $<
+
+OBJS_D = \
+ d.o \
+ d_t.o \
+ d_zb.o \
+ d_zf.o \
+ d_yb.o \
+ d_yf.o \
+ d_xb.o \
+ d_xf.o \
+ d_dag.o \
+ d_dag_t.o \
+ d_dag_zb.o \
+ d_dag_zf.o \
+ d_dag_yb.o \
+ d_dag_yf.o \
+ d_dag_xb.o \
+ d_dag_xf.o \
+ d_version.o
+
+OBJS_D2 = \
+ d2.o \
+ d2_t.o \
+ d2_zf.o \
+ d2_yf.o \
+ d2_xf.o \
+ d2_dag.o \
+ d2_dag_t.o \
+ d2_dag_zf.o \
+ d2_dag_yf.o \
+ d2_dag_xf.o \
+ d2_version.o
+
+OBJS_D21 = \
+ d21.o \
+ d21_t.o \
+ d21_zf.o \
+ d21_yf.o \
+ d21_xf.o \
+ d21_dag.o \
+ d21_dag_t.o \
+ d21_dag_zf.o \
+ d21_dag_yf.o \
+ d21_dag_xf.o \
+ d21_version.o
+
+OBJS_D3 = \
+ d3.o \
+ d3_t.o \
+ d3_zf.o \
+ d3_yf.o \
+ d3_xf.o \
+ d3_dag.o \
+ d3_dag_t.o \
+ d3_dag_zf.o \
+ d3_dag_yf.o \
+ d3_dag_xf.o \
+ d3_version.o
+
+OBJS_D31 = \
+ d31.o \
+ d31_switch.o \
+ d31_switch_0.o \
+ d31_t.o \
+ d31_t_0.o \
+ d31_zf.o \
+ d31_zf_0.o \
+ d31_yf.o \
+ d31_yf_0.o \
+ d31_xf.o \
+ d31_xf_0.o \
+ d31_dag.o \
+ d31_dag_switch.o \
+ d31_dag_switch_0.o \
+ d31_dag_t.o \
+ d31_dag_t_0.o \
+ d31_dag_zf.o \
+ d31_dag_zf_0.o \
+ d31_dag_yf.o \
+ d31_dag_yf_0.o \
+ d31_dag_xf.o \
+ d31_dag_xf_0.o \
+ d31_version.o
+
+OBJS_DSF = \
+ dsf_xf.o \
+ dsf_xb.o \
+ dsf_yf.o \
+ dsf_yb.o \
+ dsf_zf.o \
+ dsf_zb.o \
+ dsf_tf.o \
+ dsf_tb.o
+
+
+#-------------------------------------------------------------------------------
+$(LIBD):
+
+libd.a: $(OBJS_D) $(OBJS_DSF)
+ $(AR) $(ARFLAGS) $@ $(OBJS_D) $(OBJS_DSF)
+ $(RANLIB) $@
+
+libd2.a: $(OBJS_D2) $(OBJS_DSF)
+ $(AR) $(ARFLAGS) $@ $(OBJS_D2) $(OBJS_DSF)
+ $(RANLIB) $@
+
+libd21.a: $(OBJS_D21) $(OBJS_DSF)
+ $(AR) $(ARFLAGS) $@ $(OBJS_D21) $(OBJS_DSF)
+ $(RANLIB) $@
+
+libd3.a: $(OBJS_D3) $(OBJS_DSF)
+ $(AR) $(ARFLAGS) $@ $(OBJS_D3) $(OBJS_DSF)
+ $(RANLIB) $@
+
+libd31.a: $(OBJS_D31) $(OBJS_DSF)
+ $(AR) $(ARFLAGS) $@ $(OBJS_D31) $(OBJS_DSF)
+ $(RANLIB) $@
+
+fast:
+ $(FAST_MAKE)
+
+#-------------------------------------------------------------------------------
+d.f90: D.F90 $(DEPENDENCIES_D)
+ $(fpp) -DNAME=d D.F90 > $@
+
+d_dag.f90: D.F90 $(DEPENDENCIES_D)
+ $(fpp) -DNAME=d_dag D.F90 > $@
+
+d_xf.f90: Dxyz.F90
+ $(fpp) -DNAME=d_xf -DDIR_X -DFORWARD -UDAGGER Dxyz.F90 > $@
+
+d_yf.f90: Dxyz.F90
+ $(fpp) -DNAME=d_yf -DDIR_Y -DFORWARD -UDAGGER Dxyz.F90 > $@
+
+d_zf.f90: Dxyz.F90
+ $(fpp) -DNAME=d_zf -DDIR_Z -DFORWARD -UDAGGER Dxyz.F90 > $@
+
+d_dag_xf.f90: Dxyz.F90
+ $(fpp) -DNAME=d_dag_xf -DDIR_X -DFORWARD -DDAGGER Dxyz.F90 > $@
+
+d_dag_yf.f90: Dxyz.F90
+ $(fpp) -DNAME=d_dag_yf -DDIR_Y -DFORWARD -DDAGGER Dxyz.F90 > $@
+
+d_dag_zf.f90: Dxyz.F90
+ $(fpp) -DNAME=d_dag_zf -DDIR_Z -DFORWARD -DDAGGER Dxyz.F90 > $@
+
+d_xb.f90: Dxyz.F90
+ $(fpp) -DNAME=d_xb -DDIR_X -UFORWARD -UDAGGER Dxyz.F90 > $@
+
+d_yb.f90: Dxyz.F90
+ $(fpp) -DNAME=d_yb -DDIR_Y -UFORWARD -UDAGGER Dxyz.F90 > $@
+
+d_zb.f90: Dxyz.F90
+ $(fpp) -DNAME=d_zb -DDIR_Z -UFORWARD -UDAGGER Dxyz.F90 > $@
+
+d_dag_xb.f90: Dxyz.F90
+ $(fpp) -DNAME=d_dag_xb -DDIR_X -UFORWARD -DDAGGER Dxyz.F90 > $@
+
+d_dag_yb.f90: Dxyz.F90
+ $(fpp) -DNAME=d_dag_yb -DDIR_Y -UFORWARD -DDAGGER Dxyz.F90 > $@
+
+d_dag_zb.f90: Dxyz.F90
+ $(fpp) -DNAME=d_dag_zb -DDIR_Z -UFORWARD -DDAGGER Dxyz.F90 > $@
+
+d_t.f90: Dt.F90
+ $(fpp) -DNAME=d_t -UDAGGER Dt.F90 > $@
+
+d_dag_t.f90: Dt.F90
+ $(fpp) -DNAME=d_dag_t -DDAGGER Dt.F90 > $@
+
+d_version.f90: DVersion.F90
+ $(fpp) -DVERSION=1 DVersion.F90 > $@
+
+
+#-------------------------------------------------------------------------------
+d2.f90: D2.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d D2.F90 > $@
+
+d2_dag.f90: D2.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d_dag D2.F90 > $@
+
+d2_xf.f90: D2xyzt.F90; $(fpp) -DNAME=d_xf -DDIR_X -UDAGGER D2xyzt.F90 > $@
+
+d2_yf.f90: D2xyzt.F90; $(fpp) -DNAME=d_yf -DDIR_Y -UDAGGER D2xyzt.F90 > $@
+
+d2_zf.f90: D2xyzt.F90; $(fpp) -DNAME=d_zf -DDIR_Z -UDAGGER D2xyzt.F90 > $@
+
+d2_dag_xf.f90: D2xyzt.F90; $(fpp) -DNAME=d_dag_xf -DDIR_X -DDAGGER D2xyzt.F90 > $@
+
+d2_dag_yf.f90: D2xyzt.F90; $(fpp) -DNAME=d_dag_yf -DDIR_Y -DDAGGER D2xyzt.F90 > $@
+
+d2_dag_zf.f90: D2xyzt.F90; $(fpp) -DNAME=d_dag_zf -DDIR_Z -DDAGGER D2xyzt.F90 > $@
+
+d2_t.f90: D2xyzt.F90; $(fpp) -DNAME=d_t -DDIR_T -UDAGGER D2xyzt.F90 > $@
+
+d2_dag_t.f90: D2xyzt.F90; $(fpp) -DNAME=d_dag_t -DDIR_T -DDAGGER D2xyzt.F90 > $@
+
+d2_version.f90: DVersion.F90; $(fpp) -DVERSION=2 DVersion.F90 > $@
+
+
+#-------------------------------------------------------------------------------
+d21.f90: D21.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d -UDAGGER D21.F90 > $@
+
+d21_dag.f90: D21.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d_dag -DDAGGER D21.F90 > $@
+
+d21_xf.f90: D21xyzt.F90; $(fpp) -DNAME=d_xf -DDIR_X -UDAGGER D21xyzt.F90 > $@
+
+d21_yf.f90: D21xyzt.F90; $(fpp) -DNAME=d_yf -DDIR_Y -UDAGGER D21xyzt.F90 > $@
+
+d21_zf.f90: D21xyzt.F90; $(fpp) -DNAME=d_zf -DDIR_Z -UDAGGER D21xyzt.F90 > $@
+
+d21_dag_xf.f90: D21xyzt.F90; $(fpp) -DNAME=d_dag_xf -DDIR_X -DDAGGER D21xyzt.F90 > $@
+
+d21_dag_yf.f90: D21xyzt.F90; $(fpp) -DNAME=d_dag_yf -DDIR_Y -DDAGGER D21xyzt.F90 > $@
+
+d21_dag_zf.f90: D21xyzt.F90; $(fpp) -DNAME=d_dag_zf -DDIR_Z -DDAGGER D21xyzt.F90 > $@
+
+d21_t.f90: D21xyzt.F90; $(fpp) -DNAME=d_t -DDIR_T -UDAGGER D21xyzt.F90 > $@
+
+d21_dag_t.f90: D21xyzt.F90; $(fpp) -DNAME=d_dag_t -DDIR_T -DDAGGER D21xyzt.F90 > $@
+
+d21_version.f90: DVersion.F90; $(fpp) -DVERSION=21 DVersion.F90 > $@
+
+
+#-------------------------------------------------------------------------------
+d3.f90: D3.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d D3.F90 > $@
+
+d3_dag.f90: D3.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d_dag D3.F90 > $@
+
+d3_xf.f90: D3xyzt.F90; $(fpp) -DNAME=d_xf -DDIR_X -UDAGGER D3xyzt.F90 > $@
+
+d3_yf.f90: D3xyzt.F90; $(fpp) -DNAME=d_yf -DDIR_Y -UDAGGER D3xyzt.F90 > $@
+
+d3_zf.f90: D3xyzt.F90; $(fpp) -DNAME=d_zf -DDIR_Z -UDAGGER D3xyzt.F90 > $@
+
+d3_dag_xf.f90: D3xyzt.F90; $(fpp) -DNAME=d_dag_xf -DDIR_X -DDAGGER D3xyzt.F90 > $@
+
+d3_dag_yf.f90: D3xyzt.F90; $(fpp) -DNAME=d_dag_yf -DDIR_Y -DDAGGER D3xyzt.F90 > $@
+
+d3_dag_zf.f90: D3xyzt.F90; $(fpp) -DNAME=d_dag_zf -DDIR_Z -DDAGGER D3xyzt.F90 > $@
+
+d3_t.f90: D3xyzt.F90; $(fpp) -DNAME=d_t -DDIR_T -UDAGGER D3xyzt.F90 > $@
+
+d3_dag_t.f90: D3xyzt.F90; $(fpp) -DNAME=d_dag_t -DDIR_T -DDAGGER D3xyzt.F90 > $@
+
+d3_version.f90: DVersion.F90; $(fpp) -DVERSION=3 DVersion.F90 > $@
+
+
+#-------------------------------------------------------------------------------
+d31.f90: D31.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d D31.F90 > $@
+
+d31_dag.f90: D31.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d_dag D31.F90 > $@
+
+d31_xf.f90: D31xyzt.F90; $(fpp) -DNAME=d_xf -DDIR_X -UDAGGER -UINIT D31xyzt.F90 > $@
+d31_xf_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_xf -DDIR_X -UDAGGER -DINIT D31xyzt.F90 > $@
+
+d31_yf.f90: D31xyzt.F90; $(fpp) -DNAME=d_yf -DDIR_Y -UDAGGER -UINIT D31xyzt.F90 > $@
+d31_yf_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_yf -DDIR_Y -UDAGGER -DINIT D31xyzt.F90 > $@
+
+d31_zf.f90: D31xyzt.F90; $(fpp) -DNAME=d_zf -DDIR_Z -UDAGGER -UINIT D31xyzt.F90 > $@
+d31_zf_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_zf -DDIR_Z -UDAGGER -DINIT D31xyzt.F90 > $@
+
+d31_dag_xf.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_xf -DDIR_X -DDAGGER -UINIT D31xyzt.F90 > $@
+d31_dag_xf_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_xf -DDIR_X -DDAGGER -DINIT D31xyzt.F90 > $@
+
+d31_dag_yf.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_yf -DDIR_Y -DDAGGER -UINIT D31xyzt.F90 > $@
+d31_dag_yf_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_yf -DDIR_Y -DDAGGER -DINIT D31xyzt.F90 > $@
+
+d31_dag_zf.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_zf -DDIR_Z -DDAGGER -UINIT D31xyzt.F90 > $@
+d31_dag_zf_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_zf -DDIR_Z -DDAGGER -DINIT D31xyzt.F90 > $@
+
+d31_t.f90: D31xyzt.F90; $(fpp) -DNAME=d_t -DDIR_T -UDAGGER -UINIT D31xyzt.F90 > $@
+d31_t_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_t -DDIR_T -UDAGGER -DINIT D31xyzt.F90 > $@
+
+d31_dag_t.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_t -DDIR_T -DDAGGER -UINIT D31xyzt.F90 > $@
+d31_dag_t_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_t -DDIR_T -DDAGGER -DINIT D31xyzt.F90 > $@
+
+d31_version.f90: DVersion.F90; $(fpp) -DVERSION=31 DVersion.F90 > $@
+
+d31_switch.f90: D31_switch.F90; $(fpp) -DNAME=d -UINIT D31_switch.F90 > $@
+d31_switch_0.f90: D31_switch.F90; $(fpp) -DNAME=d -DINIT D31_switch.F90 > $@
+
+d31_dag_switch.f90: D31_switch.F90; $(fpp) -DNAME=d_dag -UINIT D31_switch.F90 > $@
+d31_dag_switch_0.f90: D31_switch.F90; $(fpp) -DNAME=d_dag -DINIT D31_switch.F90 > $@
+
+
+#-------------------------------------------------------------------------------
+dsf.f90: dsf.F90 $(DEPENDENCIES_DSF); $(fpp) dsf.F90 > $@
+
+dsf_xf.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_xf -DDIR_X -DFORWARD DSFxyzt.F90 > $@
+
+dsf_yf.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_yf -DDIR_Y -DFORWARD DSFxyzt.F90 > $@
+
+dsf_zf.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_zf -DDIR_Z -DFORWARD DSFxyzt.F90 > $@
+
+dsf_tf.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_tf -DDIR_T -DFORWARD DSFxyzt.F90 > $@
+
+dsf_xb.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_xb -DDIR_X -UFORWARD DSFxyzt.F90 > $@
+
+dsf_yb.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_yb -DDIR_Y -UFORWARD DSFxyzt.F90 > $@
+
+dsf_zb.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_zb -DDIR_Z -UFORWARD DSFxyzt.F90 > $@
+
+dsf_tb.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_tb -DDIR_T -UFORWARD DSFxyzt.F90 > $@
+
+
+#-------------------------------------------------------------------------------
+clean:
+ rm -f *.[Tiod] *.f90 *.mod work.pc work.pcl
+
+clobber: clean
+ rm -f libd.a libd2.a libd21.a libd3.a libd31.a
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/bqcd.pcl b/qcd/part_cpu/applications/QCD/src/kernel_A/d/bqcd.pcl
new file mode 100644
index 0000000000000000000000000000000000000000..906244500b31700684482c3dcfd32f6cec4279db
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/bqcd.pcl
@@ -0,0 +1,2 @@
+work.pc
+../modules/work.pc
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/defs.h b/qcd/part_cpu/applications/QCD/src/kernel_A/defs.h
new file mode 100644
index 0000000000000000000000000000000000000000..05654d6e3e338d85e7aa8fc50ccbf893b7e1d222
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/defs.h
@@ -0,0 +1,213 @@
+#ifndef BQCD_DEFS_H
+#define BQCD_DEFS_H
+
+# define MAX_TEMPER 50
+
+# define RKIND 8
+
+# define BQCD_REAL mpi_real8
+
+# define BQCD_CHECK_SUM mpi_integer8
+# define BQCD_SEED mpi_integer8
+
+# define CHECK_SUM integer(8)
+# define SEED integer(8)
+# define SECONDS real(8)
+# define COMM_METHOD character(40)
+
+#ifdef INTEL
+# define RECL_UNIT 4
+#else
+# define RECL_UNIT 1
+#endif
+
+# define DIM 4
+# define NCOL 3
+# define NDIRAC 4
+# define NGEN 8
+# define EVEN 0
+# define ODD 1
+# define FWD 0
+# define BWD 1
+
+# define SIZE_COMPLEX 2
+
+# define REAL real(RKIND)
+# define INTEGER integer(4)
+# define COMPLEX complex(RKIND)
+
+# define SU3 COMPLEX, dimension (NCOL, NCOL)
+# define GENERATOR REAL, dimension (NGEN)
+
+# define GAUGE_FIELD_IO COMPLEX, dimension(NCOL, NCOL-1, DIM, 0:NX-1, 0:NY-1, 0:NZ-1, 0:NT-1)
+# define SPINCOL_FIELD_IO COMPLEX, dimension(NDIRAC, NCOL, 0:NXH-1, 0:NY-1, 0:NZ-1, 0:NT-1)
+
+# define SU3_FIELD COMPLEX, dimension (NCOL, NCOL, volh_tot)
+# define GAUGE_FIELD COMPLEX, dimension (NCOL, NCOL, volh_tot, EVEN:ODD, DIM)
+# define GENERATOR_FIELD REAL, dimension (NGEN, volh_tot, EVEN:ODD, DIM)
+# define SPINCOL_FIELD COMPLEX, dimension (NDIRAC, NCOL, volh_tot)
+# define SC2_FIELD COMPLEX, dimension(2, NCOL, volh_tot, DIM, FWD:BWD)
+# define CLOVER_FIELD_A type(type_clover_a), dimension(2, volh, EVEN:ODD)
+# define CLOVER_FIELD_B type(type_clover_b), dimension(2, volh, EVEN:ODD)
+# define CLOVER_FIELD_C COMPLEX, dimension(NDIRAC, NCOL, NDIRAC, NCOL, volh)
+
+# define P_GAUGE_FIELD COMPLEX, dimension(:, :, :, :, :), pointer
+# define P_GAUGE_FIELD_IO COMPLEX, dimension(:, :, :, :, :, :, :), pointer
+# define P_GENERATOR_FIELD REAL, dimension(:, :, :, :), pointer
+# define P_SPINCOL_FIELD COMPLEX, dimension(:, :, :), pointer
+# define P_SPINCOL_FIELD_IO COMPLEX, dimension(:, :, :, :, :, :), pointer
+# define P_SC2_FIELD COMPLEX, dimension(:, :, :, :, :), pointer
+# define P_CLOVER_FIELD_A type(type_clover_a), dimension(:, :, :), pointer
+# define P_CLOVER_FIELD_B type(type_clover_b), dimension(:, :, :), pointer
+
+# define SPINCOL_OVERINDEXED REAL, dimension(SIZE_COMPLEX*NDIRAC*NCOL*volh_tot)
+# define P_SPINCOL_OVERINDEXED REAL, dimension(:), pointer
+
+# define FILENAME character(len=80)
+# define FILENAME_FORMAT character(len=80)
+
+# define Re(z) real(z)
+# define Im(z) aimag(z)
+
+# define CAT(A, B) A ## B
+# define STRCAT(A, B) CAT(A, B)
+# define STRCAT3(A, B, C) STRCAT(STRCAT(A, B), C)
+
+# define PI STRCAT(3.1415926535897931_, RKIND)
+# define TWOPI STRCAT(6.2831853071795862_, RKIND)
+# define SQRT3 STRCAT(1.7320508075688772_, RKIND)
+
+# define ZERO STRCAT(0.0_, RKIND)
+# define ONE STRCAT(1.0_, RKIND)
+# define TWO STRCAT(2.0_, RKIND)
+# define THREE STRCAT(3.0_, RKIND)
+# define FOUR STRCAT(4.0_, RKIND)
+# define SIX STRCAT(6.0_, RKIND)
+# define EIGHT STRCAT(8.0_, RKIND)
+
+# define HALF STRCAT(0.5_, RKIND)
+# define EIGHTH STRCAT(0.125_, RKIND)
+
+# define timing_bin_d_xf 1
+# define timing_bin_d_xb 2
+# define timing_bin_d_yf 3
+# define timing_bin_d_yb 4
+# define timing_bin_d_zf 5
+# define timing_bin_d_zb 6
+# define timing_bin_d_t 7
+# define timing_bin_d 8
+# define timing_bin_mtdagmt 9
+# define timing_bin_global_sum 10
+# define timing_bin_global_sum_vec 11
+# define timing_bin_sc_zero 12
+# define timing_bin_sc_copy 13
+# define timing_bin_sc_scale 14
+# define timing_bin_sc_norm2 15
+# define timing_bin_sc_dot 16
+# define timing_bin_sc_axpy 17
+# define timing_bin_sc_xpby 18
+# define timing_bin_sc_axpby 19
+# define timing_bin_sc_cdotc 20
+# define timing_bin_sc_caxpy 21
+# define timing_bin_sc_caxpy2 22
+# define timing_bin_sc_cax2 23
+# define timing_bin_cg 24
+# define timing_bin_hmc_init_p 25
+# define timing_bin_hmc_u 26
+# define timing_bin_dsg 27
+# define timing_bin_dsf 28
+# define timing_bin_clover_init 29
+# define timing_bin_clover_mult_a 30
+# define timing_bin_clover_mult_ao 31
+# define timing_bin_clover_mult_b 32
+# define timing_bin_clover_dsd 33
+# define timing_bin_clover_dsf 34
+# define timing_bin_hmc 35
+# define timing_bin_plaq 36
+# define timing_bin_cooling 37
+# define timing_bin_u_read 38
+# define timing_bin_u_write 39
+# define timing_bin_total 40
+
+# define timing_bin_hmc_init 41
+# define timing_bin_hmc_momenta 42
+# define timing_bin_hmc_init_phi 43
+# define timing_bin_hmc_h_old 44
+# define timing_bin_hmc_backup 45
+# define timing_bin_hmc_half_step0 46
+# define timing_bin_hmc_half_step1 47
+# define timing_bin_hmc_xbound_g 48
+# define timing_bin_hmc_steps 49
+# define timing_bin_hmc_h_new 50
+# define timing_bin_hmc_rest 51
+
+# define timing_bin_h_mult_a 52
+# define timing_bin_h_mult_b 53
+# define timing_bin_h_mult_c 54
+
+# define timing_bin_sc2_projection 55
+
+# define timing_bin_d_dag_xf timing_bin_d_xf
+# define timing_bin_d_dag_xb timing_bin_d_xb
+# define timing_bin_d_dag_yf timing_bin_d_yf
+# define timing_bin_d_dag_yb timing_bin_d_yb
+# define timing_bin_d_dag_zf timing_bin_d_zf
+# define timing_bin_d_dag_zb timing_bin_d_zb
+# define timing_bin_d_dag_t timing_bin_d_t
+# define timing_bin_d_dag timing_bin_d
+
+#ifdef TIMING
+
+# define TIMING_START(bin) call timing_start(bin)
+# define TIMING_STOP(bin) call timing_stop(bin)
+# define TIMING_WRITE(unit) call timing_write(unit)
+
+#else
+
+# define TIMING_START(bin)
+# define TIMING_STOP(bin)
+# define TIMING_WRITE(unit)
+
+#endif
+
+
+# define STDERR 0
+# define UINPUT 1
+# define UCONF 2
+# define URAN 3
+# define UCOUNT 4
+# define UREC 6
+# define UINFO 7
+# define ULIST 8
+# define UDIAG 99
+
+# define START_HOT 0
+# define START_COLD 1
+# define START_CONT 2
+# define START_FILE 3
+
+# define SWAP_DOWN -1
+# define SWAP_RANDOM 0
+# define SWAP_UP 1
+
+# define HMC_TEST_FORWARDS 1
+# define HMC_TEST_NONE 0
+# define HMC_TEST_BACKWARDS -1
+
+# define PUTSTR(unit, str) if (my_pe() == 0) write(unit,*) str
+# define PUTVAL(unit, val) if (my_pe() == 0) write(unit,*) #val, ": ", val
+
+# define DIAGSTR(str) write(UDIAG,*) str
+# define DIAGVAL(val) write(UDIAG,*) #val, ": ", val
+
+# define ALLOCATE_G_FIELD(x) if (.not. associated(x)) call allocate_g_field(x)
+# define ALLOCATE_G_FIELD_IO(x) if (.not. associated(x)) call allocate_g_field_io(x)
+# define ALLOCATE_GEN_FIELD(x) if (.not. associated(x)) call allocate_gen_field(x)
+# define ALLOCATE_SC_FIELD(x) if (.not. associated(x)) call allocate_sc_field(x)
+# define ALLOCATE_SC_FIELD_IO(x) if (.not. associated(x)) call allocate_sc_field_io(x)
+# define ALLOCATE_SC_OVERINDEXED(x) if (.not. associated(x)) call allocate_sc_overindexed(x)
+# define ALLOCATE_SC2_FIELD(x) if (.not. associated(x)) call allocate_sc2_field(x)
+
+# define ASSERT(condition) if (.not. (condition)) call assertion_failed(__FILE__, __LINE__, #condition)
+
+#endif
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/dsd.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/dsd.F90
new file mode 100644
index 0000000000000000000000000000000000000000..0a5b9069a3cc78918cbbc1e6307cf0a25f94731c
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/dsd.F90
@@ -0,0 +1,40 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! dsd.F90 --- p(j,x,mu) := p(j,x,mu) - step * D_{x,mu,j} S_det
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine dsd(p, conf, step, para)
+
+ use typedef_hmc
+ use module_hmc_forces
+ use module_vol
+ implicit none
+
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(in) :: conf
+ GENERATOR_FIELD, intent(inout) :: p
+ REAL, intent(in) :: step
+ REAL :: s
+
+ s = -step * TWO * (para%csw_kappa / EIGHT)
+
+ if (s /= ZERO) then
+ call hmc_forces_old(p)
+ call clover_dsd(ODD, p, conf%b, s, conf%u)
+ call hmc_forces_new(p, step, i_sd)
+ endif
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/dsf.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/dsf.F90
new file mode 100644
index 0000000000000000000000000000000000000000..f06ff31df11deaa345d7c5a8895e3eb1fa9f8378
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/dsf.F90
@@ -0,0 +1,101 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! dsf.F90 - kernel of: p(j,x,mu) := p(j,x,mu) - step * D_{x,mu,j} S_{f 1|2}
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine dsf(p, conf, step, para, a, b)
+
+ use typedef_hmc
+ use module_nn
+ use module_p_interface
+ use module_vol
+ implicit none
+
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(in) :: conf
+ GENERATOR_FIELD, intent(inout) :: p
+ REAL, intent(in) :: step
+ SPINCOL_FIELD, intent(in) :: a
+ SPINCOL_FIELD, intent(in) :: b
+
+ P_GAUGE_FIELD :: u
+ P_SPINCOL_FIELD, save :: at, bt
+ REAL :: s, s1, s2
+
+
+ !! call flip_bc(u) <- done in calling routine
+
+ ALLOCATE_SC_FIELD(at)
+ ALLOCATE_SC_FIELD(bt)
+
+ u => conf%u
+
+ if (para%kappa /= ZERO) then
+ call d(ODD, EVEN, at, a, u) ! A~ = Doe A
+ call d_dag(ODD, EVEN, bt, b, u) ! B~ = Deo+ B
+
+ if (para%csw_kappa /= ZERO) then
+ call clover_mult_b(conf%b(1,1,ODD), at, volh) ! A~ = inv(Too) A~
+ call clover_mult_b(conf%b(1,1,ODD), bt, volh) ! B~ = inv(Too) B~
+ endif
+
+ if (para%h /= ZERO) then
+ call h_mult_b(-para%h, at, volh) ! A~ ~ inv(H) A~
+ call h_mult_b( para%h, bt, volh) ! B~ ~ inv(H+) B~
+ endif
+
+ call xbound_sc_field(a)
+ call xbound_sc_field(b)
+ call xbound_sc_field(at)
+ call xbound_sc_field(bt)
+ endif
+
+ TIMING_START(timing_bin_dsf)
+
+ s = -step * TWO * para%kappa**2 / (ONE + para%h**2)
+
+ if (s /= ZERO) then
+ call dsf_xf(p(1,1,EVEN,1), b, at, s, u(1,1,1,EVEN,1), nn(1,EVEN,1,FWD), VOLH)
+ call dsf_xf(p(1,1,ODD ,1), bt, a, s, u(1,1,1,ODD ,1), nn(1,ODD ,1,FWD), VOLH)
+ call dsf_xb(p(1,1,EVEN,1), bt, a, s, u(1,1,1,EVEN,1), nn(1,EVEN,1,FWD), VOLH)
+ call dsf_xb(p(1,1,ODD ,1), b, at, s, u(1,1,1,ODD ,1), nn(1,ODD ,1,FWD), VOLH)
+
+ call dsf_yf(p(1,1,EVEN,2), b, at, s, u(1,1,1,EVEN,2), nn(1,EVEN,2,FWD), VOLH)
+ call dsf_yf(p(1,1,ODD ,2), bt, a, s, u(1,1,1,ODD ,2), nn(1,ODD ,2,FWD), VOLH)
+ call dsf_yb(p(1,1,EVEN,2), bt, a, s, u(1,1,1,EVEN,2), nn(1,EVEN,2,FWD), VOLH)
+ call dsf_yb(p(1,1,ODD ,2), b, at, s, u(1,1,1,ODD ,2), nn(1,ODD ,2,FWD), VOLH)
+
+ call dsf_zf(p(1,1,EVEN,3), b, at, s, u(1,1,1,EVEN,3), nn(1,EVEN,3,FWD), VOLH)
+ call dsf_zf(p(1,1,ODD ,3), bt, a, s, u(1,1,1,ODD ,3), nn(1,ODD ,3,FWD), VOLH)
+ call dsf_zb(p(1,1,EVEN,3), bt, a, s, u(1,1,1,EVEN,3), nn(1,EVEN,3,FWD), VOLH)
+ call dsf_zb(p(1,1,ODD ,3), b, at, s, u(1,1,1,ODD ,3), nn(1,ODD ,3,FWD), VOLH)
+
+ call dsf_tf(p(1,1,EVEN,4), b, at, s, u(1,1,1,EVEN,4), nn(1,EVEN,4,FWD), VOLH)
+ call dsf_tf(p(1,1,ODD ,4), bt, a, s, u(1,1,1,ODD ,4), nn(1,ODD ,4,FWD), VOLH)
+ call dsf_tb(p(1,1,EVEN,4), bt, a, s, u(1,1,1,EVEN,4), nn(1,EVEN,4,FWD), VOLH)
+ call dsf_tb(p(1,1,ODD ,4), b, at, s, u(1,1,1,ODD ,4), nn(1,ODD ,4,FWD), VOLH)
+ endif
+
+ TIMING_STOP(timing_bin_dsf)
+
+ call flip_bc(u)
+
+ s1 = -step * TWO * (para%csw_kappa / EIGHT)
+ s2 = -step * TWO * (para%csw_kappa / EIGHT) * para%kappa**2
+
+ if (s1 /= ZERO) call clover_dsf(EVEN, p, b, a, s1, u)
+ if (s2 /= ZERO) call clover_dsf(ODD, p, bt, at, s2, u)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/dsf1.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/dsf1.F90
new file mode 100644
index 0000000000000000000000000000000000000000..9796d065359348494c2d70385a5c3f5caa63ae69
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/dsf1.F90
@@ -0,0 +1,66 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! dsf1.F90 -- p(j,x,mu) := p(j,x,mu) - step * D_{x,mu,j} S_{f1}
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine dsf1(p, conf, step, calc_sf, sf, para)
+
+ use typedef_hmc
+ use module_hmc_forces
+ use module_mre
+ use module_p_interface
+ use module_switches
+ use module_vol
+ implicit none
+
+ GENERATOR_FIELD, intent(inout) :: p
+ type(hmc_conf), intent(in) :: conf
+ type(hmc_para), intent(in) :: para
+ integer, intent(in) :: calc_sf
+ REAL, intent(in) :: step
+ REAL, intent(out) :: sf
+
+ type(type_mre), save :: solutions
+ P_SPINCOL_FIELD, save :: a, b
+ REAL, external :: dotprod
+ integer :: iterations
+ external :: w_mult
+ external :: w_dagger_w
+
+ sf = ZERO
+
+ if (switches%quenched) return
+
+ ALLOCATE_SC_FIELD(a)
+ ALLOCATE_SC_FIELD(b)
+
+ call flip_bc(conf%u)
+
+ call mre_get(solutions, w_mult, a, conf%phi, para, conf)
+ call cg(w_dagger_w, a, conf%phi, para, conf, iterations) ! A = inv(W+ W~) Phi
+ call mre_put(solutions, a, calc_sf) ! calc_sf <=> reset
+ call w_mult(b, a, para, conf) ! B = W~ A
+
+ if (calc_sf /= 0) sf = dotprod(b, b, SIZE_SC_FIELD)
+
+ call hmc_forces_old(p)
+ call dsf(p, conf, step, para, a, b)
+ call hmc_forces_new(p, step, i_sf1)
+
+ !! call flip_bc(conf%u) <- done in dsf()
+
+ call iteration_count_f1(iterations)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/dsf2.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/dsf2.F90
new file mode 100644
index 0000000000000000000000000000000000000000..9cf66fe4195ef2b0b213821c96f32bdfa7fbe211
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/dsf2.F90
@@ -0,0 +1,67 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! dsf2.F90 -- p(j,x,mu) := p(j,x,mu) - step * D_{x,mu,j} S_{f2}
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine dsf2(p, conf, step, calc_sf, sf, para)
+
+ use typedef_hmc
+ use module_hmc_forces
+ use module_mre
+ use module_p_interface
+ use module_switches
+ use module_vol
+ implicit none
+
+ GENERATOR_FIELD, intent(inout) :: p
+ type(hmc_conf), intent(in) :: conf
+ type(hmc_para), intent(in) :: para
+ integer, intent(in) :: calc_sf
+ REAL, intent(in) :: step
+ REAL, intent(out) :: sf
+
+ type(type_mre), save :: solutions
+ P_SPINCOL_FIELD, save :: a, b
+ REAL, external :: dotprod
+ integer :: iterations
+ external :: mtil
+ external :: mtdagmt
+
+ sf = ZERO
+
+ if (.not. switches%hasenbusch) return
+
+ ALLOCATE_SC_FIELD(a)
+ ALLOCATE_SC_FIELD(b)
+
+ call flip_bc(conf%u)
+
+ call w_mult_dag(b, conf%phi2, para, conf) ! B = W+ phi2
+ call mre_get(solutions, mtil, a, b, para, conf)
+ call cg(mtdagmt, a, b, para, conf, iterations) ! A = inv(M~+ M~) W+ phi2
+ call mre_put(solutions, a, calc_sf) ! calc_sf <=> reset
+ call mtil(b, a, para, conf) ! B = M~ A
+ if (calc_sf /= 0) sf = dotprod(b, b, SIZE_SC_FIELD)
+ call sc_axpy(b, conf%phi2, -ONE) ! B = B - phi2
+
+ call hmc_forces_old(p)
+ call dsf(p, conf, step, para, a, b)
+ call hmc_forces_new(p, step, i_sf2)
+
+ !! call flip_bc(conf%u) <- done in dsf()
+
+ call iteration_count_f2(iterations)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/dsg.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/dsg.F90
new file mode 100644
index 0000000000000000000000000000000000000000..d15f70a21492382124813aa1e00ec6bea32549a5
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/dsg.F90
@@ -0,0 +1,54 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! dsg.F90 - p(j,x,mu) := p(j,x,mu) - step * D_{x,mu,j} S_g
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine dsg(p, u, step, beta)
+
+ use module_hmc_forces
+ use module_vol
+ implicit none
+
+ GENERATOR_FIELD, intent(inout) :: p
+ GAUGE_FIELD, intent(in) :: u
+ REAL, intent(in) :: step, beta
+ REAL :: s
+ SU3 :: uuu, w
+ integer :: mu, eo, i
+
+ if (beta == ZERO) return
+
+ TIMING_START(timing_bin_dsg)
+
+ call hmc_forces_old(p)
+
+ s = -step * beta / THREE
+
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ !$omp parallel do private(uuu, w)
+ do i = 1, volh
+ call staple(uuu, u, i, eo, mu)
+ call uu(w, u(1, 1, i, eo, mu), uuu)
+ call im_tr_j(p(1, i, eo, mu), w, s)
+ enddo
+ enddo
+ enddo
+
+ call hmc_forces_new(p, step, i_sg)
+
+ TIMING_STOP(timing_bin_dsg)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/files.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/files.F90
new file mode 100644
index 0000000000000000000000000000000000000000..55da217cf65bd496a9389e6916728ca5e3a5c4ab
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/files.F90
@@ -0,0 +1,325 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! files.F90
+!
+!-------------------------------------------------------------------------------
+!
+! restart_file: progname.run.{res|count|ran|stop}
+! restart_conf_file: progname.run.s.time.{u|phi}
+! info_file: progname.run.s.info
+!
+! conf_info_file: progname.run.s1.s2.traj.info
+! conf_file: progname.run.s1.s2.traj.time.u
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_files ! formats of file name strings
+
+ implicit none
+
+ ! lengths of formats of file name strings
+
+ integer, parameter :: l_name = 2
+ integer, parameter :: l_ext = 2
+ integer, parameter :: l_sep = 5
+ integer, parameter :: l_num = 4
+
+ integer, parameter :: l_base = l_name + l_sep + l_num
+ integer, parameter :: l_conf = l_base + 3 * (l_sep + l_num)
+
+ ! formats of file name strings
+
+ character(len = l_name), save :: fmt_name = "(a"
+ character(len = l_name), save :: fmt_ext = "a)"
+ character(len = l_sep), save :: fmt_sep = ",'.',"
+ character(len = l_num), save :: fmt_run = "i3.3"
+ character(len = l_num), save :: fmt_ensemble = "i1.1"
+ character(len = l_num), save :: fmt_traj = "i5.5"
+ character(len = l_num), save :: fmt_time = "i3.3"
+
+ character(len = *), parameter :: ext_count = "count"
+ character(len = *), parameter :: ext_ran = "ran"
+ character(len = *), parameter :: ext_res = "res"
+ character(len = *), parameter :: ext_u = "u"
+ character(len = *), parameter :: ext_phi = "phi"
+ character(len = *), parameter :: ext_info = "info"
+ character(len = *), parameter :: ext_stop = "STOP"
+
+CONTAINS
+
+ character(len=l_base) function fmt_base()
+ fmt_base = fmt_name // fmt_sep // fmt_run
+ end function fmt_base
+
+ character(len=l_conf) function fmt_conf()
+ fmt_conf = fmt_base() // fmt_sep // fmt_ensemble &
+ // fmt_sep // fmt_ensemble &
+ // fmt_sep // fmt_traj
+ end function fmt_conf
+
+end
+
+!-------------------------------------------------------------------------------
+FILENAME function count_file()
+
+ use module_files
+ implicit none
+ FILENAME :: restart_file
+
+ count_file = restart_file(ext_count)
+end
+
+!-------------------------------------------------------------------------------
+FILENAME function ran_file()
+
+ use module_files
+ implicit none
+ FILENAME :: restart_file
+
+ ran_file = restart_file(ext_ran)
+end
+
+!-------------------------------------------------------------------------------
+FILENAME function res_file()
+
+ use module_files
+ implicit none
+ FILENAME :: restart_file
+
+ res_file = restart_file(ext_res)
+end
+
+!-------------------------------------------------------------------------------
+FILENAME function stop_file()
+
+ use module_files
+ implicit none
+ FILENAME :: restart_file
+
+ stop_file = restart_file(ext_stop)
+end
+
+!-------------------------------------------------------------------------------
+FILENAME function u_file(i_ensemble, time)
+
+ use module_files
+ implicit none
+ integer, intent(in) :: i_ensemble, time
+ FILENAME :: restart_conf_file
+
+ u_file = restart_conf_file(i_ensemble, time, ext_u)
+end
+
+!-------------------------------------------------------------------------------
+FILENAME function phi_file(i_ensemble, time)
+
+ use module_files
+ implicit none
+ integer, intent(in) :: i_ensemble, time
+ FILENAME :: restart_conf_file
+
+ phi_file = restart_conf_file(i_ensemble, time, ext_phi)
+end
+
+
+!-------------------------------------------------------------------------------
+FILENAME function restart_file(ext)
+
+ use module_bqcd
+ use module_counter
+ use module_files
+ implicit none
+
+ character(len = *), intent(in) :: ext
+ FILENAME_FORMAT :: fmt
+
+ fmt = fmt_base() // fmt_sep // fmt_ext
+
+ write(restart_file, fmt) prog_name, counter%run, ext
+
+end
+
+!-------------------------------------------------------------------------------
+FILENAME function restart_conf_file(i_ensemble, time, ext)
+
+ use module_bqcd
+ use module_counter
+ use module_files
+ implicit none
+
+ integer, intent(in) :: i_ensemble, time
+ character(len = *), intent(in) :: ext
+ FILENAME_FORMAT :: fmt
+
+ fmt = fmt_base() // fmt_sep // fmt_ensemble &
+ // fmt_sep // fmt_time &
+ // fmt_sep // fmt_ext
+
+ write(restart_conf_file, fmt) prog_name, counter%run, i_ensemble, time, ext
+
+end
+
+!-------------------------------------------------------------------------------
+FILENAME function info_file(i_ensemble)
+
+ use module_bqcd
+ use module_counter
+ use module_files
+ implicit none
+
+ integer, intent(in) :: i_ensemble
+ FILENAME_FORMAT :: fmt
+
+ fmt = fmt_base() // fmt_sep // fmt_ensemble // fmt_sep // fmt_ext
+
+ write(info_file, fmt) prog_name, counter%run, i_ensemble, ext_info
+
+end
+
+!-------------------------------------------------------------------------------
+FILENAME function conf_info_file(i_ensemble1, i_ensemble2)
+
+ use module_bqcd
+ use module_counter
+ use module_files
+ implicit none
+
+ integer, intent(in) :: i_ensemble1, i_ensemble2
+ FILENAME_FORMAT :: fmt
+
+ fmt = fmt_conf() // fmt_sep // fmt_ext
+
+ write(conf_info_file, fmt) &
+ prog_name, counter%run, i_ensemble1, i_ensemble2, counter%traj, ext_info
+
+end
+
+!-------------------------------------------------------------------------------
+FILENAME function conf_file(i_ensemble1, i_ensemble2, time)
+
+ use module_bqcd
+ use module_counter
+ use module_files
+ implicit none
+
+ integer, intent(in) :: i_ensemble1, i_ensemble2, time
+ FILENAME_FORMAT :: fmt
+
+ fmt = fmt_conf() // fmt_sep // fmt_time // fmt_sep // fmt_ext
+
+ write(conf_file, fmt) &
+ prog_name, counter%run, i_ensemble1, i_ensemble2, counter%traj, time, ext_u
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine check_fmt(run, max_temper, max_traj, max_time)
+
+ use module_files
+ implicit none
+ integer :: run, max_temper, max_traj, max_time
+
+ call check_len(run, fmt_run, "RUN")
+ call check_len(max_temper, fmt_ensemble, "TEMPER")
+ call check_len(max_traj, fmt_traj, "TRAJ")
+ call check_len(max_time, fmt_time, "TIME")
+
+CONTAINS
+
+ subroutine check_len(counter, counter_fmt, counter_name)
+
+ implicit none
+ integer :: i, len, counter
+ character(len = *) :: counter_fmt, counter_name
+
+ i = index(counter_fmt, "i")
+
+ if (i == 0) then
+ call die("check_fmt(): unable to check fmt for " // counter_name)
+ endif
+
+ read(counter_fmt(i+1:i+1), *) len
+
+ if (counter < 0 .or. counter >= 10**len) then
+ call die("check_fmt(): file name format unsuitable for " // counter_name)
+ endif
+
+ end subroutine check_len
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine set_fmt_ensemble(N_temper)
+
+ use module_files
+ implicit none
+
+ integer, intent(in) :: N_temper
+
+ if (N_temper < 10) then
+ fmt_ensemble = "i1.1"
+ else if (N_temper < 100) then
+ fmt_ensemble = "i2.2"
+ else
+ call die ("set_fmt_ensemble(): N_temper >= 100 ???")
+ endif
+
+end
+
+
+!-------------------------------------------------------------------------------
+function format_ensemble()
+
+ use module_files
+ implicit none
+ character(len = l_num) :: format_ensemble
+
+ format_ensemble = fmt_ensemble
+end
+
+!-------------------------------------------------------------------------------
+subroutine filename_test()
+
+ use module_function_decl
+ implicit none
+ integer i
+
+ FILENAME, external :: count_file
+ FILENAME, external :: ran_file
+ FILENAME, external :: res_file
+ FILENAME, external :: stop_file
+ FILENAME, external :: u_file
+ FILENAME, external :: phi_file
+ FILENAME, external :: restart_conf_file
+ FILENAME, external :: info_file
+ FILENAME, external :: conf_info_file
+ FILENAME, external :: conf_file
+
+ do i = 1, 11, 10
+ call set_fmt_ensemble(i)
+
+ PUTVAL(6, count_file())
+ PUTVAL(6, ran_file())
+ PUTVAL(6, res_file())
+ PUTVAL(6, stop_file())
+ PUTVAL(6, u_file(3, 4))
+ PUTVAL(6, phi_file(5, 6))
+ PUTVAL(6, restart_conf_file(1, 2, 'conf'))
+ PUTVAL(6, info_file(7))
+ PUTVAL(6, conf_info_file(7, 8))
+ PUTVAL(6, conf_file(3, 4, 5))
+ enddo
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/fixLouhiPP.sh b/qcd/part_cpu/applications/QCD/src/kernel_A/fixLouhiPP.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b6e1537ba25946b650663060d03a61e95895e258
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/fixLouhiPP.sh
@@ -0,0 +1,8 @@
+sed 's/% ##/%/g' modules/module_input.F90 > tmp.dat; cp tmp.dat modules/module_input.F90; rm -f tmp.dat
+
+sed 's/## )/ )/g' clover/clover_init.F90 > tmp.dat; cp tmp.dat clover/clover_init.F90; rm -f tmp.dat
+
+sed 's/## (/ (/g' d/D21xyzt.F90 > tmp.dat; cp tmp.dat d/D21xyzt.F90; rm -f tmp.dat
+sed 's/## (/ (/g' d/D2xyzt.F90 > tmp.dat; cp tmp.dat d/D2xyzt.F90; rm -f tmp.dat
+sed 's/## (/ (/g' d/D31xyzt.F90 > tmp.dat; cp tmp.dat d/D31xyzt.F90; rm -f tmp.dat
+sed 's/## (/ (/g' d/D3xyzt.F90 > tmp.dat; cp tmp.dat d/D3xyzt.F90; rm -f tmp.dat
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/flip_bc.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/flip_bc.F90
new file mode 100644
index 0000000000000000000000000000000000000000..f8cb9fbdb940fccfaa155b2712969f83fa5b5b51
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/flip_bc.F90
@@ -0,0 +1,112 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! flip_bc.F90 - flip fermionic boundary conditions
+! (ie multiplication of corresponding links with -1)
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_flip_bc
+
+ INTEGER, dimension(:, :, :), pointer, save :: flip_bc_list
+ integer, dimension(DIM), save :: flip_bc_len
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_flip_bc()
+
+ use module_flip_bc
+ use module_function_decl
+ use module_lattice
+ use module_vol
+ implicit none
+
+ integer, dimension(DIM) :: i0, i1, i_pe, j
+ integer :: me, mu, x, y, z, t, i, eo, count(EVEN:ODD)
+ integer, external :: xyzt2i, e_o
+
+ allocate(flip_bc_list(volh_tot, EVEN:ODD, DIM))
+
+ me = my_pe()
+ call unlex(me, DIM, i_pe, NPE)
+
+ do mu = 1, DIM
+ count = 0
+ if (bc_fermions(mu) < 0) then
+ if (i_pe(mu) == (NPE(mu) - 1) .or. i_pe(mu) == 0) then
+ i0 = 0
+ i1 = N - 1
+
+ if (i_pe(mu) == (NPE(mu) - 1)) then
+ i0(mu) = N(mu) - 1
+ else
+ i0(mu) = -1
+ endif
+
+ i1(mu) = i0(mu)
+
+ do t = i0(4), i1(4)
+ do z = i0(3), i1(3)
+ do y = i0(2), i1(2)
+ do x = i0(1), i1(1)
+ j = (/x, y, z, t/)
+ i = xyzt2i(j)
+ eo = e_o(j)
+
+ count(eo) = count(eo) + 1
+ flip_bc_list(count(eo), eo, mu) = i
+ enddo
+ enddo
+ enddo
+ enddo
+
+ endif
+ endif
+
+ if (count(EVEN) /= count(ODD)) then
+ call die ("init_flip_bc(): count(EVEN) /= count(ODD)")
+ else
+ flip_bc_len(mu) = count(EVEN)
+ endif
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine flip_bc(u)
+
+ use module_flip_bc
+ use module_lattice
+ use module_vol
+ implicit none
+
+ GAUGE_FIELD, intent(inout) :: u
+ integer :: mu, nu, count, i, eo, c1, c2
+
+ do mu = 1, DIM
+ nu = gamma_index(mu)
+ do eo = EVEN,ODD
+ do count = 1, flip_bc_len(mu)
+ i = flip_bc_list(count, eo, mu)
+ do c2 = 1, NCOL
+ do c1 = 1, NCOL
+ u(c1, c2, i, eo, nu) = -u(c1, c2, i, eo, nu)
+ enddo
+ enddo
+ enddo
+ enddo
+ enddo
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/h_mult.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/h_mult.F90
new file mode 100644
index 0000000000000000000000000000000000000000..67af646983001c34b430fb7402e5b962216ffedf
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/h_mult.F90
@@ -0,0 +1,107 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2000-2002, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! h_mult.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine h_mult_a(out, h, in, volh) ! out := out + i h gamma_5 in
+
+ implicit none
+ COMPLEX, dimension (NDIRAC, *) :: out, in
+ REAL :: h
+ integer :: volh
+
+ integer :: i
+
+ ! statement function:
+
+ COMPLEX :: i_times, c
+ i_times(c) = cmplx(-aimag(c), real(c))
+
+ TIMING_START(timing_bin_h_mult_a)
+
+ !$omp parallel do
+ do i = 1, NCOL * volh
+ out(1, i) = out(1, i) + h * i_times(in(3, i))
+ out(2, i) = out(2, i) + h * i_times(in(4, i))
+ out(3, i) = out(3, i) + h * i_times(in(1, i))
+ out(4, i) = out(4, i) + h * i_times(in(2, i))
+ enddo
+
+ TIMING_STOP(timing_bin_h_mult_a)
+end
+
+!-------------------------------------------------------------------------------
+subroutine h_mult_b(h, x, volh) ! x := (1 + i h gamma_5) x
+
+ implicit none
+ REAL :: h
+ COMPLEX, dimension (NDIRAC, *) :: x
+ integer :: volh
+
+ integer :: i
+ COMPLEX :: x1, x2, x3, x4
+
+ ! statement function:
+
+ COMPLEX :: i_times, c
+ i_times(c) = cmplx(-aimag(c), real(c))
+
+
+ TIMING_START(timing_bin_h_mult_b)
+
+ !$omp parallel do private(x1, x2, x3, x4)
+ do i = 1, NCOL * volh
+ x1 = x(1, i)
+ x2 = x(2, i)
+ x3 = x(3, i)
+ x4 = x(4, i)
+
+ x(1, i) = x(1, i) + h * i_times(x3)
+ x(2, i) = x(2, i) + h * i_times(x4)
+ x(3, i) = x(3, i) + h * i_times(x1)
+ x(4, i) = x(4, i) + h * i_times(x2)
+ enddo
+
+ TIMING_STOP(timing_bin_h_mult_b)
+end
+
+!-------------------------------------------------------------------------------
+subroutine h_mult_c(out, h, in, volh) ! out = (1 + i h gamma_5) in
+
+ implicit none
+ COMPLEX, dimension (NDIRAC, *) :: out, in
+ REAL :: h
+ integer :: volh
+
+ integer :: i
+
+ ! statement function:
+
+ COMPLEX :: i_times, c
+ i_times(c) = cmplx(-aimag(c), real(c))
+
+ TIMING_START(timing_bin_h_mult_c)
+
+ !$omp parallel do
+ do i = 1, NCOL * volh
+ out(1, i) = in(1, i) + h * i_times(in(3, i))
+ out(2, i) = in(2, i) + h * i_times(in(4, i))
+ out(3, i) = in(3, i) + h * i_times(in(1, i))
+ out(4, i) = in(4, i) + h * i_times(in(2, i))
+ enddo
+
+ TIMING_STOP(timing_bin_h_mult_c)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc.F90
new file mode 100644
index 0000000000000000000000000000000000000000..ddfed12989cc9eee43d773e8fce66013cbb78bae
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc.F90
@@ -0,0 +1,207 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! hmc.F90 - one Hybrid Monte Carlo step including Hasenbusch's and Bakeyev's
+! accelerations
+!
+!// IR scale: tau
+!// IR steps: ntau
+!// UV scale: tau / m_scale
+!// UV steps: m_scale
+!//
+!// models / splits of actions:
+!//
+!// model A:
+!// S_UV = S_g
+!// S_IR = S_det + S_f1
+!//
+!// model B:
+!// S_UV = S_g
+!// S_IR = S_det + S_f1 + S_f2
+!//
+!// model C:
+!// S_UV = S_g + S_det + S_f1
+!// S_IR = S_f2
+!//
+!// S_f1 = phi1+ inv(W+ W) phi1
+!// S_f2 = phi2+ W inv(M~+ M~) W+ phi2
+!//
+!// W = M~ + rho
+!//
+!// => ir_steps = 1 and rho = 0 corresponds exactly to the previous verions
+!// (rho = 0 is treated as S_f2 = 0),
+!// and especially tau and ntau have the same meaning as before
+!//
+!// (In the whole program phi and phi2 are treated asymmetrically.
+!// The reason for this is upward compatibility with the mode
+!// "standard Wilson fermions + parallel tempering".
+!// phi is needed for the tempering decisions.)
+!//
+!// Flags / switches:
+!//
+!// force_accept: Force acceptance after, eg, a hot or cold start.
+!// test: For testing reversibility by forward/backward integration.
+!// If (test /= HMC_TEST_NONE) force_accept has to be .true.
+!// If (test == HMC_TEST_BACKWARDS) para%tau has to be reversed.
+!//
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine hmc(para, conf, out, force_accept, test)
+
+ use typedef_hmc
+ use module_function_decl
+ use module_p_interface
+ use module_switches
+ use module_vol
+ implicit none
+
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(inout) :: conf
+ type(hmc_out), intent(out) :: out
+ integer, intent(in) :: force_accept
+ integer, intent(in) :: test
+
+ P_GAUGE_FIELD, save :: u_bck
+ P_SPINCOL_FIELD, save :: phi_bck
+ P_CLOVER_FIELD_A, save :: a_bck, i_bck
+ P_CLOVER_FIELD_B, save :: b_bck
+ P_GENERATOR_FIELD, save :: p
+
+ REAL :: sd_old, sd_new
+ REAL :: sf1_old, sf1_new
+ REAL :: sf2_old, sf2_new
+ REAL :: sg_old, sg_new
+ REAL :: sp_old, sp_new
+ REAL :: hg_old, hg_new
+ REAL :: h_old, h_new
+ REAL :: sf1, sf2
+ REAL, external :: sp, sg, clover_action
+
+
+ TIMING_START(timing_bin_hmc)
+
+ TIMING_START(timing_bin_hmc_init)
+
+ if (.not. associated(u_bck)) then
+ ALLOCATE_G_FIELD(u_bck)
+ ALLOCATE_GEN_FIELD(p)
+ ALLOCATE_SC_FIELD(phi_bck)
+ if (switches%clover) call allocate_clover_field_a(a_bck)
+ if (switches%clover) call allocate_clover_field_a(i_bck)
+ if (switches%clover) call allocate_clover_field_b(b_bck)
+ endif
+
+ sd_old = ZERO; sd_new = ZERO
+ sf1_old = ZERO; sf1_new = ZERO
+ sf2_old = ZERO; sf2_new = ZERO
+ sg_old = ZERO; sg_new = ZERO
+ sp_old = ZERO; sp_new = ZERO
+ sf1 = ZERO; sf2 = ZERO
+
+ call init_cg_stat()
+ TIMING_STOP(timing_bin_hmc_init)
+
+ if (test /= HMC_TEST_BACKWARDS) then ! ie normally do:
+
+ ! backups:
+
+ TIMING_START(timing_bin_hmc_backup)
+
+ call swap_p_sc_field(phi_bck, conf%phi)
+
+ u_bck = conf%u
+ if (switches%clover) a_bck = conf%a
+ if (switches%clover) i_bck = conf%i
+ if (switches%clover) b_bck = conf%b
+
+ TIMING_STOP(timing_bin_hmc_backup)
+
+ ! initialize momenta p, phi, phi2 and old action:
+
+ call hmc_init_p(p)
+ call hmc_init_phi(conf, para, sf1_old, sf2_old)
+
+ TIMING_START(timing_bin_hmc_h_old)
+ if (switches%clover) sd_old = clover_action(conf%b(1,1,ODD))
+ sg_old = sg(conf%u)
+ sp_old = sp(p)
+ hg_old = sg_old * para%beta
+
+ h_old = sd_old + sp_old + hg_old + sf1_old + sf2_old
+ TIMING_STOP(timing_bin_hmc_h_old)
+ endif
+
+ if (test == HMC_TEST_FORWARDS) then
+ call hmc_test_report(test, p, conf%u, &
+ sp_old, hg_old, sf1_old, sf2_old, sd_old)
+ endif
+
+! leap frog integration:
+
+ call hmc_leap_frog(p, para, conf, sf1, sf2)
+
+! calculate Hamiltonian:
+
+ TIMING_START(timing_bin_hmc_h_new)
+ if (switches%clover) sd_new = clover_action(conf%b(1,1,ODD))
+ sf1_new = sf1
+ sf2_new = sf2
+ sg_new = sg(conf%u)
+ sp_new = sp(p)
+ hg_new = sg_new * para%beta
+
+ h_new = sd_new + sp_new + hg_new + sf1_new + sf2_new
+ TIMING_STOP(timing_bin_hmc_h_new)
+
+! accept new U ? :
+
+ TIMING_START(timing_bin_hmc_rest)
+ out%exp_dh = exp(h_old - h_new)
+
+ if (force_accept /= 0) then
+ out%accepted = 1
+ else
+ if (ranf() < out%exp_dh) then
+ out%accepted = 1
+ else
+ out%accepted = 0
+ endif
+ endif
+
+ if (out%accepted == 1) then
+ out%sg = sg_new
+ out%sf = sf1_new
+ else
+ call swap_p_sc_field(conf%phi, phi_bck)
+ call swap_p_g_field(conf%u, u_bck)
+ call swap_p_clover_field_a(conf%a, a_bck)
+ call swap_p_clover_field_a(conf%i, i_bck)
+ call swap_p_clover_field_b(conf%b, b_bck)
+ endif
+
+ call get_cg_stat(out%cg_ncall, out%cg_niter_max, out%cg_niter_tot)
+
+ call iteration_count_write(UREC)
+ call hmc_forces_write(UREC)
+
+ if (test == HMC_TEST_BACKWARDS) then
+ call hmc_test_report(test, p, conf%u, &
+ sp_new, hg_new,sf1_new, sf2_new, sd_new)
+ endif
+
+ TIMING_STOP(timing_bin_hmc_rest)
+
+ TIMING_STOP(timing_bin_hmc)
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_check.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_check.F90
new file mode 100644
index 0000000000000000000000000000000000000000..b0793399a0058613b5e32f9a498a75306db5bafe
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_check.F90
@@ -0,0 +1,122 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! hmc_check.F90 - check by forward/backward leap frog integration
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine hmc_check(check, p, u, hp, hg, hf1, hf2, hd)
+
+ use module_function_decl
+ use module_vol
+ implicit none
+
+ integer, intent(in) :: check
+ GENERATOR_FIELD, intent(in) :: p
+ GAUGE_FIELD, intent(in) :: u
+ REAL, intent(in) :: hp, hg, hf1, hf2, hd
+
+ P_GENERATOR_FIELD, save :: p_start
+ P_GAUGE_FIELD, save :: u_start
+ REAL, save :: hp_start
+ REAL, save :: hg_start
+ REAL, save :: hf1_start
+ REAL, save :: hf2_start
+ REAL, save :: hd_start
+
+ REAL :: diff_p, diff_u
+ integer :: i, eo, mu, j, c1, c2
+
+ if (.not. associated(p_start)) then
+ allocate(p_start(NGEN, volh_tot, EVEN:ODD, DIM))
+ allocate(u_start(NCOL, NCOL, volh_tot, EVEN:ODD, DIM))
+ endif
+
+ if (check == HMC_CHECK_FORWARDS) then
+
+ p_start = p
+ u_start = u
+ hp_start = hp
+ hg_start = hg
+ hf1_start = hf1
+ hf2_start = hf2
+ hd_start = hd
+
+ else if (check == HMC_CHECK_BACKWARDS) then
+
+ diff_p = ZERO
+ diff_u = ZERO
+
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ do i = 1, volh
+ do j = 1, NGEN
+ diff_p = max(diff_p, abs(p_start(j,i,eo,mu) - p(j,i,eo,mu)))
+ enddo
+ do c2 = 1, NCOL
+ do c1 = 1, NCOL
+ diff_u = max(diff_u, &
+ abs(relative_change(Re(u_start(c1,c2,i,eo,mu)), &
+ Re(u(c1,c2,i,eo,mu)))))
+ diff_u = max(diff_u, &
+ abs(relative_change(Im(u_start(c1,c2,i,eo,mu)), &
+ Im(u(c1,c2,i,eo,mu)))))
+ enddo
+ enddo
+ enddo
+ enddo
+ enddo
+
+ if (my_pe() == 0) then
+ call begin(UREC, "HMC-check")
+ write(UREC, *)
+ write(UREC,400) "Configuration changes (maximal abs. relative changes):"
+ write(UREC, *)
+ write(UREC,410) "Generator field:", diff_p
+ write(UREC,410) "Gauge field: ", diff_u
+ write(UREC, *)
+ write(UREC, *)
+ write(UREC,400) "Energy changes:"
+ write(UREC, *)
+ write(UREC,420) "Energy ", "old value", "rel.change"
+ write(UREC, *)
+ write(UREC,430) "H_generator", hp_start, relative_change(hp_start, hp)
+ write(UREC,430) "H_gauge ", hg_start, relative_change(hg_start, hg)
+ write(UREC,430) "H_fermion_1", hf1_start, relative_change(hf1_start, hf1)
+ write(UREC,430) "H_fermion_2", hf2_start, relative_change(hf2_start, hf2)
+ write(UREC,430) "H_det ", hd_start, relative_change(hd_start, hd)
+ write(UREC, *)
+ call end_A(UREC, "HMC-check")
+ endif
+
+400 format (1x, a)
+410 format (1x, a, e8.1)
+420 format (1x, a, a20, a12)
+430 format (1x, a, e20.10, e12.1)
+
+ else
+ call die("hmc_check(): unknown check flag.")
+ endif
+
+contains
+
+ REAL function relative_change(old, new)
+
+ implicit none
+ REAL, intent(in) :: old, new
+
+ relative_change = old / (new - old)
+
+ end function relative_change
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_forces.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_forces.F90
new file mode 100644
index 0000000000000000000000000000000000000000..4f7e3985ee39a367b3299c23e03474b41bbfc710
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_forces.F90
@@ -0,0 +1,155 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! hmc_forces.F90 - calculation of HMC forces in Hasenbusch improvement,
+! does not work with tempering
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-----------------------------------------------------------------------------
+subroutine hmc_forces_old(p)
+
+ use module_hmc_forces
+ use module_p_interface
+ use module_switches
+ use module_vol
+ implicit none
+
+ GENERATOR_FIELD, intent(in) :: p
+ integer :: mu, eo, i
+
+ if (.not. switches%hasenbusch) return
+
+ if (.not. associated(p_old)) then
+ call allocate_gen_field(p_old)
+ f_count = ZERO
+ f_avg = ZERO
+ f_max = ZERO
+ endif
+
+
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ !$omp parallel do
+ do i = 1, volh
+ p_old(1, i, eo, mu) = p(1, i, eo, mu)
+ p_old(2, i, eo, mu) = p(2, i, eo, mu)
+ p_old(3, i, eo, mu) = p(3, i, eo, mu)
+ p_old(4, i, eo, mu) = p(4, i, eo, mu)
+ p_old(5, i, eo, mu) = p(5, i, eo, mu)
+ p_old(6, i, eo, mu) = p(6, i, eo, mu)
+ p_old(7, i, eo, mu) = p(7, i, eo, mu)
+ p_old(8, i, eo, mu) = p(8, i, eo, mu)
+ enddo
+ enddo
+ enddo
+
+end
+
+!-----------------------------------------------------------------------------
+subroutine hmc_forces_new(p, step, which)
+
+ use module_hmc_forces
+ use module_function_decl
+ use module_switches
+ use module_vol
+ implicit none
+
+ GENERATOR_FIELD, intent(in) :: p
+ REAL, intent(in) :: step
+ integer, intent(in) :: which
+
+ integer :: mu, eo, i
+ REAL :: force
+
+ if (.not. switches%hasenbusch) return
+
+ force = ZERO
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ !$omp parallel do
+ do i = 1, volh
+ force = force &
+ + (p_old(1, i, eo, mu) - p(1, i, eo, mu))**2 &
+ + (p_old(2, i, eo, mu) - p(2, i, eo, mu))**2 &
+ + (p_old(3, i, eo, mu) - p(3, i, eo, mu))**2 &
+ + (p_old(4, i, eo, mu) - p(4, i, eo, mu))**2 &
+ + (p_old(5, i, eo, mu) - p(5, i, eo, mu))**2 &
+ + (p_old(6, i, eo, mu) - p(6, i, eo, mu))**2 &
+ + (p_old(7, i, eo, mu) - p(7, i, eo, mu))**2 &
+ + (p_old(8, i, eo, mu) - p(8, i, eo, mu))**2
+ enddo
+ enddo
+ enddo
+
+ force = global_sum(force) / (NGEN * volume * DIM)
+ force = sqrt(force) / abs(step)
+
+ f_count(which) = f_count(which) + ONE
+ f_avg(which) = f_avg(which) + force
+ f_max(which) = max(f_max(which), force)
+
+end
+
+
+!-----------------------------------------------------------------------------
+subroutine hmc_forces_write(unit)
+
+ use module_hmc_forces
+ use module_counter
+ use module_function_decl
+ use module_switches
+
+ implicit none
+
+ integer, intent(in) :: unit
+ integer, save :: written = 0
+ integer :: i
+
+ character(*), parameter :: key_avg = "%Favg"
+ character(*), parameter :: key_max = "%Fmax"
+ character(*), parameter :: fmt_h = "(1x, 2a, a6, 4a)"
+ character(*), parameter :: fmt_b = "(1x, a6, i6, 4g20.10)"
+
+ character(20), dimension(n_force) :: f_name
+
+ if (.not. switches%hasenbusch) return
+
+ f_name(i_sg) = " F_gauge"
+ f_name(i_sd) = " F_det"
+ f_name(i_sf1) = " F_F1"
+ f_name(i_sf2) = " F_F2"
+
+ do i = 1, n_force
+ if (f_count(i) /= ZERO) then
+ f_avg(i) = f_avg(i) / f_count(i)
+ endif
+ enddo
+
+ if (written == 0 .and. my_pe() == 0) then
+ write(unit, fmt_h) "T", key_avg, "traj", (f_name(i), i = 1, n_force)
+ write(unit, fmt_h) "T", key_max, "traj", (f_name(i), i = 1, n_force)
+ endif
+
+ if (my_pe() == 0) then
+ write(unit, fmt_b) key_avg, counter%traj, (f_avg(i), i = 1, n_force)
+ write(unit, fmt_b) key_max, counter%traj, (f_max(i), i = 1, n_force)
+ endif
+
+
+ written = written + 1
+ f_count = ZERO
+ f_avg = ZERO
+ f_max = ZERO
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_init_p.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_init_p.F90
new file mode 100644
index 0000000000000000000000000000000000000000..4d40dbf9d606cd8849f4b112daa0467526ae3252
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_init_p.F90
@@ -0,0 +1,36 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! hmc_init_p.F90 - initialization of momenta
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine hmc_init_p(p)
+
+ use module_vol
+ implicit none
+
+ GENERATOR_FIELD, intent(out) :: p
+ integer :: mu, eo
+
+ TIMING_START(timing_bin_hmc_init_p)
+
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ call ran_gauss_volh(NGEN/2, p(1,1,eo,mu), ONE, eo)
+ enddo
+ enddo
+
+ TIMING_STOP(timing_bin_hmc_init_p)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_init_phi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_init_phi.F90
new file mode 100644
index 0000000000000000000000000000000000000000..3e42f7ff0d01f2b7ee365e2b987dcfd256e45996
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_init_phi.F90
@@ -0,0 +1,66 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! hmc_init_phi.F90 - initialises phi, phi2 and calculates actions
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine hmc_init_phi(conf, para, sf1, sf2)
+
+ use typedef_hmc
+ use module_function_decl
+ use module_p_interface
+ use module_switches
+ use module_vol
+ implicit none
+
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(inout) :: conf
+ REAL, intent(out) :: sf1
+ REAL, intent(out) :: sf2
+
+ P_SPINCOL_FIELD, save :: tmp
+
+ integer :: iterations
+ external :: w_dagger_w
+
+ TIMING_START(timing_bin_hmc_init_phi)
+
+ sf1 = ZERO
+ sf2 = ZERO
+
+ if (.not. switches%dynamical) return
+
+ ALLOCATE_SC_FIELD(tmp)
+
+ call flip_bc(conf%u)
+
+
+ call ran_gauss_volh(NDIRAC * NCOL, tmp, HALF, EVEN) ! tmp = noise
+ sf1 = dotprod(tmp, tmp, SIZE_SC_FIELD)
+ call w_mult_dag(conf%phi, tmp, para, conf) ! phi = W+ noise
+
+ if (switches%hasenbusch) then
+ call ran_gauss_volh(NDIRAC * NCOL, tmp, HALF, EVEN) ! tmp = noise
+ sf2 = dotprod(tmp, tmp, SIZE_SC_FIELD)
+ call mtil_dag(conf%phi2, tmp, para, conf) ! phi2 = M~+ noise
+ call cg(w_dagger_w, tmp, conf%phi2, para, conf, iterations)
+ ! tmp = inv(W+ W) M~+ noise
+ call w_mult(conf%phi2, tmp, para, conf) ! phi2 = inv(W+) M~+ noise
+ endif
+
+ call flip_bc(conf%u)
+
+ TIMING_STOP(timing_bin_hmc_init_phi)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_integrator.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_integrator.F90
new file mode 100644
index 0000000000000000000000000000000000000000..609a126116e2d27014680aeecc8e8f97eec5fd7b
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_integrator.F90
@@ -0,0 +1,94 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! hmc_integrator.F90 - integrators for the different models
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine hmc_integrator_p_ir(p, para, conf, step, calc_sf, sf1, sf2)
+
+ use typedef_hmc
+ use module_switches
+ use module_vol
+ implicit none
+
+ GENERATOR_FIELD, intent(inout) :: p
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(in) :: conf
+ REAL, intent(in) :: step
+ integer, intent(in) :: calc_sf
+ REAL, intent(out) :: sf1, sf2
+
+ select case (para%model)
+ case ("A")
+ call dsf1(p, conf, step, calc_sf, sf1, para)
+ call dsd(p, conf, step, para)
+ case ("B")
+ call dsf1(p, conf, step, calc_sf, sf1, para)
+ call dsf2(p, conf, step, calc_sf, sf2, para)
+ call dsd(p, conf, step, para)
+ case ("C")
+ call dsf2(p, conf, step, calc_sf, sf2, para)
+ case default
+ call die("hmc_integrator_p_ir: " // para%model // ": unknown model")
+ end select
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine hmc_integrator_p_uv(p, para, conf, step, calc_sf, sf1, sf2)
+
+ use typedef_hmc
+ use module_switches
+ use module_vol
+ implicit none
+
+ GENERATOR_FIELD, intent(inout) :: p
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(in) :: conf
+ REAL, intent(in) :: step
+ integer, intent(in) :: calc_sf
+ REAL, intent(out) :: sf1, sf2
+
+ select case (para%model)
+ case ("A")
+ call dsg(p, conf%u, step, para%beta)
+ case ("B")
+ call dsg(p, conf%u, step, para%beta)
+ case ("C")
+ call dsg(p, conf%u, step, para%beta)
+ call dsf1(p, conf, step, calc_sf, sf1, para)
+ call dsd(p, conf, step, para)
+ case default
+ call die("hmc_integrator_p_uv: " // para%model // ": unknown model")
+ end select
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine hmc_integrator_q(p, para, conf, step)
+
+ use typedef_hmc
+ use module_switches
+ use module_vol
+ implicit none
+
+ GENERATOR_FIELD, intent(in) :: p
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(inout) :: conf
+ REAL, intent(in) :: step
+
+ call hmc_u(p, conf, step, para)
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_leap_frog.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_leap_frog.F90
new file mode 100644
index 0000000000000000000000000000000000000000..246e5b31d409006998ccb2f5b3e252de02c97a5e
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_leap_frog.F90
@@ -0,0 +1,70 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! hmc_leap_frog.F90 - two time scale leap frog integration
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine hmc_leap_frog(p, para, conf, sf1, sf2)
+
+ use typedef_hmc
+ use module_switches
+ use module_vol
+ implicit none
+
+ GENERATOR_FIELD, intent(inout) :: p
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(inout) :: conf
+ REAL, intent(out) :: sf1, sf2
+
+ REAL :: step_ir, step_uv
+ integer :: calc_sf, itau, i_scale
+
+ calc_sf = 0
+
+! first half ir and uv step
+
+ step_ir = HALF * para%tau
+ step_uv = HALF * para%tau / para%m_scale
+
+ TIMING_START(timing_bin_hmc_half_step0)
+ call hmc_integrator_p_ir(p, para, conf, step_ir, calc_sf, sf1, sf2)
+ call hmc_integrator_p_uv(p, para, conf, step_uv, calc_sf, sf1, sf2)
+ TIMING_STOP(timing_bin_hmc_half_step0)
+
+ step_ir = para%tau
+ step_uv = para%tau / para%m_scale
+
+ do itau = 1, para%ntau
+ do i_scale = 1, para%m_scale
+
+ call hmc_integrator_q(p, para, conf, step_uv)
+
+ if (itau == para%ntau .and. i_scale == para%m_scale) then
+ step_ir = step_ir * HALF ! final half steps
+ step_uv = step_uv * HALF
+ calc_sf = 1 ! calculate new S_f
+ endif
+
+ TIMING_START(timing_bin_hmc_steps)
+ call hmc_integrator_p_uv(p, para, conf, step_uv, calc_sf, sf1, sf2)
+ TIMING_STOP(timing_bin_hmc_steps)
+ enddo
+
+ TIMING_START(timing_bin_hmc_steps)
+ call hmc_integrator_p_ir(p, para, conf, step_ir, calc_sf, sf1, sf2)
+ TIMING_STOP(timing_bin_hmc_steps)
+ enddo
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_test.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_test.F90
new file mode 100644
index 0000000000000000000000000000000000000000..c7233bf8d5cf916c16b903035804d0d50d6ce740
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_test.F90
@@ -0,0 +1,177 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! hmc_test.F90 - forward/backward leap frog integration
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine hmc_test(para, conf)
+
+ use typedef_hmc
+ use module_function_decl
+ use module_vol
+ implicit none
+
+ type(hmc_para), intent(inout) :: para
+ type(hmc_conf), intent(inout) :: conf
+ type(hmc_out) :: out
+
+ call begin(UREC, "HMCtest")
+
+ call hmc(para, conf, out, .true., HMC_TEST_FORWARDS)
+
+ call write_out("forward ")
+
+ para%tau = -para%tau
+
+ call hmc(para, conf, out, .true., HMC_TEST_BACKWARDS)
+
+ para%tau = -para%tau
+
+ call write_out("backward ")
+
+ call end_A(UREC, "HMCtest")
+
+
+contains
+
+ subroutine write_out(direction)
+
+ character(*) :: direction
+ REAL :: plaq
+
+ if (my_pe() == 0) then
+ plaq = out%sg / (6 * volume)
+ write(UREC, *)
+ write(UREC, 400) "Direction", "PlaqEnergy", &
+ "exp(-Delta_H)", "CGcalls", "CGitTot", "CGitMax"
+ write(UREC, 410) direction, plaq, out%exp_dh, &
+ out%cg_ncall, out%cg_niter_tot, out%cg_niter_max
+ write(UREC, *)
+ endif
+
+400 format (1x, a, 2a15, 3a8)
+410 format (1x, a, 2f15.10, 3i8)
+
+ end subroutine write_out
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine hmc_test_report(test, p, u, hp, hg, hf1, hf2, hd)
+
+ use module_function_decl
+ use module_vol
+ implicit none
+
+ integer, intent(in) :: test
+ GENERATOR_FIELD, intent(in) :: p
+ GAUGE_FIELD, intent(in) :: u
+ REAL, intent(in) :: hp, hg, hf1, hf2, hd
+
+ P_GENERATOR_FIELD, save :: p_start
+ P_GAUGE_FIELD, save :: u_start
+ REAL, save :: hp_start
+ REAL, save :: hg_start
+ REAL, save :: hf1_start
+ REAL, save :: hf2_start
+ REAL, save :: hd_start
+
+ REAL :: diff_p, diff_u
+ integer :: i, eo, mu, j, c1, c2
+
+ if (.not. associated(p_start)) then
+ allocate(p_start(NGEN, volh_tot, EVEN:ODD, DIM))
+ allocate(u_start(NCOL, NCOL, volh_tot, EVEN:ODD, DIM))
+ endif
+
+ if (test == HMC_TEST_FORWARDS) then
+
+ p_start = p
+ u_start = u
+ hp_start = hp
+ hg_start = hg
+ hf1_start = hf1
+ hf2_start = hf2
+ hd_start = hd
+
+ else if (test == HMC_TEST_BACKWARDS) then
+
+ diff_p = ZERO
+ diff_u = ZERO
+
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ do i = 1, volh
+ do j = 1, NGEN
+ diff_p = max(diff_p, abs(p_start(j,i,eo,mu) - p(j,i,eo,mu)))
+ enddo
+ do c2 = 1, NCOL
+ do c1 = 1, NCOL
+ diff_u = max(diff_u, &
+ abs(relative_change(Re(u_start(c1,c2,i,eo,mu)), &
+ Re(u(c1,c2,i,eo,mu)))))
+ diff_u = max(diff_u, &
+ abs(relative_change(Im(u_start(c1,c2,i,eo,mu)), &
+ Im(u(c1,c2,i,eo,mu)))))
+ enddo
+ enddo
+ enddo
+ enddo
+ enddo
+
+ if (my_pe() == 0) then
+ write(UREC, *)
+ write(UREC,400) "Configuration changes (maximal abs. relative changes):"
+ write(UREC, *)
+ write(UREC,410) "Generator field:", diff_p
+ write(UREC,410) "Gauge field: ", diff_u
+ write(UREC, *)
+ write(UREC, *)
+ write(UREC,400) "Energy changes:"
+ write(UREC, *)
+ write(UREC,420) "Energy ", "old value", "rel.change"
+ write(UREC, *)
+ write(UREC,430) "H_generator", hp_start, relative_change(hp_start, hp)
+ write(UREC,430) "H_gauge ", hg_start, relative_change(hg_start, hg)
+ write(UREC,430) "H_fermion_1", hf1_start, relative_change(hf1_start, hf1)
+ write(UREC,430) "H_fermion_2", hf2_start, relative_change(hf2_start, hf2)
+ write(UREC,430) "H_det ", hd_start, relative_change(hd_start, hd)
+ write(UREC, *)
+ endif
+
+400 format (1x, a)
+410 format (1x, a, e8.1)
+420 format (1x, a, a20, a12)
+430 format (1x, a, e20.10, e12.1)
+
+ else
+ call die("hmc_test_report(): illegal test flag.")
+ endif
+
+contains
+
+ REAL function relative_change(old, new)
+
+ implicit none
+ REAL, intent(in) :: old, new
+
+ if (old == ZERO) then
+ relative_change = ZERO
+ else
+ relative_change = (new - old) / old
+ endif
+
+ end function relative_change
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_u.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_u.F90
new file mode 100644
index 0000000000000000000000000000000000000000..fa3ad5724f480953ad4c36b0f830178a80951b32
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_u.F90
@@ -0,0 +1,60 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! hmc_u.F90 - U := exp(i * lambda_j * P_j * step) * U
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine hmc_u(p, conf, step, para)
+
+ use typedef_hmc
+ use module_switches
+ use module_vol
+ implicit none
+
+ GENERATOR_FIELD, intent(in) :: p
+ type(hmc_conf), intent(inout) :: conf
+ REAL, intent(in) :: step
+ type(hmc_para), intent(in) :: para
+
+ GENERATOR :: q
+ SU3 :: v
+ integer :: i, mu, eo, j
+
+ TIMING_START(timing_bin_hmc_u)
+
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ !$omp parallel do private(j, q, v)
+ do i = 1, VOLH
+ do j = 1, NGEN
+ q(j) = p(j, i, eo, mu) * step
+ enddo
+ call gen2u(v, q)
+ call u_update(conf%u(1, 1, i, eo, mu), v) ! u = v * u
+ call u_normalize(conf%u(1, 1, i, eo, mu))
+ enddo
+ enddo
+ enddo
+
+ TIMING_STOP(timing_bin_hmc_u)
+
+ TIMING_START(timing_bin_hmc_xbound_g)
+ call xbound_g_field(conf%u)
+ TIMING_STOP(timing_bin_hmc_xbound_g)
+
+ if (switches%clover) then
+ call clover_init(conf%a, conf%i, conf%b, conf%u, para%csw_kappa)
+ endif
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/index.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/index.F90
new file mode 100644
index 0000000000000000000000000000000000000000..57a35648262b1bd93ee1b786b7c6680e071f10aa
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/index.F90
@@ -0,0 +1,144 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! index.F90 - functions for index calculations
+! these functions work "stand alone"
+!
+!-------------------------------------------------------------------------------
+ integer function i_e_o(dim, i) ! returns EVEN or ODD (0 or 1)
+
+ implicit none
+ integer dim, i(dim), d
+
+ i_e_o = 0
+ do d = 1, dim
+ i_e_o = i_e_o + i(d)
+ enddo
+
+ i_e_o = mod(abs(i_e_o), 2)
+ end
+
+!-------------------------------------------------------------------------------
+ integer function i_global(i_local, N_local, i_pe)
+
+ implicit none
+ integer i_local, N_local, i_pe
+
+ i_global = i_pe * N_local + i_local
+
+ end
+
+!-------------------------------------------------------------------------------
+ integer function i_local(i_global, N_local, i_pe)
+
+ implicit none
+ integer i_global, N_local, i_pe
+
+ i_local = i_global - i_pe * N_local
+
+ end
+
+!-------------------------------------------------------------------------------
+ integer function i_periodic(i, n)
+
+ implicit none
+ integer i, n
+
+ if (i .ge. 0) then
+ if (i .lt. n) then
+ i_periodic = i
+ elseif (i .lt. 2 * n) then
+ i_periodic = i - n
+ else
+ i_periodic = mod(i, n)
+ endif
+ else
+ if (i .ge. -n) then
+ i_periodic = i + n
+ else
+ i_periodic = i + (1 - (i + 1) / n) * n
+ endif
+ endif
+ end
+
+!-------------------------------------------------------------------------------
+ integer function ieo(dim, i, n)
+
+ implicit none
+ integer dim, i(dim), n(dim), d, ilex
+
+ ieo = ilex(dim, i, n) / 2
+ end
+
+!-------------------------------------------------------------------------------
+ integer function ilex(dim, i, n)
+
+ implicit none
+ integer dim, i(dim), n(dim), d
+
+ ilex = i(dim)
+ do d = dim - 1, 1, -1
+ ilex = ilex * n(d) + i(d)
+ enddo
+
+ end
+
+!-------------------------------------------------------------------------------
+ integer function n_sites(dim, direction, n, npe)
+
+! returns number of sites of local grid and boundaries
+! n_sites(dim, (/0, 0, ..., 0/), n, npe) is the (local) grid volume
+
+ implicit none
+ integer dim, direction(dim), n(dim), npe(dim), d
+
+ n_sites = 1
+ do d = 1, dim
+ if (direction(d) .eq. 0) then
+ n_sites = n_sites * n(d)
+ else
+ if (npe(d) .eq. 1) then ! grid not partitioned in d-direction
+ n_sites = 0
+ return
+ endif
+ endif
+ enddo
+
+ end
+
+!-------------------------------------------------------------------------------
+!! subroutine uneo(ieo, eo, dim, i, n) ! returns i for given (ieo, eo)
+!!
+!! implicit none
+!! integer ieo, eo, dim, i(dim), n(dim), e_o
+!!
+!! call unlex(2 * ieo, dim, i, n)
+!! i(1) = i(1) + ieor(e_o(dim-1, i(2)), eo)
+!!
+!! end
+!!
+!-------------------------------------------------------------------------------
+ subroutine unlex(ilex, dim, i, n)
+
+ ! remember the range of ilex: 0 <= ilex < (n(1) * ... * n(dim))
+
+ implicit none
+ integer ilex, dim, i(dim), n(dim), j, d
+
+ j = ilex
+ do d = 1, dim - 1
+ i(d) = mod(j, n(d))
+ j = j / n(d) ! integer division
+ enddo
+ i(dim) = j
+
+ end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/index2.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/index2.F90
new file mode 100644
index 0000000000000000000000000000000000000000..1f1c3aa3dfabd03c13c5f921514cfe17b35cb6c2
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/index2.F90
@@ -0,0 +1,55 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! index2.F90 - more functions for index calculations
+! these functions use modules
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine local2global(home, local, global)
+
+ use module_lattice
+ implicit none
+
+ integer, intent(in) :: home ! home process of "local"
+ integer, intent(in), dimension(DIM) :: local ! local coordinates
+ integer, intent(out),dimension(DIM) :: global ! global coordinates
+
+ integer, external :: i_global, i_periodic
+ integer :: coord_home(DIM), i
+
+
+ call unlex(home, DIM, coord_home, npe)
+
+ do i = 1, DIM
+ global(i) = i_global(local(i), N(i), coord_home(i))
+ global(i) = i_periodic(global(i), L(i))
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+integer function e_o(local) !// returns EVEN or ODD (0 or 1)
+
+ use module_function_decl
+ implicit none
+
+ integer, intent(in), dimension(DIM) :: local ! local coordinates
+ integer, dimension(DIM) :: global ! global coordinates
+ integer, external :: i_e_o
+
+ call local2global(my_pe(), local, global)
+
+ e_o = i_e_o(DIM, global)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/init_common.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/init_common.F90
new file mode 100644
index 0000000000000000000000000000000000000000..61a6cf938f34632a4e0c85981dc320e616ec3f0a
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/init_common.F90
@@ -0,0 +1,396 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! init_common.F90 - initialize common blocks (now: mostly modules)
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine init_common(para)
+
+ use typedef_para
+ implicit none
+ type(type_para), intent(in) :: para
+
+ call init_common_lattice(para%L, para%NPE, para%bc_fermions, para%gamma_index)
+ call init_common_vol(para%L, para%NPE)
+ call init_common_nnpe(para%NPE)
+ call init_common_offset()
+ call init_common_nn()
+ call init_common_thread()
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_common_lattice(ll, pe, bc_f, gamma_i)
+
+ use module_function_decl
+ use module_lattice
+ implicit none
+
+ integer, dimension(DIM), intent(in) :: ll, pe, bc_f, gamma_i
+
+ integer :: i, count(DIM)
+
+ L = ll
+ NPE = pe
+ N = ll / pe
+ NH = N
+
+ NH(1) = N(1) / 2
+
+ bc_fermions = bc_f
+ gamma_index = gamma_i
+
+ if (mod(L(1), 2) /= 0) call die("init_common_lattice(): LX must be even")
+ if (mod(L(2), 2) /= 0) call die("init_common_lattice(): LY must be even")
+ if (mod(L(3), 2) /= 0) call die("init_common_lattice(): LZ must be even")
+ if (mod(L(4), 2) /= 0) call die("init_common_lattice(): LT must be even")
+
+ if (mod(N(1), 2) /= 0) call die("init_common_lattice(): NX must be even")
+
+ if (NPE(1) * NPE(2) * NPE(3) * NPE(4) /= num_pes()) then
+ call die("init_common_lattice(): N_PEs wrong")
+ endif
+
+if (N(1)*NPE(1)/=L(1)) call die("init_common_lattice(): NPEX not divider of LX")
+if (N(2)*NPE(2)/=L(2)) call die("init_common_lattice(): NPEY not divider of LY")
+if (N(3)*NPE(3)/=L(3)) call die("init_common_lattice(): NPEZ not divider of LZ")
+if (N(4)*NPE(4)/=L(4)) call die("init_common_lattice(): NPET not divider of LT")
+
+ count = 0
+ do i = 1, DIM
+ if (gamma_index(i) < 1 .or. gamma_index(i) > DIM) then
+ call die("init_common_lattice(): gamma_index: out of range")
+ endif
+ count(gamma_index(i)) = count(gamma_index(i)) + 1
+ enddo
+
+ do i = 1, DIM
+ if (count(i) /= 1) then
+ call die("init_common_lattice(): gamma_index: inconsistent")
+ endif
+ enddo
+
+ select case (version_of_d())
+ case(3,4,22)
+ do i = 1, DIM
+ if (gamma_index(i) /= i) then
+ call die( &
+ "init_common_lattice(): gamma_index: not changeable for this version of D()")
+ endif
+ enddo
+ end select
+
+ do i = 1, DIM
+ decomp_direction(gamma_index(i)) = i
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_common_nn()
+
+ use module_function_decl
+ use module_lattice
+ use module_vol
+ use module_nn
+ implicit none
+
+ integer :: x, y, z, t, i, eo, mu, fb, dir, tmp
+ integer, dimension (DIM) :: j, start, end
+ integer, external :: e_o, xyzt2i, i_periodic
+ integer, parameter :: out_of_range = 2000000000
+
+ allocate(nn(volh_tot, EVEN:ODD, DIM, FWD:BWD))
+
+ do fb = FWD, BWD
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ !$omp parallel do
+ do i = 1, volh_tot
+ nn(i, eo, mu, fb) = out_of_range
+ enddo
+ enddo
+ enddo
+ enddo
+
+ do mu = 1, DIM
+ if (NPE(mu) == 1) then
+ start(mu) = 0
+ end(mu) = N(mu) - 1
+ else
+ start(mu) = -1
+ end(mu) = N(mu)
+ endif
+ enddo
+
+ do t = start(4), end(4)
+ do z = start(3), end(3)
+ do y = start(2), end(2)
+ do x = start(1), end(1)
+
+ j = (/x,y,z,t/)
+
+ i = xyzt2i(j)
+ eo = e_o(j)
+
+ do fb = FWD, BWD
+ if (fb == FWD) then
+ dir = +1
+ else
+ dir = -1
+ endif
+ do mu = 1, DIM
+ j = (/x, y, z, t/)
+ j(mu) = j(mu) + dir
+
+ if (NPE(mu) == 1) j(mu) = i_periodic(j(mu), L(mu))
+
+ if (j(mu) < -1 .or. j(mu) > N(mu)) then
+ nn(i, eo, gamma_index(mu), fb) = out_of_range
+ else
+ nn(i, eo, gamma_index(mu), fb) = xyzt2i(j)
+ endif
+ enddo
+ enddo
+
+ enddo
+ enddo
+ enddo
+ enddo
+
+ do fb = FWD, BWD
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ do i = 1, volh
+ tmp = nn(i, eo, mu, fb)
+ if (tmp < 1 .or. tmp > volh_tot) call die("init_common_nn(): error1")
+ if (num_pes() == 1 .and. tmp > volh) call die("init_common_nn(): error2")
+ enddo
+ enddo
+ enddo
+ enddo
+
+ do fb = FWD, BWD
+ do mu = 1, DIM
+ do eo = EVEN, ODD
+ do i = 1, volh
+ tmp = nn(i, eo, mu, fb)
+ if (nn(tmp, EVEN + ODD - eo, mu, FWD + BWD - fb) /= i) &
+ call die("init_common_nn(): error3")
+ enddo
+ enddo
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_common_nnpe(NPE)
+
+ use module_function_decl
+ use module_nnpe
+ implicit none
+
+ integer, dimension (DIM) :: NPE, i, j, start, end
+ integer, external :: i_periodic, ilex
+ integer :: me, pe, x, y, z, t, mu
+
+ me = my_pe()
+ nnpe(:, :, :, :) = me
+
+ call unlex(me, DIM, i, NPE)
+
+ do mu = 1, DIM
+ if (NPE(mu) == 1) then
+ start(mu) = 0
+ end(mu) = 0
+ else
+ start(mu) = -1
+ end(mu) = 1
+ endif
+ enddo
+
+ do t = start(4), end(4)
+ do z = start(3), end(3)
+ do y = start(2), end(2)
+ do x = start(1), end(1)
+
+ j(1) = i_periodic(i(1) + x, NPE(1))
+ j(2) = i_periodic(i(2) + y, NPE(2))
+ j(3) = i_periodic(i(3) + z, NPE(3))
+ j(4) = i_periodic(i(4) + t, NPE(4))
+
+ pe = ilex(DIM, j, NPE)
+
+ if (pe < 0 .or. pe >= num_pes()) then
+ call die ("init_common_pe(): pe out of range")
+ endif
+
+ nnpe(x, y, z, t) = pe
+
+ enddo
+ enddo
+ enddo
+ enddo
+
+ ASSERT(nnpe(0,0,0,0) == my_pe())
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_common_offset()
+
+ use module_lattice
+ use module_vol
+ use module_offset
+ implicit none
+
+ integer, external :: n_sites
+ integer :: x, y, z, t, off, off2, mu
+ integer :: start(DIM), end(DIM)
+
+
+ !!ASSERT(n_sites(DIM, (/0,0,0,0/), N, NPE) == vol)
+ !!ASSERT(n_sites(DIM, (/0,0,0,0/), NH, NPE) == volh)
+
+ do mu = 1, DIM
+ if (NPE(mu) == 1) then
+ start(mu) = 0
+ end(mu) = 0
+ else
+ start(mu) = -1
+ end(mu) = 1
+ endif
+ enddo
+
+ off = n_sites(DIM,(/0,0,0,0/), NH, NPE) ! volh
+
+ do t = start(4), end(4)
+ do z = start(3), end(3)
+ do y = start(2), end(2)
+ do x = start(1), end(1)
+
+ if (x == 0 .and. y == 0 .and. z == 0 .and. t == 0) then
+ offset(x,y,z,t) = 0
+ else
+ offset(x,y,z,t) = off
+ off = off + n_sites(DIM, (/x,y,z,t/), NH, NPE)
+ endif
+
+ enddo
+ enddo
+ enddo
+ enddo
+
+ off2 = 1
+ do mu = 1, DIM
+ if (NPE(mu) == 1) then
+ off2 = off2 * NH(mu)
+ else
+ off2 = off2 * (NH(mu) + 2)
+ endif
+ enddo
+
+ ASSERT(off == off2)
+ ASSERT(off <= volh_tot)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_common_vol(L, NPE)
+
+ use module_vol
+ implicit none
+
+ integer, dimension(DIM), intent(in) :: L, NPE
+ integer, dimension(DIM) :: N
+
+ N = L / NPE
+
+ volume = L(1) * L(2) * L(3) * L(4)
+ vol = N(1) * N(2) * N(3) * N(4)
+ volh = vol / 2
+ volh_tot = (N(1)/2 + 2) * (N(2) + 2) * (N(3) + 2) * (N(4) + 2)
+
+ size_sc_field = SIZE_COMPLEX * NDIRAC * NCOL * volh
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_common_thread()
+
+ use module_function_decl
+ use module_thread
+ use module_vol
+ implicit none
+
+ integer :: i, size
+
+#ifdef _OPENMP
+ integer :: omp_get_max_threads
+ n_thread = omp_get_max_threads()
+#else
+ n_thread = 1
+#endif
+
+ if (version_of_d() /= 21 .and. &
+ version_of_d() /= 22 .and. &
+ version_of_d() > 2) then
+ if (n_thread < 2) then
+ call die("init_common_thread(): " // &
+ "need at least 2 threads for compiled version of D()")
+ endif
+ endif
+
+ allocate(xyz_start(0:n_thread-1))
+ allocate(xyz_end(0:n_thread-1))
+ allocate(t_start(0:n_thread-1))
+ allocate(t_end(0:n_thread-1))
+
+ if (n_thread == 1) then
+ xyz_start(0) = 1
+ xyz_end(0) = volh
+ t_start(0) = 1
+ t_end(0) = volh
+ else
+ xyz_start(0) = 0
+ xyz_end(0) = -1
+ call init_common_thread_split(n_thread - 1, volh, xyz_start(1), xyz_end(1))
+ call init_common_thread_split(n_thread, volh, t_start, t_end)
+ endif
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_common_thread_split(n, size, start, end)
+
+ implicit none
+ integer, intent(in) :: n, size
+ integer, intent(out) :: start(n), end(n)
+ integer :: chunk, rest, i
+
+ chunk = size / n
+ rest = size - (chunk * n)
+
+ start(1) = 1
+ end(1) = chunk
+ if (rest >= 1) end(1) = end(1) + 1
+
+ do i = 2, n
+ start(i) = end(i - 1) + 1
+ end(i) = end(i - 1) + chunk
+ if (rest >= i) end(i) = end(i) + 1
+ enddo
+
+ ASSERT(end(n) == size)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/init_modules.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/init_modules.F90
new file mode 100644
index 0000000000000000000000000000000000000000..ba4de3f936678abd83f1217655a1b81ed40c2f1e
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/init_modules.F90
@@ -0,0 +1,119 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! init_modules.F90 - initialise (some) modules
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine init_modules()
+
+ call init_module_decomp()
+ call init_module_lattice_io()
+ call init_module_sc_size()
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_module_decomp()
+
+ use module_decomp
+ use module_function_decl
+ use module_lattice
+ use module_vol
+ implicit none
+
+ integer :: i, j, me, i_pe(DIM)
+ integer :: x, y, z, t, eo
+ integer :: x_std(DIM), i_std
+ integer :: x_act(DIM), i_act
+ integer :: me_act, me_std
+
+ integer, external :: e_o
+ integer, external :: ieo
+
+
+ allocate(decomp%std%i(volh, EVEN:ODD))
+ allocate(decomp%act%i(volh, EVEN:ODD))
+
+ me = my_pe()
+ call unlex(me, DIM, i_pe, NPE)
+
+ decomp%act%L = L
+ decomp%act%NPE = NPE
+ decomp%act%N = N
+ decomp%act%NH = NH
+ decomp%act%i_pe = i_pe
+ decomp%act%bc_fermions = bc_fermions
+ decomp%gamma_index = gamma_index
+ decomp%direction = decomp_direction
+
+ do j = 1, DIM
+ i = gamma_index(j)
+
+ decomp%std%L(i) = L(j)
+ decomp%std%NPE(i) = NPE(j)
+ decomp%std%N(i) = N(j)
+ decomp%std%i_pe(i) = i_pe(j)
+ decomp%std%bc_fermions(i) = bc_fermions(j)
+ enddo
+
+ decomp%std%NH(1) = decomp%std%N(1) / 2
+ decomp%std%NH(2) = decomp%std%N(2)
+ decomp%std%NH(3) = decomp%std%N(3)
+ decomp%std%NH(4) = decomp%std%N(4)
+
+
+ decomp%std%i = 0
+ decomp%act%i = 0
+
+ do t = 0, decomp%act%N(4) - 1
+ do z = 0, decomp%act%N(3) - 1
+ do y = 0, decomp%act%N(2) - 1
+ do x = 0, decomp%act%N(1) - 1
+
+ x_act(1) = x
+ x_act(2) = y
+ x_act(3) = z
+ x_act(4) = t
+
+ x_std(gamma_index(1)) = x_act(1)
+ x_std(gamma_index(2)) = x_act(2)
+ x_std(gamma_index(3)) = x_act(3)
+ x_std(gamma_index(4)) = x_act(4)
+
+ i_std = ieo(DIM, x_std, decomp%std%N) + 1
+ i_act = ieo(DIM, x_act, decomp%act%N) + 1
+ eo = e_o(x_act)
+
+ decomp%std%i(i_act, eo) = i_std
+ decomp%act%i(i_std, eo) = i_act
+ enddo
+ enddo
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_module_lattice_io()
+
+ use module_decomp
+ use module_lattice_io
+ implicit none
+
+ L = decomp%std%L
+ N = decomp%std%N
+ NH = decomp%std%NH
+ NPE = decomp%std%NPE
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/iteration_count.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/iteration_count.F90
new file mode 100644
index 0000000000000000000000000000000000000000..da324c4e50424ad482a3bb5384164393552af0c1
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/iteration_count.F90
@@ -0,0 +1,72 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! iteration_count.F90 - counts iterations in Hasenbusch improvement,
+! does not work with tempering
+!
+!-------------------------------------------------------------------------------
+module module_iteration_count
+
+ integer, save :: it_f1 = 0 ! iterations in dsf1()
+ integer, save :: it_f2 = 0 ! iterations in dsf2()
+end
+
+!-----------------------------------------------------------------------------
+subroutine iteration_count_f1(iter)
+
+ use module_iteration_count
+ implicit none
+ integer :: iter
+
+ it_f1 = it_f1 + iter
+end
+
+!-----------------------------------------------------------------------------
+subroutine iteration_count_f2(iter)
+
+ use module_iteration_count
+ implicit none
+ integer :: iter
+
+ it_f2 = it_f2 + iter
+end
+
+!-----------------------------------------------------------------------------
+subroutine iteration_count_write(unit)
+
+ use module_counter
+ use module_function_decl
+ use module_iteration_count
+ use module_switches
+
+ implicit none
+
+ integer, intent(in) :: unit
+ integer, save :: written = 0
+
+ character(*), parameter :: key = "%it"
+ character(*), parameter :: fmt_h = "(1x, 2a, a6, 2a16)"
+ character(*), parameter :: fmt_b = "(1x, a4, i6, 2i16)"
+
+ if (switches%hasenbusch) then
+
+ if (written == 0 .and. my_pe() == 0) then
+ write(unit, fmt_h) "T", key, "traj", "iterations(F1)", "iterations(F2)"
+ endif
+
+ if (my_pe() == 0) write(unit, fmt_b) key, counter%traj, it_f1, it_f2
+
+ endif
+
+ written = written + 1
+ it_f1 = 0
+ it_f2 = 0
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/m_tilde.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/m_tilde.F90
new file mode 100644
index 0000000000000000000000000000000000000000..9782c4ede8d1d0226bfa97cf24df93a1a00ee280
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/m_tilde.F90
@@ -0,0 +1,107 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! m_tilde.F90 - matrix multiplications involving :
+!
+! M~ := \tilde{M}
+! M~+ := \tilde{M}^\dagger
+!
+! M~ = 1 - kappa^2 Deo Doe (Wilson fermions)
+! M~ = Tee - kappa^2 Deo Inv(Too) Doe (Wilson fermions + clover)
+! M~ = H - kappa^2 Deo Inv(H) Doe (Wilson fermions + external h)
+!
+! and
+!
+! \tilde{M}^\dagger =: M~+ = 1 - kappa^2 Doe+ Deo+
+!
+! subroutine mtil: out = M~ in
+! subroutine mtil_dag: out = M~+ in
+! subroutine mtdagmt: out = (M~+ M~) in --> mtdagmt.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine mtil(out, in, para, conf)
+
+ use typedef_hmc
+ use module_p_interface
+ use module_vol
+ implicit none
+
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(in) :: conf
+
+ SPINCOL_FIELD, intent(out) :: out
+ SPINCOL_FIELD, intent(in) :: in
+ P_SPINCOL_FIELD, save :: tmp
+ REAL :: b
+
+
+ ALLOCATE_SC_FIELD(tmp)
+
+ if (para%kappa /= ZERO) then
+ call d(ODD, EVEN, tmp, in, conf%u)
+ if (para%csw_kappa /= ZERO) call clover_mult_ao(conf%i(1,1,ODD), tmp, volh)
+ if (para%h /= ZERO) call h_mult_b(-para%h, tmp, volh)
+ call d(EVEN, ODD, out, tmp, conf%u)
+ endif
+
+ b = -para%kappa**2 / (ONE + para%h**2)
+
+ if (para%csw_kappa /= ZERO) then
+ call clover_mult_a(tmp, conf%a(1,1,EVEN), in, volh)
+ call sc_xpby(out, tmp, b) ! out = tmp - kappa**2 * out
+ else
+ call sc_xpby(out, in, b) ! out = in - kappa**2 * out
+ if (para%h /= ZERO) call h_mult_a(out, para%h, in, volh)
+ endif
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine mtil_dag(out, in, para, conf)
+
+ use typedef_hmc
+ use module_p_interface
+ use module_vol
+ implicit none
+
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(in) :: conf
+
+ SPINCOL_FIELD, intent(out) :: out
+ SPINCOL_FIELD, intent(in) :: in
+ P_SPINCOL_FIELD, save :: tmp
+ REAL :: b
+
+
+ ALLOCATE_SC_FIELD(tmp)
+
+ if (para%kappa /= ZERO) then
+ call d_dag(ODD, EVEN, tmp, in, conf%u)
+ if (para%csw_kappa /= ZERO) call clover_mult_ao(conf%i(1,1,ODD), tmp, volh)
+ if (para%h /= ZERO) call h_mult_b(para%h, tmp, volh)
+ call d_dag(EVEN, ODD, out, tmp, conf%u)
+ endif
+
+ b = -para%kappa**2 / (ONE + para%h**2)
+
+ if (para%csw_kappa /= ZERO) then
+ call clover_mult_a(tmp, conf%a(1,1,EVEN), in, volh)
+ call sc_xpby(out, tmp, b) ! out = tmp - kappa**2 * out
+ else
+ call sc_xpby(out, in, b) ! out = in - kappa**2 * out
+ if (para%h /= ZERO) call h_mult_a(out, -para%h, in, volh)
+ endif
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/mc.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/mc.F90
new file mode 100644
index 0000000000000000000000000000000000000000..b63a456278c9dedbb0242dcdda866ad5ab444b29
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/mc.F90
@@ -0,0 +1,299 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! mc.F90 - Monte Carlo loop (including tempering)
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module typedef_mc_temper
+
+ type temper_out
+ REAL :: delta_h
+ integer :: pair
+ integer :: swapped
+ end type temper_out
+
+end module typedef_mc_temper
+
+!-------------------------------------------------------------------------------
+subroutine mc(para, conf)
+
+ use typedef_mc_temper
+ use typedef_para
+ use module_counter
+ use module_function_decl
+ use module_switches
+ use module_vol
+ implicit none
+
+ type(type_para), intent(in) :: para
+ type(hmc_conf), dimension(MAX_TEMPER), intent(inout) :: conf
+
+ type(hmc_out), dimension(MAX_TEMPER) :: out
+ type(temper_out), dimension(MAX_TEMPER) :: tmpr
+ integer :: i, j
+ integer :: iforce, itraj
+ integer :: force_accept = 0
+ character(len = *), parameter :: key_fa = "%fa"
+ character(len = *), parameter :: key_mc = "%mc"
+ character(len = *), parameter :: key_swap = "%sw"
+ REAL :: plaq
+ REAL, external :: sf, sg
+
+ if (switches%hmc_test) then
+ if (para%n_temper /= 1) call die("mc(): HMC-test: n_ensemble has to be 1")
+ call hmc_test(para%hmc(1), conf(1))
+ return
+ endif
+
+ force_accept = 1
+ iforce = 0
+ do while (counter%traj < 0 .and. counter%j_traj < para%ntraj)
+
+ iforce = iforce + 1
+ counter%traj = counter%traj + 1
+ counter%j_traj = counter%j_traj + 1
+
+ if (iforce == 1 .and. my_pe() == 0) then
+ call begin(UREC, "ForceAcceptance")
+ write(UREC, 405) "T", key_fa, "i_fa", "e", &
+ "PlaqEnergy", "exp(-Delta_H)", "CGcalls", "CGitTot", "CGitMax"
+ endif
+
+ do i = 1, para%n_temper
+ call hmc(para%hmc(i), conf(i), out(i), force_accept, HMC_TEST_NONE)
+
+ plaq = out(i)%sg / (SIX * volume)
+
+ if (my_pe() == 0) then
+ write(UREC, 415) key_fa, counter%traj, i, plaq, out(i)%exp_dh, &
+ out(i)%cg_ncall, out(i)%cg_niter_tot, out(i)%cg_niter_max
+ endif
+ enddo
+ enddo
+
+ if (iforce > 0) then
+ call end_A(UREC, "ForceAcceptance")
+ endif
+
+405 format (1x, 2a, 2a5, 2a15, 3a8)
+415 format (1x, a4, 2i5, 2f15.10, 3i8)
+
+
+ force_accept = 0
+ itraj = 0
+ do while (counter%traj >= 0 .and. counter%traj < para%maxtraj &
+ .and. counter%j_traj < para%ntraj)
+
+ itraj = itraj + 1
+ counter%traj = counter%traj + 1
+ counter%j_traj = counter%j_traj + 1
+
+ if (itraj == 1 .and. my_pe() == 0) then
+ call begin(UREC, "MC")
+
+ if (para%n_temper > 1) write(UREC, 450) &
+ "T", key_swap, "traj", "ie", "e1", "e2", "Delta_H", "Acc"
+
+ write(UREC, 400) "T", key_mc, "traj", "e", "f", &
+ "PlaqEnergy", "exp(-Delta_H)", "Acc", &
+ "CGcalls", "CGitTot", "CGitMax"
+ endif
+
+ do i = 1, para%n_temper
+ if (itraj == 1) then
+ if (switches%tempering .and. switches%dynamical) then
+ out(i)%sf = sf(para%hmc(i), conf(i))
+ else
+ out(i)%sf = ZERO
+ endif
+ out(i)%sg = sg(conf(i)%u)
+ endif
+
+ call hmc(para%hmc(i), conf(i), out(i), force_accept, HMC_TEST_NONE)
+ enddo
+
+ if (para%n_temper > 1 .and. counter%traj > para%nstd) then
+ call temper(para%n_temper, para%swap_seq, para%hmc, conf, out, tmpr)
+ do i = 1, para%n_temper - 1
+ if (my_pe() == 0) write(UREC, 460) key_swap, counter%traj, i, &
+ tmpr(i)%pair, tmpr(i)%pair + 1, tmpr(i)%delta_h, tmpr(i)%swapped
+ enddo
+ endif
+
+ do i = 1, para%n_temper
+ j = conf(i)%former
+
+ if (my_pe() == 0) write(UREC,410) key_mc, counter%traj, i, j, &
+ out(i)%sg / (SIX * volume), out(i)%exp_dh, out(i)%accepted, &
+ out(i)%cg_ncall, out(i)%cg_niter_tot, out(i)%cg_niter_max
+
+ if (switches%measure_traces) &
+ call traces(para%hmc(i), conf(i), counter%traj, i, j)
+
+ if (switches%measure_polyakov_loop) &
+ call polyakov_loop(conf(i), counter%traj, i, j)
+
+ call cooling(conf(i)%u, counter%traj, i, j)
+ enddo
+
+ if (para%nsave > 0) then
+ if (mod(counter%traj, para%nsave) == 0) then
+ call conf_write(.false., para, conf)
+ endif
+ endif
+
+ enddo
+
+ if (itraj > 0) then
+ call end_A(UREC, "MC")
+ endif
+
+400 format (1x, 2a, a6, 2a3, 2a15, a4, 3a8)
+410 format (1x, a4, i6, 2i3, 2f15.10, i4, 3i8)
+
+450 format (1x, 2a, a6, 3a3, a18, a4)
+460 format (1x, a4, i6, 3i3, f18.10, i4)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine temper(n_temper, swap_seq, para, conf, action, tmpr)
+
+ use typedef_hmc
+ use typedef_mc_temper
+ use module_function_decl
+ use module_p_interface
+ implicit none
+
+ integer, intent(in) :: n_temper, swap_seq
+ type(hmc_para), dimension(n_temper), intent(in) :: para
+ type(hmc_conf), dimension(n_temper), intent(inout) :: conf
+ type(hmc_out), dimension(n_temper), intent(inout) :: action
+ type(temper_out), dimension(n_temper), intent(out) :: tmpr
+
+ integer, dimension(n_temper) :: pair
+ integer :: i_pair, n_pair, i, j
+ integer :: swapped
+ REAL, dimension(2, 2) :: hf, hg
+ REAL, external :: sf, sg
+ REAL :: h_old, h_new, delta_h
+ REAL :: random
+
+ if (n_temper == 1) return
+
+ do i = 1, n_temper
+ conf(i)%former = i
+ enddo
+
+ n_pair = n_temper - 1
+
+ call swap_sequence(swap_seq, pair, n_pair)
+
+ do i_pair = 1, n_pair
+ i = pair(i_pair)
+ j = pair(i_pair) + 1
+
+ hg(1, 1) = para(i)%beta * action(i)%sg
+ hg(2, 2) = para(j)%beta * action(j)%sg
+
+ hg(1, 2) = para(i)%beta * action(j)%sg
+ hg(2, 1) = para(j)%beta * action(i)%sg
+
+ hf(1, 1) = action(i)%sf
+ hf(2, 2) = action(j)%sf
+ hf(1, 2) = sf(para(i), conf(j))
+ hf(2, 1) = sf(para(j), conf(i))
+
+ if (para(i)%kappa == ZERO) then
+ if (hf(1, 1) /= ZERO) call die("temper(): hf(1, 1) /= 0 ")
+ if (hf(2, 1) /= ZERO) call die("temper(): hf(2, 1) /= 0 ")
+ if (hf(1, 2) /= ZERO) call die("temper(): hf(1, 2) /= 0 ")
+ if (hf(2, 2) /= ZERO) call die("temper(): hf(2, 2) /= 0 ")
+ endif
+
+ h_old = hg(1, 1) + hf(1, 1) + hg(2, 2) + hf(2, 2)
+ h_new = hg(1, 2) + hf(1, 2) + hg(2, 1) + hf(2, 1)
+
+ delta_h = h_new - h_old
+
+ if (ranf() < exp(-delta_h)) then
+ swapped = 1
+ else
+ swapped = 0
+ endif
+
+ if (swapped /= 0) then
+ call swap_p_g_field(conf(i)%u, conf(j)%u)
+ call swap_p_sc_field(conf(i)%phi, conf(j)%phi)
+ call swap_integer(conf(i)%former, conf(j)%former)
+ call swap_real(action(i)%sg, action(j)%sg)
+ action(i)%sf = hf(1, 2)
+ action(j)%sf = hf(2, 1)
+ endif
+
+ tmpr(i_pair)%pair = i
+ tmpr(i_pair)%swapped = swapped
+ tmpr(i_pair)%delta_h = delta_h
+ enddo
+
+ call check_former(n_temper, conf)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine check_former(n_temper, conf)
+
+ use typedef_hmc
+ use module_function_decl
+ implicit none
+
+ integer, intent(in) :: n_temper
+ type(hmc_conf), intent(inout) :: conf(n_temper)
+ integer :: count(n_temper), i
+
+ count = 0
+ do i = 1, n_temper
+ count(conf(i)%former) = count(conf(i)%former) + 1
+ enddo
+
+ do i = 1, n_temper
+ if (count(i) /= 1) call die("check_former(): error")
+ enddo
+end
+
+!-------------------------------------------------------------------------------
+subroutine swap_sequence(type, s, n)
+
+ implicit none
+ integer, intent(in) :: type, n
+ integer, intent(out) :: s(n)
+ integer :: i
+
+ select case (type)
+ case (SWAP_UP)
+ do i = 1, n
+ s(i) = i
+ enddo
+ case (SWAP_DOWN)
+ do i = 1, n
+ s(i) = n - i + 1
+ enddo
+ case (SWAP_RANDOM)
+ call random_sequence(s, n)
+ case default
+ call die("swap_sequence(): don't know how to build the sequence")
+ end select
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/misc.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/misc.F90
new file mode 100644
index 0000000000000000000000000000000000000000..8bfc895a5a6841503e93170fdfa2ca5d63366f24
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/misc.F90
@@ -0,0 +1,225 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! misc.F90 - miscellaneous (service routines)
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine die(msg) ! write "msg" to stderr and abort
+
+ implicit none
+ character(len = *) msg
+
+ write(STDERR,*) msg
+ call abbruch()
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine warn(msg) ! write "msg" to stderr and unit UREC
+ implicit none
+ character(len = *) msg
+
+ write(STDERR,*) "WARNING: ", msg, " !!!"
+ write(UREC,*) "WARNING: ", msg, " !!!"
+end
+
+!-------------------------------------------------------------------------------
+subroutine assertion_failed(file, line, condition) !// used by ASSERT makro
+ implicit none
+ character(len = *) file, condition
+ integer line
+
+ character(len = 8) aline
+
+ write(aline, *) line
+
+ call die(file // " (" // trim(aline) // "): assertion failed: " // condition)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine begin(unit, str) ! write "begin marker"
+
+ use module_function_decl
+ implicit none
+ integer :: unit
+ character(len = *) :: str
+
+ if (my_pe() == 0) write(unit, "(1x,2a)") ">Begin", str
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine end_A(unit, str) ! write "end marker"
+
+ use module_function_decl
+ implicit none
+ integer :: unit
+ character(len = *) :: str
+
+ if (my_pe() == 0) write(unit, "(1x,2a)") ">End", str
+
+end
+
+!-------------------------------------------------------------------------------
+function datum() ! returns date as: YYYY-MM-DD
+
+ implicit none
+ character(len = 10) :: datum
+
+ call date_and_time(date = datum)
+
+ datum(9:10) = datum(7:8)
+ datum(6:7) = datum(5:6)
+
+ datum(5:5) = "-"
+ datum(8:8) = "-"
+
+end
+
+!-------------------------------------------------------------------------------
+function uhrzeit() ! returns time as: hh:mm:ss.sss
+
+ implicit none
+ character(len = 12) :: uhrzeit
+
+ call date_and_time(time = uhrzeit)
+
+ uhrzeit(7:12) = uhrzeit(5:10)
+ uhrzeit(4:5) = uhrzeit(3:4)
+
+ uhrzeit(3:3) = ":"
+ uhrzeit(6:6) = ":"
+
+end
+
+!-------------------------------------------------------------------------------
+function f_exist(file) ! check in file exists
+
+ implicit none
+ logical :: f_exist
+ character(len = *) :: file
+
+ inquire(file = file, exist = f_exist)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine open_diag() ! open debug file on each process
+
+ use module_function_decl
+ implicit none
+ FILENAME :: name
+
+ write(name, '(i4.4)') my_pe()
+ name = 'diag.' // name
+
+ open(UDIAG, file = name)
+
+ write(UDIAG,*) 'Output from PE ', my_pe()
+ write(UDIAG,*) '~~~~~~~~~~~~~~~~~~~'
+ write(UDIAG,*)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine pos_keyword(unit, keyword) ! positions unit at keyword
+
+ implicit none
+ integer, intent(in) :: unit
+ character(len = *), intent(in) :: keyword
+ integer :: iostat
+ character(len = len(keyword) + 1) :: word
+
+ iostat = 0
+ do while (iostat == 0)
+ read(unit, *, iostat = iostat) word
+ if (word == keyword) then
+ backspace(unit)
+ return
+ endif
+ enddo
+
+ call die("pos_keyword(): " // keyword // ": not found")
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine read_keyword_int(unit, keyword, int, dim) ! read integer(s) at keyw.
+
+ implicit none
+ integer, intent(in) :: unit, dim
+ character(len = *), intent(in) :: keyword
+ integer, intent(out) :: int(dim)
+ character :: c
+
+ call pos_keyword(unit, keyword)
+ read(unit, *) c, int
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine read_keyword_REAL(unit, keyword, x, dim) ! read float(s) at keyword
+
+ implicit none
+ integer, intent(in) :: unit, dim
+ character(len = *), intent(in) :: keyword
+ REAL, intent(out) :: x(dim)
+ character :: c
+
+ call pos_keyword(unit, keyword)
+ read(unit, *) c, x
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine swap_endian8(n, a)
+
+ implicit none
+ integer, parameter :: i8 = 8
+ integer, intent(in) :: n
+ integer :: i
+ integer(i8), intent(inout) :: a(n)
+ integer(i8), save :: mask1, mask2, mask3, mask4, &
+ mask5, mask6, mask7, mask8
+ integer(i8) :: tmp
+ logical, external :: is_big_endian
+
+ data mask1 /z'00000000000000FF'/
+ data mask2 /z'000000000000FF00'/
+ data mask3 /z'0000000000FF0000'/
+ data mask4 /z'00000000FF000000'/
+ data mask5 /z'000000FF00000000'/
+ data mask6 /z'0000FF0000000000'/
+ data mask7 /z'00FF000000000000'/
+ data mask8 /z'FF00000000000000'/
+
+
+ if (is_big_endian()) return
+
+ do i = 1, n
+ tmp = 0_i8
+ tmp = ior(tmp, ishft(iand(a(i), mask1), 56_i8))
+ tmp = ior(tmp, ishft(iand(a(i), mask2), 40_i8))
+ tmp = ior(tmp, ishft(iand(a(i), mask3), 24_i8))
+ tmp = ior(tmp, ishft(iand(a(i), mask4), 8_i8))
+ tmp = ior(tmp, ishft(iand(a(i), mask5), -8_i8))
+ tmp = ior(tmp, ishft(iand(a(i), mask6),-24_i8))
+ tmp = ior(tmp, ishft(iand(a(i), mask7),-40_i8))
+ tmp = ior(tmp, ishft(iand(a(i), mask8),-56_i8))
+ a(i) = tmp
+ enddo
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..884239c63b5f8b05e93c5aed91fcf2ffae14bb05
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/Makefile
@@ -0,0 +1,59 @@
+#===============================================================================
+#
+# BQCD -- Berlin Quantum ChromoDynamics programme
+#
+# Author: Hinnerk Stueben
+#
+# Copyright (C) 2003-2006, Hinnerk Stueben, Zuse-Institut Berlin
+#
+#-------------------------------------------------------------------------------
+#
+# modules/Makefile
+#
+#===============================================================================
+
+include ../Makefile.defs
+
+.SUFFIXES:
+.SUFFIXES: .a .o .F90
+
+.F90.o:
+ $(FPP) -I.. $(FPPFLAGS) $< > $*.f90
+ $(F90) -c $(FFLAGS) $*.f90
+
+MODULES_DIR = .
+
+MODULES = \
+ typedef_cksum.o \
+ typedef_clover.o \
+ typedef_flags.o \
+ typedef_hmc.o \
+ typedef_para.o \
+ module_bqcd.o \
+ module_counter.o \
+ module_d21.o \
+ module_decomp.o \
+ module_hmc_forces.o \
+ module_lattice.o \
+ module_lattice_io.o \
+ module_nn.o \
+ module_nnpe.o \
+ module_offset.o \
+ module_thread.o \
+ module_vol.o \
+ module_conf_info.o \
+ module_function_decl.o \
+ module_input.o \
+ module_switches.o \
+ module_p_interface.o \
+ module_mre.o
+
+modules: $(MODULES)
+
+fast:
+ $(MAKE)
+
+clean:
+ rm -f *.[Tiod] *.f90 *.mod work.pc work.pcl
+
+clobber: clean
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/README b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/README
new file mode 100644
index 0000000000000000000000000000000000000000..2edcd731ef7bc59540583b4442f0463e6cd88ff9
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/README
@@ -0,0 +1,23 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! modules/README
+!
+!-------------------------------------------------------------------------------
+
+This directory contain "modules" that are needed in more than one
+source files.
+
+"modules" that are needed only in one file are kept in that file.
+
+For historical reasons some information is stored in more than one module.
+Many "modules" were "common blocks" in older versions of the programme.
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/bqcd.pcl b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/bqcd.pcl
new file mode 100644
index 0000000000000000000000000000000000000000..7f6c06e225907b3e12cb0daddf00f6b753a7ab89
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/bqcd.pcl
@@ -0,0 +1 @@
+work.pc
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_bqcd.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_bqcd.F90
new file mode 100644
index 0000000000000000000000000000000000000000..ad4b746f7d9aac4a9008bbd0818d0ddd36d65710
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_bqcd.F90
@@ -0,0 +1,22 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! module_bqcd.F90
+!
+!-------------------------------------------------------------------------------
+module module_bqcd
+
+ character(len = *), parameter :: prog_name = "bqcd"
+ character(len = *), parameter :: prog_version = "benchmark2"
+ integer, parameter :: input_version = 4
+ integer, parameter :: conf_info_version = 3
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_conf_info.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_conf_info.F90
new file mode 100644
index 0000000000000000000000000000000000000000..81f8b1e214c3ca217eeb5037f6459fdf69d7f785
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_conf_info.F90
@@ -0,0 +1,47 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2002, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! module_conf_info.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_conf_info
+
+ type type_conf_info
+ REAL, dimension(2) :: beta, kappa, csw, csw_kappa, h
+ REAL :: plaq
+ integer, dimension(DIM) :: L, bc_fermions
+ integer :: rkind
+ integer, dimension(2) :: ensemble
+ end type type_conf_info
+
+ character(len = *), parameter :: k_format = "Format"
+ character(len = *), parameter :: k_prog = "Program"
+ character(len = *), parameter :: k_run = "Run"
+ character(len = *), parameter :: k_traj = "Traj"
+ character(len = *), parameter :: k_host = "Host"
+ character(len = *), parameter :: k_date = "Date"
+ character(len = *), parameter :: k_L = "L"
+ character(len = *), parameter :: k_bc = "bc_fermions"
+ character(len = *), parameter :: k_rkind = "REAL_kind"
+ character(len = *), parameter :: k_plaq = "PlaqEnergy"
+
+ character(len = *), parameter, dimension(2) :: &
+ k_ensemble = (/ "ensemble ", "former_ensemble " /), &
+ k_beta = (/ "beta ", "former_beta " /), &
+ k_kappa = (/ "kappa ", "former_kappa " /), &
+ k_csw = (/ "csw ", "former_csw " /), &
+ k_csw_kappa = (/ "csw_kappa ", "former_csw_kappa" /), &
+ k_h = (/ "h ", "former_h " /)
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_counter.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_counter.F90
new file mode 100644
index 0000000000000000000000000000000000000000..f539c06f9a7af052abcdc1e5acba841021ebea9a
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_counter.F90
@@ -0,0 +1,30 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! module_counter.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_counter
+
+ type type_counter
+ sequence
+ integer :: run
+ integer :: job
+ integer :: traj ! overall trajectory counter
+ integer :: j_traj ! job trajectory counter
+ end type type_counter
+
+ type(type_counter), save :: counter
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_d21.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_d21.F90
new file mode 100644
index 0000000000000000000000000000000000000000..d35a29d17f3dd7396bae9caeff19d5947c76a54e
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_d21.F90
@@ -0,0 +1,21 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! module_d21.F90 - Two component spincol field used in d/D21.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_d21
+ P_SC2_FIELD, save :: a
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_decomp.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_decomp.F90
new file mode 100644
index 0000000000000000000000000000000000000000..677e085e15f2e80d18840a0a86471bc7ae810657
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_decomp.F90
@@ -0,0 +1,36 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! module_decomp.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_decomp
+
+ type type_decomp1
+ integer, dimension(DIM) :: L, N, NH, NPE
+ integer, dimension(DIM) :: i_pe
+ integer, dimension(DIM) :: bc_fermions
+ INTEGER, dimension(:, :), pointer :: i
+ end type type_decomp1
+
+ type type_decomp2
+ type(type_decomp1) :: std ! "standard"
+ type(type_decomp1) :: act ! "actual" (essentially module_lattice)
+ integer, dimension(DIM) :: gamma_index
+ integer, dimension(DIM) :: direction
+ end type type_decomp2
+
+ type(type_decomp2), save :: decomp
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_function_decl.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_function_decl.F90
new file mode 100644
index 0000000000000000000000000000000000000000..4e4afc9bc67f8d465b82de35223322c32795dc96
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_function_decl.F90
@@ -0,0 +1,67 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! module_function_decl.F90 - declaration of functions
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_function_decl
+
+ ! functions in misc.F90:
+
+ character(len=10), external :: datum
+ character(len=12), external :: uhrzeit
+ character(len=20), external :: rechner
+ logical, external :: f_exist
+
+ ! processes (-> comm/pes*.F90):
+
+ integer, external :: num_pes
+
+#ifdef CRAY
+ integer, intrinsic :: my_pe ! gives the same id as MPI_Rank in MPI_COMM_WORLD
+#else
+ integer, external :: my_pe
+#endif
+
+ ! global reduction (-> comm/reduction_*.F90)
+
+ REAL, external :: dotprod
+ REAL, external :: global_sum
+ real, external :: global_min
+ real, external :: global_max
+
+ ! sc-field (-> sc.F90)
+
+ REAL, external :: sc_norm2
+ REAL, external :: sc_dot
+ COMPLEX, external :: sc_cdotc
+
+ ! ranf:
+
+#ifdef CRAY
+ real(8), intrinsic :: ranf
+#else
+ real(8), external :: ranf
+#endif
+
+ ! identification of D (-> d/DVersion.F90)
+
+ integer, external :: version_of_d
+ integer, external :: get_d3_buffer_vol
+
+ ! communication method (-> comm/comm_*.F90)
+
+ COMM_METHOD, external :: comm_method
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_hmc_forces.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_hmc_forces.F90
new file mode 100644
index 0000000000000000000000000000000000000000..c4cd3dbabe5abd6a14c6392d166c44a6de32ac40
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_hmc_forces.F90
@@ -0,0 +1,33 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! module_hmc_forces.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_hmc_forces
+
+ P_GENERATOR_FIELD, save :: p_old
+
+ integer, parameter :: n_force = 4
+
+ integer, parameter :: i_sg = 1
+ integer, parameter :: i_sd = 2
+ integer, parameter :: i_sf1 = 3
+ integer, parameter :: i_sf2 = 4
+
+ REAL, save :: f_count(n_force)
+ REAL, save :: f_avg(n_force)
+ REAL, save :: f_max(n_force)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_input.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_input.F90
new file mode 100644
index 0000000000000000000000000000000000000000..96aede4988f919ae9b63c9745e0535a8b4a0a74c
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_input.F90
@@ -0,0 +1,174 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! module_input.F90
+!
+! Simple module to read "keyword value" formatted input. The keywords
+! are defined in "module_input.h" together with a type definition and a
+! default value. The input is stored in the structure "input". The
+! components are called "input%".
+!
+! New input/keywords can be added or modified in "module_input.h". It
+! suffices to modify "module_input.h" execpt for input of type
+! INPUT_ARRAY_ENSEMBLES for which (re)allocation and initialisation
+! routines have to be provided in this file (in subroutines "input_allocate"
+! and "input_reallocate").
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+# define INPUT_ARRAY_DIM integer, dimension(DIM)
+# define INPUT_ARRAY_ENSEMBLES character(para_len), dimension(:), pointer
+# define INPUT_DEFAULT_ENSEMBLES 1
+
+!-------------------------------------------------------------------------------
+module module_input
+
+ implicit none
+
+ !-----------------------------------------------------------------------------
+ public :: type_input ! data structure declaration
+ public :: input ! variable containing data structure
+ public :: input_read ! subroutine input_read(file)
+ public :: input_dump ! subroutine input_dump(unit)
+ !-----------------------------------------------------------------------------
+
+ private
+
+ integer, parameter :: comment_len = 80
+ integer, parameter :: keyword_len = 32
+ integer, parameter :: para_len = 32
+ integer, parameter :: word_len = 8
+
+ type type_input
+
+#define INPUT_INPUT(var, type, default) type :: var
+#include "module_input.h"
+
+ end type type_input
+
+ type(type_input), save :: input
+
+contains
+
+ !=============================================================================
+ subroutine input_read(file)
+
+ character(*), intent(in) :: file
+ integer :: iostat
+ character(keyword_len) :: keyword
+
+ call input_allocate(INPUT_DEFAULT_ENSEMBLES)
+ call input_defaults()
+
+ open(UINPUT, file = file, action = "read", status = "old")
+
+ do
+ read(UINPUT, *, iostat = iostat) keyword
+ if (iostat /= 0) exit
+ if (keyword(1:1) == '#') cycle
+ backspace(UINPUT)
+ select case (keyword)
+
+
+#undef INPUT_INPUT
+#define INPUT_INPUT(var, type, default) \
+ case (#var); read(UINPUT, *) keyword, input% ## var
+#include "module_input.h"
+
+ case default
+ call die("input_read(): " // trim(keyword) &
+ // ": unknown keyword")
+
+ end select
+
+ if (keyword == "ensembles") then
+ call input_reallocate(input%ensembles)
+ endif
+
+ enddo
+ close(UINPUT)
+
+ end subroutine input_read
+
+ !-----------------------------------------------------------------------------
+ subroutine input_dump(unit)
+ integer :: unit
+
+ if (.not. associated(input%beta)) then
+ call input_allocate(INPUT_DEFAULT_ENSEMBLES)
+ call input_defaults()
+ endif
+
+#undef INPUT_INPUT
+#define INPUT_INPUT(var, type, default) write(unit,*) #var, " ", input% ## var
+#include "module_input.h"
+
+ end subroutine input_dump
+
+ !-----------------------------------------------------------------------------
+ subroutine input_defaults()
+
+#undef INPUT_INPUT
+#define INPUT_INPUT(var, type, default) input% ## var = default
+#include "module_input.h"
+
+ end subroutine input_defaults
+
+ !-----------------------------------------------------------------------------
+ subroutine input_allocate(size)
+
+ integer :: size
+
+ allocate(input%beta(size))
+ allocate(input%kappa(size))
+ allocate(input%csw(size))
+ allocate(input%h(size))
+ allocate(input%hmc_trajectory_length(size))
+ allocate(input%hmc_steps(size))
+ allocate(input%hmc_rho(size))
+ allocate(input%hmc_m_scale(size))
+ allocate(input%start_info_file(size))
+
+ input%beta = "0.0"
+ input%kappa = "0.0"
+ input%csw = "0.0"
+ input%h = "0.0"
+ input%hmc_trajectory_length = "1"
+ input%hmc_steps = "0"
+ input%hmc_rho = "0.0"
+ input%hmc_m_scale = "1"
+ input%start_info_file = ""
+
+ end subroutine input_allocate
+
+
+ !-----------------------------------------------------------------------------
+ subroutine input_reallocate(size)
+
+ integer :: size
+
+ if (size > INPUT_DEFAULT_ENSEMBLES) then
+ deallocate(input%beta)
+ deallocate(input%kappa)
+ deallocate(input%csw)
+ deallocate(input%h)
+ deallocate(input%hmc_trajectory_length)
+ deallocate(input%hmc_steps)
+ deallocate(input%hmc_rho)
+ deallocate(input%hmc_m_scale)
+ deallocate(input%start_info_file)
+
+ call input_allocate(size)
+ endif
+ end subroutine input_reallocate
+
+end module module_input
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_input.h b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_input.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0aa2e325556644801432584b2d10bafc0a804d6
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_input.h
@@ -0,0 +1,42 @@
+INPUT_INPUT(run, integer, 0)
+INPUT_INPUT(comment, character(comment_len), "")
+
+INPUT_INPUT(lattice, INPUT_ARRAY_DIM, (/4, 4, 4, 4/))
+INPUT_INPUT(processes, INPUT_ARRAY_DIM, (/1, 1, 1, 1/))
+INPUT_INPUT(boundary_conditions_fermions, INPUT_ARRAY_DIM, (/1, 1, 1, -1/))
+INPUT_INPUT(gamma_index, INPUT_ARRAY_DIM, (/1, 2, 3, 4/))
+
+INPUT_INPUT(ensembles, integer, INPUT_DEFAULT_ENSEMBLES)
+
+INPUT_INPUT(beta, INPUT_ARRAY_ENSEMBLES, "0.0")
+INPUT_INPUT(kappa, INPUT_ARRAY_ENSEMBLES, "0.0")
+INPUT_INPUT(csw, INPUT_ARRAY_ENSEMBLES, "0.0")
+INPUT_INPUT(h, INPUT_ARRAY_ENSEMBLES, "0.0")
+
+INPUT_INPUT(tempering_swap_sequence, character(word_len), "random")
+INPUT_INPUT(tempering_steps_without, integer, 0)
+
+INPUT_INPUT(hmc_model, character, "A")
+INPUT_INPUT(hmc_trajectory_length, INPUT_ARRAY_ENSEMBLES, "1")
+INPUT_INPUT(hmc_steps, INPUT_ARRAY_ENSEMBLES, "0")
+INPUT_INPUT(hmc_rho, INPUT_ARRAY_ENSEMBLES, "0.0")
+INPUT_INPUT(hmc_m_scale, INPUT_ARRAY_ENSEMBLES, "1")
+INPUT_INPUT(hmc_accept_first, integer, 0)
+INPUT_INPUT(hmc_test, integer, 0)
+
+INPUT_INPUT(start_configuration, character(word_len), "cold")
+INPUT_INPUT(start_info_file, INPUT_ARRAY_ENSEMBLES, "")
+INPUT_INPUT(start_random, character(para_len), "default")
+
+INPUT_INPUT(mc_total_steps, integer, 1)
+INPUT_INPUT(mc_steps, integer, 1)
+INPUT_INPUT(mc_save_frequency, integer, 0)
+
+INPUT_INPUT(solver_rest, character(para_len), "1e-8")
+INPUT_INPUT(solver_maxiter, integer, 100)
+INPUT_INPUT(solver_ignore_no_convergence, integer, 0)
+INPUT_INPUT(solver_mre_vectors, integer, 0)
+
+INPUT_INPUT(measure_cooling_list, FILENAME, "")
+INPUT_INPUT(measure_polyakov_loop, integer, 0)
+INPUT_INPUT(measure_traces, integer, 0)
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_lattice.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_lattice.F90
new file mode 100644
index 0000000000000000000000000000000000000000..2cf8dcdc283a5f2e9fda4ca98e9f3ef3cbb48276
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_lattice.F90
@@ -0,0 +1,48 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! module_lattice.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_lattice
+
+ !>> See also "module_lattice_io" !!!
+
+ ! use a common block, because without, equivalence leads to errors
+ ! with Intel Fortran90 compiler
+
+ integer, dimension(DIM) :: L, N, NH, NPE
+
+ common /common_lattice/ L, N, NH, NPE
+
+ integer :: LX, LY, LZ, LT
+ integer :: NX, NY, NZ, NT, NXH
+
+ equivalence (L(1), LX)
+ equivalence (L(2), LY)
+ equivalence (L(3), LZ)
+ equivalence (L(4), LT)
+
+ equivalence (N(1), NX)
+ equivalence (N(2), NY)
+ equivalence (N(3), NZ)
+ equivalence (N(4), NT)
+
+ equivalence (NH(1), NXH)
+
+ integer, dimension(DIM), save :: bc_fermions
+ integer, dimension(DIM), save :: gamma_index
+ integer, dimension(DIM), save :: decomp_direction
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_lattice_io.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_lattice_io.F90
new file mode 100644
index 0000000000000000000000000000000000000000..492243725ea4bc60c781687d73d1afb3d6cd9765
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_lattice_io.F90
@@ -0,0 +1,47 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! module_lattice_io.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_lattice_io
+
+ !>> The common block is syntactically identical to
+ !>> module_lattice/common_lattice. But it contains permutated
+ !>> values according to "gamma_index".
+ !>> This can be confusing when reading source code !!!!
+
+ ! use a common block, because without, equivalence leads to errors
+ ! with Intel Fortran90 compiler
+
+ integer, dimension(DIM) :: L, N, NH, NPE
+
+ common /common_lattice_io/ L, N, NH, NPE
+
+ integer :: LX, LY, LZ, LT
+ integer :: NX, NY, NZ, NT, NXH
+
+ equivalence (L(1), LX)
+ equivalence (L(2), LY)
+ equivalence (L(3), LZ)
+ equivalence (L(4), LT)
+
+ equivalence (N(1), NX)
+ equivalence (N(2), NY)
+ equivalence (N(3), NZ)
+ equivalence (N(4), NT)
+
+ equivalence (NH(1), NXH)
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_mre.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_mre.F90
new file mode 100644
index 0000000000000000000000000000000000000000..21d58a534d941c6a6dffc5fcd636ba91d06c4cda
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_mre.F90
@@ -0,0 +1,34 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! module_mre.F90
+!
+! Important: type(type_mre) must be defined with the "save" attribute!
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_mre
+
+ integer, save :: mre_n_vec = 0
+
+ type mre_pointer_to_sc_field
+ P_SPINCOL_FIELD :: sc
+ end type mre_pointer_to_sc_field
+
+ type type_mre
+ integer :: rank
+ type(mre_pointer_to_sc_field), dimension(:), pointer :: vec
+ end type type_mre
+
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_nn.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_nn.F90
new file mode 100644
index 0000000000000000000000000000000000000000..20bcb9536e195c925e2ac1991c6ade199466cecc
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_nn.F90
@@ -0,0 +1,23 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! module_nn.F90 - pointer to nearest neighbour list
+! nn(volh_tot, EVEN:ODD, DIM, FWD:BWD)
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_nn
+
+ INTEGER, dimension(:, :, :, :), pointer, save :: nn
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_nnpe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_nnpe.F90
new file mode 100644
index 0000000000000000000000000000000000000000..4969e97237ab53e6d174ad888db59f265cc9489d
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_nnpe.F90
@@ -0,0 +1,22 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! module_nnpe.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_nnpe
+
+ integer, dimension (-1:1, -1:1, -1:1, -1:1), save :: nnpe
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_offset.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_offset.F90
new file mode 100644
index 0000000000000000000000000000000000000000..167f47a0b3fd95c7da599bc578ffb26f81b294a7
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_offset.F90
@@ -0,0 +1,22 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! module_offset.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_offset
+
+ integer, dimension (-1:1, -1:1, -1:1, -1:1), save :: offset
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_p_interface.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_p_interface.F90
new file mode 100644
index 0000000000000000000000000000000000000000..9163c7aa4113b65a4c87f3b9703a2596fbe708a3
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_p_interface.F90
@@ -0,0 +1,80 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! module_p_interface.F90 ! interfaces of pointer manipulating routines
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_p_interface
+
+ interface
+
+ subroutine allocate_g_field(u)
+ P_GAUGE_FIELD :: u
+ end subroutine allocate_g_field
+
+ subroutine allocate_g_field_io(u)
+ P_GAUGE_FIELD_IO :: u
+ end subroutine allocate_g_field_io
+
+ subroutine allocate_gen_field(p)
+ P_GENERATOR_FIELD :: p
+ end subroutine allocate_gen_field
+
+ subroutine allocate_sc_field(a)
+ P_SPINCOL_FIELD :: a
+ end subroutine allocate_sc_field
+
+ subroutine allocate_sc_field_io(a)
+ P_SPINCOL_FIELD_IO :: a
+ end subroutine allocate_sc_field_io
+
+ subroutine allocate_sc_overindexed(a)
+ P_SPINCOL_OVERINDEXED :: a
+ end subroutine allocate_sc_overindexed
+
+ subroutine allocate_sc2_field(a)
+ P_SC2_FIELD :: a
+ end subroutine allocate_sc2_field
+
+ subroutine allocate_clover_field_a(a)
+ use typedef_clover
+ P_CLOVER_FIELD_A :: a
+ end subroutine allocate_clover_field_a
+
+ subroutine allocate_clover_field_b(b)
+ use typedef_clover
+ P_CLOVER_FIELD_B :: b
+ end subroutine allocate_clover_field_b
+
+ subroutine swap_p_g_field(u, v)
+ P_GAUGE_FIELD :: u, v
+ end subroutine swap_p_g_field
+
+ subroutine swap_p_sc_field(a, b)
+ P_SPINCOL_FIELD :: a, b
+ end subroutine swap_p_sc_field
+
+ subroutine swap_p_clover_field_a(x, y)
+ use typedef_clover
+ P_CLOVER_FIELD_A :: x, y
+ end subroutine swap_p_clover_field_a
+
+ subroutine swap_p_clover_field_b(x, y)
+ use typedef_clover
+ P_CLOVER_FIELD_B :: x, y
+ end subroutine swap_p_clover_field_b
+
+ end interface
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_switches.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_switches.F90
new file mode 100644
index 0000000000000000000000000000000000000000..28543df8749de32397d153292b9a236cabeb03ee
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_switches.F90
@@ -0,0 +1,31 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! module_switches.F90
+!
+!-------------------------------------------------------------------------------
+
+module module_switches
+
+ type type_switches
+ logical :: quenched
+ logical :: dynamical
+ logical :: tempering
+ logical :: clover
+ logical :: h_ext
+ logical :: hasenbusch
+ logical :: hmc_test
+ logical :: measure_polyakov_loop
+ logical :: measure_traces
+ end type type_switches
+
+ type (type_switches), save :: switches
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_thread.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_thread.F90
new file mode 100644
index 0000000000000000000000000000000000000000..bd51c2cfdea6bf1908a8533149139d42e0e62624
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_thread.F90
@@ -0,0 +1,26 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! module_thread.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_thread
+
+ ! {xyz|t}_{start|end} (0:n_thread-1)
+
+ integer, save :: n_thread
+ integer, dimension(:), pointer, save :: xyz_start, xyz_end
+ integer, dimension(:), pointer, save :: t_start, t_end
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_vol.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_vol.F90
new file mode 100644
index 0000000000000000000000000000000000000000..388675e2d141943ca2847e0886c36777d5a65f25
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_vol.F90
@@ -0,0 +1,22 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! module_vol.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_vol
+
+ integer, save :: volume, vol, volh, volh_tot, size_sc_field
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_cksum.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_cksum.F90
new file mode 100644
index 0000000000000000000000000000000000000000..8338b75f7f099899f0737ee891e3d3466ad80e3f
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_cksum.F90
@@ -0,0 +1,26 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! typedef_cksum.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module typedef_cksum
+
+ type type_cksum
+ CHECK_SUM :: sum
+ CHECK_SUM :: bytes
+ FILENAME :: file
+ end type type_cksum
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_clover.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_clover.F90
new file mode 100644
index 0000000000000000000000000000000000000000..e580e642e82fd6c766eac46696784822c65f058d
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_clover.F90
@@ -0,0 +1,85 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! typedef_clover.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module typedef_clover
+
+ type type_clover_a
+ sequence
+
+ REAL i11
+ REAL i22
+
+ COMPLEX i12
+ COMPLEX i13
+ COMPLEX i14
+ COMPLEX i15
+ COMPLEX i16
+
+ COMPLEX i23
+ COMPLEX i24
+ COMPLEX i25
+ COMPLEX i26
+
+ REAL i33
+ REAL i44
+
+ COMPLEX i34
+ COMPLEX i35
+ COMPLEX i36
+
+ COMPLEX i45
+ COMPLEX i46
+
+ REAL i55
+ REAL i66
+
+ COMPLEX i56
+ end type type_clover_a
+
+
+ type type_clover_b
+ sequence
+
+ COMPLEX i21
+
+ COMPLEX i31
+ COMPLEX i32
+
+ COMPLEX i41
+ COMPLEX i42
+ COMPLEX i43
+
+ COMPLEX i51
+ COMPLEX i52
+ COMPLEX i53
+ COMPLEX i54
+
+ COMPLEX i61
+ COMPLEX i62
+ COMPLEX i63
+ COMPLEX i64
+ COMPLEX i65
+
+ REAL i11
+ REAL i22
+ REAL i33
+ REAL i44
+ REAL i55
+ REAL i66
+ end type type_clover_b
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_flags.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_flags.F90
new file mode 100644
index 0000000000000000000000000000000000000000..ca191bd165ace9d669cd785527c2802615243527
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_flags.F90
@@ -0,0 +1,26 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! typedef_flags.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module typedef_flags
+
+ type type_flags
+ logical :: show_version
+ logical :: continuation_job
+ FILENAME :: input
+ end type type_flags
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_hmc.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_hmc.F90
new file mode 100644
index 0000000000000000000000000000000000000000..3e1a1d1662053d88c4fe08eb40ea6901490cdc62
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_hmc.F90
@@ -0,0 +1,69 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! typedef_hmc.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module typedef_hmc
+
+ use typedef_clover
+
+ type hmc_para
+ REAL :: beta
+ REAL :: kappa
+ REAL :: csw
+ REAL :: csw_kappa
+ REAL :: h
+ REAL :: traj_length
+ REAL :: tau
+ REAL :: rho
+ integer :: ntau
+ integer :: m_scale
+ character :: model
+ end type hmc_para
+
+ type hmc_para_char
+ character(len = 20) :: beta
+ character(len = 20) :: kappa
+ character(len = 20) :: csw
+ character(len = 20) :: csw_kappa
+ character(len = 20) :: h
+ character(len = 20) :: traj_length
+ character(len = 20) :: tau
+ character(len = 20) :: ntau
+ character(len = 20) :: rho
+ character(len = 20) :: m_scale
+ end type hmc_para_char
+
+ type hmc_out
+ REAL :: exp_dh ! exp(-Delta H)
+ REAL :: sg ! without factor beta
+ REAL :: sf
+ integer :: accepted
+ integer :: cg_ncall
+ integer :: cg_niter_max
+ integer :: cg_niter_tot
+ end type hmc_out
+
+ type hmc_conf
+ P_GAUGE_FIELD :: u
+ P_SPINCOL_FIELD :: phi
+ P_SPINCOL_FIELD :: phi2
+ P_CLOVER_FIELD_A :: a ! A := 1 - kappa c_sw sigma F (in 6x6 blocks)
+ P_CLOVER_FIELD_A :: i ! inverse of A
+ P_CLOVER_FIELD_B :: b ! inverse of A in (L D L+) decomposed form
+ integer :: former ! ensemble index before tempering
+ end type hmc_conf
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_para.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_para.F90
new file mode 100644
index 0000000000000000000000000000000000000000..70fee2ca94708b611e562a166648339961cf0d79
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_para.F90
@@ -0,0 +1,57 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! typedef_para.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module typedef_para
+
+ use typedef_hmc
+
+ type type_para
+
+ integer :: run
+
+ integer, dimension(DIM) :: L
+ integer, dimension(DIM) :: NPE
+ integer, dimension(DIM) :: bc_fermions
+ integer, dimension(DIM) :: gamma_index
+
+ integer :: n_temper
+
+ type(hmc_para), dimension(MAX_TEMPER) :: hmc
+ type(hmc_para_char), dimension(MAX_TEMPER) :: c_hmc
+
+ integer :: start
+ SEED :: seed
+ integer :: swap_seq
+
+ integer :: nforce
+ integer :: ntraj
+ integer :: nstd
+ integer :: maxtraj
+
+ integer :: nsave
+
+ real :: cg_rest
+ integer :: cg_maxiter
+ integer :: cg_log
+
+ character(len = 20) c_cg_rest
+
+ FILENAME, dimension(MAX_TEMPER) :: info_file
+
+ end type type_para
+
+end
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/mre.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/mre.F90
new file mode 100644
index 0000000000000000000000000000000000000000..4e9078d1d1bc97321380ae2abc4930816aa4b077
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/mre.F90
@@ -0,0 +1,270 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! mre.F90 - Chronological Inverter by Minimal Residual Extrapolation
+! hep-lat/9509012
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine mre_put(basis, sc_field, reset) ! add a solution
+
+ use module_mre
+ use module_p_interface
+ use module_vol
+ implicit none
+
+ type(type_mre), intent(inout) :: basis
+ SPINCOL_FIELD, intent(in) :: sc_field
+ integer, intent(in) :: reset
+
+ P_SPINCOL_FIELD :: tmp
+ integer :: i
+
+ if (mre_n_vec == 0) then
+ return
+ endif
+
+ call mre_allocate(basis)
+
+ if (reset /= 0) then
+ basis%rank = 0
+ return
+ endif
+
+ tmp => basis%vec(mre_n_vec)%sc
+
+ do i = mre_n_vec, 2, -1
+ basis%vec(i)%sc => basis%vec(i - 1)%sc
+ enddo
+
+ basis%vec(1)%sc => tmp
+ call sc_copy(basis%vec(1)%sc, sc_field)
+
+ if (basis%rank < mre_n_vec) basis%rank = basis%rank + 1
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine mre_get(basis, matrix_mult, trial, phi, para, conf)
+
+ ! get trial solution
+
+ use typedef_hmc
+ use module_function_decl
+ use module_mre
+ use module_vol
+ implicit none
+
+ type(type_mre), intent(inout) :: basis
+ external :: matrix_mult
+ SPINCOL_FIELD, intent(out) :: trial
+ SPINCOL_FIELD, intent(in) :: phi
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(in) :: conf
+
+ type(type_mre), save :: mv ! "Matrix * v"
+ COMPLEX, target :: g(mre_n_vec, mre_n_vec + 1)
+ COMPLEX, pointer :: b(:)
+ integer :: size_g
+ integer :: i
+ integer :: j
+ integer :: s
+ integer :: c
+ integer :: rest
+
+ if (mre_n_vec == 0 .or. .not. associated(basis%vec) .or. basis%rank == 0) then
+ call sc_copy(trial, phi)
+ return
+ endif
+
+ if (basis%rank == 1) then
+ call sc_copy(trial, basis%vec(1)%sc)
+ return
+ endif
+
+ size_g = mre_n_vec * (mre_n_vec + 1) * SIZE_COMPLEX
+
+ b => g(:, mre_n_vec + 1) ! storage arrangement for global sum
+ ! => one global sum for everything
+
+ call mre_allocate(mv)
+ call mre_gram_schmidt(basis)
+
+ do i = 1, basis%rank
+ b(i) = sc_cdotc(basis%vec(i)%sc, phi)
+ call matrix_mult(mv%vec(i)%sc, basis%vec(i)%sc, para, conf)
+ enddo
+
+ do i = 1, basis%rank
+ g(i, i) = sc_norm2(mv%vec(i)%sc)
+ do j = i + 1, basis%rank
+ g(i, j) = sc_cdotc(mv%vec(i)%sc, mv%vec(j)%sc)
+ g(j, i) = conjg(g(i, j))
+ enddo
+ enddo
+
+ call global_sum_vec(size_g, g)
+
+ call mre_gauss_jordan(g, b, basis%rank, mre_n_vec)
+
+ ! calculation of "trial" with doubled data re-use:
+
+ call sc_cax2(trial, basis%vec(1)%sc, b(1), basis%vec(2)%sc, b(2))
+
+ rest = mod(basis%rank, 2)
+
+ do j = 3, basis%rank - rest, 2
+ call sc_caxpy2(trial, basis%vec(j)%sc, b(j), &
+ basis%vec(j+1)%sc, b(j+1))
+ enddo
+
+ if (rest == 1) then
+ j = basis%rank
+ call sc_caxpy(trial, basis%vec(j)%sc, b(j))
+ endif
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine mre_allocate(basis)
+
+ use module_mre
+ use module_p_interface
+ use module_vol
+ implicit none
+
+ type(type_mre), intent(inout) :: basis
+ integer :: i
+
+ if (.not. associated(basis%vec)) then
+ allocate(basis%vec(mre_n_vec))
+ do i = 1, mre_n_vec
+ nullify(basis%vec(i)%sc)
+ call allocate_sc_field(basis%vec(i)%sc)
+ enddo
+ basis%rank = 0
+ endif
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine mre_gram_schmidt(basis)
+
+ ! Golub and van Loon, Matrix Computations (3rd ed.), p. 232
+
+ use module_function_decl
+ use module_mre
+ use module_vol
+ implicit none
+
+ type(type_mre), intent(inout) :: basis
+
+ integer :: k, j
+ REAL :: r_kk, r_kj
+
+ do k = 1, basis%rank
+ r_kk = sc_norm2(basis%vec(k)%sc)
+ r_kk = global_sum(r_kk)
+ r_kk = ONE / sqrt(r_kk)
+ call sc_scale(basis%vec(k)%sc, r_kk)
+ do j = k + 1, basis%rank
+ r_kj = sc_dot(basis%vec(k)%sc, basis%vec(j)%sc)
+ r_kj = global_sum(r_kj)
+ call sc_axpy(basis%vec(j)%sc, basis%vec(k)%sc, -r_kj)
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine mre_gauss_jordan(a, b, n, np)
+
+ ! Numerical Recipes in Fortran (2nd ed.), p. 30
+ ! inv(a) i.e. a is not unscrambled
+
+ implicit none
+
+ integer, intent(in) :: n, np
+ COMPLEX, intent(inout) :: a(np,np), b(np)
+
+ integer :: ipiv(n)
+ integer :: i, j, k, l, ll
+ integer :: irow, icol
+
+ REAL :: big, tmp
+ COMPLEX :: dum
+ COMPLEX :: pivinv
+
+ ipiv = 0
+
+ do i = 1, n
+
+ big = ZERO
+ do j = 1, n
+ if (ipiv(j) /= 1) then
+ do k = 1, n
+ if (ipiv(k) == 0) then
+ tmp = abs(a(j, k))
+ if (tmp >= big) then
+ big = tmp
+ irow = j
+ icol = k
+ endif
+ else if (ipiv(k) > 1) then
+ call die("mre_gauss_jordan(): singular matrix 1")
+ endif
+ enddo
+ endif
+ enddo
+
+ ipiv(icol) = ipiv(icol) + 1
+
+ if (irow /= icol) then
+ do l = 1, n
+ dum = a(irow, l)
+ a(irow, l) = a(icol, l)
+ a(icol, l) = dum
+ enddo
+ dum = b(irow)
+ b(irow) = b(icol)
+ b(icol) = dum
+ endif
+
+ if (a(icol, icol) == ZERO ) then
+ call die("mre_gauss_jordan(): singular matrix 2")
+ endif
+
+ pivinv = ONE / a(icol, icol)
+ !!a(icol, icol) = ONE !! only needed for inv(a)
+
+ do l = 1, n
+ a(icol, l) = a(icol, l) * pivinv
+ enddo
+
+ b(icol) = b(icol) * pivinv
+
+ do ll = 1, n
+ if (ll /= icol) then
+ dum = a(ll, icol)
+ a(ll, icol) = ZERO
+ do l = 1, n
+ a(ll, l) = a(ll, l) - a(icol, l) * dum
+ enddo
+ b(ll) = b(ll) - b(icol) * dum
+ endif
+ enddo
+
+ enddo
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/mtdagmt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/mtdagmt.F90
new file mode 100644
index 0000000000000000000000000000000000000000..44e1323a953c59ed3eeebdb6865c41f1e64b4c67
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/mtdagmt.F90
@@ -0,0 +1,40 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! mtdagmt.F90 - -> see m_tilde.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine mtdagmt(out, in, para, conf)
+
+ use typedef_hmc
+ use module_p_interface
+ use module_vol
+ implicit none
+
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(in) :: conf
+
+ SPINCOL_FIELD :: out, in
+ P_SPINCOL_FIELD, save :: tmp
+
+ TIMING_START(timing_bin_mtdagmt)
+
+ ALLOCATE_SC_FIELD(tmp)
+
+ call mtil(tmp, in, para, conf)
+ call mtil_dag(out, tmp, para, conf)
+
+ TIMING_STOP(timing_bin_mtdagmt)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-EXPLAINED.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-EXPLAINED.var
new file mode 100644
index 0000000000000000000000000000000000000000..cec45a8248829ba1fee2fbf339b0144c80da4fdd
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-EXPLAINED.var
@@ -0,0 +1,61 @@
+#===============================================================================
+#
+# BQCD -- Berlin Quantum ChromoDynamics programme
+#
+# Author: Hinnerk Stueben
+#
+# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+#
+#-------------------------------------------------------------------------------
+#
+# Makefile-EXPLAINED.var
+#
+#-------------------------------------------------------------------------------
+
+SHELL = /bin/ksh
+
+MODULES_FLAG = -I$(MODULES_DIR) # how to find "modules"
+ # MODULES_DIR is set in every Makefile
+
+FPP = cpp -C -P # Fortran preprocessor / ANSI C preprocessor
+F90 = f90 # Fortran90 compiler
+CC = cc # ANSI C compiler
+AR = ar # ar command
+RANLIB = echo # "ranlib" if necessary
+
+MYFLAGS = -DTIMING # or to switch time measurement off
+ # must always be set for benchmarking
+ -DD3_BUFFER_VOL=24*24*12*12
+ # -sample for lattice='24 24 24 48',processes='1 1 2 4'
+ # - largest possible size of a local lattice boundary
+ # - determines the size of *static* arrays
+ # - only needed in libd3.a
+ # - should be set to 1 when using a different LIBD.
+ -D_OPENMP # has to be explicitly defined for OpenMP
+
+FFLAGS = -O3 # Fortran90 compiler flags
+ $(MODULES_FLAG)
+CFLAGS = -O3 # C compiles flags
+ARFLAGS = rv # ar flags
+
+LDFLAGS = # loader flags (the loader is: ${F90})
+SYSLIBS = # system libraries (e.g. for BLAS)
+
+FAST_MAKE = gmake -j 8 # parallel make
+
+CKSUM_O = cksum.o # do not change
+RANDOM_O = ran.o ranf.o # or "ran.o" on Crays
+UUU_O = uuu_f90.o # or "uuu_fwd.o uuu_bwd.o uuu_bwd_m.o" if C is
+ # faster than Fortran90 (was a small effect on T3E)
+
+LIBD = libd.a # Multiplication with Wilson hopping term:
+ # libd.a: Cray T3E version (MPI or shmem)
+ # libd2.a: Hitachi SR8000 version (MPI)
+ # libd3.a: Hitachi SR8000 version (MPI+OpenMP)
+ # libd21.a: version for high scalability
+
+LIBCOMM = lib_mpi.a # or "lib_single_pe.a" or "lib_shmempi.a"
+
+LIBCLOVER = libclover.a # do not change
+
+#===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-altix.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-altix.var
new file mode 100644
index 0000000000000000000000000000000000000000..baf01efce305d994685f8ed67ce8a0909ed49181
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-altix.var
@@ -0,0 +1,108 @@
+#===============================================================================
+#
+# BQCD -- Berlin Quantum ChromoDynamics programme
+#
+# Author: Hinnerk Stueben
+#
+# Copyright (C) 2005, Hinnerk Stueben, Zuse-Institut Berlin
+#
+#-------------------------------------------------------------------------------
+#
+# Makefile-altix.var - settings on SGI-Altix
+#
+#-------------------------------------------------------------------------------
+
+timing = 1
+mpi =
+omp = 1
+shmem = 1
+shmempi = 1
+debug =
+libd = 2
+d3_buffer_vol = 32*32*16*16
+
+#-------------------------------------------------------------------------------
+
+SHELL = /bin/ksh
+
+FPP = mpif90 -g -E
+FPP2 = icc -E -C -P
+F90 = mpif90
+CC = mpicc
+AR = ar
+RANLIB = echo
+
+MODULES_FLAG = -I$(MODULES_DIR)
+
+MYFLAGS = -DINTEL -DALTIX
+FFLAGS_STD= $(MODULES_FLAG)
+CFLAGS_STD= -DNamesToLower_
+ARFLAGS = rv
+
+LDFLAGS = -Vaxlib
+SYSLIBS =
+
+FAST_MAKE = gmake -j 8
+
+CKSUM_O = cksum.o
+RANDOM_O = ran.o ranf.o
+UUU_O = uuu_f90.o
+
+LIBD =
+LIBCOMM = lib_single_pe.a
+LIBCLOVER = libclover.a
+
+#-------------------------------------------------------------------------------
+
+ifdef timing
+ MYFLAGS += -DTIMING
+endif
+
+ifdef mpi
+ LIBCOMM = lib_mpi.a
+endif
+
+ifdef omp
+ F90 += -openmp
+ MYFLAGS += -D_OPENMP
+endif
+
+ifdef shmem
+ LDFLAGS += -lsma
+ LIBCOMM = lib_shmem.a
+endif
+
+ifdef shmempi
+ LDFLAGS += -lsma
+ LIBCOMM = lib_shmempi.a
+endif
+
+ifdef debug
+ FFLAGS = -g -O0 $(FFLAGS_STD)
+ CFLAGS = -g -O0 $(CFLAGS_STD)
+else
+ FFLAGS = -O2 $(FFLAGS_STD)
+ CFLAGS = -O2 $(CFLAGS_STD)
+endif
+
+ifeq ($(libd),1)
+ LIBD = libd.a
+ MYFLAGS += -DD3_BUFFER_VOL=1
+endif
+
+ifeq ($(libd),2)
+ LIBD = libd2.a
+ MYFLAGS += -DD3_BUFFER_VOL=1
+endif
+
+ifeq ($(libd),21)
+ LIBD = libd21.a
+ MYFLAGS += -DD3_BUFFER_VOL=1
+endif
+
+ifeq ($(libd),3)
+ LIBD = libd3.a
+ MYFLAGS += -DD3_BUFFER_VOL='$(d3_buffer_vol)'
+endif
+
+#===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-bgl.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-bgl.var
new file mode 100644
index 0000000000000000000000000000000000000000..5841de7a5d1c09511eba640cae719b0e93016b2d
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-bgl.var
@@ -0,0 +1,105 @@
+#===============================================================================
+#
+# BQCD -- Berlin Quantum ChromoDynamics programme
+#
+# Author: Hinnerk Stueben
+#
+# Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin
+#
+#-------------------------------------------------------------------------------
+#
+# Makefile-bgl.var - settings on BlueGene/L
+#
+#-------------------------------------------------------------------------------
+
+timing = 1
+debug =
+bits64 =
+libd = 2
+d3_buffer_vol = 32*32*16*16
+
+#-------------------------------------------------------------------------------
+
+SHELL = /bin/ksh
+
+FPP = /opt/ibmcmp/vac/7.0/bin/blrts_xlc -E -C -P
+F90 = /opt/ibmcmp/xlf/9.1/bin/blrts_xlf90 -qsuffix=f=f90
+CC = /opt/ibmcmp/vac/7.0/bin/blrts_xlc
+AR = ar
+RANLIB = echo
+
+BGLSYS = /bgl/BlueLight/ppcfloor/bglsys
+
+MODULES_FLAG = -I$(MODULES_DIR)
+
+MYFLAGS = -DIBM
+FFLAGS_STD= $(MODULES_FLAG) -I$(BGLSYS)/include
+CFLAGS_STD= -DLongLong -DNamesToLower
+ARFLAGS = rv
+
+LDFLAGS = -L$(BGLSYS)/lib
+SYSLIBS = -lmpich.rts -lfmpich.rts -lmsglayer.rts -lrts.rts -ldevices.rts
+
+FAST_MAKE = gmake -j 8
+
+CKSUM_O = cksum.o
+RANDOM_O = ran.o ranf.o
+UUU_O = uuu_f90.o
+
+LIBD =
+LIBCOMM = lib_mpi.a
+LIBCLOVER = libclover.a
+
+#-------------------------------------------------------------------------------
+
+ifdef timing
+ MYFLAGS += -DTIMING
+endif
+
+ifdef mpi
+ LIBCOMM = lib_mpi.a
+endif
+
+ifdef omp
+ F90 += -qsmp=omp
+ MYFLAGS += -D_OPENMP
+endif
+
+ifdef debug
+ FFLAGS = -g -qfullpath $(FFLAGS_STD)
+ CFLAGS = -g -qfullpath $(CFLAGS_STD)
+else
+ FFLAGS = -O3 -qhot $(FFLAGS_STD)
+ CFLAGS = -O2 $(CFLAGS_STD)
+endif
+
+ifdef bits64
+ F90 += -q64
+ CFLAGS += -q64
+ ARFLAGS += -X64
+else
+ LDFLAGS +=
+endif
+
+
+ifeq ($(libd),1)
+ LIBD = libd1.a
+ MYFLAGS += -DD3_BUFFER_VOL=1
+endif
+
+ifeq ($(libd),2)
+ LIBD = libd2.a
+ MYFLAGS += -DD3_BUFFER_VOL=1
+endif
+
+ifeq ($(libd),21)
+ LIBD = libd21.a
+ MYFLAGS += -DD3_BUFFER_VOL=1
+endif
+
+ifeq ($(libd),3)
+ LIBD = libd3.a
+ MYFLAGS += -DD3_BUFFER_VOL='$(d3_buffer_vol)'
+endif
+
+#===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-cray.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-cray.var
new file mode 100644
index 0000000000000000000000000000000000000000..22aacaf584928dcf5fc43178b4da26caee0a25fe
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-cray.var
@@ -0,0 +1,45 @@
+#===============================================================================
+#
+# BQCD -- Berlin Quantum ChromoDynamics programme
+#
+# Author: Hinnerk Stueben
+#
+# Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+#
+#-------------------------------------------------------------------------------
+#
+# Makefile-cray.var - settings on Cray T3E
+#
+#-------------------------------------------------------------------------------
+
+SHELL = /bin/ksh
+
+MODULES_FLAG = -p $(MODULES_DIR)
+
+FPP = cpp -C -P
+F90 = f90
+CC = cc
+AR = ar
+RANLIB = echo
+
+# "-M 801": suppress "unknown directive" messages (from OpenMP)
+
+MYFLAGS = -DTIMING -DD3_BUFFER_VOL=1
+FFLAGS = $(MODULES_FLAG) -g -M 801
+CFLAGS = -O3
+ARFLAGS = rv
+
+LDFLAGS =
+SYSLIBS =
+
+FAST_MAKE = NPROC=4 make
+
+CKSUM_O = cksum.o
+RANDOM_O = ran.o
+UUU_O = uuu_fwd.o uuu_bwd.o uuu_bwd_m.o
+
+LIBD = libd.a
+LIBCOMM = lib_mpi.a
+LIBCLOVER = libclover.a
+
+#===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hitachi-omp.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hitachi-omp.var
new file mode 100644
index 0000000000000000000000000000000000000000..975c7e235ef3a96047908424f22d4b3f91b8af06
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hitachi-omp.var
@@ -0,0 +1,44 @@
+#===============================================================================
+#
+# BQCD -- Berlin Quantum ChromoDynamics programme
+#
+# Author: Hinnerk Stueben
+#
+# Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+#
+#-------------------------------------------------------------------------------
+#
+# Makefile-hitachi-omp.var - settings on Hitachi SR8000
+# (fastest version: MPI +OpenMP)
+#
+#-------------------------------------------------------------------------------
+
+SHELL = /bin/ksh
+
+MODULES_FLAG = -I$(MODULES_DIR)
+
+FPP = cpp -C -P -D_OPENMP
+F90 = mpif90
+CC = cc
+AR = ar
+RANLIB = echo
+
+MYFLAGS = -DTIMING -DD3_BUFFER_VOL=24*24*12*12
+FFLAGS = $(MODULES_FLAG) -Oss -pvdiag -par=2 -pardiag=1 -omp -procnum=8 -nosave -contarea=2
+CFLAGS = -DLongLong
+ARFLAGS = rv
+
+LDFLAGS = +BTLB -omp -rdma
+SYSLIBS = /usr/local/lib/liblrz.a -lf90c -lpl
+
+FAST_MAKE = JOBTYPE=SS prun -p IAPAR gmake -j 8
+
+CKSUM_O = cksum.o
+RANDOM_O = ran.o ranf.o
+UUU_O = uuu_f90.o
+
+LIBD = libd3.a
+LIBCOMM = lib_mpi.a
+LIBCLOVER = libclover.a
+
+#===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hitachi.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hitachi.var
new file mode 100644
index 0000000000000000000000000000000000000000..24a06ad305c32af72d9827b01d70d41356ac1dfa
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hitachi.var
@@ -0,0 +1,43 @@
+#===============================================================================
+#
+# BQCD -- Berlin Quantum ChromoDynamics programme
+#
+# Author: Hinnerk Stueben
+#
+# Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+#
+#-------------------------------------------------------------------------------
+#
+# Makefile-hitachi.var - settings on Hitachi SR8000 (pure MPI version)
+#
+#-------------------------------------------------------------------------------
+
+SHELL = /bin/ksh
+
+MODULES_FLAG = -I$(MODULES_DIR)
+
+FPP = cpp -C -P
+F90 = mpif90
+CC = cc
+AR = ar
+RANLIB = echo
+
+MYFLAGS = -DTIMING -DD3_BUFFER_VOL=1
+FFLAGS = $(MODULES_FLAG) -opt=ss -par=0 -contarea=2
+CFLAGS = -DLongLong
+ARFLAGS = rv
+
+LDFLAGS = +BTLB
+SYSLIBS = /usr/local/lib/liblrz.a -lf90c -lpl
+
+FAST_MAKE = JOBTYPE=SS prun -p IAPAR gmake -j 8
+
+CKSUM_O = cksum.o
+RANDOM_O = ran.o ranf.o
+UUU_O = uuu_f90.o
+
+LIBD = libd2.a
+LIBCOMM = lib_mpi.a
+LIBCLOVER = libclover.a
+
+#===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hp.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hp.var
new file mode 100644
index 0000000000000000000000000000000000000000..a2050cfe4fdd0559ec513f6f1de864b193fbf9ca
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hp.var
@@ -0,0 +1,45 @@
+#===============================================================================
+#
+# BQCD -- Berlin Quantum ChromoDynamics programme
+#
+# Author: Hinnerk Stueben
+#
+# Copyright (C) 2002-2003, Hinnerk Stueben, Zuse-Institut Berlin
+#
+#-------------------------------------------------------------------------------
+#
+# Makefile-hp.var - settings for HP-UX Fortran Compiler
+#
+#-------------------------------------------------------------------------------
+
+SHELL = /bin/ksh
+
+MODULES_FLAG = -I$(MODULES_DIR)
+
+FPP = /opt/langtools/lbin/cpp.ansi -P
+F90 = mpif90 ####+Oopenmp
+CC = cc
+AR = ar
+RANLIB = ranlib
+
+MYFLAGS = -DTIMING -DD3_BUFFER_VOL=1 #####-D_OPENMP
+FFLAGS = $(MODULES_FLAG) \
+ +r8 +DD64 +DSnative +O3 +Ocache_pad_common \
+ +Olibcalls +Onolimit +Ofltacc=relaxed +FPD
+CFLAGS = -DLongLong -DNamesToLower_ +DD64 +DSnative
+ARFLAGS = rv
+
+LDFLAGS = +O3 +DD64 +DSnative +U77
+SYSLIBS = -L/opt/mlib/lib/hpux64 -lveclib
+
+FAST_MAKE = gmake -j 1
+
+CKSUM_O = cksum.o
+RANDOM_O = ran.o ranf.o
+UUU_O = uuu_f90.o
+
+LIBD = libd.a
+LIBCOMM = lib_mpi.a
+LIBCLOVER = libclover.a
+
+#===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-ibm.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-ibm.var
new file mode 100644
index 0000000000000000000000000000000000000000..8710c65d4cb392d964a86ebe422571a7c0d4adde
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-ibm.var
@@ -0,0 +1,116 @@
+#===============================================================================
+#
+# BQCD -- Berlin Quantum ChromoDynamics programme
+#
+# Author: Hinnerk Stueben
+#
+# Copyright (C) 2002-2006, Hinnerk Stueben, Zuse-Institut Berlin
+#
+#-------------------------------------------------------------------------------
+#
+# Makefile-ibm.var - settings on IBM
+#
+#-------------------------------------------------------------------------------
+
+timing = 1
+mpi = 1
+omp =
+debug =
+bits64 = 1
+libd = 2
+d3_buffer_vol = 24*24*12*12
+
+#-------------------------------------------------------------------------------
+
+SHELL = /bin/ksh
+
+FPP = /usr/ccs/lib/cpp -C -P
+F90 = xlf90_r -qsuffix=f=f90
+CC = cc_r
+AR = ar
+RANLIB = echo
+
+# suppressed messages:
+#
+# 1500-036 (I) The NOSTRICT option (default at OPT(3)) has the potential
+# to alter the semantics of a program
+#
+# 1516-092 (E) If a statement function expression contains a reference
+# to a function or a function dummy procedure, the
+# reference must not require an explicit interface or be
+# a transformational intrinsic
+
+MODULES_FLAG = -I$(MODULES_DIR)
+
+MYFLAGS = -DIBM
+FFLAGS_STD= $(MODULES_FLAG) -qsuppress=1500-036:1516-092 #:1501-510:1516-092:1514-008
+CFLAGS_STD= -DLongLong -DNamesToLower
+ARFLAGS = -r -v
+
+LDFLAGS =
+SYSLIBS =
+
+FAST_MAKE = gmake -j 8
+
+CKSUM_O = cksum.o
+RANDOM_O = ran.o ranf.o
+UUU_O = uuu_f90.o
+
+LIBD =
+LIBCOMM = lib_single_pe.a
+LIBCLOVER = libclover.a
+
+#-------------------------------------------------------------------------------
+
+ifdef timing
+ MYFLAGS += -DTIMING
+endif
+
+ifdef mpi
+ F90 = mpxlf90_r -qsuffix=f=f90
+ LIBCOMM = lib_mpi.a
+endif
+
+ifdef omp
+ F90 += -qsmp=omp
+ MYFLAGS += -D_OPENMP
+endif
+
+ifdef debug
+ FFLAGS = -g -qfullpath $(FFLAGS_STD)
+ CFLAGS = -g -qfullpath $(CFLAGS_STD)
+else
+ FFLAGS = -O3 $(FFLAGS_STD)
+ CFLAGS = -O2 $(CFLAGS_STD)
+endif
+
+ifdef bits64
+ F90 += -q64
+ CFLAGS += -q64
+ ARFLAGS += -X64
+else
+ LDFLAGS += -bmaxdata:2000000000 -bmaxstack:250000000
+endif
+
+
+ifeq ($(libd),1)
+ LIBD = libd1.a
+ MYFLAGS += -DD3_BUFFER_VOL=1
+endif
+
+ifeq ($(libd),2)
+ LIBD = libd2.a
+ MYFLAGS += -DD3_BUFFER_VOL=1
+endif
+
+ifeq ($(libd),21)
+ LIBD = libd21.a
+ MYFLAGS += -DD3_BUFFER_VOL=1
+endif
+
+ifeq ($(libd),3)
+ LIBD = libd3.a
+ MYFLAGS += -DD3_BUFFER_VOL='$(d3_buffer_vol)'
+endif
+
+#===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-intel.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-intel.var
new file mode 100644
index 0000000000000000000000000000000000000000..e1d0443529a8d277e2acb4b5bade295e4e6c73cb
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-intel.var
@@ -0,0 +1,43 @@
+#===============================================================================
+#
+# BQCD -- Berlin Quantum ChromoDynamics programme
+#
+# Author: Hinnerk Stueben
+#
+# Copyright (C) 2002-2003, Hinnerk Stueben, Zuse-Institut Berlin
+#
+#-------------------------------------------------------------------------------
+#
+# Makefile-intel.var - settings for Intel Fortran Compiler
+#
+#-------------------------------------------------------------------------------
+
+SHELL = /bin/ksh
+
+MODULES_FLAG = -cl,bqcd.pcl
+
+FPP = cpp -C -P
+F90 = ifc
+CC = gcc
+AR = ar
+RANLIB = echo
+
+MYFLAGS = -DINTEL -DTIMING -DD3_BUFFER_VOL=1
+FFLAGS = $(MODULES_FLAG)
+CFLAGS = -DLongLong -DNamesToLower_
+ARFLAGS = rv
+
+LDFLAGS = -Vaxlib
+SYSLIBS =
+
+FAST_MAKE = gmake
+
+CKSUM_O = cksum.o
+RANDOM_O = ran.o ranf.o
+UUU_O = uuu_f90.o
+
+LIBD = libd.a
+LIBCOMM = lib_single_pe.a
+LIBCLOVER = libclover.a
+
+#===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-nec.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-nec.var
new file mode 100644
index 0000000000000000000000000000000000000000..cfba0e88b8f2a10987e3f8e1ee7ac447d35fe20f
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-nec.var
@@ -0,0 +1,44 @@
+#===============================================================================
+#
+# BQCD -- Berlin Quantum ChromoDynamics programme
+#
+# Author: Hinnerk Stueben
+#
+# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+#
+#-------------------------------------------------------------------------------
+#
+# Makefile-nec.var - settings on NEC SX-8
+#
+#-------------------------------------------------------------------------------
+
+SHELL = /bin/ksh
+
+MODULES_FLAG = I$(MODULES_DIR)
+
+FPP = cpp -C -P
+F90 = sxmpif90
+CC = sxmpic++
+AR = sxar
+RANLIB = echo
+
+MYFLAGS = -DTIMING -DD3_BUFFER_VOL=24*24*12*12
+
+FFLAGS = -$(MODULES_FLAG)
+CFLAGS = -DNamesToLower_ -DLongLong
+ARFLAGS = rv
+
+LDFLAGS =
+SYSLIBS =
+
+FAST_MAKE = make
+
+CKSUM_O = cksum.o
+RANDOM_O = ran.o ranf.o
+UUU_O = uuu_f90.o
+
+LIBD = libd2.a
+LIBCOMM = lib_mpi.a
+LIBCLOVER = libclover.a
+
+#===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-sun.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-sun.var
new file mode 100644
index 0000000000000000000000000000000000000000..82f8a6f9029f08895c6f89a5efbd031f63d3eb4b
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-sun.var
@@ -0,0 +1,44 @@
+#===============================================================================
+#
+# BQCD -- Berlin Quantum ChromoDynamics programme
+#
+# Author: Hinnerk Stueben
+#
+# Copyright (C) 1998-2001, Hinnerk Stueben, Zuse-Institut Berlin
+#
+#-------------------------------------------------------------------------------
+#
+# Makefile-sun.var - settings on Sun
+#
+#-------------------------------------------------------------------------------
+
+SHELL = /bin/ksh
+
+MODULES_FLAG = -M$(MODULES_DIR)
+
+# use GNU C-preprocessor (well hidden on our Sun):
+
+FPP = /sw/sun4_56/egcs-1.1.2-2/lib/gcc-lib/sparc-sun-solaris2.6/egcs-2.91.66/cpp -C -P
+F90 = /opt/SUNWspro/bin/f90
+CC = gcc
+AR = ar
+RANLIB = echo
+
+MYFLAGS = -DTIMING -DD3_BUFFER_VOL=1
+FFLAGS = -O3 $(MODULES_FLAG)
+CFLAGS = -O3 -DNamesToLower_ -DLongLong
+ARFLAGS = rv
+
+LDFLAGS =
+SYSLIBS =
+
+FAST_MAKE = make
+
+RANDOM_O = ran.o ranf.o
+UUU_O = uuu_f90.o
+
+LIBD = libd.a
+LIBCOMM = lib_single_pe.a
+LIBCLOVER = libclover.a
+
+#===============================================================================
\ No newline at end of file
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-EXPLAINED.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-EXPLAINED.F90
new file mode 100644
index 0000000000000000000000000000000000000000..5b2ef70328aceb5b96ad1bd051601aa94c361981
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-EXPLAINED.F90
@@ -0,0 +1,47 @@
+!===============================================================================
+!
+! service.F90
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine abbruch() ! exit with error status and shutdown parallel application
+ ! can be mpi_abort(1) when using MPI
+ call errexit()
+end
+
+!-------------------------------------------------------------------------------
+SECONDS function sekunden() ! returns CPU-seconds
+ ! SECONDS has to be defined in "defs.h"
+ sekunden = tsecnd()
+end
+
+!-------------------------------------------------------------------------------
+! Arguments from the command line (the following works on many machines):
+!-------------------------------------------------------------------------------
+subroutine pxfgetarg(iarg, arg, larg, status) ! iarg = 0 ==> command name
+ ! iarg > 0 ==> argument
+ implicit none
+ integer :: iarg, larg, status
+ character(len = *) :: arg
+ character(len(arg) + 1) :: a
+
+ call getarg(iarg, a)
+
+ larg = len_trim(a)
+ arg = a
+ status = 0
+end
+
+!-------------------------------------------------------------------------------
+integer function ipxfargc() ! ipxfargc = 0 ==> no arguments
+ ipxfargc = iargc() ! ipxfargc > 0 ==> number of arguments
+end
+
+!-------------------------------------------------------------------------------
+logical function is_big_endian() ! .false. on e.g. Intel
+ is_big_endian = .true.
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-altix.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-altix.F90
new file mode 100644
index 0000000000000000000000000000000000000000..cd3e165e433b54bbc27bec6d2e2862eb7f1a73d9
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-altix.F90
@@ -0,0 +1,71 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! service-altix.F90 - calls to service routines on SGI-Altix
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine abbruch()
+
+ integer(4) status
+ status = 1
+ call exit(status)
+
+ !!include 'mpif.h'
+ !!integer errorcode, ierror
+ !!call mpi_abort(MPI_COMM_WORLD, errorcode, ierror)
+end
+
+!-------------------------------------------------------------------------------
+function rechner() ! returns hostname
+
+ character(len = 20) rechner
+ character(len = 32) r
+
+ call hostnm(r)
+ rechner = r
+end
+
+!-------------------------------------------------------------------------------
+SECONDS function sekunden()
+
+ include 'mpif.h'
+
+ sekunden = mpi_wtime()
+end
+
+!-------------------------------------------------------------------------------
+subroutine pxfgetarg(iarg, arg, larg, status)
+
+ implicit none
+ integer :: iarg, larg, status
+ character(len = *) :: arg
+ character(100) :: a
+
+ call getarg(iarg, a)
+
+ larg = len_trim(a)
+ arg = a
+ status = 0
+end
+
+!-------------------------------------------------------------------------------
+integer function ipxfargc()
+ ipxfargc = iargc()
+end
+
+!-------------------------------------------------------------------------------
+logical function is_big_endian()
+ is_big_endian = .false.
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-bgl.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-bgl.F90
new file mode 100644
index 0000000000000000000000000000000000000000..38e13dbe956dd405104094bee16885626577ca99
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-bgl.F90
@@ -0,0 +1,72 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! service-bgl.F90 - calls to service routines on BlueGene/L
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine abbruch()
+
+ implicit none
+ include 'mpif.h'
+
+ integer ierror
+
+ call mpi_abort(MPI_COMM_WORLD, 1, ierror)
+end
+
+!-------------------------------------------------------------------------------
+function rechner() ! returns hostname
+
+ character(len = 20) rechner
+
+ rechner = "jubl"
+end
+
+!-------------------------------------------------------------------------------
+SECONDS function sekunden()
+
+ integer(8), external :: rts_get_timebase
+ real(8), parameter :: speed = ONE / 700000000.0_8
+
+ sekunden = rts_get_timebase() * speed
+end
+
+!-------------------------------------------------------------------------------
+subroutine pxfgetarg(iarg, arg, larg, status)
+
+ implicit none
+ integer :: iarg, larg, status
+ integer(4) :: i
+ character(len = *) :: arg
+ character(len(arg) + 1) :: a
+
+ i = iarg
+ call getarg(i, a)
+
+ larg = len_trim(a)
+ arg = a
+ status = 0
+end
+
+!-------------------------------------------------------------------------------
+integer function ipxfargc()
+ integer(4) :: iargc
+ ipxfargc = iargc()
+end
+
+!-------------------------------------------------------------------------------
+logical function is_big_endian()
+ is_big_endian = .true.
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-cray.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-cray.F90
new file mode 100644
index 0000000000000000000000000000000000000000..f7981db72b83e9a6be52318313ac9aca84379ac9
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-cray.F90
@@ -0,0 +1,70 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2002, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! service-ibm.F90 - calls to service routines on IBM
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine abbruch()
+
+ integer(4) status
+ status = 1
+ call exit(status)
+end
+
+!-------------------------------------------------------------------------------
+function rechner() ! returns hostname
+
+ character(len = 20) rechner
+ character(len = 32) r
+
+ call hostnm(r)
+ rechner = r
+end
+
+!-------------------------------------------------------------------------------
+SECONDS function sekunden()
+
+ real(8) rtc
+
+ sekunden = rtc() !!!mclock() * 0.01
+end
+
+!-------------------------------------------------------------------------------
+subroutine pxfgetarg(iarg, arg, larg, status)
+
+ implicit none
+ integer :: iarg, larg, status
+ integer(4) :: i
+ character(len = *) :: arg
+ character(len(arg) + 1) :: a
+
+ i = iarg
+ call getarg(i, a)
+
+ larg = len_trim(a)
+ arg = a
+ status = 0
+end
+
+!-------------------------------------------------------------------------------
+integer function ipxfargc()
+ integer(4) :: iargc
+ ipxfargc = iargc()
+end
+
+!-------------------------------------------------------------------------------
+logical function is_big_endian()
+ is_big_endian = .true.
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hitachi-omp.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hitachi-omp.F90
new file mode 100644
index 0000000000000000000000000000000000000000..55704528a43a2ce05db1ba16ea2c2f0f43da0520
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hitachi-omp.F90
@@ -0,0 +1,68 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! service-hitachi.F90 - calls to service routines on HITACHI
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine abbruch()
+
+ call fexit(1)
+end
+
+!-------------------------------------------------------------------------------
+function rechner() ! returns hostname
+
+ character(len = 20) rechner
+
+ call hostnm(rechner)
+end
+
+!-------------------------------------------------------------------------------
+SECONDS function sekunden()
+
+ implicit none
+
+!!real(8) d
+!!call xclock(d, 5)
+!!sekunden = d
+
+ real(8) dwalltime
+ sekunden = dwalltime()
+end
+
+!-------------------------------------------------------------------------------
+subroutine pxfgetarg(iarg, arg, larg, status)
+
+ implicit none
+ integer :: iarg, larg, status
+ character(len = *) :: arg
+ character(len(arg) + 1) :: a
+
+ call getarg(iarg + 1, a)
+
+ larg = len_trim(a)
+ arg = a
+ status = 0
+end
+
+!-------------------------------------------------------------------------------
+integer function ipxfargc()
+ ipxfargc = iargc() - 1
+end
+
+!-------------------------------------------------------------------------------
+logical function is_big_endian()
+ is_big_endian = .true.
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hitachi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hitachi.F90
new file mode 100644
index 0000000000000000000000000000000000000000..55704528a43a2ce05db1ba16ea2c2f0f43da0520
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hitachi.F90
@@ -0,0 +1,68 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! service-hitachi.F90 - calls to service routines on HITACHI
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine abbruch()
+
+ call fexit(1)
+end
+
+!-------------------------------------------------------------------------------
+function rechner() ! returns hostname
+
+ character(len = 20) rechner
+
+ call hostnm(rechner)
+end
+
+!-------------------------------------------------------------------------------
+SECONDS function sekunden()
+
+ implicit none
+
+!!real(8) d
+!!call xclock(d, 5)
+!!sekunden = d
+
+ real(8) dwalltime
+ sekunden = dwalltime()
+end
+
+!-------------------------------------------------------------------------------
+subroutine pxfgetarg(iarg, arg, larg, status)
+
+ implicit none
+ integer :: iarg, larg, status
+ character(len = *) :: arg
+ character(len(arg) + 1) :: a
+
+ call getarg(iarg + 1, a)
+
+ larg = len_trim(a)
+ arg = a
+ status = 0
+end
+
+!-------------------------------------------------------------------------------
+integer function ipxfargc()
+ ipxfargc = iargc() - 1
+end
+
+!-------------------------------------------------------------------------------
+logical function is_big_endian()
+ is_big_endian = .true.
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hp.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hp.F90
new file mode 100644
index 0000000000000000000000000000000000000000..af019b707dab0d8d3ee8ac1c6ee8a2d6482b007c
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hp.F90
@@ -0,0 +1,68 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2002, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! service-hp.F90 - calls to service routine with HP-UX Fortran Compiler
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine abbruch()
+
+ integer(4) status
+ status = 1
+ call exit(status)
+end
+
+!-------------------------------------------------------------------------------
+function rechner() ! returns hostname
+
+ character(len = 20) rechner
+ character(len = 32) r
+
+ call hostnm(r)
+ rechner = r
+end
+
+!-------------------------------------------------------------------------------
+SECONDS function sekunden()
+
+ real, external :: walltime
+
+ sekunden = walltime(0.0)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine pxfgetarg(iarg, arg, larg, status)
+
+ implicit none
+ integer :: iarg, larg, status
+ character(len = *) :: arg
+ character(100) :: a
+
+ call getarg(iarg, a)
+
+ larg = len_trim(a)
+ arg = a
+ status = 0
+end
+
+!-------------------------------------------------------------------------------
+integer function ipxfargc()
+ ipxfargc = iargc()
+end
+
+!-------------------------------------------------------------------------------
+logical function is_big_endian()
+ is_big_endian = .false.
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-ibm.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-ibm.F90
new file mode 100644
index 0000000000000000000000000000000000000000..288e22355b448f65111abaada245ca0aadd7fc10
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-ibm.F90
@@ -0,0 +1,70 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2002, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! service-ibm.F90 - calls to service routines on IBM
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine abbruch()
+
+ integer(4) status
+ status = 1
+ call exit_(status)
+end
+
+!-------------------------------------------------------------------------------
+function rechner() ! returns hostname
+
+ character(len = 20) rechner
+ character(len = 32) r
+
+ call hostnm_(r)
+ rechner = r
+end
+
+!-------------------------------------------------------------------------------
+SECONDS function sekunden()
+
+ real(8) rtc
+
+ sekunden = rtc() !!!mclock() * 0.01
+end
+
+!-------------------------------------------------------------------------------
+subroutine pxfgetarg(iarg, arg, larg, status)
+
+ implicit none
+ integer :: iarg, larg, status
+ integer(4) :: i
+ character(len = *) :: arg
+ character(len(arg) + 1) :: a
+
+ i = iarg
+ call getarg(i, a)
+
+ larg = len_trim(a)
+ arg = a
+ status = 0
+end
+
+!-------------------------------------------------------------------------------
+integer function ipxfargc()
+ integer(4) :: iargc
+ ipxfargc = iargc()
+end
+
+!-------------------------------------------------------------------------------
+logical function is_big_endian()
+ is_big_endian = .true.
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-intel.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-intel.F90
new file mode 100644
index 0000000000000000000000000000000000000000..f60c35f249022f6d631797114418584d0247066a
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-intel.F90
@@ -0,0 +1,65 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2002, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! service-intel.F90 - calls to service routine with Intel Fortran Compiler
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine abbruch()
+
+ integer(4) status
+ status = 1
+ call exit(status)
+end
+
+!-------------------------------------------------------------------------------
+function rechner() ! returns hostname
+
+ character(len = 20) rechner
+ character(len = 32) r
+
+ call hostnm(r)
+ rechner = r
+end
+
+!-------------------------------------------------------------------------------
+SECONDS function sekunden()
+
+ call cpu_time(sekunden)
+end
+
+!-------------------------------------------------------------------------------
+subroutine pxfgetarg(iarg, arg, larg, status)
+
+ implicit none
+ integer :: iarg, larg, status
+ character(len = *) :: arg
+ character(100) :: a
+
+ call getarg(iarg, a)
+
+ larg = len_trim(a)
+ arg = a
+ status = 0
+end
+
+!-------------------------------------------------------------------------------
+integer function ipxfargc()
+ ipxfargc = iargc()
+end
+
+!-------------------------------------------------------------------------------
+logical function is_big_endian()
+ is_big_endian = .false.
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-nec.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-nec.F90
new file mode 100644
index 0000000000000000000000000000000000000000..db903a6691780fd249e828b6a12e7b7e6b176e68
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-nec.F90
@@ -0,0 +1,65 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! service-nec.F90 - calls service routines on NEC SX-8
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine abbruch()
+
+ integer(4) status
+ status = 1
+ call exit(status)
+end
+
+!-------------------------------------------------------------------------------
+SECONDS function sekunden()
+
+ call cpu_time(sekunden)
+end
+
+!-------------------------------------------------------------------------------
+function rechner() ! returns hostname
+
+ character(len = 20) rechner
+ character(len = 32) r
+
+ call hostnm(r)
+ rechner = r
+end
+
+!-------------------------------------------------------------------------------
+subroutine pxfgetarg(iarg, arg, larg, status)
+
+ implicit none
+ integer :: iarg, larg, status
+ character(len = *) :: arg
+ character(len(arg) + 1) :: a
+
+ call getarg(iarg, a)
+
+ larg = len_trim(a)
+ arg = a
+ status = 0
+end
+
+!-------------------------------------------------------------------------------
+integer function ipxfargc()
+ ipxfargc = iargc()
+end
+
+!-------------------------------------------------------------------------------
+logical function is_big_endian()
+ is_big_endian = .true.
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-sun.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-sun.F90
new file mode 100644
index 0000000000000000000000000000000000000000..017d07bdd74355e3d0d1e386342c92a7f936c44c
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-sun.F90
@@ -0,0 +1,64 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! service-sun.F90 - calls to service routines on SUN
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine abbruch()
+
+ call exit(1)
+end
+
+!-------------------------------------------------------------------------------
+function rechner() ! returns hostname
+
+ character(len = 20) rechner
+
+ i = hostnm(rechner)
+end
+
+!-------------------------------------------------------------------------------
+SECONDS function sekunden()
+
+ real time(2)
+
+ call etime(time)
+ sekunden = time(1)
+end
+
+!-------------------------------------------------------------------------------
+subroutine pxfgetarg(iarg, arg, larg, status)
+
+ implicit none
+ integer :: iarg, larg, status
+ character(len = *) :: arg
+ character(len(arg) + 1) :: a
+
+ call getarg(iarg, a)
+
+ larg = len_trim(a)
+ arg = a
+ status = 0
+end
+
+!-------------------------------------------------------------------------------
+integer function ipxfargc()
+ ipxfargc = iargc()
+end
+
+!-------------------------------------------------------------------------------
+logical function is_big_endian()
+ is_big_endian = .false.
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/polyakov_loop.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/polyakov_loop.F90
new file mode 100644
index 0000000000000000000000000000000000000000..015be3504883e1cfd546382078bed8385143badf
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/polyakov_loop.F90
@@ -0,0 +1,98 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2000-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! polyakov_loop.F90 - in gamma_4-direction, requires (NPE(gamma_4) == 1)
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine polyakov_loop(conf, traj, i_ensemble1, i_ensemble2)
+
+ use typedef_hmc
+ use module_decomp
+ use module_function_decl
+ use module_vol
+ implicit none
+
+ integer, intent(in) :: traj, i_ensemble1, i_ensemble2
+ type(hmc_conf), intent(in) :: conf
+
+ COMPLEX :: pl
+ REAL :: re_pl, im_pl
+ integer :: x, y, z, t, i, eo, j(DIM)
+ integer :: nx, ny, nz, nt, dir4, npe4
+ integer, external :: ieo, e_o, std_xyzt2i
+ SU3 :: u
+
+ SU3, parameter :: su3_one = reshape( &
+ (/ ONE,ZERO,ZERO, &
+ ZERO,ONE,ZERO, &
+ ZERO,ZERO,ONE /), &
+ (/ NCOL, NCOL /))
+
+ character(len=*), parameter :: key_pl = "%pl"
+ integer, save :: count = 0
+
+
+ count = count + 1
+
+ dir4 = decomp%direction(4)
+ npe4 = decomp%act%npe(dir4)
+
+ if (npe4 /= 1) then
+ call die("polyakov_loop(): gamma_4 direction must not be decomposed")
+ endif
+
+ nx = decomp%std%N(1)
+ ny = decomp%std%N(2)
+ nz = decomp%std%N(3)
+ nt = decomp%std%N(4)
+
+ pl = 0
+ !$omp parallel do reduction(+: pl) private(x, y, z, t, i, j, eo, u)
+ do x = 0, nx - 1
+ do y = 0, ny - 1
+ do z = 0, nz - 1
+
+ u = su3_one
+ do t = 0, nt - 1
+ j = (/x, y, z, t/)
+
+ i = std_xyzt2i(j)
+ eo = e_o(j)
+
+ call u_update2(u, conf%u(1, 1, i, eo, 4))
+ enddo
+
+ pl = pl + u(1,1) + u(2,2) + u(3,3)
+ enddo
+ enddo
+ enddo
+
+ call global_sum_vec(SIZE_COMPLEX, pl)
+
+ pl = pl / (THREE * decomp%std%L(1) * decomp%std%L(2) * decomp%std%L(3))
+
+ re_pl = Re(pl)
+ im_pl = Im(pl)
+
+ if (my_pe() == 0) then
+ if (count == 1) write(UREC, 400) &
+ "T", key_pl, "traj", "e", "f", "Re(Polyakov_Loop)", "Im(Polyakov_Loop)"
+
+ write(UREC, 410) key_pl, traj, i_ensemble1, i_ensemble2, re_pl, im_pl
+ endif
+
+
+400 format (1x, 2a, a6, 2a3, 2a20)
+410 format (1x, a4, i6, 2i3, 2g20.10)
+
+end
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/ran.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/ran.F90
new file mode 100644
index 0000000000000000000000000000000000000000..85b241f3f38cfde13470a63c84a7d5ad4d365683
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/ran.F90
@@ -0,0 +1,207 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! ran.F90 - random number related routines
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine ran_gauss_volh(m, ran, var, eo)
+
+! intitializes x with Gaussian random numbers in a manner that makes
+! results independent of the lattice decomposition
+
+ use module_decomp
+ use module_function_decl
+ use module_vol
+ implicit none
+
+ integer, intent(in) :: m
+ COMPLEX, intent(out) :: ran(m, volh)
+ REAL, intent(in) :: var
+ integer, intent(in) :: eo
+
+ integer, dimension (DIM) :: i_pe, block
+ integer :: i, ii, x, y, z, t, j
+ integer :: nxh, nx, ny, nz, nt, lx, ly, lz, npe(DIM)
+ REAL :: twovar, phi, r
+
+ twovar = TWO * var
+
+ i_pe = decomp%std%i_pe
+ npe = decomp%std%NPE
+
+ nxh = decomp%std%NH(1)
+
+ nx = decomp%std%N(1)
+ ny = decomp%std%N(2)
+ nz = decomp%std%N(3)
+ nt = decomp%std%N(4)
+
+ lx = decomp%std%L(1)
+ ly = decomp%std%L(2)
+ lz = decomp%std%L(3)
+
+ ! block() contains the number of random numbers to be skipped.
+ ! In the calculation of block() a factor 1/2 from "even/odd volume"
+ ! cancels with 2 from "two random numbers per site".
+
+ block(1) = m * nx
+ block(2) = m * lx * ny
+ block(3) = m * lx * ly * nz
+ block(4) = m * lx * ly * lz * nt
+
+ i = 0
+ call ranskip(block(4) * i_pe(4))
+ do t = 0, nt - 1
+ call ranskip(block(3) * i_pe(3))
+ do z = 0, nz - 1
+ call ranskip(block(2) * i_pe(2))
+ do y = 0, ny - 1
+ call ranskip(block(1) * i_pe(1))
+ do x = 0, nxh - 1
+ i = i + 1
+ ii = decomp%act%i(i, eo)
+ do j = 1, m
+ r = sqrt(-twovar * log(ranf()))
+ phi = TWOPI * ranf()
+ ran(j, ii) = cmplx(r * cos(phi), r * sin(phi))
+ enddo
+ enddo
+ call ranskip(block(1) * (npe(1) - 1 - i_pe(1)))
+ enddo
+ call ranskip(block(2) * (npe(2) - 1 - i_pe(2)))
+ enddo
+ call ranskip(block(3) * (npe(3) - 1 - i_pe(3)))
+ enddo
+ call ranskip(block(4) * (npe(4) - 1 - i_pe(4)))
+
+
+CONTAINS
+
+ subroutine ranskip(n)
+ integer :: n
+ SEED :: seed, n_skip
+
+ n_skip = n
+ call ranget(seed)
+ call ranset(seed, n_skip)
+ end subroutine ranskip
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine rancheck() ! checks if seed is equal on all PEs
+
+ use module_function_decl
+ implicit none
+ SEED seed
+
+ call ranget(seed)
+ call seed_compare(seed)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine write_ran() ! save state of random number generator
+
+ use module_function_decl
+ implicit none
+ SEED :: seed
+ FILENAME, external :: ran_file
+
+ call rancheck()
+ call ranget(seed)
+ if (my_pe() == 0) then
+ open(URAN, file = ran_file(), action = "write")
+ write(URAN, *) seed
+ close(URAN)
+ endif
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine random_sequence(r, n) ! random permutation of [1..n]
+
+ use module_function_decl
+ implicit none
+ integer, intent(in) :: n
+ integer, intent(out) :: r(n)
+ integer :: ran, seq(n), i, j, len_seq
+ integer :: ceiling
+
+ len_seq = n
+
+ do i = 1, len_seq
+ seq(i) = i
+ enddo
+
+ do i = 1, n - 1
+ ran = ceiling(len_seq * ranf())
+ if (ran <= 0 .or. ran > len_seq) stop "random_sequence(): ran out of range"
+ r(i) = seq(ran)
+ do j = ran, len_seq - 1
+ seq(j) = seq(j + 1)
+ enddo
+ len_seq = len_seq - 1
+ enddo
+ r(n) = seq(1)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine get_a_random_seed(seed) ! (try to) generate a random seed
+
+ use module_function_decl
+ implicit none
+ SEED :: seed
+ integer :: count, rate, rate_10sec
+ integer :: pe, ierror
+
+ if (my_pe() == 0) then
+ call system_clock(count = count, count_rate = rate)
+
+ if (rate <= 0) call die("get_a_random_seed(): failed")
+
+ rate_10sec = rate * 10
+ seed = mod(count, rate_10sec)
+ endif
+
+ call seed_broadcast(seed)
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_ran(para, flags) ! initilize random number generator
+
+ use typedef_para
+ use typedef_flags
+ implicit none
+
+ type(type_para) :: para
+ type(type_flags) :: flags
+ FILENAME, external :: ran_file
+ SEED :: seed, null
+
+ if (flags%continuation_job) then
+ open(URAN, file = ran_file(), action = "read", status = "old")
+ read(URAN, *) para%seed
+ close(URAN)
+ else
+ seed = para%seed
+ if (seed < 0) call get_a_random_seed(para%seed)
+ endif
+
+ null = 0
+ call ranset(para%seed, null)
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/ranf.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/ranf.F90
new file mode 100644
index 0000000000000000000000000000000000000000..3b3a8216d3642bd9c558a910298b587d15f377bd
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/ranf.F90
@@ -0,0 +1,112 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! ranf.F90 - Fortran90 implementation of the Cray random number generator ranf()
+!
+!-------------------------------------------------------------------------------
+module module_ranf
+
+ integer, parameter :: ikind = 8
+ integer, parameter :: jkind = 8
+ integer, parameter :: rkind = 8
+
+ integer(ikind), parameter :: mH = 2651554_ikind
+ integer(ikind), parameter :: mL = 15184245_ikind
+ integer(ikind), parameter :: mask24 = 16777215_ikind
+ integer(ikind), parameter :: mask48 = 281474976710655_ikind
+ integer(ikind), parameter :: default_seed = 48131768981101_ikind
+ integer(ikind), save :: seed = default_seed
+
+! integer(ikind), save :: mH
+! integer(ikind), save :: mL
+! integer(ikind), save :: mask24
+! integer(ikind), save :: mask48
+! integer(ikind), save :: default_seed
+! integer(ikind), save :: seed
+!
+! data mH /o"12072642"/
+! data mL /o"71730565"/
+! data mask24 /o"77777777"/
+! data mask48 /o"7777777777777777"/
+! data default_seed /o"1274321477413155"/
+! data seed /o"1274321477413155"/
+
+end
+
+!-------------------------------------------------------------------------------
+function ranf()
+
+ use module_ranf
+ implicit none
+ real(rkind) :: ranf
+ integer(ikind) :: seedH, seedL
+
+ ! seed = mod(m * seed, 48):
+
+ seedH = iand(mask24, ishft(seed, -24_jkind))
+ seedL = iand(mask24, seed)
+ seed = iand(mask48, seedL * mL + ishft(seedL * mH + seedH * mL, 24_jkind))
+
+ ! normalize result:
+
+ ranf = real(seed, kind = rkind)
+ ranf = set_exponent(fraction(ranf), exponent(ranf) - 48_jkind)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine ranget(seed_out)
+
+ use module_ranf
+ implicit none
+ integer(ikind), intent(out) :: seed_out
+
+ seed_out = seed
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine ranset(seed_in, n_skip)
+
+ use module_ranf
+ implicit none
+ integer(ikind), intent(in) :: seed_in, n_skip
+ integer(ikind) :: n, mm, mmH, mmL, seedH, seedL
+
+ if (seed_in == 0_ikind) then
+ seed = default_seed
+ else
+ seed = iand(mask48, ibset(seed_in, 0_jkind))
+ endif
+
+ ! skip "n_skip" seeds [i.e. calculate seed = mod(m**n_skip * seed, 48)]:
+
+ n = iand(mask48, n_skip)
+ mmH = mH
+ mmL = mL
+
+ do while (n > 0_ikind)
+
+ if (btest(n, 0_ikind)) then ! seed = seed * mm
+ seedH = iand(mask24, ishft(seed, -24_jkind))
+ seedL = iand(mask24, seed)
+ seed = iand(mask48, seedL*mmL + ishft(seedL*mmH + seedH*mmL, 24_jkind))
+ endif
+
+ mm = iand(mask48, mmL * mmL + ishft(mmH * mmL, 25_jkind)) ! mm = mm * mm
+ mmH = iand(mask24, ishft(mm, -24_jkind))
+ mmL = iand(mask24, mm)
+
+ n = ishft(n, -1_jkind)
+ enddo
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/ranf_test.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/ranf_test.F90
new file mode 100644
index 0000000000000000000000000000000000000000..e87ba91aa5faa7fd9580607010cb9f0574de3b11
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/ranf_test.F90
@@ -0,0 +1,82 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! ranf_test - test of "ranf.F90"
+!
+!-------------------------------------------------------------------------------
+
+program ranf_test
+
+ implicit none
+ integer(8), parameter :: null = 0
+ integer(8) :: i, seed
+ real(8) :: x, ranf
+
+ write(6,400) "default seed:"
+
+ do i = 1, 10
+ call ranget(seed)
+ x = ranf()
+ write(6,410) i, seed, x
+ enddo
+
+ write(6,400) "seed = 4711:"
+
+ call ranset(4711_8, null)
+
+ do i = 1, 10
+ call ranget(seed)
+ x = ranf()
+ write(6,410) i, seed, x
+ enddo
+
+ write(6,400) "seed varies, no skip:"
+
+ do i = -10, 20
+ call ranset(i, null)
+ call ranget(seed)
+ x = ranf()
+ write(6,410) i, seed, x
+ enddo
+
+ write(6,400) "default seed, skip varies:"
+
+ do i = -10, 20
+ call ranset(null, i)
+ call ranget(seed)
+ x = ranf()
+ write(6,410) i, seed, x
+ enddo
+
+ write(6,400) "large seeds, no skip:"
+
+ do i = 0, 47
+ call ranset(2_8**i, null)
+ call ranget(seed)
+ x = ranf()
+ write(6,410) i, seed, x
+ enddo
+
+ write(6,400) "default seeds, big skips:"
+
+ do i = 0, 47
+ call ranset(null, 2_8**i)
+ call ranget(seed)
+ x = ranf()
+ write(6,410) i, seed, x
+ enddo
+
+
+400 format (//1x,a//)
+410 format (i6,i24,f24.16)
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/ranf_test.reference b/qcd/part_cpu/applications/QCD/src/kernel_A/ranf_test.reference
new file mode 100644
index 0000000000000000000000000000000000000000..cd9731dca6ac5f2b10293d5a2749ab1d7b2b12ab
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/ranf_test.reference
@@ -0,0 +1,208 @@
+
+
+ default seed:
+
+
+ 1 48131768981101 0.5801136485795872
+ 2 163287475723473 0.9505127349807658
+ 3 267545549941893 0.7863714253306036
+ 4 221343878630857 0.2976202640037293
+ 5 83772656879069 0.4536999002984921
+ 6 127705168870145 0.0062619416061871
+ 7 1762579867765 0.2757364263838760
+ 8 77612904194681 0.3056509438704786
+ 9 86033092307533 0.6891007107498730
+ 10 193964606509617 0.3826622386562981
+
+
+ seed = 4711:
+
+
+ 1 4711 0.5499394951912784
+ 2 154794206601235 0.1941472565046602
+ 3 54647594503087 0.6685086396955562
+ 4 188168453789179 0.7925054533613682
+ 5 223070454027959 0.8427178850184980
+ 6 237203997059235 0.0340008174584874
+ 7 9570379302271 0.6038999714237825
+ 8 169982730392075 0.9377837430456815
+ 9 263962657233415 0.8757572329452152
+ 10 246503746747443 0.5980755473195210
+
+
+ seed varies, no skip:
+
+
+ -10 281474976710647 0.5775951060376308
+ -9 281474976710647 0.5775951060376308
+ -8 281474976710649 0.8936850824737128
+ -7 281474976710649 0.8936850824737128
+ -6 281474976710651 0.2097750589097949
+ -5 281474976710651 0.2097750589097949
+ -4 281474976710653 0.5258650353458769
+ -3 281474976710653 0.5258650353458769
+ -2 281474976710655 0.8419550117819590
+ -1 281474976710655 0.8419550117819590
+ 0 48131768981101 0.5801136485795872
+ 1 1 0.1580449882180410
+ 2 3 0.4741349646541231
+ 3 3 0.4741349646541231
+ 4 5 0.7902249410902051
+ 5 5 0.7902249410902051
+ 6 7 0.1063149175262872
+ 7 7 0.1063149175262872
+ 8 9 0.4224048939623692
+ 9 9 0.4224048939623692
+ 10 11 0.7384948703984513
+ 11 11 0.7384948703984513
+ 12 13 0.0545848468345334
+ 13 13 0.0545848468345334
+ 14 15 0.3706748232706154
+ 15 15 0.3706748232706154
+ 16 17 0.6867647997066975
+ 17 17 0.6867647997066975
+ 18 19 0.0028547761427795
+ 19 19 0.0028547761427795
+ 20 21 0.3189447525788616
+
+
+ default seed, skip varies:
+
+
+ -10 65394301920949 0.4024069778585933
+ -9 113267494720953 0.0411126961321777
+ -8 11572195186317 0.3413527205224831
+ -7 96082249059185 0.7768798964931598
+ -6 218672250772389 0.7323797526691216
+ -5 206146573825897 0.4161318802907665
+ -4 117130711313405 0.8580567658715417
+ -3 241521508190113 0.9085015119759028
+ -2 255720441925013 0.9242771334193982
+ -1 260160884603417 0.1709983940440232
+ 0 48131768981101 0.5801136485795872
+ 1 163287475723473 0.9505127349807658
+ 2 267545549941893 0.7863714253306036
+ 3 221343878630857 0.2976202640037293
+ 4 83772656879069 0.4536999002984921
+ 5 127705168870145 0.0062619416061871
+ 6 1762579867765 0.2757364263838760
+ 7 77612904194681 0.3056509438704786
+ 8 86033092307533 0.6891007107498730
+ 9 193964606509617 0.3826622386562981
+ 10 107709844713829 0.1329027054963809
+ 11 37408785934377 0.8318579032090732
+ 12 234147183932349 0.5829797958307417
+ 13 164094224454241 0.0986253383374169
+ 14 27760564811605 0.2765484551335682
+ 15 77841469968089 0.6204460277969481
+ 16 174640031224365 0.0835029668338088
+ 17 23503995644817 0.9903771205956851
+ 18 278766376954437 0.9793469434430655
+ 19 275661658097289 0.6938844384181841
+ 20 195311106143645 0.9344770142467986
+
+
+ large seeds, no skip:
+
+
+ 0 1 0.1580449882180410
+ 1 3 0.4741349646541231
+ 2 5 0.7902249410902051
+ 3 9 0.4224048939623692
+ 4 17 0.6867647997066975
+ 5 33 0.2154846111953539
+ 6 65 0.2729242341726668
+ 7 129 0.3878034801272925
+ 8 257 0.6175619720365439
+ 9 513 0.0770789558550469
+ 10 1025 0.9961129234920527
+ 11 2049 0.8341808587660644
+ 12 4097 0.5103167293140878
+ 13 8193 0.8625884704101345
+ 14 16385 0.5671319526022280
+ 15 32769 0.9762189169864151
+ 16 65537 0.7943928457547891
+ 17 131073 0.4307407032915371
+ 18 262145 0.7034364183650332
+ 19 524289 0.2488278485120254
+ 20 1048577 0.3396107088060099
+ 21 2097153 0.5211764293939787
+ 22 4194305 0.8843078705699163
+ 23 8388609 0.6105707529217916
+ 24 16777217 0.0630965176255422
+ 25 33554433 0.9681480470330435
+ 26 67108865 0.7782511058480459
+ 27 134217729 0.3984572234780508
+ 28 268435457 0.6388694587380606
+ 29 536870913 0.1196939292580801
+ 30 1073741825 0.0813428702981192
+ 31 2147483649 0.0046407523781973
+ 32 4294967297 0.8512365165383535
+ 33 8589934593 0.5444280448586660
+ 34 17179869185 0.9308111014992910
+ 35 34359738369 0.7035772147805410
+ 36 68719476737 0.2491094413430410
+ 37 137438953473 0.3401738944680410
+ 38 274877906945 0.5223028007180410
+ 39 549755813889 0.8865606132180410
+ 40 1099511627777 0.6150762382180410
+ 41 2199023255553 0.0721074882180410
+ 42 4398046511105 0.9861699882180410
+ 43 8796093022209 0.8142949882180410
+ 44 17592186044417 0.4705449882180410
+ 45 35184372088833 0.7830449882180410
+ 46 70368744177665 0.4080449882180410
+ 47 140737488355329 0.6580449882180410
+
+
+ default seeds, big skips:
+
+
+ 0 163287475723473 0.9505127349807658
+ 1 267545549941893 0.7863714253306036
+ 2 83772656879069 0.4536999002984921
+ 3 86033092307533 0.6891007107498730
+ 4 174640031224365 0.0835029668338088
+ 5 164662163201517 0.9030257778452473
+ 6 187456359032173 0.6504473584280426
+ 7 247956895216749 0.1291985634769084
+ 8 256822357742189 0.7782475093925463
+ 9 15047453609581 0.7480693881996707
+ 10 59815078594157 0.7411042261636034
+ 11 36250749236845 0.3277774080605163
+ 12 65623225914989 0.2811012722930322
+ 13 17231226705517 0.8281668150128247
+ 14 127417138878061 0.6480316574714529
+ 15 67360884790893 0.5031963704648810
+ 16 92173458085485 0.5752659087564247
+ 17 158548977129069 0.1663654345582621
+ 18 76826528323181 0.1364062830369370
+ 19 181387589981805 0.2278551674942868
+ 20 55158666827373 0.0162216864089864
+ 21 150146494895725 0.0148297242383855
+ 22 41054988277357 0.6995457998971837
+ 23 33978207573613 0.8189779512147801
+ 24 19824646166125 0.0578422538499730
+ 25 272992500061805 0.5355708591203587
+ 26 216378254431853 0.4910280696611302
+ 27 103149763171949 0.4019424907426732
+ 28 158167757362797 0.2237713329057591
+ 29 268203745744493 0.8674290172319310
+ 30 206800745797229 0.1547443858842747
+ 31 83994745902701 0.7293751231889622
+ 32 119857722824301 0.8786365977983372
+ 33 191583676667501 0.1771595470170872
+ 34 53560607643245 0.7742054454545872
+ 35 58989446305389 0.9682972423295872
+ 36 69847123629677 0.3564808360795872
+ 37 91562478278253 0.1328480235795872
+ 38 134993187575405 0.6855823985795872
+ 39 221854606169709 0.7910511485795872
+ 40 114102466647661 0.0019886485795872
+ 41 180073164314221 0.4238636485795872
+ 42 30539582936685 0.2676136485795872
+ 43 12947396892269 0.9551136485795872
+ 44 259238001514093 0.3301136485795872
+ 45 188869257336429 0.0801136485795872
+ 46 48131768981101 0.5801136485795872
+ 47 48131768981101 0.5801136485795872
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/sc.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/sc.F90
new file mode 100644
index 0000000000000000000000000000000000000000..47f85b02af90ccee3a894027c591284f6499918e
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/sc.F90
@@ -0,0 +1,278 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! sc.F90 - routines for the Spin-Colour field
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_sc_size
+
+ integer :: sc_n_real ! number of real numbers of an sc-field
+ integer :: sc_n_complex ! number of complex numbers of an sc-field
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine init_module_sc_size()
+
+ use module_sc_size
+ use module_vol
+ implicit none
+
+ sc_n_complex = NDIRAC * NCOL * volh
+ sc_n_real = NDIRAC * NCOL * volh * SIZE_COMPLEX
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine sc_zero(out)
+
+ use module_sc_size
+ implicit none
+ REAL, intent(out) :: out(*)
+ integer :: i
+
+ TIMING_START(timing_bin_sc_zero)
+
+ !$omp parallel do
+ do i = 1, sc_n_real
+ out(i) = ZERO
+ enddo
+
+ TIMING_STOP(timing_bin_sc_zero)
+end
+
+!-------------------------------------------------------------------------------
+subroutine sc_copy(out, in)
+
+ use module_sc_size
+ implicit none
+ REAL, intent(out) :: out(*)
+ REAL, intent(in) :: in(*)
+ integer :: i
+
+ TIMING_START(timing_bin_sc_copy)
+
+ !$omp parallel do
+ do i = 1, sc_n_real
+ out(i) = in(i)
+ enddo
+
+ TIMING_STOP(timing_bin_sc_copy)
+end
+
+!-------------------------------------------------------------------------------
+subroutine sc_scale(inout, factor)
+
+ use module_sc_size
+ implicit none
+ REAL, intent(inout) :: inout(*)
+ REAL, intent(in) :: factor
+ integer :: i
+
+ TIMING_START(timing_bin_sc_scale)
+
+ !$omp parallel do
+ do i = 1, sc_n_real
+ inout(i) = inout(i) * factor
+ enddo
+
+ TIMING_STOP(timing_bin_sc_scale)
+end
+
+!-------------------------------------------------------------------------------
+subroutine sc_cax2(out, in1, a1, in2, a2) ! out = a1 * in1 + a2 * in2
+
+ use module_sc_size
+ implicit none
+ COMPLEX, intent(out) :: out(*)
+ COMPLEX, intent(in) :: in1(*), in2(*)
+ COMPLEX, intent(in) :: a1, a2
+ integer :: i
+
+ TIMING_START(timing_bin_sc_cax2)
+
+ !$omp parallel do
+ do i = 1, sc_n_complex
+ out(i) = a1 * in1(i) + a2 * in2(i)
+ enddo
+
+ TIMING_STOP(timing_bin_sc_cax2)
+end
+
+!-------------------------------------------------------------------------------
+subroutine sc_axpy(inout, in, a) ! inout = inout + a * in
+
+ use module_sc_size
+ implicit none
+ REAL, intent(inout) :: inout(*)
+ REAL, intent(in) :: in(*)
+ REAL, intent(in) :: a
+ integer :: i
+
+ TIMING_START(timing_bin_sc_axpy)
+
+ !$omp parallel do
+ do i = 1, sc_n_real
+ inout(i) = inout(i) + a * in(i)
+ enddo
+
+ TIMING_STOP(timing_bin_sc_axpy)
+end
+
+!-------------------------------------------------------------------------------
+subroutine sc_caxpy(inout, in, a) ! inout = inout + a * in
+
+ use module_sc_size
+ implicit none
+ COMPLEX, intent(inout) :: inout(*)
+ COMPLEX, intent(in) :: in(*)
+ COMPLEX, intent(in) :: a
+ integer :: i
+
+ TIMING_START(timing_bin_sc_caxpy)
+
+ !$omp parallel do
+ do i = 1, sc_n_complex
+ inout(i) = inout(i) + a * in(i)
+ enddo
+
+ TIMING_STOP(timing_bin_sc_caxpy)
+end
+
+!-------------------------------------------------------------------------------
+subroutine sc_caxpy2(inout, in1, a1, in2, a2) ! inout = inout + a1*in1 + a2*in2
+
+ use module_sc_size
+ implicit none
+ COMPLEX, intent(inout) :: inout(*)
+ COMPLEX, intent(in) :: in1(*), in2(*)
+ COMPLEX, intent(in) :: a1, a2
+ integer :: i
+
+ TIMING_START(timing_bin_sc_caxpy2)
+
+ !$omp parallel do
+ do i = 1, sc_n_complex
+ inout(i) = inout(i) + a1 * in1(i) + a2 * in2(i)
+ enddo
+
+ TIMING_STOP(timing_bin_sc_caxpy2)
+end
+
+!-------------------------------------------------------------------------------
+subroutine sc_xpby(inout, in, b) ! inout = b * inout + in
+
+ use module_sc_size
+ implicit none
+ REAL, intent(inout) :: inout(*)
+ REAL, intent(in) :: in(*)
+ REAL, intent(in) :: b
+ integer :: i
+
+ TIMING_START(timing_bin_sc_xpby)
+
+ !$omp parallel do
+ do i = 1, sc_n_real
+ inout(i) = b * inout(i) + in(i)
+ enddo
+
+ TIMING_STOP(timing_bin_sc_xpby)
+end
+
+!-------------------------------------------------------------------------------
+subroutine sc_axpby(inout, in, b, a) ! inout = b * inout + a * in
+
+ use module_sc_size
+ implicit none
+ REAL, intent(inout) :: inout(*)
+ REAL, intent(in) :: in(*)
+ REAL, intent(in) :: b, a
+ integer :: i
+
+ TIMING_START(timing_bin_sc_axpby)
+
+ !$omp parallel do
+ do i = 1, sc_n_real
+ inout(i) = b * inout(i) + a * in(i)
+ enddo
+
+ TIMING_STOP(timing_bin_sc_axpby)
+end
+
+!-------------------------------------------------------------------------------
+REAL function sc_norm2(in) ! Sum_i abs(in_i)**2
+
+ use module_sc_size
+ implicit none
+ REAL, intent(in) :: in(*)
+ REAL :: tmp
+ integer :: i
+
+ TIMING_START(timing_bin_sc_norm2)
+
+ tmp = ZERO
+ !$omp parallel do reduction(+: tmp)
+ do i = 1, sc_n_real
+ tmp = tmp + in(i)**2
+ enddo
+
+ sc_norm2 = tmp
+
+ TIMING_STOP(timing_bin_sc_norm2)
+end
+
+!-------------------------------------------------------------------------------
+REAL function sc_dot(x, y) ! Sum_i [Re(x_i) * Re(y_i) + Im(x_i) * Im(y_i)]
+
+ use module_sc_size
+ implicit none
+ REAL, intent(in) :: x(*), y(*)
+ REAL :: tmp
+ integer :: i
+
+ TIMING_START(timing_bin_sc_dot)
+
+ tmp = ZERO
+ !$omp parallel do reduction(+: tmp)
+ do i = 1, sc_n_real
+ tmp = tmp + x(i) * y(i)
+ enddo
+
+ sc_dot = tmp
+
+ TIMING_STOP(timing_bin_sc_dot)
+end
+
+!-------------------------------------------------------------------------------
+COMPLEX function sc_cdotc(x, y) ! Sum_i conjg(x_i) * y_i
+
+ use module_sc_size
+ implicit none
+ COMPLEX, intent(in) :: x(*), y(*)
+ COMPLEX :: tmp
+ integer :: i
+
+ TIMING_START(timing_bin_sc_cdotc)
+
+ tmp = ZERO
+ !$omp parallel do reduction(+: tmp)
+ do i = 1, sc_n_complex
+ tmp = tmp + conjg(x(i)) * y(i)
+ enddo
+
+ sc_cdotc = tmp
+
+ TIMING_STOP(timing_bin_sc_cdotc)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/service.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/service.F90
new file mode 100644
index 0000000000000000000000000000000000000000..f7981db72b83e9a6be52318313ac9aca84379ac9
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/service.F90
@@ -0,0 +1,70 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2002, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! service-ibm.F90 - calls to service routines on IBM
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine abbruch()
+
+ integer(4) status
+ status = 1
+ call exit(status)
+end
+
+!-------------------------------------------------------------------------------
+function rechner() ! returns hostname
+
+ character(len = 20) rechner
+ character(len = 32) r
+
+ call hostnm(r)
+ rechner = r
+end
+
+!-------------------------------------------------------------------------------
+SECONDS function sekunden()
+
+ real(8) rtc
+
+ sekunden = rtc() !!!mclock() * 0.01
+end
+
+!-------------------------------------------------------------------------------
+subroutine pxfgetarg(iarg, arg, larg, status)
+
+ implicit none
+ integer :: iarg, larg, status
+ integer(4) :: i
+ character(len = *) :: arg
+ character(len(arg) + 1) :: a
+
+ i = iarg
+ call getarg(i, a)
+
+ larg = len_trim(a)
+ arg = a
+ status = 0
+end
+
+!-------------------------------------------------------------------------------
+integer function ipxfargc()
+ integer(4) :: iargc
+ ipxfargc = iargc()
+end
+
+!-------------------------------------------------------------------------------
+logical function is_big_endian()
+ is_big_endian = .true.
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/staple.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/staple.F90
new file mode 100644
index 0000000000000000000000000000000000000000..a06be5d95f528d6deda28562173fa1aa6f902bcc
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/staple.F90
@@ -0,0 +1,67 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! staple.F90 - calculates sum of staples for one link
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine staple(uuu, u, i, e, mu)
+
+ use module_nn
+ use module_vol
+ implicit none
+
+ SU3, intent(out) :: uuu
+ GAUGE_FIELD, intent(in) :: u
+ integer, intent(in) :: i, e, mu
+ integer :: o, nu, j1, j2, j3, j4
+
+ o = EVEN + ODD - e
+ uuu = 0
+
+ do nu = 1, DIM
+ if (nu /= mu) then
+
+ ! (j2,o) --<-- x nu
+ ! | |
+ ! v ^ ^
+ ! | | |
+ ! (i,e) -->-- (j1,o) x--> mu
+ ! | |
+ ! ^ v
+ ! | |
+ ! (j3,o) --<-- (j4,e)
+
+
+ j1 = nn(i, e, mu, FWD)
+ j2 = nn(i, e, nu, FWD)
+ j3 = nn(i, e, nu, BWD)
+ j4 = nn(j3,o, mu, FWD)
+
+ if (j4 /= nn(j1, o, nu, BWD)) call die('staple(): j4 inconsistent')
+ if (nn(j1, o, nu, FWD) /= nn(j2, o, mu, FWD)) &
+ call die('staple(): j12 inconsistent')
+
+ call uuu_fwd(uuu, u(1, 1, j1, o, nu), &
+ u(1, 1, j2, o, mu), &
+ u(1, 1, i, e, nu))
+
+ call uuu_bwd(uuu, u(1, 1, j4, e, nu), &
+ u(1, 1, j3, o, mu), &
+ u(1, 1, j3, o, nu))
+
+ endif
+ enddo
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/su3.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/su3.F90
new file mode 100644
index 0000000000000000000000000000000000000000..ad5cc768abdbd2f60aecfa362596150e5a9df702
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/su3.F90
@@ -0,0 +1,477 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! su3.F90 - SU(3) routines
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine gen2u(u, h) ! u := exp(i lambda_j h_j)
+
+! adapted from:
+
+! Program qcdf90, module generator_algebra, version 4.0.0
+
+! Copyright by Indranil Dasgupta, Andrea R. Levi, Vittorio Lubicz
+! and Claudio Rebbi - Boston University - January 1996
+! This program may be freely copied and used as long as this notice
+! is retained.
+
+ implicit none
+
+ GENERATOR, intent(in) :: h
+ SU3, intent(out) :: u
+
+ REAL :: p, q, a, alpha, l1, l2, l3, l12, s, aux, c, d, cs1, cs2
+ REAL :: a8, a12, a45, a67
+ GENERATOR :: h2, hs, hk, hs2
+ COMPLEX :: ck1, ck2, ck3, ck4
+ SU3 :: ms, mk
+ integer :: i, j, k
+
+ integer, parameter :: rkind = RKIND
+ COMPLEX, parameter :: iu = (ZERO, ONE)
+ REAL, parameter :: eps = 0.00000001_rkind
+
+ REAL, parameter :: sqrt33 = SQRT3 / THREE
+ REAL, parameter :: twosqrt33 = TWO * sqrt33
+
+! h2=.Sq.h, inlined:
+
+ a8 = h(8)*SQRT33
+ a12 = h(1)**2+h(2)**2
+ a45 = h(4)**2+h(5)**2
+ a67 = h(6)**2+h(7)**2
+ h2(1) = 2*h(1)*a8+h(4)*h(6)+h(5)*h(7)
+ h2(2) = 2*h(2)*a8+h(5)*h(6)-h(4)*h(7)
+ h2(3) = 2*h(3)*a8+HALF*(a45-a67)
+ h2(4) = h(4)*(h(3)-a8)+h(1)*h(6)-h(2)*h(7)
+ h2(5) = h(5)*(h(3)-a8)+h(1)*h(7)+h(2)*h(6)
+ h2(6) = h(6)*(-h(3)-a8)+h(1)*h(4)+h(2)*h(5)
+ h2(7) = h(7)*(-h(3)-a8)+h(1)*h(5)-h(2)*h(4)
+ h2(8) = (h(3)**2-h(8)**2+a12-HALF*(a45+a67))*SQRT33
+
+! q = .Tr.h, inlined:
+
+ q = h(1)**2
+ DO i = 2,8
+ q = q+h(i)**2
+ END DO
+ q = TWO*q
+
+! p = (h*h2)/THREE, inlined:
+
+ p = h(1)*h2(1)
+ DO i = 2,8
+ p = p+h(i)*h2(i)
+ END DO
+ p = TWO*p/THREE
+
+ a = SQRT(TWO*q/THREE)
+ alpha = ACOS(FOUR*p/a**3)/THREE
+ IF(alpha <= PI/6) THEN
+ l1 = a*COS(alpha)
+ l2 = a*COS(alpha+2*PI/3)
+ ELSE
+ l2 = a*COS(alpha)
+ l1 = a*COS(alpha+2*PI/3)
+ ENDIF
+ l3 = -l1-l2
+
+ l12 = l1*l2
+ s = -l1-2*l2
+
+ aux = (TWO*l3*l3+l12)*(l1-l2)
+ c = s*(l3*l3+TWO*l12)/aux
+ d = -THREE*s*l3/aux
+
+! hs = c*h+d*h2, and
+! hk = h-hs, inlined:
+
+ DO i = 1,8
+ hs(i) = c*h(i)+d*h2(i)
+ hk(i) = h(i)-hs(i)
+ END DO
+
+! hs2 = .Sq.hs, inlined:
+
+ a8 = hs(8)*SQRT33
+ a12 = hs(1)**2+hs(2)**2
+ a45 = hs(4)**2+hs(5)**2
+ a67 = hs(6)**2+hs(7)**2
+ hs2(1) = 2*hs(1)*a8+hs(4)*hs(6)+hs(5)*hs(7)
+ hs2(2) = 2*hs(2)*a8+hs(5)*hs(6)-hs(4)*hs(7)
+ hs2(3) = 2*hs(3)*a8+HALF*(a45-a67)
+ hs2(4) = hs(4)*(hs(3)-a8)+hs(1)*hs(6)-hs(2)*hs(7)
+ hs2(5) = hs(5)*(hs(3)-a8)+hs(1)*hs(7)+hs(2)*hs(6)
+ hs2(6) = hs(6)*(-hs(3)-a8)+hs(1)*hs(4)+hs(2)*hs(5)
+ hs2(7) = hs(7)*(-hs(3)-a8)+hs(1)*hs(5)-hs(2)*hs(4)
+ hs2(8) = (hs(3)**2-hs(8)**2+a12-HALF*(a45+a67))*SQRT33
+
+ IF(ABS(s) > eps) THEN
+ cs1 = SIN(s)/s
+ cs2 = (COS(s)-1)/s**2
+ ELSE
+ cs1 = 1
+ cs2 = -HALF
+ ENDIF
+
+ ck1 = EXP(IU*l3)
+ ck2 = 1/ck1**2
+ ck3 = (ck2+2*ck1)/3
+ IF(ABS(l3) > eps) THEN
+ ck4 = (ck1-ck2)/(3*l3)
+ ELSE
+ ck4 = 3*IU
+ ENDIF
+
+! aux = .Tr.hs, inlined:
+
+ aux = hs(1)**2
+ DO i = 2,8
+ aux = aux+hs(i)**2
+ END DO
+ aux = TWO*aux
+
+! ms = UNIT+IU*cs1*(.Matrix.hs.)+cs2*(.Matrix.hs)*(.Matrix.hs), inlined:
+
+ ms(1,1) = ONE+cs2*aux/THREE &
+ +CMPLX(cs2*(hs2(3)+SQRT33*hs2(8)),cs1*(hs(3)+SQRT33*hs(8)),RKIND)
+ ms(2,2) = ONE+cs2*aux/THREE &
+ +CMPLX(cs2*(-hs2(3)+SQRT33*hs2(8)),cs1*(-hs(3)+SQRT33*hs(8)),RKIND)
+ ms(3,3) = ONE+cs2*aux/THREE &
+ +CMPLX(-cs2*TWOSQRT33*hs2(8),-cs1*TWOSQRT33*hs(8),RKIND)
+ ms(1,2) = CMPLX(cs2*hs2(1)+cs1*hs(2),cs1*hs(1)-cs2*hs2(2),RKIND)
+ ms(2,1) = CMPLX(cs2*hs2(1)-cs1*hs(2),cs1*hs(1)+cs2*hs2(2),RKIND)
+ ms(1,3) = CMPLX(cs2*hs2(4)+cs1*hs(5),cs1*hs(4)-cs2*hs2(5),RKIND)
+ ms(3,1) = CMPLX(cs2*hs2(4)-cs1*hs(5),cs1*hs(4)+cs2*hs2(5),RKIND)
+ ms(2,3) = CMPLX(cs2*hs2(6)+cs1*hs(7),cs1*hs(6)-cs2*hs2(7),RKIND)
+ ms(3,2) = CMPLX(cs2*hs2(6)-cs1*hs(7),cs1*hs(6)+cs2*hs2(7),RKIND)
+
+! mk = ck3*UNIT+ck4*(.Matrix.hk), inlined:
+
+ mk(1,1) = ck3+ck4*(hk(3)+SQRT33*hk(8))
+ mk(2,2) = ck3+ck4*(-hk(3)+SQRT33*hk(8))
+ mk(3,3) = ck3-ck4*TWOSQRT33*hk(8)
+ mk(1,2) = ck4*CMPLX(hk(1),-hk(2),RKIND)
+ mk(2,1) = ck4*CMPLX(hk(1),hk(2),RKIND)
+ mk(1,3) = ck4*CMPLX(hk(4),-hk(5),RKIND)
+ mk(3,1) = ck4*CMPLX(hk(4),hk(5),RKIND)
+ mk(2,3) = ck4*CMPLX(hk(6),-hk(7),RKIND)
+ mk(3,2) = ck4*CMPLX(hk(6),hk(7),RKIND)
+
+! u = ms*mk, inlined:
+
+ DO i = 1,3
+ DO j = 1,3
+ u(i,j) = ms(i,1)*mk(1,j)
+ DO k = 2,3
+ u(i,j) = u(i,j)+ms(i,k)*mk(k,j)
+ END DO
+ END DO
+ END DO
+
+END
+
+!-------------------------------------------------------------------------------
+subroutine im_tr_j(p, u, s) ! p(j) := p(j) + s * Im Tr(lambda_j U)
+
+ implicit none
+
+ GENERATOR :: p
+ SU3 :: u
+ REAL :: s
+
+ p(1) = p(1) + s * (Im(u(1, 2)) + Im(u(2, 1)))
+ p(2) = p(2) + s * (Re(u(1, 2)) - Re(u(2, 1)))
+ p(3) = p(3) + s * (Im(u(1, 1)) - Im(u(2, 2)))
+ p(4) = p(4) + s * (Im(u(1, 3)) + Im(u(3, 1)))
+ p(5) = p(5) + s * (Re(u(1, 3)) - Re(u(3, 1)))
+ p(6) = p(6) + s * (Im(u(2, 3)) + Im(u(3, 2)))
+ p(7) = p(7) + s * (Re(u(2, 3)) - Re(u(3, 2)))
+ p(8) = p(8) + s * (Im(u(1, 1)) + Im(u(2, 2)) - TWO * Im(u(3, 3))) / SQRT3
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine re_tr_j(p, u, s) ! p(j) := p(j) + s * Re Tr(lambda_j U)
+
+ implicit none
+
+ GENERATOR :: p
+ SU3 :: u
+ REAL :: s
+
+ p(1) = p(1) + s * (Re(u(1, 2)) + Re(u(2, 1)))
+ p(2) = p(2) + s * (Im(u(2, 1)) - Im(u(1, 2)))
+ p(3) = p(3) + s * (Re(u(1, 1)) - Re(u(2, 2)))
+ p(4) = p(4) + s * (Re(u(1, 3)) + Re(u(3, 1)))
+ p(5) = p(5) + s * (Im(u(3, 1)) - Im(u(1, 3)))
+ p(6) = p(6) + s * (Re(u(2, 3)) + Re(u(3, 2)))
+ p(7) = p(7) + s * (Im(u(3, 2)) - Im(u(2, 3)))
+ p(8) = p(8) + s * (Re(u(1, 1)) + Re(u(2, 2)) - TWO * Re(u(3, 3))) / SQRT3
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine su3_check(u) ! checks if "u" is in SU(3)
+
+ implicit none
+ SU3 :: u, v
+ SU3, parameter :: su3_one = reshape( &
+ (/ ONE,ZERO,ZERO, &
+ ZERO,ONE,ZERO, &
+ ZERO,ZERO,ONE /), &
+ (/ NCOL, NCOL /))
+ REAL, parameter :: eps = 1e-13
+ REAL :: dev
+ integer :: i, j
+
+ call uud(v, u, u)
+ dev = ZERO
+ do i = 1, NCOL
+ do j = 1, NCOL
+ dev = dev + abs(Re(v(i, j)) - Re(su3_one(i, j))) &
+ + abs(Im(v(i, j)) - Im(su3_one(i, j)))
+ enddo
+ enddo
+
+ if (dev > eps) call die('su3_check(): dev > eps')
+
+ call su3_check_det(u)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine su3_check_det(u)
+
+ implicit none
+ COMPLEX :: det
+ SU3 :: u
+ REAL, parameter :: eps = 1e-13
+
+ det = u(1,1) * u(2,2) * u(3,3) &
+ + u(1,2) * u(2,3) * u(3,1) &
+ + u(1,3) * u(2,1) * u(3,2) &
+ - u(1,1) * u(2,3) * u(3,2) &
+ - u(1,2) * u(2,1) * u(3,3) &
+ - u(1,3) * u(2,2) * u(3,1)
+
+ if (abs(Re(det) - ONE) > eps) call die("check_su3_det(): Re(det) /= 1")
+ if (abs(Im(det)) > eps) call die("check_su3_det(): Im(det) /= 0")
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine u_add(u, v) ! u := u + v
+
+ implicit none
+
+ SU3 :: u, v
+ integer i, j
+
+ do j = 1, NCOL
+ do i = 1, NCOL
+ u(i, j) = u(i, j) + v(i, j)
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine u_complete(u) ! calculate 3rd column form the first two
+
+ implicit none
+ SU3 :: u
+
+ u(1,3) = conjg(u(2,1) * u(3,2) - u(3,1) * u(2,2))
+ u(2,3) = conjg(u(3,1) * u(1,2) - u(1,1) * u(3,2))
+ u(3,3) = conjg(u(1,1) * u(2,2) - u(2,1) * u(1,2))
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine u_normalize(u)
+
+! from qcdsf_t3e program:
+!
+! u_normalize() takes a complex matrix and produces a true su3 matrix from
+! the upper 6 entries (because of FORTRAN the first two rows -> DIFFERS
+! FROM APE-PROGRAM!!)
+!
+! ( * * . )
+! ( * * . )
+! ( * * . ) (right 3 completely ignored)
+!
+! Normalization done by Gramm-Schmitt
+
+ implicit none
+ SU3 :: u
+ COMPLEX :: f
+ REAL :: len
+ integer :: i
+
+ len = real(u(1,1))**2 + aimag(u(1,1))**2 + & ! length u_1
+ real(u(2,1))**2 + aimag(u(2,1))**2 + &
+ real(u(3,1))**2 + aimag(u(3,1))**2
+ len = sqrt(len)
+
+ do i = 1, NCOL
+ u(i,1) = u(i,1) / len ! normalize u_1
+ enddo
+
+ f = u(1,2) * conjg(u(1,1)) + &
+ u(2,2) * conjg(u(2,1)) + &
+ u(3,2) * conjg(u(3,1))
+
+ do i = 1, NCOL
+ u(i,2) = u(i,2) - f * u(i,1) ! orthogonalize
+ enddo
+
+ len = real(u(1,2))**2 + aimag(u(1,2))**2 + & ! length u_2
+ real(u(2,2))**2 + aimag(u(2,2))**2 + &
+ real(u(3,2))**2 + aimag(u(3,2))**2
+ len = sqrt(len)
+
+ do i = 1, NCOL
+ u(i,2) = u(i,2) / len ! normalize u_2
+ enddo
+
+ call u_complete(u)
+
+! u(1,3) = conjg(u(2,1) * u(3,2) - u(3,1) * u(2,2)) ! calculate u_3
+! u(2,3) = conjg(u(3,1) * u(1,2) - u(1,1) * u(3,2)) ! = u_1 x u_2
+! u(3,3) = conjg(u(1,1) * u(2,2) - u(2,1) * u(1,2))
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine u_trans(u) ! u := transpose(u)
+
+ implicit none
+
+ SU3, intent(inout) :: u
+ COMPLEX :: tmp
+ integer :: i, j
+
+ do j = 1, NCOL
+ do i = j + 1, NCOL
+ tmp = u(i, j)
+ u(i, j) = u(j, i)
+ u(j, i) = tmp
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine u_update(u, v) ! u = v * u
+
+ implicit none
+
+ SU3, intent(in) :: v
+ SU3, intent(inout) :: u
+ SU3 :: w
+
+ w = u
+ call uu(u, v, w)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine u_update2(u, v) ! u = u * v
+
+ implicit none
+
+ SU3, intent(in) :: v
+ SU3, intent(inout) :: u
+ SU3 :: w
+
+ w = u
+ call uu(u, w, v)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine uu(r, a, b) ! r = a * b
+
+ implicit none
+
+ SU3 :: r, a, b
+ integer :: i, j
+
+ do i = 1, NCOL
+ do j = 1, NCOL
+ r(i, j) = a(i, 1) * b(1, j) &
+ + a(i, 2) * b(2, j) &
+ + a(i, 3) * b(3, j)
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine uud(r, a, b) ! U U^dagger: r = a * b+
+
+ implicit none
+
+ SU3 :: r, a, b
+ integer :: i, j
+
+ do i = 1, NCOL
+ do j = 1, NCOL
+ r(i, j) = a(i, 1) * conjg(b(j, 1)) &
+ + a(i, 2) * conjg(b(j, 2)) &
+ + a(i, 3) * conjg(b(j, 3))
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine udu(r, a, b) ! U^dagger U: r = a+ * b
+
+ implicit none
+
+ SU3 :: r, a, b
+ integer :: i, j
+
+ do i = 1, NCOL
+ do j = 1, NCOL
+ r(i, j) = conjg(a(1, i)) * b(1, j) &
+ + conjg(a(2, i)) * b(2, j) &
+ + conjg(a(3, i)) * b(3, j)
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+REAL function Re_Tr_uu(u, v) ! returns Re(Tr(u * v))
+
+ implicit none
+ SU3, intent(in) :: u, v
+ REAL :: p
+ integer :: c1, c2
+
+ p = 0
+ do c2 = 1, NCOL
+ do c1 = 1, NCOL
+ p = p + Re(u(c2, c1)) * Re(v(c1, c2)) &
+ - Im(u(c2, c1)) * Im(v(c1, c2))
+ enddo
+ enddo
+
+ Re_Tr_uu = p
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/swap.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/swap.F90
new file mode 100644
index 0000000000000000000000000000000000000000..d4d924dc22f69551c01e5bd8c923a1d6df64e89f
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/swap.F90
@@ -0,0 +1,90 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! swap.F90 - swap routines for various data types
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine swap_p_g_field(u, v)
+
+ implicit none
+ P_GAUGE_FIELD :: u, v, tmp
+
+ tmp => u
+ u => v
+ v => tmp
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine swap_p_sc_field(a, b)
+
+ implicit none
+ P_SPINCOL_FIELD :: a, b, tmp
+
+ tmp => a
+ a => b
+ b => tmp
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine swap_p_clover_field_a(x, y)
+
+ use typedef_clover
+ implicit none
+ P_CLOVER_FIELD_A :: x, y, tmp
+
+ tmp => x
+ x => y
+ y => tmp
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine swap_p_clover_field_b(x, y)
+
+ use typedef_clover
+ implicit none
+ P_CLOVER_FIELD_B :: x, y, tmp
+
+ tmp => x
+ x => y
+ y => tmp
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine swap_real(x, y)
+
+ implicit none
+ REAL :: x, y, tmp
+
+ tmp = x
+ x = y
+ y = tmp
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine swap_integer(x, y)
+
+ implicit none
+ integer :: x, y, tmp
+
+ tmp = x
+ x = y
+ y = tmp
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/test_echo.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/test_echo.F90
new file mode 100644
index 0000000000000000000000000000000000000000..4533c28887c30a2cf91018d0ae019770ae14e9ff
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/test_echo.F90
@@ -0,0 +1,9 @@
+program bqcd_echo
+
+ character(2) :: arg
+
+ do i = 1, ipxfargc()
+ call pxfgetarg(i, arg, length, istat)
+ write(6,*) i, ":", arg(1:length), ":"
+ enddo
+end
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/timing.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/timing.F90
new file mode 100644
index 0000000000000000000000000000000000000000..7393ff71738c3b81f1153846dd19bdd8afa773c3
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/timing.F90
@@ -0,0 +1,324 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! timing.F90 - measurements of execution times and performance
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+module module_timing_meas
+
+ implicit none
+
+ type type_timing
+ SECONDS time
+ SECONDS total_time
+ integer n_call
+ integer fill_the_cache_line
+ end type type_timing
+
+ integer, parameter :: n_timing = 55
+ type(type_timing), save :: meas(n_timing) ! measurements
+!dir$ cache_align meas
+
+ data meas /n_timing * type_timing(0.0, 0.0, 0, 0)/
+
+end
+
+!-------------------------------------------------------------------------------
+module module_timing_write
+
+ use module_timing_meas
+ implicit none
+
+
+ character(len = 16), save :: text(n_timing) ! region name
+ integer, save :: n_op(n_timing) ! # operations
+
+ data text(timing_bin_d_xf) /"d_xf"/
+ data text(timing_bin_d_xb) /"d_xb"/
+ data text(timing_bin_d_yf) /"d_yf"/
+ data text(timing_bin_d_yb) /"d_yb"/
+ data text(timing_bin_d_zf) /"d_zf"/
+ data text(timing_bin_d_zb) /"d_zb"/
+ data text(timing_bin_d_t) /"d_t"/
+ data text(timing_bin_d) /"D_TOTAL"/
+
+ data text(timing_bin_global_sum) /"global_sum"/
+ data text(timing_bin_global_sum_vec) /"global_sum_vec"/
+ data text(timing_bin_sc_zero) /"sc_zero"/
+ data text(timing_bin_sc_copy) /"sc_copy"/
+ data text(timing_bin_sc_scale) /"sc_scale"/
+ data text(timing_bin_sc_axpy) /"sc_axpy"/
+ data text(timing_bin_sc_caxpy) /"sc_caxpy"/
+ data text(timing_bin_sc_caxpy2) /"sc_caxpy2"/
+ data text(timing_bin_sc_cax2) /"sc_cax2"/
+ data text(timing_bin_sc_xpby) /"sc_xpby"/
+ data text(timing_bin_sc_axpby) /"sc_axpby"/
+ data text(timing_bin_sc_norm2) /"sc_norm2"/
+ data text(timing_bin_sc_dot) /"sc_dot"/
+ data text(timing_bin_sc_cdotc) /"sc_cdotc"/
+
+ data text(timing_bin_plaq) /"plaquette"/
+ data text(timing_bin_cooling) /"cooling"/
+ data text(timing_bin_u_read) /"u_read"/
+ data text(timing_bin_u_write) /"u_write"/
+
+ data text(timing_bin_total) /"TOTAL"/
+ data text(timing_bin_hmc) /"HMC"/
+ data text(timing_bin_cg) /"CG"/
+ data text(timing_bin_mtdagmt) /"MTDAGMT"/
+
+ data text(timing_bin_dsf) /"dsf"/
+ data text(timing_bin_dsg) /"dsg"/
+ data text(timing_bin_hmc_u) /"hmc_u"/
+ data text(timing_bin_hmc_init_p) /"hmc_init_p"/
+
+ data text(timing_bin_clover_init) /"clover_init"/
+ data text(timing_bin_clover_mult_a) /"clover_mult_a"/
+ data text(timing_bin_clover_mult_ao)/"clover_mult_ao"/
+ data text(timing_bin_clover_mult_b) /"clover_mult_b"/
+ data text(timing_bin_clover_dsd) /"clover_dsd"/
+ data text(timing_bin_clover_dsf) /"clover_dsf"/
+
+ data text(timing_bin_hmc_init) /"hmc_init"/
+ data text(timing_bin_hmc_momenta) /"hmc_momenta"/
+ data text(timing_bin_hmc_init_phi) /"hmc_phi"/
+ data text(timing_bin_hmc_h_old) /"hmc_h_old"/
+ data text(timing_bin_hmc_backup) /"hmc_backup"/
+ data text(timing_bin_hmc_half_step0)/"hmc_half_step0"/
+ data text(timing_bin_hmc_half_step1)/"hmc_half_step1"/
+ data text(timing_bin_hmc_xbound_g) /"hmc_xbound_g"/
+ data text(timing_bin_hmc_steps) /"hmc_steps"/
+ data text(timing_bin_hmc_h_new) /"hmc_h_new"/
+ data text(timing_bin_hmc_rest) /"hmc_rest"/
+
+ data text(timing_bin_h_mult_a) /"h_mult_a"/
+ data text(timing_bin_h_mult_b) /"h_mult_b"/
+ data text(timing_bin_h_mult_c) /"h_mult_c"/
+
+ data text(timing_bin_sc2_projection)/"sc2_projection"/
+
+ integer, parameter :: op_add = 2 ! operations per complex add.
+ integer, parameter :: op_mult = 6 ! operations per complex mult.
+
+ integer, parameter, private :: op_d_xyz = 18 * op_mult + 30 * op_add
+ integer, parameter, private :: op_d_t = 36 * op_mult + 24 * op_add + 24
+
+ integer, parameter :: op_d = 6 * op_d_xyz + op_d_t
+
+ integer, parameter, private :: op_uuu = 162 * op_mult
+ integer, parameter, private :: op_re_tr = 36
+ integer, parameter, private :: op_plaq = 2 * 6 * (op_uuu + op_re_tr)
+
+ integer, parameter :: op_sc_r = NDIRAC * NCOL * SIZE_COMPLEX
+ integer, parameter :: op_sc_c = NDIRAC * NCOL
+
+ integer, parameter :: op_blas_r1 = op_sc_r
+ integer, parameter :: op_blas_r2 = op_sc_r * 2
+ integer, parameter :: op_blas_r3 = op_sc_r * 3
+ integer, parameter :: op_blas_c1 = op_sc_c * op_mult
+ integer, parameter :: op_blas_c2 = op_sc_c * (op_mult + op_add)
+ integer, parameter :: op_blas_c3 = 2 * op_blas_c2
+
+ integer, parameter :: op_clov = 84 * op_mult + 60 * op_add + 24
+ integer, parameter :: op_h_mult= NDIRAC * NCOL * 3
+
+ integer, parameter :: op_sc2_proj = 36 * op_add
+
+ data n_op(timing_bin_d_xf) /op_d_xyz/
+ data n_op(timing_bin_d_xb) /op_d_xyz/
+ data n_op(timing_bin_d_yf) /op_d_xyz/
+ data n_op(timing_bin_d_yb) /op_d_xyz/
+ data n_op(timing_bin_d_zf) /op_d_xyz/
+ data n_op(timing_bin_d_zb) /op_d_xyz/
+ data n_op(timing_bin_d_t) /op_d_t/
+ data n_op(timing_bin_d) /op_d/
+
+ data n_op(timing_bin_global_sum) /0/
+ data n_op(timing_bin_global_sum_vec) /0/
+ data n_op(timing_bin_sc_zero) /0/
+ data n_op(timing_bin_sc_copy) /0/
+ data n_op(timing_bin_sc_scale) /op_blas_r1/
+ data n_op(timing_bin_sc_norm2) /op_blas_r2/
+ data n_op(timing_bin_sc_dot) /op_blas_r2/
+ data n_op(timing_bin_sc_axpy) /op_blas_r2/
+ data n_op(timing_bin_sc_xpby) /op_blas_r2/
+ data n_op(timing_bin_sc_axpby) /op_blas_r3/
+ data n_op(timing_bin_sc_cdotc) /op_blas_c2/
+ data n_op(timing_bin_sc_caxpy) /op_blas_c2/
+ data n_op(timing_bin_sc_caxpy2) /op_blas_c3/
+ data n_op(timing_bin_sc_cax2) /op_blas_c3/
+
+ data n_op(timing_bin_plaq) /op_plaq/
+ data n_op(timing_bin_cooling) /0/
+ data n_op(timing_bin_u_read) /0/
+ data n_op(timing_bin_u_write) /0/
+
+ data n_op(timing_bin_total) /0/
+ data n_op(timing_bin_hmc) /0/
+ data n_op(timing_bin_cg) /0/
+ data n_op(timing_bin_mtdagmt) /0/
+
+ data n_op(timing_bin_dsf) /0/
+ data n_op(timing_bin_dsg) /0/
+ data n_op(timing_bin_hmc_u) /0/
+
+ data n_op(timing_bin_clover_init) /0/
+ data n_op(timing_bin_clover_mult_a) /op_clov/
+ data n_op(timing_bin_clover_mult_ao)/op_clov/
+ data n_op(timing_bin_clover_mult_b) /op_clov/
+ data n_op(timing_bin_clover_dsd) /0/
+ data n_op(timing_bin_clover_dsf) /0/
+
+ data n_op(timing_bin_hmc_init) /0/
+ data n_op(timing_bin_hmc_momenta) /0/
+ data n_op(timing_bin_hmc_init_phi) /0/
+ data n_op(timing_bin_hmc_h_old) /0/
+ data n_op(timing_bin_hmc_backup) /0/
+ data n_op(timing_bin_hmc_half_step0)/0/
+ data n_op(timing_bin_hmc_half_step1)/0/
+ data n_op(timing_bin_hmc_xbound_g) /0/
+ data n_op(timing_bin_hmc_steps) /0/
+ data n_op(timing_bin_hmc_h_new) /0/
+ data n_op(timing_bin_hmc_rest) /0/
+
+ data n_op(timing_bin_h_mult_a) /op_h_mult/
+ data n_op(timing_bin_h_mult_b) /op_h_mult/
+ data n_op(timing_bin_h_mult_c) /op_h_mult/
+ data n_op(timing_bin_sc2_projection)/op_sc2_proj/
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine timing_start(bin)
+
+ use module_timing_meas
+ implicit none
+ integer bin
+ SECONDS sekunden
+
+ meas(bin)%time = sekunden()
+end
+
+!-------------------------------------------------------------------------------
+subroutine timing_stop(bin)
+
+ use module_timing_meas
+ implicit none
+ integer bin
+ SECONDS sekunden
+
+ meas(bin)%total_time = meas(bin)%total_time + sekunden() - meas(bin)%time
+ meas(bin)%n_call = meas(bin)%n_call + 1
+end
+
+!-------------------------------------------------------------------------------
+subroutine timing_write(unit)
+
+ use module_timing_write
+ use module_cg
+ use module_function_decl
+ use module_switches
+ use module_thread
+ use module_vol
+ implicit none
+
+ integer unit, i
+ integer ierror
+ integer op_mtdagmt
+ integer cg_calls, cg_iter
+ real mflops, mflops_mean, mflops_min, mflops_max
+ real total_gflops, time_mean
+
+
+ character(len = 8) :: a_mflops_mean, a_mflops_min, a_mflops_max, &
+ a_total_gflops, a_time_mean, a_n_call
+
+ character(*), parameter :: ifmt = "(i8)", ffmt = "(f8.2)", &
+ tab_fmt ="(2(3(1x,a),2x),1x,a)"
+
+ if (version_of_d() >= 2) then
+ n_op(timing_bin_d_xf) = n_op(timing_bin_d_xf) * 2
+ n_op(timing_bin_d_yf) = n_op(timing_bin_d_yf) * 2
+ n_op(timing_bin_d_zf) = n_op(timing_bin_d_zf) * 2
+ endif
+
+ op_mtdagmt = 2 * (2 * op_d + op_blas_r2)
+ if (switches%clover) op_mtdagmt = op_mtdagmt + 4 * op_clov
+ if (switches%h_ext) op_mtdagmt = op_mtdagmt + 4 * op_h_mult
+
+ if (version_of_d() == 21 .or. version_of_d() == 22) then
+ n_op(timing_bin_d_xf) = n_op(timing_bin_d_xf) - 12 * op_add
+ n_op(timing_bin_d_yf) = n_op(timing_bin_d_yf) - 12 * op_add
+ n_op(timing_bin_d_zf) = n_op(timing_bin_d_zf) - 12 * op_add
+ endif
+
+ cg_calls = meas(timing_bin_cg)%n_call
+ cg_iter = cg_iterations_total
+
+ n_op(timing_bin_mtdagmt) = op_mtdagmt
+ n_op(timing_bin_cg) = cg_iter * (op_mtdagmt + 5 * op_blas_r2) &
+ + cg_calls * (op_mtdagmt + op_blas_r1)
+ n_op(timing_bin_cg) = nint(real(n_op(timing_bin_cg)) / real(cg_calls))
+
+
+ call begin(unit, "Timing")
+ if (my_pe() == 0) then
+ write(unit,"(48x,a)") "Performance"
+ write(unit, tab_fmt) "region ", " #calls", " time", &
+ " mean", " min", " max", " Total"
+ write(unit, tab_fmt) " ", " ", " s", &
+ " Mflop/s", " Mflop/s", " Mflop/s", " Gflop/s"
+ write(unit, *)
+ endif
+
+ do i = 1, n_timing
+
+ write(a_n_call, ifmt) meas(i)%n_call
+ a_time_mean = " "
+ a_mflops_mean = " "
+ a_mflops_min = " "
+ a_mflops_max = " "
+ a_total_gflops = " "
+
+ if (meas(i)%n_call /= 0) then ! must be true on all PEs !!
+
+ time_mean = global_sum(real(meas(i)%total_time, kind=RKIND)) / num_pes()
+ write(a_time_mean, ffmt) time_mean
+
+ if (n_op(i) /= 0) then
+ mflops = 1e-6 * n_op(i) * volh * meas(i)%n_call / meas(i)%total_time
+ mflops_mean = global_sum(real(mflops, kind=RKIND)) / num_pes()
+
+ mflops_min = global_min(mflops)
+ mflops_max = global_max(mflops)
+
+ total_gflops = 1e-3 * mflops_mean * num_pes()
+
+ write(a_mflops_mean, ffmt) mflops_mean / n_thread
+ write(a_mflops_min, ffmt) mflops_min / n_thread
+ write(a_mflops_max, ffmt) mflops_max / n_thread
+ write(a_total_gflops, ffmt) total_gflops
+ endif
+ endif
+
+ if (my_pe() == 0) then
+ write(unit, tab_fmt) text(i), a_n_call, a_time_mean, &
+ a_mflops_mean, a_mflops_min, a_mflops_max, &
+ a_total_gflops
+ endif
+ enddo
+
+ call end_A(unit, "Timing")
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/traces.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/traces.F90
new file mode 100644
index 0000000000000000000000000000000000000000..7e05fcea473452153b5df5680c455eb8debf3dd5
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/traces.F90
@@ -0,0 +1,166 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2000-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! traces.F90
+!
+! calculates: Tr(inv(M)) psibar psi (pbp)
+! Tr(gamma5 inv(M)) psibar gamma5 psi (p5p)
+! Tr(inv(M+ M)) pion norm (pinorm)
+!
+! traces of a matrix A are calculated with a stochastic estimator:
+!
+! Tr(A) = eta+ A eta (eta: Gaussian noise)
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine traces(para, conf, traj, i_ensemble1, i_ensemble2)
+
+ use typedef_hmc
+ use module_function_decl
+ use module_p_interface
+ use module_vol
+ implicit none
+
+ integer, intent(in) :: traj, i_ensemble1, i_ensemble2
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(in) :: conf
+
+ P_SPINCOL_FIELD, save :: eta_e, eta_o, zeta_e, zeta_o
+
+ character(len=*), parameter :: key_tr = "%tr"
+ integer, save :: count = 0
+ integer :: n_sc_field, size_of_trace
+ integer :: cg_ncall, cg_niter_max, cg_niter_tot
+ REAL :: pinorm
+ REAL :: re_pbp, im_pbp, re_p5p, im_p5p
+ COMPLEX :: pbp, p5p
+ REAL :: res(5)
+
+
+ ALLOCATE_SC_FIELD(eta_e)
+ ALLOCATE_SC_FIELD(eta_o)
+ ALLOCATE_SC_FIELD(zeta_e)
+ ALLOCATE_SC_FIELD(zeta_o)
+
+ count = count + 1
+ n_sc_field = NDIRAC * NCOL * volh
+ size_of_trace = NDIRAC * NCOL * volume
+
+ call ran_gauss_volh(NDIRAC * NCOL, eta_e, HALF, EVEN)
+ call ran_gauss_volh(NDIRAC * NCOL, eta_o, HALF, ODD)
+
+ call init_cg_stat()
+ call solve(para, conf, zeta_e, zeta_o, eta_e, eta_o) ! zeta = inv(M) eta
+
+ pbp = sc_cdotc(eta_e, zeta_e) + sc_cdotc(eta_o, zeta_o)
+
+ pinorm = sc_dot(zeta_e, zeta_e) + sc_dot(zeta_o, zeta_o)
+
+ call gamma5(zeta_e, volh) ! zeta = gamma5 inv(M) eta
+ call gamma5(zeta_o, volh)
+
+ p5p = sc_cdotc(eta_e, zeta_e) + sc_cdotc(eta_o, zeta_o)
+
+ res(1) = Re(pbp)
+ res(2) = Im(pbp)
+ res(3) = Re(p5p)
+ res(4) = Im(p5p)
+ res(5) = pinorm
+
+ call global_sum_vec(5, res)
+
+ re_pbp = res(1) / size_of_trace
+ im_pbp = res(2) / size_of_trace
+ re_p5p = res(3) / size_of_trace
+ im_p5p = res(4) / size_of_trace
+ pinorm = res(5) / size_of_trace
+
+ call get_cg_stat(cg_ncall, cg_niter_max, cg_niter_tot)
+
+ if (my_pe() == 0) then
+ if (count == 1) write(UREC, 400) &
+ "T", key_tr, "traj", "e", "f", &
+ "Re(pbp)", "Im(pbp)", "Re(p5p)", "-Im(p5p)", "PionNorm", "CGiter"
+
+ write(UREC, 410) key_tr, traj, i_ensemble1, i_ensemble2, &
+ re_pbp, im_pbp, re_p5p, -im_p5p, pinorm, cg_niter_max
+ endif
+
+
+400 format (1x, 2a, a6, 2a3, 5a20, a10)
+410 format (1x, a4, i6, 2i3, 5g20.10, i10)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine solve(para, conf, out_e, out_o, in_e, in_o) ! solves: M out = in
+
+ use typedef_hmc
+ use module_vol
+ implicit none
+
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(in) :: conf
+ SPINCOL_FIELD, intent(out) :: out_e, out_o
+ SPINCOL_FIELD, intent(in) :: in_e, in_o
+
+ REAL :: a, b
+ integer :: iterations
+ external :: mtdagmt
+
+ b = para%kappa / (ONE + para%h**2)
+
+ call h_mult_c(out_o, -para%h, in_o, volh)
+
+ call d(EVEN, ODD, out_e, out_o, conf%u)
+
+ call sc_xpby(out_e, in_e, b)
+
+ call mtil_dag(out_o, out_e, para, conf)
+
+ call cg(mtdagmt, out_e, out_o, para, conf, iterations)
+
+ call d(ODD, EVEN, out_o, out_e, conf%u)
+
+ a = ONE / (ONE + para%h**2)
+
+ call sc_axpby(out_o, in_o, b, a)
+
+ call h_mult_b(-para%h, out_o, volh)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine gamma5(x, volh)
+
+ implicit none
+ COMPLEX, dimension (NDIRAC, *) :: x
+ integer :: volh
+
+ integer :: i
+ COMPLEX :: x1, x2, x3, x4
+
+ !$omp parallel do private(x1, x2, x3, x4)
+ do i = 1, NCOL * volh
+ x1 = x(1, i)
+ x2 = x(2, i)
+ x3 = x(3, i)
+ x4 = x(4, i)
+
+ x(1, i) = x3
+ x(2, i) = x4
+ x(3, i) = x1
+ x(4, i) = x2
+ enddo
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/types.h b/qcd/part_cpu/applications/QCD/src/kernel_A/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..269565a5c9a46e4a9816db69b467a1b10a2c9d88
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/types.h
@@ -0,0 +1,10 @@
+typedef int INTSTD;
+typedef short INT4;
+typedef long INT8;
+
+typedef double REALSTD;
+typedef float REAL4;
+typedef double REAL8;
+
+typedef struct { REAL4 r, i; } COMPLEX4;
+typedef struct { REAL8 r, i; } COMPLEX8;
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_bwd.c b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_bwd.c
new file mode 100644
index 0000000000000000000000000000000000000000..cdb2bd9ea4020e67a42a03b6aa677fd062ffdb4a
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_bwd.c
@@ -0,0 +1,353 @@
+/*
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! uuu_bwd.c - adds backward staple: r = r + a^\dagger b^\dagger c
+!
+!-------------------------------------------------------------------------------
+*/
+
+#include "types.h"
+
+#ifdef NamesToLower_
+# define UUU_BWD uuu_bwd_
+#endif
+
+#ifdef NamesToLower
+# define UUU_BWD uuu_bwd
+#endif
+
+void UUU_BWD(r, a, b, c)
+COMPLEX8 *r, *a, *b, *c;
+{
+ register COMPLEX8 q__1, q__2;
+ register COMPLEX8 t1, t2, t3, x1, x2, x3;
+
+ /* Parameter adjustments */
+ c -= 4;
+ b -= 4;
+ a -= 4;
+ r -= 4;
+
+ /* Function Body */
+ q__1.r = a[4].r * b[4].r - a[4].i * b[4].i,
+ q__1.i = a[4].r * b[4].i + a[4].i * b[4].r;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__1.r = a[4].r * b[5].r - a[4].i * b[5].i,
+ q__1.i = a[4].r * b[5].i + a[4].i * b[5].r;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__1.r = a[4].r * b[6].r - a[4].i * b[6].i,
+ q__1.i = a[4].r * b[6].i + a[4].i * b[6].r;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+ q__2.r = a[5].r * b[7].r - a[5].i * b[7].i,
+ q__2.i = a[5].r * b[7].i + a[5].i * b[7].r;
+ q__1.r = t1.r + q__2.r,
+ q__1.i = t1.i + q__2.i;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__2.r = a[5].r * b[8].r - a[5].i * b[8].i,
+ q__2.i = a[5].r * b[8].i + a[5].i * b[8].r;
+ q__1.r = t2.r + q__2.r,
+ q__1.i = t2.i + q__2.i;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__2.r = a[5].r * b[9].r - a[5].i * b[9].i,
+ q__2.i = a[5].r * b[9].i + a[5].i * b[9].r;
+ q__1.r = t3.r + q__2.r,
+ q__1.i = t3.i + q__2.i;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+ q__2.r = a[6].r * b[10].r - a[6].i * b[10].i,
+ q__2.i = a[6].r * b[10].i + a[6].i * b[10].r;
+ q__1.r = t1.r + q__2.r,
+ q__1.i = t1.i + q__2.i;
+ t1.r = q__1.r,
+ t1.i = -q__1.i;
+ q__2.r = a[6].r * b[11].r - a[6].i * b[11].i,
+ q__2.i = a[6].r * b[11].i + a[6].i * b[11].r;
+ q__1.r = t2.r + q__2.r,
+ q__1.i = t2.i + q__2.i;
+ t2.r = q__1.r,
+ t2.i = -q__1.i;
+ q__2.r = a[6].r * b[12].r - a[6].i * b[12].i,
+ q__2.i = a[6].r * b[12].i + a[6].i * b[12].r;
+ q__1.r = t3.r + q__2.r,
+ q__1.i = t3.i + q__2.i;
+ t3.r = q__1.r,
+ t3.i = -q__1.i;
+
+ q__2.r = t1.r * c[4].r - t1.i * c[4].i,
+ q__2.i = t1.r * c[4].i + t1.i * c[4].r;
+ q__1.r = r[4].r + q__2.r,
+ q__1.i = r[4].i + q__2.i;
+ x1.r = q__1.r,
+ x1.i = q__1.i;
+ q__2.r = t1.r * c[7].r - t1.i * c[7].i,
+ q__2.i = t1.r * c[7].i + t1.i * c[7].r;
+ q__1.r = r[7].r + q__2.r,
+ q__1.i = r[7].i + q__2.i;
+ x2.r = q__1.r,
+ x2.i = q__1.i;
+ q__2.r = t1.r * c[10].r - t1.i * c[10].i,
+ q__2.i = t1.r * c[10].i + t1.i * c[10].r;
+ q__1.r = r[10].r + q__2.r,
+ q__1.i = r[10].i + q__2.i;
+ x3.r = q__1.r,
+ x3.i = q__1.i;
+ q__2.r = t2.r * c[5].r - t2.i * c[5].i,
+ q__2.i = t2.r * c[5].i + t2.i * c[5].r;
+ q__1.r = x1.r + q__2.r,
+ q__1.i = x1.i + q__2.i;
+ x1.r = q__1.r,
+ x1.i = q__1.i;
+ q__2.r = t2.r * c[8].r - t2.i * c[8].i,
+ q__2.i = t2.r * c[8].i + t2.i * c[8].r;
+ q__1.r = x2.r + q__2.r,
+ q__1.i = x2.i + q__2.i;
+ x2.r = q__1.r,
+ x2.i = q__1.i;
+ q__2.r = t2.r * c[11].r - t2.i * c[11].i,
+ q__2.i = t2.r * c[11].i + t2.i * c[11].r;
+ q__1.r = x3.r + q__2.r,
+ q__1.i = x3.i + q__2.i;
+ x3.r = q__1.r,
+ x3.i = q__1.i;
+ q__2.r = t3.r * c[6].r - t3.i * c[6].i,
+ q__2.i = t3.r * c[6].i + t3.i * c[6].r;
+ q__1.r = x1.r + q__2.r,
+ q__1.i = x1.i + q__2.i;
+ r[4].r = q__1.r,
+ r[4].i = q__1.i;
+ q__2.r = t3.r * c[9].r - t3.i * c[9].i,
+ q__2.i = t3.r * c[9].i + t3.i * c[9].r;
+ q__1.r = x2.r + q__2.r,
+ q__1.i = x2.i + q__2.i;
+ r[7].r = q__1.r,
+ r[7].i = q__1.i;
+ q__2.r = t3.r * c[12].r - t3.i * c[12].i,
+ q__2.i = t3.r * c[12].i + t3.i * c[12].r;
+ q__1.r = x3.r + q__2.r,
+ q__1.i = x3.i + q__2.i;
+ r[10].r = q__1.r,
+ r[10].i = q__1.i;
+
+ q__1.r = a[7].r * b[4].r - a[7].i * b[4].i,
+ q__1.i = a[7].r * b[4].i + a[7].i * b[4].r;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__1.r = a[7].r * b[5].r - a[7].i * b[5].i,
+ q__1.i = a[7].r * b[5].i + a[7].i * b[5].r;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__1.r = a[7].r * b[6].r - a[7].i * b[6].i,
+ q__1.i = a[7].r * b[6].i + a[7].i * b[6].r;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+ q__2.r = a[8].r * b[7].r - a[8].i * b[7].i,
+ q__2.i = a[8].r * b[7].i + a[8].i * b[7].r;
+ q__1.r = t1.r + q__2.r,
+ q__1.i = t1.i + q__2.i;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__2.r = a[8].r * b[8].r - a[8].i * b[8].i,
+ q__2.i = a[8].r * b[8].i + a[8].i * b[8].r;
+ q__1.r = t2.r + q__2.r,
+ q__1.i = t2.i + q__2.i;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__2.r = a[8].r * b[9].r - a[8].i * b[9].i,
+ q__2.i = a[8].r * b[9].i + a[8].i * b[9].r;
+ q__1.r = t3.r + q__2.r,
+ q__1.i = t3.i + q__2.i;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+ q__2.r = a[9].r * b[10].r - a[9].i * b[10].i,
+ q__2.i = a[9].r * b[10].i + a[9].i * b[10].r;
+ q__1.r = t1.r + q__2.r,
+ q__1.i = t1.i + q__2.i;
+ t1.r = q__1.r,
+ t1.i = -q__1.i;
+ q__2.r = a[9].r * b[11].r - a[9].i * b[11].i,
+ q__2.i = a[9].r * b[11].i + a[9].i * b[11].r;
+ q__1.r = t2.r + q__2.r,
+ q__1.i = t2.i + q__2.i;
+ t2.r = q__1.r,
+ t2.i = -q__1.i;
+ q__2.r = a[9].r * b[12].r - a[9].i * b[12].i,
+ q__2.i = a[9].r * b[12].i + a[9].i * b[12].r;
+ q__1.r = t3.r + q__2.r,
+ q__1.i = t3.i + q__2.i;
+ t3.r = q__1.r,
+ t3.i = -q__1.i;
+
+ q__2.r = t1.r * c[4].r - t1.i * c[4].i,
+ q__2.i = t1.r * c[4].i + t1.i * c[4].r;
+ q__1.r = r[5].r + q__2.r,
+ q__1.i = r[5].i + q__2.i;
+ x1.r = q__1.r,
+ x1.i = q__1.i;
+ q__2.r = t1.r * c[7].r - t1.i * c[7].i,
+ q__2.i = t1.r * c[7].i + t1.i * c[7].r;
+ q__1.r = r[8].r + q__2.r,
+ q__1.i = r[8].i + q__2.i;
+ x2.r = q__1.r,
+ x2.i = q__1.i;
+ q__2.r = t1.r * c[10].r - t1.i * c[10].i,
+ q__2.i = t1.r * c[10].i + t1.i * c[10].r;
+ q__1.r = r[11].r + q__2.r,
+ q__1.i = r[11].i + q__2.i;
+ x3.r = q__1.r,
+ x3.i = q__1.i;
+ q__2.r = t2.r * c[5].r - t2.i * c[5].i,
+ q__2.i = t2.r * c[5].i + t2.i * c[5].r;
+ q__1.r = x1.r + q__2.r,
+ q__1.i = x1.i + q__2.i;
+ x1.r = q__1.r,
+ x1.i = q__1.i;
+ q__2.r = t2.r * c[8].r - t2.i * c[8].i,
+ q__2.i = t2.r * c[8].i + t2.i * c[8].r;
+ q__1.r = x2.r + q__2.r,
+ q__1.i = x2.i + q__2.i;
+ x2.r = q__1.r,
+ x2.i = q__1.i;
+ q__2.r = t2.r * c[11].r - t2.i * c[11].i,
+ q__2.i = t2.r * c[11].i + t2.i * c[11].r;
+ q__1.r = x3.r + q__2.r,
+ q__1.i = x3.i + q__2.i;
+ x3.r = q__1.r,
+ x3.i = q__1.i;
+ q__2.r = t3.r * c[6].r - t3.i * c[6].i,
+ q__2.i = t3.r * c[6].i + t3.i * c[6].r;
+ q__1.r = x1.r + q__2.r,
+ q__1.i = x1.i + q__2.i;
+ r[5].r = q__1.r,
+ r[5].i = q__1.i;
+ q__2.r = t3.r * c[9].r - t3.i * c[9].i,
+ q__2.i = t3.r * c[9].i + t3.i * c[9].r;
+ q__1.r = x2.r + q__2.r,
+ q__1.i = x2.i + q__2.i;
+ r[8].r = q__1.r,
+ r[8].i = q__1.i;
+ q__2.r = t3.r * c[12].r - t3.i * c[12].i,
+ q__2.i = t3.r * c[12].i + t3.i * c[12].r;
+ q__1.r = x3.r + q__2.r,
+ q__1.i = x3.i + q__2.i;
+ r[11].r = q__1.r,
+ r[11].i = q__1.i;
+
+ q__1.r = a[10].r * b[4].r - a[10].i * b[4].i,
+ q__1.i = a[10].r * b[4].i + a[10].i * b[4].r;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__1.r = a[10].r * b[5].r - a[10].i * b[5].i,
+ q__1.i = a[10].r * b[5].i + a[10].i * b[5].r;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__1.r = a[10].r * b[6].r - a[10].i * b[6].i,
+ q__1.i = a[10].r * b[6].i + a[10].i * b[6].r;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+ q__2.r = a[11].r * b[7].r - a[11].i * b[7].i,
+ q__2.i = a[11].r * b[7].i + a[11].i * b[7].r;
+ q__1.r = t1.r + q__2.r,
+ q__1.i = t1.i + q__2.i;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__2.r = a[11].r * b[8].r - a[11].i * b[8].i,
+ q__2.i = a[11].r * b[8].i + a[11].i * b[8].r;
+ q__1.r = t2.r + q__2.r,
+ q__1.i = t2.i + q__2.i;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__2.r = a[11].r * b[9].r - a[11].i * b[9].i,
+ q__2.i = a[11].r * b[9].i + a[11].i * b[9].r;
+ q__1.r = t3.r + q__2.r,
+ q__1.i = t3.i + q__2.i;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+ q__2.r = a[12].r * b[10].r - a[12].i * b[10].i,
+ q__2.i = a[12].r * b[10].i + a[12].i * b[10].r;
+ q__1.r = t1.r + q__2.r,
+ q__1.i = t1.i + q__2.i;
+ t1.r = q__1.r,
+ t1.i = -q__1.i;
+ q__2.r = a[12].r * b[11].r - a[12].i * b[11].i,
+ q__2.i = a[12].r * b[11].i + a[12].i * b[11].r;
+ q__1.r = t2.r + q__2.r,
+ q__1.i = t2.i + q__2.i;
+ t2.r = q__1.r,
+ t2.i = -q__1.i;
+ q__2.r = a[12].r * b[12].r - a[12].i * b[12].i,
+ q__2.i = a[12].r * b[12].i + a[12].i * b[12].r;
+ q__1.r = t3.r + q__2.r,
+ q__1.i = t3.i + q__2.i;
+ t3.r = q__1.r,
+ t3.i = -q__1.i;
+
+ q__2.r = t1.r * c[4].r - t1.i * c[4].i,
+ q__2.i = t1.r * c[4].i + t1.i * c[4].r;
+ q__1.r = r[6].r + q__2.r,
+ q__1.i = r[6].i + q__2.i;
+ x1.r = q__1.r,
+ x1.i = q__1.i;
+ q__2.r = t1.r * c[7].r - t1.i * c[7].i,
+ q__2.i = t1.r * c[7].i + t1.i * c[7].r;
+ q__1.r = r[9].r + q__2.r,
+ q__1.i = r[9].i + q__2.i;
+ x2.r = q__1.r,
+ x2.i = q__1.i;
+ q__2.r = t1.r * c[10].r - t1.i * c[10].i,
+ q__2.i = t1.r * c[10].i + t1.i * c[10].r;
+ q__1.r = r[12].r + q__2.r,
+ q__1.i = r[12].i + q__2.i;
+ x3.r = q__1.r,
+ x3.i = q__1.i;
+ q__2.r = t2.r * c[5].r - t2.i * c[5].i,
+ q__2.i = t2.r * c[5].i + t2.i * c[5].r;
+ q__1.r = x1.r + q__2.r,
+ q__1.i = x1.i + q__2.i;
+ x1.r = q__1.r,
+ x1.i = q__1.i;
+ q__2.r = t2.r * c[8].r - t2.i * c[8].i,
+ q__2.i = t2.r * c[8].i + t2.i * c[8].r;
+ q__1.r = x2.r + q__2.r,
+ q__1.i = x2.i + q__2.i;
+ x2.r = q__1.r,
+ x2.i = q__1.i;
+ q__2.r = t2.r * c[11].r - t2.i * c[11].i,
+ q__2.i = t2.r * c[11].i + t2.i * c[11].r;
+ q__1.r = x3.r + q__2.r,
+ q__1.i = x3.i + q__2.i;
+ x3.r = q__1.r,
+ x3.i = q__1.i;
+ q__2.r = t3.r * c[6].r - t3.i * c[6].i,
+ q__2.i = t3.r * c[6].i + t3.i * c[6].r;
+ q__1.r = x1.r + q__2.r,
+ q__1.i = x1.i + q__2.i;
+ r[6].r = q__1.r,
+ r[6].i = q__1.i;
+ q__2.r = t3.r * c[9].r - t3.i * c[9].i,
+ q__2.i = t3.r * c[9].i + t3.i * c[9].r;
+ q__1.r = x2.r + q__2.r,
+ q__1.i = x2.i + q__2.i;
+ r[9].r = q__1.r,
+ r[9].i = q__1.i;
+ q__2.r = t3.r * c[12].r - t3.i * c[12].i,
+ q__2.i = t3.r * c[12].i + t3.i * c[12].r;
+ q__1.r = x3.r + q__2.r,
+ q__1.i = x3.i + q__2.i;
+ r[12].r = q__1.r,
+ r[12].i = q__1.i;
+
+ return;
+}
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_bwd_m.c b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_bwd_m.c
new file mode 100644
index 0000000000000000000000000000000000000000..6d50ebce74b3f7d6bc889aad0010d79266909f02
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_bwd_m.c
@@ -0,0 +1,353 @@
+/*
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! uuu_bwd_m.c - subtracts backward staple: r = r - a^\dagger b^\dagger c
+!
+!-------------------------------------------------------------------------------
+*/
+
+#include "types.h"
+
+#ifdef NamesToLower_
+# define UUU_BWD_M uuu_bwd_m_
+#endif
+
+#ifdef NamesToLower
+# define UUU_BWD_M uuu_bwd_m
+#endif
+
+void UUU_BWD_M(r, a, b, c)
+COMPLEX8 *r, *a, *b, *c;
+{
+ register COMPLEX8 q__1, q__2;
+ register COMPLEX8 t1, t2, t3, x1, x2, x3;
+
+ /* Parameter adjustments */
+ c -= 4;
+ b -= 4;
+ a -= 4;
+ r -= 4;
+
+ /* Function Body */
+ q__1.r = -a[4].r * b[4].r + a[4].i * b[4].i,
+ q__1.i = -a[4].r * b[4].i - a[4].i * b[4].r;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__1.r = -a[4].r * b[5].r + a[4].i * b[5].i,
+ q__1.i = -a[4].r * b[5].i - a[4].i * b[5].r;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__1.r = -a[4].r * b[6].r + a[4].i * b[6].i,
+ q__1.i = -a[4].r * b[6].i - a[4].i * b[6].r;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+ q__2.r = -a[5].r * b[7].r + a[5].i * b[7].i,
+ q__2.i = -a[5].r * b[7].i - a[5].i * b[7].r;
+ q__1.r = t1.r + q__2.r,
+ q__1.i = t1.i + q__2.i;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__2.r = -a[5].r * b[8].r + a[5].i * b[8].i,
+ q__2.i = -a[5].r * b[8].i - a[5].i * b[8].r;
+ q__1.r = t2.r + q__2.r,
+ q__1.i = t2.i + q__2.i;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__2.r = -a[5].r * b[9].r + a[5].i * b[9].i,
+ q__2.i = -a[5].r * b[9].i - a[5].i * b[9].r;
+ q__1.r = t3.r + q__2.r,
+ q__1.i = t3.i + q__2.i;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+ q__2.r = -a[6].r * b[10].r + a[6].i * b[10].i,
+ q__2.i = -a[6].r * b[10].i - a[6].i * b[10].r;
+ q__1.r = t1.r + q__2.r,
+ q__1.i = t1.i + q__2.i;
+ t1.r = q__1.r,
+ t1.i = -q__1.i;
+ q__2.r = -a[6].r * b[11].r + a[6].i * b[11].i,
+ q__2.i = -a[6].r * b[11].i - a[6].i * b[11].r;
+ q__1.r = t2.r + q__2.r,
+ q__1.i = t2.i + q__2.i;
+ t2.r = q__1.r,
+ t2.i = -q__1.i;
+ q__2.r = -a[6].r * b[12].r + a[6].i * b[12].i,
+ q__2.i = -a[6].r * b[12].i - a[6].i * b[12].r;
+ q__1.r = t3.r + q__2.r,
+ q__1.i = t3.i + q__2.i;
+ t3.r = q__1.r,
+ t3.i = -q__1.i;
+
+ q__2.r = t1.r * c[4].r - t1.i * c[4].i,
+ q__2.i = t1.r * c[4].i + t1.i * c[4].r;
+ q__1.r = r[4].r + q__2.r,
+ q__1.i = r[4].i + q__2.i;
+ x1.r = q__1.r,
+ x1.i = q__1.i;
+ q__2.r = t1.r * c[7].r - t1.i * c[7].i,
+ q__2.i = t1.r * c[7].i + t1.i * c[7].r;
+ q__1.r = r[7].r + q__2.r,
+ q__1.i = r[7].i + q__2.i;
+ x2.r = q__1.r,
+ x2.i = q__1.i;
+ q__2.r = t1.r * c[10].r - t1.i * c[10].i,
+ q__2.i = t1.r * c[10].i + t1.i * c[10].r;
+ q__1.r = r[10].r + q__2.r,
+ q__1.i = r[10].i + q__2.i;
+ x3.r = q__1.r,
+ x3.i = q__1.i;
+ q__2.r = t2.r * c[5].r - t2.i * c[5].i,
+ q__2.i = t2.r * c[5].i + t2.i * c[5].r;
+ q__1.r = x1.r + q__2.r,
+ q__1.i = x1.i + q__2.i;
+ x1.r = q__1.r,
+ x1.i = q__1.i;
+ q__2.r = t2.r * c[8].r - t2.i * c[8].i,
+ q__2.i = t2.r * c[8].i + t2.i * c[8].r;
+ q__1.r = x2.r + q__2.r,
+ q__1.i = x2.i + q__2.i;
+ x2.r = q__1.r,
+ x2.i = q__1.i;
+ q__2.r = t2.r * c[11].r - t2.i * c[11].i,
+ q__2.i = t2.r * c[11].i + t2.i * c[11].r;
+ q__1.r = x3.r + q__2.r,
+ q__1.i = x3.i + q__2.i;
+ x3.r = q__1.r,
+ x3.i = q__1.i;
+ q__2.r = t3.r * c[6].r - t3.i * c[6].i,
+ q__2.i = t3.r * c[6].i + t3.i * c[6].r;
+ q__1.r = x1.r + q__2.r,
+ q__1.i = x1.i + q__2.i;
+ r[4].r = q__1.r,
+ r[4].i = q__1.i;
+ q__2.r = t3.r * c[9].r - t3.i * c[9].i,
+ q__2.i = t3.r * c[9].i + t3.i * c[9].r;
+ q__1.r = x2.r + q__2.r,
+ q__1.i = x2.i + q__2.i;
+ r[7].r = q__1.r,
+ r[7].i = q__1.i;
+ q__2.r = t3.r * c[12].r - t3.i * c[12].i,
+ q__2.i = t3.r * c[12].i + t3.i * c[12].r;
+ q__1.r = x3.r + q__2.r,
+ q__1.i = x3.i + q__2.i;
+ r[10].r = q__1.r,
+ r[10].i = q__1.i;
+
+ q__1.r = -a[7].r * b[4].r + a[7].i * b[4].i,
+ q__1.i = -a[7].r * b[4].i - a[7].i * b[4].r;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__1.r = -a[7].r * b[5].r + a[7].i * b[5].i,
+ q__1.i = -a[7].r * b[5].i - a[7].i * b[5].r;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__1.r = -a[7].r * b[6].r + a[7].i * b[6].i,
+ q__1.i = -a[7].r * b[6].i - a[7].i * b[6].r;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+ q__2.r = -a[8].r * b[7].r + a[8].i * b[7].i,
+ q__2.i = -a[8].r * b[7].i - a[8].i * b[7].r;
+ q__1.r = t1.r + q__2.r,
+ q__1.i = t1.i + q__2.i;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__2.r = -a[8].r * b[8].r + a[8].i * b[8].i,
+ q__2.i = -a[8].r * b[8].i - a[8].i * b[8].r;
+ q__1.r = t2.r + q__2.r,
+ q__1.i = t2.i + q__2.i;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__2.r = -a[8].r * b[9].r + a[8].i * b[9].i,
+ q__2.i = -a[8].r * b[9].i - a[8].i * b[9].r;
+ q__1.r = t3.r + q__2.r,
+ q__1.i = t3.i + q__2.i;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+ q__2.r = -a[9].r * b[10].r + a[9].i * b[10].i,
+ q__2.i = -a[9].r * b[10].i - a[9].i * b[10].r;
+ q__1.r = t1.r + q__2.r,
+ q__1.i = t1.i + q__2.i;
+ t1.r = q__1.r,
+ t1.i = -q__1.i;
+ q__2.r = -a[9].r * b[11].r + a[9].i * b[11].i,
+ q__2.i = -a[9].r * b[11].i - a[9].i * b[11].r;
+ q__1.r = t2.r + q__2.r,
+ q__1.i = t2.i + q__2.i;
+ t2.r = q__1.r,
+ t2.i = -q__1.i;
+ q__2.r = -a[9].r * b[12].r + a[9].i * b[12].i,
+ q__2.i = -a[9].r * b[12].i - a[9].i * b[12].r;
+ q__1.r = t3.r + q__2.r,
+ q__1.i = t3.i + q__2.i;
+ t3.r = q__1.r,
+ t3.i = -q__1.i;
+
+ q__2.r = t1.r * c[4].r - t1.i * c[4].i,
+ q__2.i = t1.r * c[4].i + t1.i * c[4].r;
+ q__1.r = r[5].r + q__2.r,
+ q__1.i = r[5].i + q__2.i;
+ x1.r = q__1.r,
+ x1.i = q__1.i;
+ q__2.r = t1.r * c[7].r - t1.i * c[7].i,
+ q__2.i = t1.r * c[7].i + t1.i * c[7].r;
+ q__1.r = r[8].r + q__2.r,
+ q__1.i = r[8].i + q__2.i;
+ x2.r = q__1.r,
+ x2.i = q__1.i;
+ q__2.r = t1.r * c[10].r - t1.i * c[10].i,
+ q__2.i = t1.r * c[10].i + t1.i * c[10].r;
+ q__1.r = r[11].r + q__2.r,
+ q__1.i = r[11].i + q__2.i;
+ x3.r = q__1.r,
+ x3.i = q__1.i;
+ q__2.r = t2.r * c[5].r - t2.i * c[5].i,
+ q__2.i = t2.r * c[5].i + t2.i * c[5].r;
+ q__1.r = x1.r + q__2.r,
+ q__1.i = x1.i + q__2.i;
+ x1.r = q__1.r,
+ x1.i = q__1.i;
+ q__2.r = t2.r * c[8].r - t2.i * c[8].i,
+ q__2.i = t2.r * c[8].i + t2.i * c[8].r;
+ q__1.r = x2.r + q__2.r,
+ q__1.i = x2.i + q__2.i;
+ x2.r = q__1.r,
+ x2.i = q__1.i;
+ q__2.r = t2.r * c[11].r - t2.i * c[11].i,
+ q__2.i = t2.r * c[11].i + t2.i * c[11].r;
+ q__1.r = x3.r + q__2.r,
+ q__1.i = x3.i + q__2.i;
+ x3.r = q__1.r,
+ x3.i = q__1.i;
+ q__2.r = t3.r * c[6].r - t3.i * c[6].i,
+ q__2.i = t3.r * c[6].i + t3.i * c[6].r;
+ q__1.r = x1.r + q__2.r,
+ q__1.i = x1.i + q__2.i;
+ r[5].r = q__1.r,
+ r[5].i = q__1.i;
+ q__2.r = t3.r * c[9].r - t3.i * c[9].i,
+ q__2.i = t3.r * c[9].i + t3.i * c[9].r;
+ q__1.r = x2.r + q__2.r,
+ q__1.i = x2.i + q__2.i;
+ r[8].r = q__1.r,
+ r[8].i = q__1.i;
+ q__2.r = t3.r * c[12].r - t3.i * c[12].i,
+ q__2.i = t3.r * c[12].i + t3.i * c[12].r;
+ q__1.r = x3.r + q__2.r,
+ q__1.i = x3.i + q__2.i;
+ r[11].r = q__1.r,
+ r[11].i = q__1.i;
+
+ q__1.r = -a[10].r * b[4].r + a[10].i * b[4].i,
+ q__1.i = -a[10].r * b[4].i - a[10].i * b[4].r;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__1.r = -a[10].r * b[5].r + a[10].i * b[5].i,
+ q__1.i = -a[10].r * b[5].i - a[10].i * b[5].r;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__1.r = -a[10].r * b[6].r + a[10].i * b[6].i,
+ q__1.i = -a[10].r * b[6].i - a[10].i * b[6].r;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+ q__2.r = -a[11].r * b[7].r + a[11].i * b[7].i,
+ q__2.i = -a[11].r * b[7].i - a[11].i * b[7].r;
+ q__1.r = t1.r + q__2.r,
+ q__1.i = t1.i + q__2.i;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__2.r = -a[11].r * b[8].r + a[11].i * b[8].i,
+ q__2.i = -a[11].r * b[8].i - a[11].i * b[8].r;
+ q__1.r = t2.r + q__2.r,
+ q__1.i = t2.i + q__2.i;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__2.r = -a[11].r * b[9].r + a[11].i * b[9].i,
+ q__2.i = -a[11].r * b[9].i - a[11].i * b[9].r;
+ q__1.r = t3.r + q__2.r,
+ q__1.i = t3.i + q__2.i;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+ q__2.r = -a[12].r * b[10].r + a[12].i * b[10].i,
+ q__2.i = -a[12].r * b[10].i - a[12].i * b[10].r;
+ q__1.r = t1.r + q__2.r,
+ q__1.i = t1.i + q__2.i;
+ t1.r = q__1.r,
+ t1.i = -q__1.i;
+ q__2.r = -a[12].r * b[11].r + a[12].i * b[11].i,
+ q__2.i = -a[12].r * b[11].i - a[12].i * b[11].r;
+ q__1.r = t2.r + q__2.r,
+ q__1.i = t2.i + q__2.i;
+ t2.r = q__1.r,
+ t2.i = -q__1.i;
+ q__2.r = -a[12].r * b[12].r + a[12].i * b[12].i,
+ q__2.i = -a[12].r * b[12].i - a[12].i * b[12].r;
+ q__1.r = t3.r + q__2.r,
+ q__1.i = t3.i + q__2.i;
+ t3.r = q__1.r,
+ t3.i = -q__1.i;
+
+ q__2.r = t1.r * c[4].r - t1.i * c[4].i,
+ q__2.i = t1.r * c[4].i + t1.i * c[4].r;
+ q__1.r = r[6].r + q__2.r,
+ q__1.i = r[6].i + q__2.i;
+ x1.r = q__1.r,
+ x1.i = q__1.i;
+ q__2.r = t1.r * c[7].r - t1.i * c[7].i,
+ q__2.i = t1.r * c[7].i + t1.i * c[7].r;
+ q__1.r = r[9].r + q__2.r,
+ q__1.i = r[9].i + q__2.i;
+ x2.r = q__1.r,
+ x2.i = q__1.i;
+ q__2.r = t1.r * c[10].r - t1.i * c[10].i,
+ q__2.i = t1.r * c[10].i + t1.i * c[10].r;
+ q__1.r = r[12].r + q__2.r,
+ q__1.i = r[12].i + q__2.i;
+ x3.r = q__1.r,
+ x3.i = q__1.i;
+ q__2.r = t2.r * c[5].r - t2.i * c[5].i,
+ q__2.i = t2.r * c[5].i + t2.i * c[5].r;
+ q__1.r = x1.r + q__2.r,
+ q__1.i = x1.i + q__2.i;
+ x1.r = q__1.r,
+ x1.i = q__1.i;
+ q__2.r = t2.r * c[8].r - t2.i * c[8].i,
+ q__2.i = t2.r * c[8].i + t2.i * c[8].r;
+ q__1.r = x2.r + q__2.r,
+ q__1.i = x2.i + q__2.i;
+ x2.r = q__1.r,
+ x2.i = q__1.i;
+ q__2.r = t2.r * c[11].r - t2.i * c[11].i,
+ q__2.i = t2.r * c[11].i + t2.i * c[11].r;
+ q__1.r = x3.r + q__2.r,
+ q__1.i = x3.i + q__2.i;
+ x3.r = q__1.r,
+ x3.i = q__1.i;
+ q__2.r = t3.r * c[6].r - t3.i * c[6].i,
+ q__2.i = t3.r * c[6].i + t3.i * c[6].r;
+ q__1.r = x1.r + q__2.r,
+ q__1.i = x1.i + q__2.i;
+ r[6].r = q__1.r,
+ r[6].i = q__1.i;
+ q__2.r = t3.r * c[9].r - t3.i * c[9].i,
+ q__2.i = t3.r * c[9].i + t3.i * c[9].r;
+ q__1.r = x2.r + q__2.r,
+ q__1.i = x2.i + q__2.i;
+ r[9].r = q__1.r,
+ r[9].i = q__1.i;
+ q__2.r = t3.r * c[12].r - t3.i * c[12].i,
+ q__2.i = t3.r * c[12].i + t3.i * c[12].r;
+ q__1.r = x3.r + q__2.r,
+ q__1.i = x3.i + q__2.i;
+ r[12].r = q__1.r,
+ r[12].i = q__1.i;
+
+ return;
+}
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_f90.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_f90.F90
new file mode 100644
index 0000000000000000000000000000000000000000..53ca89df005fe7caa5aab26fc99f99964534a967
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_f90.F90
@@ -0,0 +1,73 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! uuu_f90.F90 - Fortran loops for (U * U * U)
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine uuu_bwd(r, a, b, c) ! adds backward staple:
+ ! r = r + a^\dagger b^\dagger c
+ implicit none
+ SU3 :: r, a, b, c
+ integer :: i, j, k, m
+
+ do i = 1, NCOL
+ do j = 1, NCOL
+ do k = 1, NCOL
+ do m = 1, NCOL
+ r(i,j) = r(i,j) + conjg(a(k,i)) * conjg(b(m,k)) * c(m,j)
+ enddo
+ enddo
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine uuu_bwd_m(r, a, b, c) ! subtracts backward staple:
+ ! r = r - a^\dagger b^\dagger c
+ implicit none
+ SU3 :: r, a, b, c
+ integer :: i, j, k, m
+
+ do i = 1, NCOL
+ do j = 1, NCOL
+ do k = 1, NCOL
+ do m = 1, NCOL
+ r(i,j) = r(i,j) - conjg(a(k,i)) * conjg(b(m,k)) * c(m,j)
+ enddo
+ enddo
+ enddo
+ enddo
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine uuu_fwd(r, a, b, c) ! adds forward staple:
+ ! r = r + a b^\dagger c^\dagger
+ implicit none
+ SU3 :: r, a, b, c
+ integer :: i, j, k, m
+
+ do i = 1, NCOL
+ do j = 1, NCOL
+ do k = 1, NCOL
+ do m = 1, NCOL
+ r(i,j) = r(i,j) + a(i,k) * conjg(b(m,k)) * conjg(c(j,m))
+ enddo
+ enddo
+ enddo
+ enddo
+
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_fwd.c b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_fwd.c
new file mode 100644
index 0000000000000000000000000000000000000000..c0b544526f0e410f19639d6873d0b6ed4b9dcf08
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_fwd.c
@@ -0,0 +1,353 @@
+/*
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! uuu_fwd.c - adds forward staple: r = r + a b^\dagger c^\dagger
+!
+!-------------------------------------------------------------------------------
+*/
+
+#include "types.h"
+
+#ifdef NamesToLower_
+# define UUU_FWD uuu_fwd_
+#endif
+
+#ifdef NamesToLower
+# define UUU_FWD uuu_fwd
+#endif
+
+void UUU_FWD(r, a, b, c)
+COMPLEX8 *r, *a, *b, *c;
+{
+ register COMPLEX8 q__1, q__2;
+ register COMPLEX8 t1, t2, t3, x1, x2, x3;
+
+ /* Parameter adjustments */
+ c -= 4;
+ b -= 4;
+ a -= 4;
+ r -= 4;
+
+ /* Function Body */
+ q__1.r = a[4].r * b[4].r + a[4].i * b[4].i,
+ q__1.i = -a[4].r * b[4].i + a[4].i * b[4].r;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__1.r = a[4].r * b[5].r + a[4].i * b[5].i,
+ q__1.i = -a[4].r * b[5].i + a[4].i * b[5].r;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__1.r = a[4].r * b[6].r + a[4].i * b[6].i,
+ q__1.i = -a[4].r * b[6].i + a[4].i * b[6].r;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+ q__2.r = a[7].r * b[7].r + a[7].i * b[7].i,
+ q__2.i = -a[7].r * b[7].i + a[7].i * b[7].r;
+ q__1.r = t1.r + q__2.r,
+ q__1.i = t1.i + q__2.i;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__2.r = a[7].r * b[8].r + a[7].i * b[8].i,
+ q__2.i = -a[7].r * b[8].i + a[7].i * b[8].r;
+ q__1.r = t2.r + q__2.r,
+ q__1.i = t2.i + q__2.i;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__2.r = a[7].r * b[9].r + a[7].i * b[9].i,
+ q__2.i = -a[7].r * b[9].i + a[7].i * b[9].r;
+ q__1.r = t3.r + q__2.r,
+ q__1.i = t3.i + q__2.i;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+ q__2.r = a[10].r * b[10].r + a[10].i * b[10].i,
+ q__2.i = -a[10].r * b[10].i + a[10].i * b[10].r;
+ q__1.r = t1.r + q__2.r,
+ q__1.i = t1.i + q__2.i;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__2.r = a[10].r * b[11].r + a[10].i * b[11].i,
+ q__2.i = -a[10].r * b[11].i + a[10].i * b[11].r;
+ q__1.r = t2.r + q__2.r,
+ q__1.i = t2.i + q__2.i;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__2.r = a[10].r * b[12].r + a[10].i * b[12].i,
+ q__2.i = -a[10].r * b[12].i + a[10].i * b[12].r;
+ q__1.r = t3.r + q__2.r,
+ q__1.i = t3.i + q__2.i;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+
+ q__2.r = t1.r * c[4].r + t1.i * c[4].i,
+ q__2.i = -t1.r * c[4].i + t1.i * c[4].r;
+ q__1.r = r[4].r + q__2.r,
+ q__1.i = r[4].i + q__2.i;
+ x1.r = q__1.r,
+ x1.i = q__1.i;
+ q__2.r = t1.r * c[5].r + t1.i * c[5].i,
+ q__2.i = -t1.r * c[5].i + t1.i * c[5].r;
+ q__1.r = r[7].r + q__2.r,
+ q__1.i = r[7].i + q__2.i;
+ x2.r = q__1.r,
+ x2.i = q__1.i;
+ q__2.r = t1.r * c[6].r + t1.i * c[6].i,
+ q__2.i = -t1.r * c[6].i + t1.i * c[6].r;
+ q__1.r = r[10].r + q__2.r,
+ q__1.i = r[10].i + q__2.i;
+ x3.r = q__1.r,
+ x3.i = q__1.i;
+ q__2.r = t2.r * c[7].r + t2.i * c[7].i,
+ q__2.i = -t2.r * c[7].i + t2.i * c[7].r;
+ q__1.r = x1.r + q__2.r,
+ q__1.i = x1.i + q__2.i;
+ x1.r = q__1.r,
+ x1.i = q__1.i;
+ q__2.r = t2.r * c[8].r + t2.i * c[8].i,
+ q__2.i = -t2.r * c[8].i + t2.i * c[8].r;
+ q__1.r = x2.r + q__2.r,
+ q__1.i = x2.i + q__2.i;
+ x2.r = q__1.r,
+ x2.i = q__1.i;
+ q__2.r = t2.r * c[9].r + t2.i * c[9].i,
+ q__2.i = -t2.r * c[9].i + t2.i * c[9].r;
+ q__1.r = x3.r + q__2.r,
+ q__1.i = x3.i + q__2.i;
+ x3.r = q__1.r,
+ x3.i = q__1.i;
+ q__2.r = t3.r * c[10].r + t3.i * c[10].i,
+ q__2.i = -t3.r * c[10].i + t3.i * c[10].r;
+ q__1.r = x1.r + q__2.r,
+ q__1.i = x1.i + q__2.i;
+ r[4].r = q__1.r,
+ r[4].i = q__1.i;
+ q__2.r = t3.r * c[11].r + t3.i * c[11].i,
+ q__2.i = -t3.r * c[11].i + t3.i * c[11].r;
+ q__1.r = x2.r + q__2.r,
+ q__1.i = x2.i + q__2.i;
+ r[7].r = q__1.r,
+ r[7].i = q__1.i;
+ q__2.r = t3.r * c[12].r + t3.i * c[12].i,
+ q__2.i = -t3.r * c[12].i + t3.i * c[12].r;
+ q__1.r = x3.r + q__2.r,
+ q__1.i = x3.i + q__2.i;
+ r[10].r = q__1.r,
+ r[10].i = q__1.i;
+
+ q__1.r = a[5].r * b[4].r + a[5].i * b[4].i,
+ q__1.i = -a[5].r * b[4].i + a[5].i * b[4].r;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__1.r = a[5].r * b[5].r + a[5].i * b[5].i,
+ q__1.i = -a[5].r * b[5].i + a[5].i * b[5].r;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__1.r = a[5].r * b[6].r + a[5].i * b[6].i,
+ q__1.i = -a[5].r * b[6].i + a[5].i * b[6].r;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+ q__2.r = a[8].r * b[7].r + a[8].i * b[7].i,
+ q__2.i = -a[8].r * b[7].i + a[8].i * b[7].r;
+ q__1.r = t1.r + q__2.r,
+ q__1.i = t1.i + q__2.i;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__2.r = a[8].r * b[8].r + a[8].i * b[8].i,
+ q__2.i = -a[8].r * b[8].i + a[8].i * b[8].r;
+ q__1.r = t2.r + q__2.r,
+ q__1.i = t2.i + q__2.i;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__2.r = a[8].r * b[9].r + a[8].i * b[9].i,
+ q__2.i = -a[8].r * b[9].i + a[8].i * b[9].r;
+ q__1.r = t3.r + q__2.r,
+ q__1.i = t3.i + q__2.i;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+ q__2.r = a[11].r * b[10].r + a[11].i * b[10].i,
+ q__2.i = -a[11].r * b[10].i + a[11].i * b[10].r;
+ q__1.r = t1.r + q__2.r,
+ q__1.i = t1.i + q__2.i;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__2.r = a[11].r * b[11].r + a[11].i * b[11].i,
+ q__2.i = -a[11].r * b[11].i + a[11].i * b[11].r;
+ q__1.r = t2.r + q__2.r,
+ q__1.i = t2.i + q__2.i;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__2.r = a[11].r * b[12].r + a[11].i * b[12].i,
+ q__2.i = -a[11].r * b[12].i + a[11].i * b[12].r;
+ q__1.r = t3.r + q__2.r,
+ q__1.i = t3.i + q__2.i;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+
+ q__2.r = t1.r * c[4].r + t1.i * c[4].i,
+ q__2.i = -t1.r * c[4].i + t1.i * c[4].r;
+ q__1.r = r[5].r + q__2.r,
+ q__1.i = r[5].i + q__2.i;
+ x1.r = q__1.r,
+ x1.i = q__1.i;
+ q__2.r = t1.r * c[5].r + t1.i * c[5].i,
+ q__2.i = -t1.r * c[5].i + t1.i * c[5].r;
+ q__1.r = r[8].r + q__2.r,
+ q__1.i = r[8].i + q__2.i;
+ x2.r = q__1.r,
+ x2.i = q__1.i;
+ q__2.r = t1.r * c[6].r + t1.i * c[6].i,
+ q__2.i = -t1.r * c[6].i + t1.i * c[6].r;
+ q__1.r = r[11].r + q__2.r,
+ q__1.i = r[11].i + q__2.i;
+ x3.r = q__1.r,
+ x3.i = q__1.i;
+ q__2.r = t2.r * c[7].r + t2.i * c[7].i,
+ q__2.i = -t2.r * c[7].i + t2.i * c[7].r;
+ q__1.r = x1.r + q__2.r,
+ q__1.i = x1.i + q__2.i;
+ x1.r = q__1.r,
+ x1.i = q__1.i;
+ q__2.r = t2.r * c[8].r + t2.i * c[8].i,
+ q__2.i = -t2.r * c[8].i + t2.i * c[8].r;
+ q__1.r = x2.r + q__2.r,
+ q__1.i = x2.i + q__2.i;
+ x2.r = q__1.r,
+ x2.i = q__1.i;
+ q__2.r = t2.r * c[9].r + t2.i * c[9].i,
+ q__2.i = -t2.r * c[9].i + t2.i * c[9].r;
+ q__1.r = x3.r + q__2.r,
+ q__1.i = x3.i + q__2.i;
+ x3.r = q__1.r,
+ x3.i = q__1.i;
+ q__2.r = t3.r * c[10].r + t3.i * c[10].i,
+ q__2.i = -t3.r * c[10].i + t3.i * c[10].r;
+ q__1.r = x1.r + q__2.r,
+ q__1.i = x1.i + q__2.i;
+ r[5].r = q__1.r,
+ r[5].i = q__1.i;
+ q__2.r = t3.r * c[11].r + t3.i * c[11].i,
+ q__2.i = -t3.r * c[11].i + t3.i * c[11].r;
+ q__1.r = x2.r + q__2.r,
+ q__1.i = x2.i + q__2.i;
+ r[8].r = q__1.r,
+ r[8].i = q__1.i;
+ q__2.r = t3.r * c[12].r + t3.i * c[12].i,
+ q__2.i = -t3.r * c[12].i + t3.i * c[12].r;
+ q__1.r = x3.r + q__2.r,
+ q__1.i = x3.i + q__2.i;
+ r[11].r = q__1.r,
+ r[11].i = q__1.i;
+
+ q__1.r = a[6].r * b[4].r + a[6].i * b[4].i,
+ q__1.i = -a[6].r * b[4].i + a[6].i * b[4].r;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__1.r = a[6].r * b[5].r + a[6].i * b[5].i,
+ q__1.i = -a[6].r * b[5].i + a[6].i * b[5].r;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__1.r = a[6].r * b[6].r + a[6].i * b[6].i,
+ q__1.i = -a[6].r * b[6].i + a[6].i * b[6].r;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+ q__2.r = a[9].r * b[7].r + a[9].i * b[7].i,
+ q__2.i = -a[9].r * b[7].i + a[9].i * b[7].r;
+ q__1.r = t1.r + q__2.r,
+ q__1.i = t1.i + q__2.i;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__2.r = a[9].r * b[8].r + a[9].i * b[8].i,
+ q__2.i = -a[9].r * b[8].i + a[9].i * b[8].r;
+ q__1.r = t2.r + q__2.r,
+ q__1.i = t2.i + q__2.i;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__2.r = a[9].r * b[9].r + a[9].i * b[9].i,
+ q__2.i = -a[9].r * b[9].i + a[9].i * b[9].r;
+ q__1.r = t3.r + q__2.r,
+ q__1.i = t3.i + q__2.i;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+ q__2.r = a[12].r * b[10].r + a[12].i * b[10].i,
+ q__2.i = -a[12].r * b[10].i + a[12].i * b[10].r;
+ q__1.r = t1.r + q__2.r,
+ q__1.i = t1.i + q__2.i;
+ t1.r = q__1.r,
+ t1.i = q__1.i;
+ q__2.r = a[12].r * b[11].r + a[12].i * b[11].i,
+ q__2.i = -a[12].r * b[11].i + a[12].i * b[11].r;
+ q__1.r = t2.r + q__2.r,
+ q__1.i = t2.i + q__2.i;
+ t2.r = q__1.r,
+ t2.i = q__1.i;
+ q__2.r = a[12].r * b[12].r + a[12].i * b[12].i,
+ q__2.i = -a[12].r * b[12].i + a[12].i * b[12].r;
+ q__1.r = t3.r + q__2.r,
+ q__1.i = t3.i + q__2.i;
+ t3.r = q__1.r,
+ t3.i = q__1.i;
+
+ q__2.r = t1.r * c[4].r + t1.i * c[4].i,
+ q__2.i = -t1.r * c[4].i + t1.i * c[4].r;
+ q__1.r = r[6].r + q__2.r,
+ q__1.i = r[6].i + q__2.i;
+ x1.r = q__1.r,
+ x1.i = q__1.i;
+ q__2.r = t1.r * c[5].r + t1.i * c[5].i,
+ q__2.i = -t1.r * c[5].i + t1.i * c[5].r;
+ q__1.r = r[9].r + q__2.r,
+ q__1.i = r[9].i + q__2.i;
+ x2.r = q__1.r,
+ x2.i = q__1.i;
+ q__2.r = t1.r * c[6].r + t1.i * c[6].i,
+ q__2.i = -t1.r * c[6].i + t1.i * c[6].r;
+ q__1.r = r[12].r + q__2.r,
+ q__1.i = r[12].i + q__2.i;
+ x3.r = q__1.r,
+ x3.i = q__1.i;
+ q__2.r = t2.r * c[7].r + t2.i * c[7].i,
+ q__2.i = -t2.r * c[7].i + t2.i * c[7].r;
+ q__1.r = x1.r + q__2.r,
+ q__1.i = x1.i + q__2.i;
+ x1.r = q__1.r,
+ x1.i = q__1.i;
+ q__2.r = t2.r * c[8].r + t2.i * c[8].i,
+ q__2.i = -t2.r * c[8].i + t2.i * c[8].r;
+ q__1.r = x2.r + q__2.r,
+ q__1.i = x2.i + q__2.i;
+ x2.r = q__1.r,
+ x2.i = q__1.i;
+ q__2.r = t2.r * c[9].r + t2.i * c[9].i,
+ q__2.i = -t2.r * c[9].i + t2.i * c[9].r;
+ q__1.r = x3.r + q__2.r,
+ q__1.i = x3.i + q__2.i;
+ x3.r = q__1.r,
+ x3.i = q__1.i;
+ q__2.r = t3.r * c[10].r + t3.i * c[10].i,
+ q__2.i = -t3.r * c[10].i + t3.i * c[10].r;
+ q__1.r = x1.r + q__2.r,
+ q__1.i = x1.i + q__2.i;
+ r[6].r = q__1.r,
+ r[6].i = q__1.i;
+ q__2.r = t3.r * c[11].r + t3.i * c[11].i,
+ q__2.i = -t3.r * c[11].i + t3.i * c[11].r;
+ q__1.r = x2.r + q__2.r,
+ q__1.i = x2.i + q__2.i;
+ r[9].r = q__1.r,
+ r[9].i = q__1.i;
+ q__2.r = t3.r * c[12].r + t3.i * c[12].i,
+ q__2.i = -t3.r * c[12].i + t3.i * c[12].r;
+ q__1.r = x3.r + q__2.r,
+ q__1.i = x3.i + q__2.i;
+ r[12].r = q__1.r,
+ r[12].i = q__1.i;
+
+ return;
+}
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/w_mult.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/w_mult.F90
new file mode 100644
index 0000000000000000000000000000000000000000..1fafce298e018acbddfdb050ec791601a411a9d5
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/w_mult.F90
@@ -0,0 +1,78 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics program
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 2003-2005, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! w_mult.F90 - W := M~ + rho (Hasenbusch improvement)
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+subroutine w_mult(out, in, para, conf) ! out = W in
+
+ use typedef_hmc
+ use module_vol
+ implicit none
+
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(in) :: conf
+
+ SPINCOL_FIELD, intent(out) :: out
+ SPINCOL_FIELD, intent(in) :: in
+
+ call mtil(out, in, para, conf)
+
+ call sc_axpy(out, in, para%rho)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine w_mult_dag(out, in, para, conf) ! out = W+ in
+
+ use typedef_hmc
+ use module_vol
+ implicit none
+
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(in) :: conf
+
+ SPINCOL_FIELD, intent(out) :: out
+ SPINCOL_FIELD, intent(in) :: in
+
+ call mtil_dag(out, in, para, conf)
+
+ call sc_axpy(out, in, para%rho)
+
+end
+
+!-------------------------------------------------------------------------------
+subroutine w_dagger_w(out, in, para, conf) ! out = (W+ W) in
+
+ use typedef_hmc
+ use module_p_interface
+ use module_vol
+ implicit none
+
+ type(hmc_para), intent(in) :: para
+ type(hmc_conf), intent(in) :: conf
+
+ SPINCOL_FIELD :: out, in
+ P_SPINCOL_FIELD, save :: tmp
+
+ TIMING_START(timing_bin_mtdagmt)
+
+ ALLOCATE_SC_FIELD(tmp)
+
+ call w_mult(tmp, in, para, conf)
+ call w_mult_dag(out, tmp, para, conf)
+
+ TIMING_STOP(timing_bin_mtdagmt)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/xyzt2i.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/xyzt2i.F90
new file mode 100644
index 0000000000000000000000000000000000000000..d87af69ff7a72b83981bed4df13362b5b8c64aa4
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_A/xyzt2i.F90
@@ -0,0 +1,97 @@
+!===============================================================================
+!
+! BQCD -- Berlin Quantum ChromoDynamics programme
+!
+! Author: Hinnerk Stueben
+!
+! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin
+!
+!-------------------------------------------------------------------------------
+!
+! xyzt2i.F90 - maps local coordinates (x,y,z,t) to even/odd index
+!
+!-------------------------------------------------------------------------------
+# include "defs.h"
+
+!-------------------------------------------------------------------------------
+integer function xyzt2i(x_in)
+
+ ! x_in := (x,y,z,t)
+ ! -1 <= x(mu) <= N(mu) ; mu = 1,2,3,4
+ ! xyzt2i >= 1
+
+ use module_function_decl
+ use module_lattice
+ use module_offset
+ implicit none
+
+ integer, dimension (DIM), intent(in) :: x_in
+ integer, dimension (DIM) :: dir, i, m, x
+ integer :: count, mu
+ integer, external :: ieo, n_sites, i_periodic, ilex
+
+
+ count = 0
+ do mu = 1, DIM
+
+ x(mu) = x_in(mu)
+
+ if (x(mu) < -1 .or. x(mu) > N(mu)) then
+ call die('xyzt2i(): x(mu) out of range')
+ endif
+
+ if (NPE(mu) == 1) x(mu) = i_periodic(x(mu), N(mu))
+
+ if (x(mu) == -1) then
+ dir(mu) = -1
+ elseif (x(mu) == N(mu)) then
+ dir(mu) = 1
+ else
+ dir(mu) = 0
+ count = count + 1
+ endif
+
+ if (dir(mu) /= 0) then
+ i(mu) = 0
+ m(mu) = 1
+ else
+ i(mu) = x(mu)
+ m(mu) = N(mu)
+ endif
+ enddo
+
+ if (count == DIM) then
+ xyzt2i = offset(0,0,0,0) + ieo(DIM, x, N) + 1
+ else
+ ASSERT(num_pes() /= 1)
+
+ if (dir(1) /= 0) then
+ !!ASSERT(n_sites(DIM, dir, N, NPE) == n_sites(DIM, dir, NH, NPE))
+ !!ASSERT(ilex(DIM, i, m) <= n_sites(DIM, dir, NH, NPE))
+ xyzt2i = offset(dir(1),dir(2),dir(3),dir(4)) + ilex(DIM, i, m) + 1
+ else
+ xyzt2i = offset(dir(1),dir(2),dir(3),dir(4)) + ieo(DIM, i, m) + 1
+ endif
+ endif
+
+end
+
+!-------------------------------------------------------------------------------
+integer function std_xyzt2i(x)
+
+ use module_lattice
+ implicit none
+
+ integer, dimension (DIM), intent(in) :: x
+ integer, dimension (DIM) :: x_act
+ integer, external :: xyzt2i
+
+ x_act(1) = x(gamma_index(1))
+ x_act(2) = x(gamma_index(2))
+ x_act(3) = x(gamma_index(3))
+ x_act(4) = x(gamma_index(4))
+
+ std_xyzt2i = xyzt2i(x_act)
+end
+
+!===============================================================================
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_B/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..31b2207cc57537d5ff8f443a772d78649b7167ed
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/Makefile
@@ -0,0 +1,10 @@
+include Makefile.defs
+
+#MODULE_INIT#
+#MODULE_CMD# #MODULE_FILES#
+
+kernel:
+ cd libraries && $(MAKE) all
+ cd su3h_n && $(MAKE) kernel-objects
+ $(AR) $(ARFLAGS) ../kernel_B.a libraries/*.o su3h_n/*.o
+
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/Makefile.defs.in b/qcd/part_cpu/applications/QCD/src/kernel_B/Makefile.defs.in
new file mode 100644
index 0000000000000000000000000000000000000000..3a5a290170bef0e2a21cc3daa385544ec63bbdab
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/Makefile.defs.in
@@ -0,0 +1,13 @@
+MAKE = #MAKE#
+
+RM = #RM#
+
+AR = #AR#
+ARFLAGS = #ARFLAGS#
+
+CC = #CC#
+CFLAGS = #CFLAGS#
+
+MPI_CC = #MPI_CC#
+
+LDFLAGS = #LDFLAGS#
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/aa/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..0b41dc11a810a6d13720e9cbef1b7e298dc700e7
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/Makefile
@@ -0,0 +1,12 @@
+OBJ = calclist.o dblarr.o halt.o io_unformat.o jacobi.o
+
+.PHONY : clean
+
+aa : $(OBJ)
+ $(CC) aa.c $(OBJ) -o aa -lm
+
+%.o : %.c
+ $(CC) -c $< -o $@
+
+clean :
+ rm -f *.o
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/aa/aa.c b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/aa.c
new file mode 100644
index 0000000000000000000000000000000000000000..f92c87803c6e540f8ddd1aaf7ba110d59a359a9a
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/aa.c
@@ -0,0 +1,1283 @@
+/*************************************************************************
+ *
+ * This c-program analyzes autocorrelations from
+ * histogram datafiles
+ *
+ * aa [opt] 'name'
+ * input files:
+ * histogram file
+ * output to standard-output
+ *
+ * COMPILE:
+ * cc -o aa aa.c io_unformat.c calclist.c dblarr.c jacobi.c halt.c -lm
+ *
+ *************************************************************************/
+
+
+#include
+#include
+#include
+#include
+#include
+/* #include */
+#include
+
+#include "stuff.h"
+
+/* #include */
+
+/* #define max(x,y) (((x) > (y)) ? (x) : (y)) */
+
+/* prototypes...
+ */
+
+
+#define MAX_DATA 10000
+
+#define PI 3.14159265358979
+#define pi2 (PI*2.0)
+
+double * dblalloc(int num);
+void halt(char *message,char *msg);
+int ccorr_matrix(double *dat[],int n,int length,int jack,
+ double *wd,int weight,double wnorm);
+int history(int nv, double a[],int showlevel,double level_val,
+ double wd[],int weight);
+int errorcalc(int nconf,double svect[],double *avep,double *sigp,
+ double *fsp,double *fser);
+int autocorr(double *d,int nd,double *res,int nres);
+int histgr2(double *a,double *b,int ndata,int bins,double *wd,int weight,
+ double low1,double high1,double low2,double high2);
+double tunnelcalc(int nd,double *d,double tmin,double tmax);
+
+int noauto = 0;
+
+char usage[] = " usage: aa [-opt] hist-file\n\
+ q : quiet\n\
+ j nblocks : jackknife data\n\
+ J nblocks : print jackknife blocks\n\
+ A : print all measurements as jackknife blocks\n\
+ i iters : use only i iterations\n\
+ s iters : skip iteration\n\
+ S n : use only every n iteration\n\
+ d num : use only data num\n\
+ D #2+#3,#4 : collect data and do the arithmetic [+-*/]\n\
+ R r1:r2 : print data from r1 to r2\n\
+ f num=val : filter data with vector num = value\n\
+ l[L] a:b : data range from a to b [element 2]\n\
+ X : print just the data indicated\n\
+ C : cross-correlation of data\n\
+ E : eigenvalue analysis of cross-correlation matrix\n\
+ c num : correlation number num\n\
+ cl length : force vector length\n\
+ cc/cd num : print 2-dim. array number num\n\
+ cos : cosine transform the data (only with -j, -A)\n\
+ a : sum -D or -c into one data element\n\
+ x d,i,num : datavector element d[index[num]], num optional\n\
+ K n : take modulo of index\n\
+ h : print histogram plot\n\
+ v value : show value in the histogram\n\
+ V : divide value by volume\n\
+ Q : multiply value by sqrt of volume\n\
+ w : use the weight factor (elem 1)\n\
+ 0 : subtract the 0-moment (average)\n\
+ 2 : print second moment\n\
+ p power : print moment power\n\
+ F bins : print histogram in weight format\n\
+ r block : do the 'running blocking'\n\
+ b block : do the 'normal blocking'\n\
+ m/M value : analyze only up to minimum/maximum value\n\
+ O : list measurements up to minimum/maximum value\n\
+ P : lag the list by one unit\n\
+ g [bins] : print histogram to the standard output, with bins bins\n\
+ B [binw] : print histogram, with binwidth [1]\n\
+ H : print history to the standard output\n\
+ G bins : 2-component histogram\n\
+ W b_old,b_new,\'act_string\' : reweight to new beta value (up to 10 times)\n\
+ Y bg,mU_old,mU_new,T_old,T_new : reweight to new T value (susy)\n\
+ Z mH_old,mH_new,b_old,b_new : reweight to new mH, betaH -values (su2-Higgs)\n\
+ y tmin:tmax : print tunneling time\n\
+ t : no autocorrelations\n\
+ T length : print autocorrelation function for distance length\n";
+
+main(argc,argv)
+ int argc;
+ char * argv[];
+
+{
+ double *fp,*dat[MAX_DATA],*tmparr,*wd;
+ double *tmpx;
+ int block,indexed,iv,dv,index;
+ int nblocking,length,jack,sum;
+ double temps,minval,maxval,tmin,tmax;
+ char * ss,*lists[MAX_DATA];
+ int raw,eig;
+ int idata,nd;
+ int i,j,jj,k,iters,hist,printhist;
+ long lk;
+ int gram,rblock,bblock,skip,weight;
+ double hbin,error,naive,aveg,average[MAX_DATA],timerel,level_val;
+ int list,limits,lagged,showlevel,stagger,mom2,ccorr;
+ int vol,volume,wgram,irange1,irange2,icorr,printjack,index_mod;
+ int four,h2gram,reweight,susy,higgs,isarray,tunnel;
+ int quiet,sqrtV,icorrlen;
+ int filter;
+ double filtervalue;
+ double betar,wnorm,vnorm;
+ double mU,mUn,mH,mH2,T,Tn;
+ int autocorrelation = 0,sub_ave = 0;
+ double power = 0,beta1[10],beta2[10];
+ char rwstring[10][100];
+ FILE *ff;
+ e_header h;
+ double low1,high1,low2,high2;
+ double susy_mul,betag;
+
+ hbin = 0.0;
+ block = nblocking = iters = hist = printhist = gram = idata =
+ rblock = bblock = jack = list = limits = stagger = mom2 = quiet = 0;
+ lagged = showlevel = weight = volume = wgram = 0;
+ irange1 = irange2 = icorr = icorrlen = 0;
+ ccorr = sqrtV = sum = four = h2gram = 0;
+ printjack = indexed = index_mod = reweight = 0;
+ susy = higgs = raw = eig = isarray = 0;
+ filter = tunnel = 0;
+ nd = 0;
+ minval = -1e-60;
+ maxval = 1e60;
+
+ low1=low2=high1=high2=0;
+
+ skip = 0;
+ if (argc <= 1) halt(usage,NULL);
+
+ while (--argc > 0 && (*++argv)[0] == '-') {
+ ss = argv[0] + 1;
+
+ while (*ss) {
+ switch(*ss++) {
+
+ case 'q': quiet = 1; break;
+ case 'O': list = 1; break;
+ case 'P': lagged = 1; break;
+ case 't': noauto = 1; break;
+ case 'T': getnum("%d",&autocorrelation); break;
+ case 'y': tunnel = 1; get2num("%lg:%lg",&tmin,&tmax); break;
+ case 'w': weight = 1; break;
+ case '0': sub_ave = 1; break;
+ case '2': mom2 = 1; break;
+ case 'a': sum = 1; break;
+ case 'p': getnum("%lg",&power); break;
+ case 'F': wgram = weight = volume = 1; idata = 4;
+ getnum("%d",&gram); break;
+ case 'V': volume = 1; break;
+ case 'Q': sqrtV = 1; break;
+ case 'c':
+ if (strcmp(ss,"os") == 0) { four = 1; ss+=2; }
+ else if (*ss == 'l') { ss++; getnum("%d",&icorrlen); }
+ else {
+ if (*ss == 'c') { isarray=1; ss++; }
+ if (*ss == 'd') { isarray=2; ss++; }
+ getnum("%d",&icorr);
+ }
+ break;
+ case 'm': limits = 1; getnum("%lg",&minval); break;
+ case 'M': limits = 1; getnum("%lg",&maxval); break;
+ case 'i': getnum("%d",&iters); break;
+ case 's': getnum("%d",&skip); break;
+ case 'S': getnum("%d",&stagger); break;
+ case 'r': getnum("%d",&rblock); break;
+ case 'b': getnum("%d",&bblock); break;
+ case 'h': hist = 1; break;
+ case 'H': printhist = 1; break;
+ case 'v': showlevel = 1; getnum("%lg",&level_val); break;
+ case 'j': getnum("%d",&jack); break;
+ case 'J': printjack = 1; getnum("%d",&jack); break;
+ case 'A': printjack = 1; jack = -1; break;
+ case 'W':
+ /* take away white space */
+ { char *p,*q;
+ p = q = ss;
+ while (*p) {
+ if (*p != ' ' && *p != '\t') *(q++) = *p;
+ p++;
+ }
+ *q = 0;
+ }
+ get3num("%lg,%lg,%s",&beta1[reweight],&beta2[reweight],rwstring[reweight]);
+ beta2[reweight] -= beta1[reweight];
+ reweight++;
+ break;
+ case 'Y': reweight = susy = 1;
+ get5num("%lg,%lg,%lg,%lg,%lg",&betag,&mU,&mUn,&T,&Tn);
+ break;
+ case 'Z': reweight = higgs = 1;
+ get4num("%lg,%lg,%lg,%lg",&mH,&mH2,&beta1[0],&beta2[0]);
+ break;
+
+ case 'd': getnum("%d",&idata); nd = 1; break;
+ case 'D': getlist(lists,nd); break;
+
+ case 'f':
+ get2num("%d=%lg",&filter,&filtervalue);
+ break;
+
+ case 'X': raw = 1; break;
+ case 'E': eig = 1; break;
+
+ case 'R':
+ if (!(*ss) && --argc) ss = (++argv)[0];
+ if (sscanf(ss,"%d:%d",&irange1,&irange2) != 2) halt(usage,NULL);
+ ss = strchr(ss,0);
+ break;
+
+ case 'x':
+ indexed = 1;
+ if (!(*ss) && --argc) ss = (++argv)[0];
+ if ((i = sscanf(ss,"%d,%d,%d",&dv,&iv,&index)) == 3) {
+ irange1 = irange2 = index;
+ } else if (i == 2) index = 0;
+ else halt(usage,NULL);
+ ss = strchr(ss,0);
+ break;
+
+ case 'K': getnum("%d",&index_mod); break;
+
+ case 'C': ccorr = 1; break;
+
+ case 'B': getoptnum("%lg",&hbin,1.0); gram = 1; break;
+ case 'g': getoptnum("%d",&gram,100); break;
+ case 'G': getnum("%d",&h2gram); break;
+
+ case 'l': get2num("%lg:%lg",&low1,&high1); break;
+ case 'L': get2num("%lg:%lg",&low2,&high2); break;
+
+ default: halt(usage,NULL);
+
+ } /* switch */
+ }
+ } /* while */
+
+ if (argc == 0) halt(usage,NULL);
+
+ /* now find the histogram file length */
+
+ ss = argv[0];
+
+ ff = fopen(ss,"r");
+ if (ff == NULL) halt("Could not open file %s",ss);
+
+ block = readheader(ff,&h);
+ if (icorrlen) h.d2 = icorrlen;
+ tmparr = dblarr(block);
+
+ vol = h.lx*h.ly*h.lz*h.lt;
+
+ if (indexed && !irange1) icorr = 1;
+
+ if (icorr) {
+ if (h.d1 >= icorr || icorrlen) {
+ irange1 = 1 + (icorr-1)*h.d2;
+ irange2 = irange1 + h.d2 - 1;
+ } else if (h.d1 + h.d3 >= icorr) {
+ irange1 = 1 + h.d1*h.d2 + (icorr-1-h.d1)*h.d4;
+ irange2 = irange1 + h.d4 - 1;
+ } else halt("No such thing!",NULL);
+ }
+
+ if (indexed) tmpx = dblarr(irange2-irange1+1);
+
+ /* do not assume only doubles */
+ if (iters) iters += skip;
+
+ length = 0;
+ do {
+ length++;
+ lk = readdata(ff,tmparr);
+ } while (lk == length && iters != length);
+ if (lk != length) length--;
+ rewind(ff);
+ skipheader(ff);
+ if (icorrlen) h.d2 = icorrlen;
+
+ length -= (skip);
+ if (!quiet) {
+ fprintf(stderr,"* Data: %d Measurements: %d\n",block,length);
+ fprintf(stderr," double %ld, float %ld, long %ld, char %ld\n",
+ h.n_double,h.n_float,h.n_long,h.n_char);
+ }
+
+ if (length <= 0) halt("Measurements == 0",NULL);
+ if (jack < 0) jack = length;
+
+ if (volume) vnorm = 1.0/vol; else vnorm = 1;
+ if (sqrtV) vnorm *= sqrt(vol);
+
+ if (!raw) {
+ if (nd) {
+ for (i=0; i block) halt("Illegal data number",NULL);
+ if (irange1 > irange2 || irange2 > block) halt("Illegal range spec",NULL);
+
+ if (weight || reweight) wd = dblarr(length);
+
+ if (susy) susy_mul = pow(4.0/(betag * 4.0/9.0),3.0) * (h.lx*h.ly*h.lz);
+
+ j = skip;
+ while (j-- > 0) lk = readdata(ff,tmparr);
+
+ if (idata) fp = dat[0];
+ for (jj=j=0; jj < length; jj++) {
+ extern int calclist_index;
+
+ lk = readdata(ff,tmparr);
+
+ calclist_index = jj;
+
+ k=0;
+ if (!filter || tmparr[filter-1] == filtervalue) {
+
+ if (reweight || weight) {
+ double w;
+ static int first=1;
+ static double w0;
+
+ if (susy)
+ /*** THis is the `old' susy weight
+ w = 0.379*(sqr(100.0/T) - sqr(100.0/Tn)) * tmparr[8]
+ + 0.849*(sqr(mU/T) - sqr(mUn/Tn)) * tmparr[16];
+ *****/
+
+ w = -susy_mul*( (1.0/sqr(T) - 1.0/sqr(Tn)) *
+ ( 18384.1*tmparr[3] - 3984.08*tmparr[4] - 2*1191.72*tmparr[7]
+ + 2*96.6867*tmparr[8] )
+ - (sqr(mU/T) - sqr(mUn/Tn))*tmparr[14] );
+
+ else if (higgs) w = (beta2[0]/beta1[0]-1)*tmparr[2] +
+ (sqr(beta2[0]*mH2/(beta1[0]*mH))-1)*tmparr[4];
+ else if (reweight)
+ for (w=i=0; i= minval) {
+ if (lagged && !lag_on) lag_on = 1;
+ else {
+ j++;
+ if (!isaccept) {
+ isaccept = 1;
+ begmeas = i;
+ }
+ }
+ } else {
+ lag_on = 0;
+ if (isaccept) {
+ isaccept = 0;
+ printf("%d %d\n",begmeas,i-1);
+ }
+ }
+ }
+ if (isaccept) printf("%d %d\n",begmeas,length-1);
+
+ if (!quiet) fprintf(stderr," - accepted %d of %d measurements\n",j,length);
+ exit(-1);
+ }
+
+ if (limits) {
+ double *dd[MAX_DATA],mval,Mval;
+
+ for (idata=irange1; idata<=irange2; idata++) dd[idata] = dblarr(length);
+ for (i=j=0; i= minval) {
+ for (idata=irange1; idata<=irange2; idata++)
+ dd[idata][j] = dat[idata][i];
+ j++;
+ }
+ }
+ for (idata=irange1; idata<=irange2; idata++) {
+ free(dat[idata]);
+ dat[idata] = dd[idata];
+ }
+ if (!quiet) fprintf(stderr," - accepted %d of %d measurements\n",j,length);
+ length = j;
+ if (jack > length) jack = length;
+ }
+
+ if (stagger) {
+ double * dd,val;
+
+ for (idata=irange1; idata<=irange2; idata++) {
+ dd = (double *)calloc(length/stagger+1,sizeof(double));
+ for (i=0; i length) jack = length;
+ }
+
+ if (rblock) {
+ double * dd,val;
+
+ for (idata=irange1; idata<=irange2; idata++) {
+ dd = (double *)calloc(length-rblock+1,sizeof(double));
+ for (val=i=0; i length) jack = length;
+ }
+
+ idata = irange1;
+
+ if (printjack && gram) {
+ printhgram(dat[0],gram,length,wd,weight,jack,hbin);
+ return(1);
+ }
+
+ if (printjack || jack) {
+ double *jave,*jnorm;
+ int dl,id;
+
+ dl = irange2-irange1+1;
+ jave = dblarr(dl*jack);
+ jnorm = dblarr(jack);
+
+ for (j=0; j 1) ftrans(jave+j*dl,dl);
+ if (printjack) {
+ for (idata=irange1; idata<=irange2; idata++) {
+ id = idata-irange1;
+ if (!isarray) printf("%d %.16lg\n",id,jave[id+j*dl]);
+ else if (isarray == 1)
+ printf("%ld %ld %.16lg\n",id/h.d3,id%h.d3,jave[id+j*dl]);
+ else {
+ printf("%.16lg ",jave[id+j*dl]);
+ if ((id+1)%h.d3 == 0) printf("\n");
+ }
+
+ /* fprintf(stderr,"."); fflush(stderr); */
+ }
+ }
+ }
+ if (!printjack) {
+ double aveg,error;
+
+ for (idata=irange1; idata<=irange2; idata++) {
+ id = idata-irange1;
+ aveg = error = 0.0;
+ for (j=0; j p[i]) ? p[i] : minv;
+ }
+
+ if (hbin <= 0) {
+ hg = (double *)calloc(bins,sizeof(double));
+ d = (maxv-minv)/(bins-1);
+ bn = bins;
+ maxv += d/2; minv -= d/2;
+ xadd = 0.5;
+ } else {
+ maxv = hbin * ceil(maxv/hbin);
+ minv = hbin * floor(minv/hbin);
+ bins = maxv/hbin - minv/hbin + 1;
+ bn = bins-1;
+ hg = (double *)calloc(bins,sizeof(double));
+ d = hbin;
+ xadd = 0;
+ }
+
+ if (jack < 2) {
+
+ if (!weight)
+ for (i=0; i a[i]) ? a[i] : mina;
+ }
+ }
+ d = (maxa-mina)/(bins-1);
+ maxa += d/2; mina -= d/2;
+
+ if (maxb <= minb) {
+ maxb = -1e60; minb = -maxb;
+ for (i=0; i b[i]) ? b[i] : minb;
+ }
+ }
+ d = (maxb-minb)/(bins-1);
+ maxb += d/2; minb -= d/2;
+
+ for (i=0; i ac[j]) ? max : ac[j];
+ }
+
+ for (j=0; j= min && level_val <= max) {
+ for (j=0; j=0; j--) printf("%s\n",level[j]);
+
+ free(his);
+
+ for (i=0; i <= NLEV; i++) {
+ sp = level[i];
+ for (j=0; j< bins; j++) sp[j] = ' ';
+ sp[j] = 0;
+ }
+
+ if (showlevel && level_val >= min && level_val <= max) {
+ for (j=0; j ac[i]) ? dmax : ac[i];
+ k++;
+ }
+ dave /= k;
+ dsig = sqrt(dsig/k - sqr(dave));
+ level[ (int)(NLEV*(dmin-min)/(max-min)) ][j] = '.';
+ level[ (int)(NLEV*(dmax-min)/(max-min)) ][j] = '.';
+ if (dave+dsig <= max)
+ level[ (int)(NLEV*(dave+dsig-min)/(max-min)) ][j] = '-';
+ if (dave-dsig >= min)
+ level[ (int)(NLEV*(dave-dsig-min)/(max-min)) ][j] = '-';
+ level[ (int)(NLEV*(dave-min)/(max-min)) ][j] = '*';
+ }
+
+ printf("\nblocked history with %d blocks, size %d ...\n",bins,nv/bins);
+
+ for (j=NLEV; j>=0; j--) printf("%s\n",level[j]);
+
+ return(1);
+}
+
+/********************************************************************
+ * Assume here periodicity so that d[2*nd - 1 - i] = d[i]
+ */
+
+int
+ccorr_matrix(double *dat[],int n,int length,int jack,
+ double *wd,int weight,double wnorm)
+{
+ int i,j,k;
+ double *a,*w,*eval,*evec;
+
+ a = dblarr(n);
+ w = dblarr(n*n);
+ eval = dblarr(n);
+ evec = dblarr(n*n);
+
+ if (!weight) wnorm = 1.0/length; else wnorm /= length;
+
+ for (j=0; j 1 && (sig == 0.0 || sig < 1.0e-12*sqr(ave) || noauto)) {
+ /* if (!noauto) fprintf(stderr," ** small sigma/ave: %g/%g\n",sqrt(sig),ave); */
+ *avep = ave;
+ *sigp = sqrt(sig/(nconf-1));
+ *fsp = *fser = 0.5;
+ return(1);
+ }
+
+ if (nconf <= 1) {
+ *avep = ave;
+ *sigp = 0.0;
+ *fsp = *fser = 0.5;
+ return(1);
+ }
+
+ /* time correlations */
+
+ it = 0;
+ fs = 0.5;
+ do {
+ it++;
+ ax = ay = fi = 0.0;
+
+ nc=nconf-it;
+ for (j=0; j it && it < nconf/2);
+
+ if (it >= nconf/2) fprintf(stderr," ** correlation > N/2*%d\n",TINT);
+
+ sig=sqrt(sig/(nconf-1));
+
+ *sigp = sig;
+ *avep = ave;
+ *fsp = fs;
+ *fser = fs*sqrt(2.*(2.*it+1)/nc);
+
+ return(0);
+}
+
+
+/********************************************************************
+ * this routine calculates autocorrelation function
+ */
+
+int
+autocorr(d,nd,res,nres)
+ int nd,nres;
+ double d[], res[];
+{
+ double ave,fs,sig,ax,ay,fi;
+ int i,j,nc,it,k;
+
+ ave = sig = 0.0;
+
+ for(i=0; i= tmax) {
+ if (!ismax) num++;
+ ismax = 1; ismin = 0;
+ } else if (d[i] <= tmin) {
+ if (!ismin) num++;
+ ismax = 0; ismin = 1;
+ }
+ }
+
+ return(1.0*nd/num);
+}
+
+
+
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/aa/calclist.c b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/calclist.c
new file mode 100644
index 0000000000000000000000000000000000000000..4e7a7779b1edf6ece4fb4ce3513663dcaf2b2f8c
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/calclist.c
@@ -0,0 +1,237 @@
+#include
+#include
+#include
+#include
+#include
+/* #include */
+#include
+
+#include "stuff.h"
+#ifdef cray
+double asinh(double x) { return(log(sqrt(x*x+1.0) + x)); }
+double acosh(double x) { return(log(sqrt(x*x-1.0) + x)); }
+double atanh(double x) { return(0.5*log((1.0+x)/(1.0-x))); }
+#endif
+
+#define pi 3.1415926535897929
+
+char * getnumber(char *s,double *d);
+double evallist(double d[],int dn, int prec);
+void eval2(double *val1, double *val2, double d[],int dn);
+
+
+static char *cmd, *cmd0;
+/* This is modified by the calling program, to produce correct
+ * index
+ */
+int calclist_index = 0;
+
+/**********************************************************
+ * this evaluates the data arithmetic string
+ */
+
+double
+calclist(double d[],int dn, char *incmd)
+{
+ double val;
+
+ /* save current number of calclist calls - return with #i */
+
+ cmd0 = cmd = incmd;
+ val = evallist(d,dn,0);
+ if (*cmd == 0) return(val);
+ if (*cmd == ')') {
+ fprintf(stderr,"Extra \')\' : %s\n",cmd0);
+ exit(0);
+ }
+ if (*cmd == ',') {
+ fprintf(stderr,"Extra \',\' : %s\n",cmd0);
+ exit(0);
+ }
+ fprintf(stderr,"Parser error: %s\n",cmd0);
+}
+
+
+double
+evallist(double d[],int dn,int prec)
+{
+ int id;
+ char *sp;
+ double val,val2;
+
+ /* get argument first */
+ while (*cmd == ' ') cmd++;
+ if (*cmd == '(') {
+ cmd++;
+ val = evallist(d,dn,0);
+ if (*cmd != ')') {
+ fprintf(stderr,"Expecting \')\': \'%s\'\n",cmd0);
+ exit(0);
+ }
+ cmd++;
+ } else if (*cmd == '#') {
+ cmd++;
+ if (*cmd == 'i') {
+ /* print out index */
+ val = calclist_index;
+ cmd++;
+ } else {
+ if ((cmd=getnumber(cmd,&val)) == NULL) {
+ fprintf(stderr,"Expecting 'i'/number after #: \'%s\'\n",cmd0);
+ exit(0);
+ }
+ id = val;
+ if (id > dn) {
+ fprintf(stderr,"#%d too large: max %d\n",id,dn);
+ exit(0);
+ }
+ val = d[id-1];
+ }
+ }
+ else if ((sp=getnumber(cmd,&val)) != NULL) cmd = sp;
+ else {
+ /* now something else as ( or #; check for function name */
+ if (strncmp(cmd,"sqrt(",5) == 0) { cmd+=5; val = sqrt(evallist(d,dn,0)); }
+ else if (strncmp(cmd,"abs(",4) == 0) { cmd+=4; val = fabs(evallist(d,dn,0)); }
+ else if (strncmp(cmd,"sin(",4) == 0) { cmd+=4; val = sin(evallist(d,dn,0)); }
+ else if (strncmp(cmd,"cos(",4) == 0) { cmd+=4; val = cos(evallist(d,dn,0)); }
+ else if (strncmp(cmd,"tan(",4) == 0) { cmd+=4; val = tan(evallist(d,dn,0)); }
+ else if (strncmp(cmd,"exp(",4) == 0) { cmd+=4; val = exp(evallist(d,dn,0)); }
+ else if (strncmp(cmd,"log(",4) == 0) { cmd+=4; val = log(evallist(d,dn,0)); }
+ else if (strncmp(cmd,"asin(",5) == 0) { cmd+=5; val = asin(evallist(d,dn,0)); }
+ else if (strncmp(cmd,"acos(",5) == 0) { cmd+=5; val = acos(evallist(d,dn,0)); }
+ else if (strncmp(cmd,"atan(",5) == 0) { cmd+=5; val = atan(evallist(d,dn,0)); }
+ else if (strncmp(cmd,"sinh(",5) == 0) { cmd+=5; val = sinh(evallist(d,dn,0)); }
+ else if (strncmp(cmd,"cosh(",5) == 0) { cmd+=5; val = cosh(evallist(d,dn,0)); }
+ else if (strncmp(cmd,"tanh(",5) == 0) { cmd+=5; val = tanh(evallist(d,dn,0)); }
+ else if (strncmp(cmd,"asinh(",6) == 0){ cmd+=6; val = asinh(evallist(d,dn,0)); }
+ else if (strncmp(cmd,"acosh(",6) == 0){ cmd+=6; val = acosh(evallist(d,dn,0)); }
+ else if (strncmp(cmd,"atanh(",6) == 0){ cmd+=6; val = atanh(evallist(d,dn,0)); }
+ else if (strncmp(cmd,"ceil(",5) == 0) { cmd+=5; val = ceil(evallist(d,dn,0)); }
+ else if (strncmp(cmd,"floor(",6) == 0){ cmd+=6; val = floor(evallist(d,dn,0)); }
+ else if (strncmp(cmd,"min(",4) == 0) {
+ cmd+=4; eval2(&val,&val2,d,dn); val = smaller(val,val2); }
+ else if (strncmp(cmd,"max(",4) == 0) {
+ cmd+=4; eval2(&val,&val2,d,dn); val = greater(val,val2); }
+ else if (strncmp(cmd,"pi",2) == 0) {
+ cmd+=2; val = pi; }
+ else { fprintf(stderr,"Unknown stuff: %s\n",cmd0); exit(0); }
+
+ if (*cmd != ')') {
+ fprintf(stderr,"Expecting \')\' after the function name: \'%s\'\n",cmd0);
+ exit(0);
+ }
+ cmd++;
+ }
+
+ /* now the operator */
+ while (*cmd == ' ') cmd++;
+
+ while (*cmd) {
+ switch (*cmd) {
+ case ',': return(val);
+ case ')': return(val);
+ case '^':
+ if (prec >= 5) return(val);
+ cmd++;
+ val = pow(val,evallist(d,dn,5));
+ break;
+ case '*':
+ if (prec >= 4) return(val);
+ cmd++;
+ val *= evallist(d,dn,4);
+ break;
+ case '/':
+ if (prec >= 4) return(val);
+ cmd++;
+ val /= evallist(d,dn,4);
+ break;
+ case '%':
+ if (prec >= 4) return(val);
+ cmd++;
+ val = fmod(val,evallist(d,dn,4));
+ break;
+ case '+':
+ if (prec >= 3) return(val);
+ cmd++;
+ val += evallist(d,dn,3);
+ break;
+ case '-':
+ if (prec >= 3) return(val);
+ cmd++;
+ val -= evallist(d,dn,3);
+ break;
+
+ case '=':
+ if (cmd[1] != '=') { fprintf(stderr,"Expecting '==': %s\n",cmd0); exit(0); }
+ if (prec >= 2) return(val);
+ cmd += 2;
+ val = (val == evallist(d,dn,2));
+ break;
+ case '<':
+ if (prec >= 2) return(val);
+ cmd++;
+ if (cmd[0] == '=') {
+ val = (val <= evallist(d,dn,2)); cmd++;
+ } else val = (val < evallist(d,dn,2));
+ break;
+ case '>':
+ if (prec >= 2) return(val);
+ cmd++;
+ if (cmd[0] == '=') {
+ val = (val >= evallist(d,dn,2)); cmd++;
+ } else val = (val > evallist(d,dn,2));
+ break;
+ case '!':
+ if (cmd[1] != '=') { fprintf(stderr,"Expecting '!=': %s\n",cmd0); exit(0); }
+ if (prec >= 2) return(val);
+ cmd += 2;
+ val = (val != evallist(d,dn,2));
+ break;
+
+ default:
+ fprintf(stderr,"Unknown operator: %s\n",cmd0);
+ exit(0);
+ }
+ }
+ return(val);
+}
+
+/**************************************************************
+ * eval 2 arguments -- must be comma!
+ */
+
+void
+eval2(double *val1, double *val2, double d[],int dn)
+{
+ *val1 = evallist(d,dn,0);
+ if (*cmd != ',') { fprintf(stderr,"Expecting ',' : %s\n",cmd0); exit(0);}
+ cmd++;
+ *val2 = evallist(d,dn,0);
+}
+
+
+/**************************************************************
+ * hop number
+ */
+char *
+getnumber(char *s,double *d)
+{
+ int dot = 0;
+
+ if (sscanf(s,"%lg",d) != 1) return(NULL);
+
+ while (*s == ' ' || *s == '\t') s++;
+ if (*s == '+' || *s == '-') s++;
+ if (*s == '.') { s++; dot = 1; }
+ while (*s <= '9' && *s >= '0') s++;
+ if ((!dot) && *s == '.') s++;
+ while (*s <= '9' && *s >= '0') s++;
+ if (*s == 'e' || *s == 'E') {
+ s++;
+ if (*s == '+' || *s == '-') s++;
+ while (*s <= '9' && *s >= '0') s++;
+ }
+ return(s);
+}
+
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/aa/dblarr.c b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/dblarr.c
new file mode 100644
index 0000000000000000000000000000000000000000..c6281ea10193fd37e64e154da0c5233dbfc830ec
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/dblarr.c
@@ -0,0 +1,48 @@
+#include
+#include
+#include
+#include
+#include
+/* #include */
+#include
+
+/* #include */
+
+#include "stuff.h"
+
+double * dblarr(int size)
+{
+ double * p;
+
+ p = (double *)calloc(size,sizeof(double));
+ if (p == NULL) {
+ fprintf(stderr," --- could not allocate double array of size %d\n",size);
+ exit(0);
+ }
+ return (p);
+}
+
+float * fltarr(int size)
+{
+ float * p;
+
+ p = (float *)calloc(size,sizeof(float));
+ if (p == NULL) {
+ fprintf(stderr," --- could not allocate float array of size %d\n",size);
+ exit(0);
+ }
+ return (p);
+}
+
+
+int * intarr(int size)
+{
+ int * ip;
+
+ ip = (int *)calloc(size,sizeof(int));
+ if (ip == NULL) {
+ fprintf(stderr," --- could not allocate int array of size %d\n",size);
+ exit(0);
+ }
+ return(ip);
+}
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/aa/halt.c b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/halt.c
new file mode 100644
index 0000000000000000000000000000000000000000..02a432d38189e5c06fa5cf5ac233c36f540e8c86
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/halt.c
@@ -0,0 +1,16 @@
+#include
+#include
+#include
+#include
+#include
+/* #include */
+#include
+
+int
+halt(char *s,void *p)
+{
+ fprintf(stderr,s,p);
+ fprintf(stderr,"\n");
+ exit(0);
+ return(0); /* just to get rid of an warning.. */
+}
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/aa/io_unformat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/io_unformat.c
new file mode 100644
index 0000000000000000000000000000000000000000..b72c7cc17970ee3c16ded488d85648a3f0196221
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/io_unformat.c
@@ -0,0 +1,292 @@
+/*
+ * UNFORMATTED IO SYSTEM
+ * Kari Rummukainen 1990 - 1999
+ */
+
+
+#include
+#include
+#include
+#include
+#include
+/* #include */
+#include
+
+#include "stuff.h"
+
+
+static int inv_bytes,long_mode,block,dblock,lblock,fblock,cblock,iblock;
+static double *dtmparr;
+static float *ftmparr;
+static long *ltmparr;
+static int *itmparr;
+static char *ctmparr;
+static int msg = 0;
+
+#define l_h (sizeof(e_header)/sizeof(long))
+
+typedef union {
+ e_header h;
+ long l[l_h];
+} h_union;
+
+typedef union {
+ i_header h;
+ int l[l_h];
+} i_union;
+
+#define ll_h (sizeof(ll_header)/sizeof(int))
+
+typedef union {
+ ll_header h;
+ int l[ll_h];
+} ll_union;
+
+
+
+/**************************************************
+ * invert the byte ordering
+ */
+
+long
+swaplong(long a)
+{
+ union {
+ long l;
+ char c[sizeof(long)];
+ } t1,t2;
+ int i;
+
+ t1.l = a;
+ for (i=0; i= 2) { /* now long is longer than our long */
+ fread(&ll.h,sizeof(ll_header),1,ff);
+
+ for (k=0, j=( (long_mode == 2) ? 0 : 1 ); jlz < 1) h->lz = 1;
+ if (h->lt < 1) h->lt = 1;
+
+ lblock = iblock = 0;
+ block = dblock = l.h.n_double;
+ if (dblock) dtmparr = dblarr(l.h.n_double);
+ block += fblock = l.h.n_float;
+ if (fblock) ftmparr = (float *)calloc(l.h.n_float,sizeof(float));
+ block += cblock = l.h.n_char;
+ if (cblock) ctmparr = (char *)calloc(l.h.n_char,sizeof(char));
+
+ if (l.h.n_long) {
+ if (long_mode == 0) {
+ block += lblock = l.h.n_long;
+ ltmparr = (long *)calloc(l.h.n_long,sizeof(long));
+ } else if (long_mode == 1) {
+ block += iblock = l.h.n_long;
+ itmparr = (int *)calloc(l.h.n_long,sizeof(int));
+ } else if (long_mode >= 2) {
+ block += iblock = l.h.n_long;
+ itmparr = (int *)calloc(2*l.h.n_long,sizeof(int));
+ }
+ }
+
+ return (block);
+
+}
+
+int
+skipheader(FILE *ff)
+{
+ e_header e;
+ i_header i;
+
+ if (long_mode == 0) fread(&e,sizeof(e_header),1,ff);
+ else if (long_mode == 1) fread(&i,sizeof(i_header),1,ff);
+ else if (long_mode >= 2) {
+ fread(&i,sizeof(i_header),1,ff);
+ fread(&i,sizeof(i_header),1,ff);
+ }
+ return(1);
+}
+
+
+#define itmp_index(i) ((long_mode < 2) ? i : 2*i + long_mode-2)
+
+long
+readdata(FILE *ff,double *arr)
+{
+ int ik,k,i;
+ long lk;
+
+ if (dblock) fread(dtmparr,dblock,sizeof(double),ff);
+ if (fblock) fread(ftmparr,fblock,sizeof(float),ff);
+ if (lblock) fread(ltmparr,lblock,sizeof(long),ff);
+ if (iblock && long_mode < 2) fread(itmparr,iblock,sizeof(int),ff);
+ if (iblock && long_mode >= 2) fread(itmparr,2*iblock,sizeof(int),ff);
+ if (cblock) fread(ctmparr,cblock,sizeof(char),ff);
+
+ k = 0;
+ if (long_mode == 1) {
+ fread(&ik,1,sizeof(int),ff); if (inv_bytes) lk = swapint(ik); else lk = ik;
+ } else if (long_mode == 0) {
+ fread(&lk,1,sizeof(long),ff); if (inv_bytes) lk = swaplong(lk);
+ } else {
+ int a[2];
+ fread(&a,2,sizeof(int),ff);
+ if (inv_bytes) lk = swapint(a[long_mode-2]);
+ else lk = a[long_mode-2];
+ }
+
+ if (inv_bytes) {
+ for (i=0; i
+#include
+#include
+
+double fabs(double);
+
+#define verysmall(a,b) ((fabs(a)+b) - fabs(a) == 0)
+
+void eigsrt(int n,double d[],double *v);
+
+int jacobi(int n, double *ap,double d[],double *vp,int is_ordered)
+{
+ double *a,*b,*z,*v;
+ int ip,iq,i,j,nrot;
+ double sm,tresh,g,h,t,theta,tau,c,s;
+
+ a = (double *)calloc(n*n,sizeof(double));
+ v = (double *)calloc(n*n,sizeof(double));
+ b = (double *)calloc(n,sizeof(double));
+ z = (double *)calloc(n,sizeof(double));
+
+ for (i=0; i 4 && verysmall(d[ip],g) && verysmall(d[iq],g)) a[ip*n+iq] = 0;
+ else if (fabs(a[ip*n+iq]) > tresh) {
+ h = d[iq]-d[ip];
+ if (verysmall(h,g)) t = a[ip*n+iq]/h;
+ else {
+ theta = 0.5*h/a[ip*n+iq];
+ t = 1./(fabs(theta)+sqrt(1.0+theta*theta));
+ if (theta < 0) t = -t;
+ }
+ c = 1./sqrt(1+t*t);
+ s = t*c;
+ tau = s/(1.0+c);
+ h = t*a[ip*n+iq];
+ z[ip] -= h; z[iq] += h;
+ d[ip] -= h; d[iq] += h;
+ a[ip*n+iq] = 0;
+ for (j=0; j<=ip-1; j++) {
+ g = a[j*n+ip]; h = a[j*n+iq];
+ a[j*n+ip] = g - s*(h + g*tau);
+ a[j*n+iq] = h + s*(g - h*tau);
+ }
+ for (j=ip+1; j<=iq-1; j++) {
+ g = a[ip*n+j]; h = a[j*n+iq];
+ a[ip*n+j] = g - s*(h + g*tau);
+ a[j*n+iq] = h + s*(g - h*tau);
+ }
+ for (j=iq+1; j tresh) */
+ } /* iq =ip+1..n-1 */
+ } /* ip = 0..n-2 */
+
+ for (ip=0; ip= p) p = d[k = j];
+ if (k != i) {
+ d[k] = d[i];
+ d[i] = p;
+ for (j=0; j (y)) ? (x) : (y))
+#define smaller(x,y) (((x) < (y)) ? (x) : (y))
+
+#define getoptnum(par1,par2,val){ \
+ char *sp; \
+ sp = ss; \
+ if (!*ss && (argc-1)) ss = (argv+1)[0]; \
+ if (sscanf(ss,par1,par2) != 1) { \
+ *par2 = val; ss = sp; \
+ } else { \
+ if (!*sp && --argc) argv++; \
+ ss = strchr(ss,0); \
+ } \
+}
+
+#define getnum(par1,par2){ \
+ if (!*ss && --argc) ss = (++argv)[0]; \
+ if (sscanf(ss,par1,par2) != 1) { \
+ fprintf(stderr,usage); \
+ exit(-1); \
+ } \
+ ss = strchr(ss,0);}
+
+#define get2num(str,p1,p2){ \
+ if (!(*ss) && --argc) ss = (++argv)[0]; \
+ if (sscanf(ss,str,p1,p2) != 2) { \
+ fprintf(stderr,usage); \
+ exit(-1); \
+ } \
+ ss = strchr(ss,0);}
+
+#define get3num(str,p1,p2,p3){ \
+ if (!(*ss) && --argc) ss = (++argv)[0]; \
+ if (sscanf(ss,str,p1,p2,p3) != 3) { \
+ fprintf(stderr,usage); \
+ exit(-1); \
+ } \
+ ss = strchr(ss,0);}
+
+#define get4num(str,p1,p2,p3,p4){ \
+ if (!(*ss) && --argc) ss = (++argv)[0]; \
+ if (sscanf(ss,str,p1,p2,p3,p4) != 4) { \
+ fprintf(stderr,usage); \
+ exit(-1); \
+ } \
+ ss = strchr(ss,0);}
+
+#define get5num(str,p1,p2,p3,p4,p5){ \
+ if (!(*ss) && --argc) ss = (++argv)[0]; \
+ if (sscanf(ss,str,p1,p2,p3,p4,p5) != 5) { \
+ fprintf(stderr,usage); \
+ exit(-1); \
+ } \
+ ss = strchr(ss,0);}
+
+
+#define getlist(v,i){ \
+ if (!(*ss) && --argc) ss = (++argv)[0]; \
+ i = 0; \
+ while (*ss) { \
+ v[i] = ss; \
+ i++; \
+ if (strchr(ss,';') == NULL) break; \
+ ss = strchr(ss,';'); \
+ *ss = 0; \
+ ss++; \
+ } \
+ if (i <= 0) { \
+ fprintf(stderr,usage); \
+ exit(-1); \
+ } \
+ ss = strchr(ss,0);}
+
+#define getnumlist(v,i,format){ \
+ if (!(*ss) && --argc) ss = (++argv)[0]; \
+ i = 0; \
+ while (*ss) { \
+ if (sscanf(ss,format,&v[i++]) != 1) { \
+ fprintf(stderr,usage); exit(-1); \
+ } \
+ if (strchr(ss,',') == NULL) break; \
+ ss = strchr(ss,','); \
+ ss++; \
+ } \
+ if (i <= 0) { \
+ fprintf(stderr,usage); \
+ exit(-1); \
+ } \
+ ss = strchr(ss,0);}
+
+
+#define get1or3num(str1,str3,p1,p2,p3,n){ \
+ if (!(*ss) && --argc) ss = (++argv)[0]; \
+ if (sscanf(ss,str3,&p1,&p2,&p3) != 3) { \
+ if (sscanf(ss,str1,&p1) != 1) { \
+ fprintf(stderr,usage); \
+ exit(-1); \
+ } \
+ n = 1; \
+ p3 = p1; p2 = 1; \
+ } else n = 3; \
+ ss = strchr(ss,0);}
+
+
+
+/********* headers for unformatted io **********/
+
+typedef struct {
+ int headerid,f1,headersize,f2;
+ int n_double,f3,n_long,f4,n_int,f5,n_char,f6;
+ int lx,f7,ly,f8,lz,f9,lt,f10;
+ int d1,f11,d2,f12,d3,f13,d4,f14,d5,f15,d6,f16,d7,f17,d8,f18;
+} ll_header;
+
+typedef struct {
+ long headerid,headersize;
+ long n_double,n_long,n_float,n_char;
+ long lx,ly,lz,lt;
+ long d1,d2,d3,d4,d5,d6,d7,d8;
+} e_header;
+
+typedef struct {
+ int headerid,headersize;
+ int n_double,n_long,n_float,n_char;
+ int lx,ly,lz,lt;
+ int d1,d2,d3,d4,d5,d6,d7,d8;
+} i_header;
+
+#define E_HEADER_ID 91919191
+
+/* and a couple of protos */
+int readheader(FILE *ff,e_header *h);
+int skipheader(FILE *ff);
+long readdata(FILE *ff,double *tmparr);
+
+
+/* other prototypes */
+
+double calclist(double d[],int dl,char *cmd);
+double * dblarr(int size);float * fltarr(int size);
+int * intarr(int size);
+double confidence(double chisq,int dof);
+int gaussj(double* a,int n, int np, double* b,int m);
+int svdecomp(double *a, int n, double *b, int m);
+void fitfun(double *x,double *y,double *sig,int ndata,double *a,int ma,
+ double *covar,int *lista,int mfit,double *chisq,int print,
+ double funcs());
+void covarfit(double *x,double *y,double *cmat,int ndata,double *a,int ma,
+ double *covar,int *lista,int mfit,double *chisq,int print,
+ double funcs());
+void jackfit(double *x,double *y,double * sig,int ndata,int n1, int n2,
+ int jack,double *a,int ma,
+ double *covar,int *lista,int mfit,double *chisq,int print,
+ int fullcov,int simplex,double funcs());
+void jack_fit(double *x,double *y,double * sig,int ndata,int n1, int n2,
+ int jack,double *a,int ma,
+ double *covar,int *lista,int mfit,double *chisq,int print,
+ int fullcov,int simplex,double funcs(),double *av);
+double fitfun_s(double *x,double *y,double *sig,int ndata,double *a,int ma,
+ double funcs());
+double brent(double ax,double bx,double cx,double f(),double tol,double *xmin);
+double polyfit(int ndata,double x[],double y[],double sig[],
+ int deg,double par[],double ep[]);
+double nelder(int ndim,double p[],double ftol,double funk(),int *i);
+void simplexfit(double *x,double *y,double *sig,int ndata,double *a,int ma,
+ int *lista,int mfit,double *chisq,int print,double funcs());
+int jacobi(int n, double *ap,double d[],double *v,int is_ordered);
+double simpson(double x[], double y[], double res[], int r);
+
+
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/compile.sh.in b/qcd/part_cpu/applications/QCD/src/kernel_B/compile.sh.in
new file mode 100644
index 0000000000000000000000000000000000000000..89084ebbd9b55e3c405dd4a3392801b129c4c465
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/compile.sh.in
@@ -0,0 +1,34 @@
+#!/bin/bash -l
+#############################################################################
+### ###
+### Compile script for SU3 ###
+### ###
+### Last modified: 2008-09-08 ###
+### ###
+#############################################################################
+
+export MAKE="#MAKE#"
+export EXECNAME="#EXECNAME#"
+export AAPROG="#AAPROG#"
+export RM="#RM#"
+export AR="#AR#"
+export ARFLAGS="#ARFLAGS#"
+export CC="#CC#"
+export CFLAGS="#CFLAGS#"
+export MPI_CC="#MPI_CC#"
+export LDFLAGS="#LDFLAGS#"
+
+#MODULE_INIT#
+#MODULE_CMD# #MODULE_FILES#
+
+cd libraries/ && $MAKE all \
+&& cd ../su3h_n/ && $MAKE su3_ahiggs \
+&& cp su3_ahiggs $EXECNAME \
+&& cd ../
+
+#MODULE_INIT2#
+#MODULE_CMD2# #MODULE_FILES2#
+
+cd aa/ && $MAKE \
+&& cp aa $AAPROG \
+&& cd ../
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_complex.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_complex.c
new file mode 100644
index 0000000000000000000000000000000000000000..1b74f279e3f609443e2c925f2b62d75681ae4f7a
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_complex.c
@@ -0,0 +1,10 @@
+/* block_field_complex -- does blocking on u1 higgs scalar field
+ * Kari Rummukainen 2002
+ */
+
+#include LATDEF
+
+#define FIELD complex
+#define block_FIELD block_field_complex
+
+#include "block_field_generic.c"
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_generic.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_generic.c
new file mode 100644
index 0000000000000000000000000000000000000000..d590598d58eab86da0dec95d2aedab234d38da25
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_generic.c
@@ -0,0 +1,42 @@
+/*********************************************************
+ * Generic field blocking routine for SU(2)
+ * Kari Rummukainen 1998-2002
+ */
+
+FIELD * block_FIELD(FIELD *f, int newlev[NDIM], int free_old)
+{
+ /* Block the field(s)
+ */
+ int dir,i,j,off,x[NDIM],step[NDIM],oldlev[NDIM];
+ node_struct oldnode;
+ FIELD *th;
+
+ /* first, catch the current node */
+ oldnode = node;
+
+ /* calculate the relative change */
+ foralldir(dir) step[dir] = newlev[dir] -
+ (oldlev[dir] = current_blocking_level[dir]);
+
+ /* set new block */
+ set_blocking_level( newlev );
+
+ th = new_latfield( FIELD );
+
+ /* and loop over */
+ off = 0;
+ forallsites(i) {
+ foralldir(dir) x[dir] = (coordinate(i,dir) << step[dir]);
+
+ /* index to the corresponding site */
+ th[i] = f[ node_index( x, &oldnode ) ];
+
+ }
+
+ set_blocking_level( oldlev );
+
+ if (free_old) free_latfield( f );
+
+ return( th );
+}
+
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_su3adjoint.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_su3adjoint.c
new file mode 100644
index 0000000000000000000000000000000000000000..c0069406652861e957fbdcf932dd647056018deb
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_su3adjoint.c
@@ -0,0 +1,10 @@
+/* block_su3_adjoint -- does blocking on su3 adjoint scalar field
+ * Kari Rummukainen 2002
+ */
+
+#include LATDEF
+
+#define FIELD adjoint_matrix
+#define block_FIELD block_field_su3adjoint
+
+#include "block_field_generic.c"
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_lattice.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_lattice.c
new file mode 100644
index 0000000000000000000000000000000000000000..5ff98245d3277db9eaa74eb7493ced40a0bd1793
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_lattice.c
@@ -0,0 +1,199 @@
+/*************************************************************************
+ * Block lattice operations
+ */
+
+#include "comdefs.h"
+#include "generic.h"
+
+/* set_blocking_leve( int blev[ndim] ) sets the global structures
+ * for reducing the current lattice size by a factors given in blev,
+ * so that, for example, nx -> nx/2^blev[XUP].
+ * Affects ONLY the gather, forallsites, etc. operations, does not
+ * touch individual variables.
+ *
+ * int * make_block_map( int blev[ndim] ) makes a index array from the
+ * current_blocking_level to blocking level blev. Thus, it returns
+ * an array of size current lattice size. It can go up or down,
+ * if up it fills the illegal sites with -(1<<30).
+ */
+
+/* static variables for the blocking operations */
+
+typedef struct block {
+ int level[NDIM]; /* blocking levels */
+ struct block * next; /* ptr to next block level */
+ lattice_struct lattice; /* saved lattice structure */
+ node_struct node; /* saved node structure */
+ node_struct * allnodes; /* ptr to node array for all nodes */
+ site_struct * site; /* ptr to site struct */
+ comlist_struct * comlist; /* and comlist for the gather */
+ int *neighb[NDIRS]; /* neighbour arrays */
+} block_buf;
+
+static block_buf * b_buf = NULL; /* pointer to all of the saved blocking vars */
+static block_buf * base; /* will point to the base level */
+static int n_blocks = 0; /* number of blocking levels */
+
+extern comlist_struct *comlist; /* ptr to comlist */
+extern node_struct *allnodes;
+
+void set_blocking_level( int blev[NDIM] )
+{
+ int dir,i,found,d;
+ block_buf *b,*p;
+
+ /* check first if it is the current level, nothing to do */
+ found = 1;
+ foralldir(dir)
+ if (current_blocking_level[dir] != blev[dir]) found = 0;
+ if (found) return;
+
+ /* Now something is happening */
+
+ if (b_buf == NULL) {
+ /* first time in, save the basic level here */
+ base = b_buf = (block_buf *)memalloc(1, sizeof(block_buf) );
+ foralldir(dir) base->level[dir] = 0;
+ for (dir=0; dirneighb[dir] = neighb[dir];
+
+ base->lattice = lattice; /* copy the lattice struct */
+ base->node = node; /* copy the node struct */
+ base->allnodes= allnodes; /* copy the address of the node array */
+ base->site = site; /* and the address of the site array too */
+ base->comlist = comlist; /* and comlist */
+ base->next = NULL;
+ }
+
+ /* ok, now chase the list and check if blev is defined */
+
+ found = 0;
+ for (b=b_buf; (!found) && b != NULL ; b=b->next) {
+ found = 1;
+ foralldir(dir) found = (found && (b->level[dir] == blev[dir]));
+ p = b;
+ }
+
+ if (found) {
+ /* now the gather was found, copy the arrays */
+
+ lattice = p->lattice;
+ node = p->node;
+ allnodes = p->allnodes;
+ site = p->site;
+ comlist = p->comlist;
+ foralldir(dir) current_blocking_level[dir] = p->level[dir];
+ for (dir=0; dirneighb[dir];
+
+ /* printf(" blocking level %d %d %d set up\n",
+ blev[XUP], blev[YUP], blev[ZUP]);
+ */
+ fflush(stdout);
+
+ } else {
+
+ /* now it was NOT found -- make new level */
+
+ p->next = b = (block_buf *)memalloc(1, sizeof(block_buf) );
+
+ b->lattice.volume = 1;
+ foralldir(dir) {
+ current_blocking_level[dir] = b->level[dir] = blev[dir];
+
+ /* check if this is at all legal */
+ if (base->lattice.size[dir] % (1 << blev[dir])) {
+ printf("Blocking error: cannot divide lattice by factors ");
+ foralldir(d) {
+ printf("%d",blev[d]);
+ if (d > 0) printf(" x ");
+ }
+ printf("\n");
+ halt("Blocking error");
+ }
+
+ b->lattice.size[dir] = (base->lattice.size[dir]) / (1 << blev[dir]);
+ b->lattice.volume *= b->lattice.size[dir];
+
+ }
+
+ lattice = b->lattice;
+
+ /* make the structures -- THIS INITIALIZES THE ARRAYS */
+
+ /* printf(" blocking level %d %d %d set up, making arrays\n",
+ * blev[XUP], blev[YUP], blev[ZUP]);
+ */
+
+ make_lattice_arrays( &(b->lattice) );
+
+ /* and copy now the stuff to buffer */
+ for (dir=0; dirneighb[dir] = neighb[dir];
+
+ b->node = node; /* copy the node struct */
+ b->allnodes= allnodes; /* copy the address of the node array */
+ b->site = site; /* and the address of the site array too */
+ b->comlist = comlist; /* comlist */
+ b->next = NULL;
+
+ }
+}
+
+
+/****************************************************************/
+
+void set_blocking_all( int lev )
+{
+ int b[NDIM],dir;
+
+ foralldir(dir) b[dir] = lev;
+ set_blocking_level( b );
+}
+
+
+/**************************************************************
+ * Make mapping from current to blev
+ */
+
+int *make_blocking_map( int newlev[NDIM] )
+{
+ int dir,i,is_ok,x[NDIM],step[NDIM],oldlev[NDIM];
+ node_struct newnode;
+ int *map;
+
+ /* allocate map */
+ map = (int *)memalloc( node.sites, sizeof(int) );
+
+ /* calculate the relative change */
+ foralldir(dir) step[dir] = newlev[dir] -
+ (oldlev[dir] = current_blocking_level[dir]);
+
+ /* set new blocking */
+ set_blocking_level( newlev );
+ /* catch the current node */
+ newnode = node;
+ /* and just reset the level */
+ set_blocking_level( oldlev );
+
+ /* and loop over */
+
+ forallsites(i) {
+ is_ok = 1;
+ foralldir(dir) {
+ if (step[dir] == 0) {
+ /* no change, coordinate as is */
+ x[dir] = coordinate(i,dir);
+ } else if (step[dir] < 0) {
+ /* down, new is 2^n denser/larger than the old */
+ x[dir] = (coordinate(i,dir) << (-step[dir]));
+ } else {
+ /* up, new is smaller than old */
+ /* if (coordinate(i,dir) % (1<> step[dir];
+ }
+ }
+ if (is_ok) map[i] = node_index( x, &newnode );
+ else map[i] = -(1<<30);
+
+ }
+ return( map );
+}
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_complex.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_complex.c
new file mode 100644
index 0000000000000000000000000000000000000000..ee428e704af7b1a0157ce3d9c268ed4e0b9b2278
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_complex.c
@@ -0,0 +1,9 @@
+/****** block_link_complex.c -- compute the blocked link ******************/
+
+/* MIMD version 3 */
+
+#include LATDEF
+
+#define block_link_MATRIX block_link_complex
+
+#include "block_link_generic.c"
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_generic.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_generic.c
new file mode 100644
index 0000000000000000000000000000000000000000..4f10469f8e69545c99268c2d0396fd2820f54caa
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_generic.c
@@ -0,0 +1,68 @@
+/*********************************************************
+ * Generic gauge field blocking routine
+ * Kari Rummukainen 1998 - 2002
+ */
+
+void block_link_MATRIX( MATRIX *oldl[NDIM], MATRIX *newl[NDIM],
+ int newlev[NDIM], int free_old )
+{
+ /* Just multiply the straight links U(x) U(x+1) -> U(x)
+ */
+
+ int i,j,dir;
+ int x[NDIM],step[NDIM],oldlev[NDIM];
+ int *nf[NDIM];
+ MATRIX *tmpl[NDIM];
+ msg_tag *tag[NDIM];
+ node_struct oldnode;
+
+ /* first, start XYZ-direction: move the link to ->staple */
+
+ foralldir(dir) {
+ oldlev[dir] = current_blocking_level[dir];
+ step[dir] = newlev[dir] - oldlev[dir];
+ if (step[dir] == 1) {
+ tag[dir] = start_get( oldl[dir], dir, EVENODD );
+ } else if (0 != step[dir]) halt(" Gauge blocking error" );
+ }
+
+ foralldir(dir) {
+ /* wait the gathers, this clears the buffers */
+ if (step[dir]) wait_get(tag[dir]);
+ /* grab the old neighbour arrays */
+ nf[dir] = neighb[dir];
+ }
+
+ /* copy the node, needed */
+ oldnode = node;
+
+ /* block the system */
+ set_blocking_level( newlev );
+
+ foralldir(dir) tmpl[dir] = new_latfield( MATRIX );
+
+ /* and loop over */
+ forallsites(i) {
+ foralldir(dir) x[dir] = (coordinate(i,dir)) << step[dir];
+ /* index to the corresponding site */
+ j = node_index( x, &oldnode );
+
+ /* and mult */
+ foralldir(dir) {
+ if (step[dir]) {
+ mult_MATRIX_nn( oldl[dir][j], oldl[dir][nf[dir][j]], tmpl[dir][i] );
+ } else {
+ tmpl[dir][i] = oldl[dir][j];
+ }
+ }
+ }
+
+ /* restore old level */
+ set_blocking_level( oldlev );
+
+ if (free_old) foralldir(dir) free_latfield( oldl[dir] );
+ /* set the pointer last - this makes it possible to use same
+ link in and out, if the old is freed first */
+ foralldir(dir) newl[dir] = tmpl[dir];
+
+}
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_su2.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_su2.c
new file mode 100644
index 0000000000000000000000000000000000000000..d44235e56760ea8f1930cba7a62b314eba414f19
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_su2.c
@@ -0,0 +1,9 @@
+/****** block_link_su2.c -- compute the blocked link ******************/
+
+/* MIMD version 3 */
+
+#include LATDEF
+#include "generic_su2.h"
+
+#define block_link_MATRIX block_link_su2
+#include "block_link_generic.c"
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_su3.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_su3.c
new file mode 100644
index 0000000000000000000000000000000000000000..da30c2dcd34cbd04427f4b0ae17a6b89058783ea
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_su3.c
@@ -0,0 +1,8 @@
+/****** block_link_su3.c -- compute the blocked link ******************/
+
+/* MIMD version 3 */
+
+#include LATDEF
+
+#define block_link_MATRIX block_link_su3
+#include "block_link_generic.c"
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/bulk_update_mpi.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/bulk_update_mpi.c
new file mode 100644
index 0000000000000000000000000000000000000000..6dbf17272d5ec4d72b5ea599ffa21c1ccbdbd1d3
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/bulk_update_mpi.c
@@ -0,0 +1,21 @@
+/******************************************************************
+ * here some MPI typical "bulk update" subroutines
+ */
+
+#include "comdefs.h"
+#include "generic.h"
+
+/* Is the link inside node? Thus, OK if the link is not _along_ any
+ * of the bottom slabs of the node. For example, reject
+ * x-links where y,z,.. coordinate == min on the node
+ */
+
+int inside_node(int i, int dir)
+{
+ register int d,s;
+
+ s = 1;
+ foralldir( d ) if ( d != dir && coordinate(i,d) == node.xmin[d] ) s = 0;
+ return( s );
+}
+
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_mpi.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_mpi.c
new file mode 100644
index 0000000000000000000000000000000000000000..166748a2e2b5c6ebad48edcb49d3c2b5d7916d68
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_mpi.c
@@ -0,0 +1,937 @@
+/****************** com_mpi.c *****************************************
+ *
+ * Communications routines, for MPI interface
+ * Modified from the MILC lattice QCD one
+ * KR 2001
+ *
+
+
+
+ g_sync() provides a synchronization point for all nodes.
+ g_floatsum() sums a floating point number over all nodes.
+ g_doublesum() sums a double over all nodes.
+ g_vecdoublesum() sums a vector of doubles over all nodes.
+ g_floatmax() finds maximum of a floating point number over all nodes.
+ g_doublemax() finds maximum of a double over all nodes.
+ broadcast_float() broadcasts a single precision number from
+ node 0 to all nodes.
+ broadcast_double() broadcasts a double precision number
+ send_integer() sends an integer to one other node
+ receive_integer() receives an integer
+ terminate() kills the job on all processors
+
+ start_gather() starts asynchronous sends and receives required
+ to gather neighbors.
+ wait_gather() waits for receives to finish, insuring that the
+ data has actually arrived.
+
+ start_scatter() invese of gather
+ wait_scatter()
+
+ send_field() sends a field to one other node.
+ receive_field() receives a field from some other node.
+*/
+
+/* load in definitions, variables etc. */
+
+#include "comdefs.h"
+#include "generic.h"
+#include "timers.h" /* includes comm timer calculators */
+
+#ifdef TIMERS
+static timer_type total_gather_timer, start_gather_timer, wait_send_timer,
+ wait_receive_timer, g_sync_timer,
+ g_sum_timer, broadcast_timer, send_timer, total_time;
+#endif
+static double total_sent_data = 0.0, total_gather_data = 0.0;
+
+
+#define MIN_GATHER_INDEX 100
+#define MAX_GATHER_INDEX 7100 /* allows 7000 concurrent gathers */
+#define FIELD_TYPE 11 /* used in send/receive field */
+#define SEND_INTEGER_TYPE 12 /* used in send/receive int */
+
+#define SEND_FLAG 1 /* flags for marking msg_tags */
+#define RECEIVE_FLAG 2
+
+extern comlist_struct *comlist; /* the comlist variables in layout.c */
+
+/************************************************************************/
+
+/* get all msg_tags in a single array, avoid allocating
+ * small bits and pieces of messages
+ */
+
+#define N_MSG_TAG 50
+
+static msg_tag msg_tag_arr[N_MSG_TAG];
+static msg_tag *msg_tag_free;
+
+void init_msg_tags()
+{
+ int i;
+
+ for (i=0; inext;
+ }
+
+ /* p points to the last in the list */
+ msg_tag_free = p->next;
+ p->next = NULL;
+ return(r);
+}
+
+
+void release_msg_tags(msg_tag *tp)
+{
+ msg_tag *p;
+
+ for (p=tp; p->next != NULL; p=p->next) ;
+ p->next = msg_tag_free;
+ msg_tag_free = tp;
+}
+
+/************************************************************************/
+
+
+/* Machine initialization */
+#include
+void initialize_machine() {
+/* MPI_Init(&argc,&argv); */
+
+#ifdef TIMERS
+ timer_reset( &total_time );
+ timer_start( &total_time );
+
+ timer_reset( &total_gather_timer );
+ timer_reset( &start_gather_timer );
+ timer_reset( &wait_send_timer );
+ timer_reset( &wait_receive_timer );
+ timer_reset( &g_sync_timer );
+ timer_reset( &g_sum_timer );
+ timer_reset( &broadcast_timer );
+ timer_reset( &send_timer );
+#endif
+
+ init_msg_tags();
+
+}
+
+
+/************************************************************************/
+
+/* this formats the wait_array, used by forallsites_waitA()
+ * should be made as fast as possible!
+ *
+ * wait_array[i] contains a bit at position 1<= node.sites ) wait_arr_[i] = wait_arr_[i] | (1<= node.sites ) wait_arr_[i] = wait_arr_[i] | (1< 4
+ halt("forallsites_waitA requires NDIM <= 4!\n");
+#endif
+
+ if (ntag > NA_MAX) halt("Error in forallsites_waitA: too many gathers");
+ mask = 0;
+ for (i=0; idir));
+ }
+ return( mask );
+}
+
+
+#ifdef OLD_WAIT_ARR
+int setup_wait_arr( unsigned char *wait_arr, msg_tag* tag_out[],
+ msg_tag* tag_in[], int ntag )
+{
+ register int i,j;
+ int nt,dir[NA_MAX];
+
+ if (ntag > NA_MAX) halt("Error in forallsites_waitA: too many gathers");
+ for (nt=i=0; idir;
+ nt++;
+ }
+ tag_out[nt] = (msg_tag *)NULL; /* needed for the last loop */
+
+ forallsites(i) {
+ wait_arr[i] = 0; /* basic, no wait */
+ for (j=0; j= node.sites ) wait_arr[i] = j+1;
+ }
+ return( nt );
+}
+#endif
+
+
+
+
+/************************************************************************/
+/* GATHER ROUTINES */
+/* start_gather() returns a pointer to a list of msg_tag's, which
+ be used as input to subsequent wait_gather().
+
+ This list contains msg_tags for all receive buffers, followed by
+ end flag.
+
+ If no messages at all are required, the routine will return NULL.
+ msg_buf==NULL should be a reliable indicator of no message.
+
+ usage: tag = start_gather( source, size, direction, parity )
+*/
+
+
+
+msg_tag* start_gather( field, size, dir, parity )
+ /* arguments */
+ char * field; /* pointer to some latfield */
+ int size; /* size in bytes of the field (eg sizeof(su3_vector))*/
+ int dir; /* direction to gather from. eg XUP - index into
+ neighbor tables */
+ int parity; /* parity of sites whose neighbors we gather.
+ one of EVEN, ODD or EVENODD (EVEN+ODD). */
+{
+ /* local variables */
+ int i,j,k; /* scratch */
+ int offset; /* number of sites in this receive or send */
+ int *idx; /* index array pointer */
+ int nsites;
+ char *tpt; /* temp ptr to buffer */
+ msg_tag *mbuf, *mp; /* list of message tags, to be returned */
+ comlist_struct *cp;
+ send_struct *sp;
+ receive_struct *rp;
+ static int index = MIN_GATHER_INDEX; /* index to identify the operation */
+
+
+ if (dir < 0 || dir >= NDIRS) {
+ printf("No such gather %d, node %d\n",dir,mynode());
+ terminate(1212);
+ }
+
+ /* First, get the rolling index for the operation
+ * This MUST BE HERE even if there's nothing to do, because
+ * somebody else might be doing this!
+ */
+ ++index ; if (index > MAX_GATHER_INDEX) index = MIN_GATHER_INDEX;
+
+ cp = &comlist[dir];
+
+ /* Now if there's nothing to do, return - CHECK IF GATHERED */
+ if( ( cp->n_send == 0 && cp->n_receive == 0 )
+ || is_already_gathered( field, size, dir, parity) )
+ return( (msg_tag *) NULL );
+
+ /* mark gathered, if needed */
+ gather_mark_gathered( field, size, dir, parity );
+
+ /* allocate a buffer for the msg_tags. This is dynamically allocated
+ because there may be an arbitrary number of gathers in progress
+ in any direction. SIZE = n_msgs sent+received, for end flag
+ */
+
+ mbuf = get_msg_tags(cp->n_send + cp->n_receive);
+
+#ifdef TIMERS
+ /* mark start time for this gather */
+ mbuf->start_time = timer_start( &start_gather_timer );
+#endif
+
+ mp=mbuf;
+ /* HANDLE RECEIVES: loop over nodes which will send here */
+ for (i=0, rp=cp->from_node; in_receive;i++, rp=rp->next, mp=mp->next) {
+ /* note--neighbors of EVEN sites are always first in the list!
+ * Thus, for ODD sites we must change the offset
+ */
+ switch (parity) {
+ case EVEN: nsites = rp->n_even; offset = rp->offset; break;
+ case ODD: nsites = rp->n_odd; offset = rp->offset + rp->n_even; break;
+ case EVENODD: nsites = rp->n; offset = rp->offset; break;
+ }
+
+ mp->flag = RECEIVE_FLAG; /* flag as normal receive */
+ mp->dir = dir;
+ /* and post receive -- comes right on spot */
+ MPI_Irecv( ((char *)field) + offset*size, nsites*size, MPI_BYTE,
+ rp->node, index, MPI_COMM_WORLD, &(mp->mpi) );
+
+ total_gather_data += nsites*size;
+
+ }
+
+ /* HANDLE SENDS - note: mp automatically correct */
+ for(k=0,sp=cp->to_node; k < cp->n_send; k++,sp = sp->next, mp = mp->next) {
+ switch (parity) {
+ case EVEN: nsites = sp->n_even; offset = 0; break;
+ case ODD: nsites = sp->n_odd; offset = sp->n_even; break;
+ case EVENODD: nsites = sp->n; offset = 0; break;
+ }
+
+ /* allocate buffer */
+ tpt = (char *)malloc( nsites*size );
+ if(tpt==NULL){printf("NO ROOM for tpt, node %d\n",mynode());exit(1);}
+ mp->flag = SEND_FLAG; /* flag as send */
+ mp->dir = dir;
+ mp->buf = tpt;
+ /* gather data into the buffer */
+
+ idx = sp->sitelist + offset; /* initial offset */
+ for (j=0; jbuf, nsites*size, MPI_BYTE,
+ sp->node, index, MPI_COMM_WORLD, &(mp->mpi) );
+
+ total_gather_data += nsites*size;
+
+ }
+
+ timer_end( &start_gather_timer );
+
+ /* return */
+ return(mbuf);
+}
+
+
+msg_tag * wait_gather( msg_tag *mbuf )
+{
+ MPI_Status status;
+ msg_tag *mp;
+
+ if (mbuf == NULL) return((msg_tag *)NULL);
+ timer_start( &wait_receive_timer );
+ /* wait for all receive messages */
+ for(mp=mbuf; mp != NULL && mp->flag == RECEIVE_FLAG; mp=mp->next) {
+ MPI_Wait( &(mp->mpi), &status );
+ }
+ timer_end( &wait_receive_timer );
+ /* wait for all send messages */
+ timer_start( &wait_send_timer );
+ for( ; mp != NULL && mp->flag == SEND_FLAG; mp=mp->next) {
+ MPI_Wait( &(mp->mpi), &status );
+ /* release the buffer */
+ free( mp->buf );
+ }
+#ifdef TIMERS
+ total_gather_timer.total +=
+ timer_end( &wait_send_timer ) - mbuf->start_time;
+ total_gather_timer.count ++;
+#endif
+
+ /* and free the mbuf */
+ release_msg_tags( mbuf );
+ return((msg_tag *)NULL);
+}
+
+
+void wait_gather_arr( msg_tag* tag[], int ntag )
+{
+ int i;
+ for (i=0; i field[i_otherparity], on
+ * neighb. nodes.
+ * THIS MODIFIES THE LATTICE FIELD field ON OTHERPARITY.
+ * Thus, there is little sense using this on EVENODD (but it is possible)
+ */
+
+msg_tag * start_scatter( field, size, dir, parity )
+ char * field; /* pointer to some latfield */
+ int size; /* size in bytes of the field (eg sizeof(su3_vector))*/
+ int dir; /* direction to push the data, eg XUP - index into
+ neighbor tables */
+ int parity; /* parity of sites from where we push
+ one of EVEN, ODD or EVENODD (EVEN+ODD). */
+{
+ /* local variables */
+ int i,k; /* scratch */
+ int offset; /* number of sites in this receive or send */
+ int nsites;
+ char *tpt; /* temp ptr to buffer */
+ msg_tag *mbuf, *mp; /* list of message tags, to be returned */
+ comlist_struct *cp;
+ send_struct *sp;
+ receive_struct *rp;
+ static int index = MIN_GATHER_INDEX; /* index to identify the operation */
+
+
+ if (dir < 0 || dir >= NDIRS) {
+ printf("No such gather %d, node %d\n",dir,mynode());
+ terminate(1212);
+ }
+
+ /* First, get the rolling index for the operation
+ * This MUST BE HERE even if there's nothing to do, because
+ * somebody else might be doing this!
+ */
+ ++index ; if (index > MAX_GATHER_INDEX) index = MIN_GATHER_INDEX;
+
+ cp = &comlist[dir];
+
+ /* Now if there's nothing to do, return */
+ if( cp->n_send == 0 && cp->n_receive == 0 )
+ return( (msg_tag *) NULL );
+
+ /* allocate a buffer for the msg_tags */
+
+ mbuf = get_msg_tags(cp->n_send + cp->n_receive);
+
+#ifdef TIMERS
+ /* mark start time for this gather */
+ mbuf->start_time = timer_start( &start_gather_timer );
+#endif
+
+ mp=mbuf;
+ /* HANDLE RECEIVES: loop over nodes which will send here
+ * note difference to start_gather; now using to_node, n_send!
+ */
+ for(k=0,sp=cp->to_node; k < cp->n_send; k++,sp = sp->next, mp=mp->next) {
+ switch (parity) {
+ case EVEN: nsites = sp->n_even; offset = 0; break;
+ case ODD: nsites = sp->n_odd; offset = sp->n_even; break;
+ case EVENODD: nsites = sp->n; offset = 0; break;
+ }
+
+ /* allocate buffer */
+ tpt = (char *)malloc( nsites*size );
+ if(tpt==NULL){printf("NO ROOM for tpt, node %d\n",mynode());exit(1);}
+ mp->flag = RECEIVE_FLAG; /* flag as receive */
+ mp->dir = opp_dir(dir); /* using opp_dir here, works with _wait etc. */
+ mp->buf = tpt;
+
+ mp->nsites = nsites;
+ mp->size = size;
+ mp->field = field;
+ mp->sitelist = sp->sitelist + offset; /* list of sites to scatter */
+
+ /* post receive */
+ MPI_Irecv( mp->buf, nsites*size, MPI_BYTE,
+ sp->node, index, MPI_COMM_WORLD, &(mp->mpi) );
+
+ total_gather_data += nsites*size;
+ }
+
+ /* and HANDLE SENDS - note: mp automatically correct */
+ for (i=0, rp=cp->from_node; in_receive;i++, rp=rp->next, mp=mp->next) {
+ /* note--EVEN sites are always first in the list!
+ * Thus, for ODD sites we must change the offset
+ */
+ switch (parity) {
+ case EVEN: nsites = rp->n_even; offset = rp->offset; break;
+ case ODD: nsites = rp->n_odd; offset = rp->offset + rp->n_even; break;
+ case EVENODD: nsites = rp->n; offset = rp->offset; break;
+ }
+
+ mp->flag = SEND_FLAG; /* flag as normal receive */
+ mp->dir = opp_dir(dir);
+ /* and post send -- comes right from spot */
+ MPI_Issend( ((char *)field) + offset*size, nsites*size, MPI_BYTE,
+ rp->node, index, MPI_COMM_WORLD, &(mp->mpi) );
+
+ total_gather_data += nsites*size;
+ }
+ timer_end( &start_gather_timer );
+
+ /* return */
+ return(mbuf);
+}
+
+
+msg_tag * wait_scatter( msg_tag *mbuf )
+{
+ MPI_Status status;
+ msg_tag *mp;
+ int *idx,j;
+ char *tpt;
+
+ if (mbuf == NULL) return((msg_tag *)NULL);
+ timer_start( &wait_receive_timer );
+ /* wait for all receive messages */
+ for(mp=mbuf; mp != NULL && mp->flag == RECEIVE_FLAG; mp=mp->next) {
+ MPI_Wait( &(mp->mpi), &status );
+
+ /* now copy data to right spot */
+ idx = mp->sitelist; /* index list */
+ tpt = mp->buf;
+ for (j=0; jnsites; j++, tpt += mp->size) {
+ memcpy( mp->field + idx[j]*mp->size, tpt, mp->size );
+ }
+ /* and free the field */
+ free( mp->buf );
+ }
+ timer_end( &wait_receive_timer );
+ /* wait for all send messages */
+ timer_start( &wait_send_timer );
+ for( ; mp != NULL && mp->flag == SEND_FLAG; mp=mp->next) {
+ MPI_Wait( &(mp->mpi), &status );
+ }
+#ifdef TIMERS
+ total_gather_timer.total +=
+ timer_end( &wait_send_timer ) - mbuf->start_time;
+ total_gather_timer.count ++;
+#endif
+ /* and free the mbuf */
+ release_msg_tags( mbuf );
+ return((msg_tag *)NULL);
+}
+
+
+/****************************************************************
+ */
+
+
+/* SEND AND RECEIVE FIELD */
+/* send_field is to be called only by the node doing the sending */
+/* get_field is to be called only by the node to which the field was sent */
+void send_field(buf,size,tonode)
+ void *buf; int size,tonode;
+{
+ timer_start( &send_timer );
+ MPI_Send(buf,size,MPI_BYTE,tonode,FIELD_TYPE,MPI_COMM_WORLD);
+ timer_end( &send_timer );
+ total_sent_data += size;
+}
+void receive_field(buf,size)
+ void *buf; int size;
+{
+ MPI_Status status;
+
+ timer_start( &send_timer );
+ MPI_Recv(buf,size,MPI_BYTE,MPI_ANY_SOURCE,FIELD_TYPE,
+ MPI_COMM_WORLD,&status);
+ timer_end( &send_timer );
+ total_sent_data += size;
+}
+
+/* BASIC COMMUNICATIONS FUNCTIONS */
+
+/* Tell what kind of machine we are on */
+static char name[]="MPI (portable)";
+char * machine_type(){
+ return(name);
+}
+
+/* Return my node number */
+int mynode()
+{
+ int node;
+ MPI_Comm_rank( MPI_COMM_WORLD, &node );
+ return(node);
+}
+
+/* Return number of nodes */
+int numnodes()
+{
+ int nodes;
+ MPI_Comm_size( MPI_COMM_WORLD, &nodes );
+ return(nodes);
+}
+
+/* Synchronize all nodes */
+void g_sync()
+{
+ timer_start( &g_sync_timer );
+ MPI_Barrier( MPI_COMM_WORLD );
+ timer_end( &g_sync_timer );
+}
+
+/* Sum float over all nodes. dist=1: distribute to all nodes */
+void g_floatsum( float * fpt, int dist )
+{
+ float work;
+
+ timer_start( &g_sum_timer );
+ if (dist) {
+ MPI_Allreduce( fpt, &work, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD );
+ *fpt = work;
+ } else {
+ MPI_Reduce ( fpt, &work, 1, MPI_FLOAT, MPI_SUM, 0 , MPI_COMM_WORLD );
+ if (this_node == 0) *fpt = work;
+ }
+ timer_end( &g_sum_timer );
+}
+
+/* Sum double over all nodes, and scatter the result */
+void g_doublesum( double * dpt, int dist )
+{
+ double work;
+
+ timer_start( &g_sum_timer );
+ if (dist) {
+ MPI_Allreduce( dpt, &work, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+ *dpt = work;
+ } else {
+ MPI_Reduce ( dpt, &work, 1, MPI_DOUBLE, MPI_SUM, 0 , MPI_COMM_WORLD );
+ if (this_node == 0) *dpt = work;
+ }
+ timer_end( &g_sum_timer );
+}
+
+#define N_ELEM 100
+
+/* Sum a vector of ints over all nodes */
+void g_vecintsum( int *dpt, int n, int dist)
+{
+ int *work, arr[N_ELEM];
+ register int i;
+
+ timer_start( &g_sum_timer );
+ if (n <= N_ELEM) work = arr; else work = (int *)malloc(n*sizeof(int));
+ if (dist) {
+ MPI_Allreduce( dpt, work, n, MPI_INT, MPI_SUM, MPI_COMM_WORLD );
+ for (i=0; i N_ELEM) free(work);
+ timer_end( &g_sum_timer );
+}
+
+
+/* Sum a vector of floats over all nodes */
+void g_vecfloatsum( float *dpt, int nfloats, int dist )
+{
+ float *work, arr[N_ELEM];
+ register int i;
+
+ timer_start( &g_sum_timer );
+
+ if (nfloats <= N_ELEM) work = arr;
+ else work = (float *)malloc(nfloats*sizeof(float));
+
+ if (dist) {
+ MPI_Allreduce( dpt, work, nfloats, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD );
+ for (i=0; i N_ELEM) free(work);
+ timer_end( &g_sum_timer );
+
+ /**
+ work = (float *)malloc(nfloats*sizeof(float));
+ MPI_Allreduce( dpt, work, nfloats, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD );
+ for(i=0;i N_ELEM) free(work);
+ timer_end( &g_sum_timer );
+
+ /**
+ work = (double *)malloc(ndoubles*sizeof(double));
+ MPI_Allreduce( dpt, work, ndoubles, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+ for(i=0;istatus == GATHER_FOLLOW && (a->gathered[dir] ^ parity) == 0) {
+#ifdef CHECK_GATHER_FIELDS
+ /* now looks like is gathered, check if the checkup field has changed */
+ if (((parity & EVEN) && (*((unsigned int *)field) != a->check_even[dir] )) ||
+ ((parity & ODD ) && (*((unsigned int *)(field+size*(node.sites-1)))
+ != a->check_odd[dir] ))) {
+ printf(" #### GATHER CHECK: Forgotten mark_changed() somewhere!\n");
+ if (size < sizeof(int))
+ printf(" Because field size %d < sizeof(int), can be spurious\n",size);
+ else {
+ printf(" Field size %d chars, dir %d parity %d\n",size, dir, parity);
+ }
+ halt(" ###### ");
+ }
+#endif
+ n_gather_avoided++;
+ return(1);
+ }
+ n_gather_done++;
+ return(0);
+}
+
+void gather_status_reset( char *field, int size )
+{
+ struct gather_status_arr *a;
+ /* cast the latfield array */
+ a = (struct gather_status_arr *) (field + size*node.latfield_size);
+ a->status = GATHER_NOT_FOLLOW;
+}
+
+void gather_mark_dirty( char *field, int size, int parity )
+{
+ int dir,p;
+ struct gather_status_arr *a;
+
+ /* cast the latfield array */
+ a = (struct gather_status_arr *) (field + size*node.latfield_size);
+ a->status = GATHER_FOLLOW;
+ p = opp_parity(parity);
+ /* mark opposite parity, because will fetch from there! */
+ /* Remember that need to mark opposite directions too! */
+ for(dir=0; dirgathered[dir] &= (!p);
+}
+
+void gather_mark_gathered( char *field, int size, int dir, int parity )
+{
+ struct gather_status_arr *a;
+
+ /* cast the latfield array */
+ a = (struct gather_status_arr *) (field + size*node.latfield_size);
+ if (a->status == GATHER_FOLLOW) a->gathered[dir] |= parity;
+#ifdef CHECK_GATHER_FIELDS
+ if (parity & EVEN) a->check_even[dir] = *((unsigned int *)field);
+ if (parity & ODD ) a->check_odd[dir] =
+ *((unsigned int *)(field+size*(node.sites-1)));
+#endif
+}
+
+
+/****************************************************************/
+
+
+
+/* version of exit for multinode processes -- kill all nodes */
+void terminate(int status)
+{
+ printf("Termination: node %d, status = %d\n",this_node,status);
+ fflush(stdout);
+ MPI_Abort( MPI_COMM_WORLD, 0);
+ exit(status);
+}
+
+
+
+
+/* clean exit from all nodes */
+void finishrun()
+{
+#ifdef TIMERS
+ report_comm_timers();
+#endif
+
+ if (this_node == 0) {
+ extern int n_gather_done,n_gather_avoided;
+
+ printf(" COMMS from node 0: %d done, %d (%.2g%%) optimized away\n",
+ n_gather_done, n_gather_avoided,
+ 100.0*n_gather_avoided/(n_gather_avoided+n_gather_done));
+ }
+
+ fflush(stdout);
+ fflush(NULL); /* for all open files */
+ MPI_Finalize();
+ exit(0);
+}
+
+
+void report_comm_timers()
+{
+#ifdef TIMERS
+ double tot;
+ if (this_node == 0) {
+ printf(" *************************\n");
+ printf(" MPI communications timers from node 0:\n");
+
+ printf(" start_get: ");
+ timer_report( &start_gather_timer );
+ printf(" waiting send: ");
+ timer_report( &wait_send_timer );
+ printf(" receive: ");
+ timer_report( &wait_receive_timer );
+ printf(" g_sync: ");
+ timer_report( &g_sync_timer );
+ printf(" g_sum: ");
+ timer_report( &g_sum_timer );
+ printf(" broadcast: ");
+ timer_report( &broadcast_timer );
+ printf(" send/receive: ");
+ timer_report( &send_timer );
+ printf(" Total time from gather start -> end (does not count against comm.time)\n");
+ printf(" ");
+ timer_report( &total_gather_timer );
+
+ printf(" Moved data:\n");
+ printf(" * send/receive %g MB, bandwith %g MB/sec\n",
+ total_sent_data*1e-6, total_sent_data*1e-6/send_timer.total);
+
+ printf(" * total pushed/pulled data %g MB\n",total_gather_data*1e-6);
+ printf(" * with optimistic bandwidth %g MB/sec, pessimistic %g MB/s\n",
+ total_gather_data*1e-6/
+ (start_gather_timer.total + wait_send_timer.total +
+ wait_receive_timer.total),
+ total_gather_data*1e-6/total_gather_timer.total);
+
+ tot = start_gather_timer.total + wait_send_timer.total + wait_receive_timer.total +
+ g_sync_timer.total +
+ g_sum_timer.total + broadcast_timer.total + send_timer.total ;
+
+ /* find current time */
+ timer_end( &total_time );
+
+ printf(" Total comm. time %.3g, total time %.3g, comm %.2g%%\n",
+ tot, total_time.total, 100*tot/total_time.total );
+ printf(" ***** \n");
+ }
+#endif
+}
+
+
diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_mpi_2.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_mpi_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..343d49c9570e2a189cd8be8918a3b59a512c04fa
--- /dev/null
+++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_mpi_2.c
@@ -0,0 +1,728 @@
+/****************** com_mpi.c *****************************************
+ *
+ * Communications routines, for MPI interface
+ * Modified from the MILC lattice QCD one
+ * KR 2001
+ *
+
+
+
+ g_sync() provides a synchronization point for all nodes.
+ g_floatsum() sums a floating point number over all nodes.
+ g_doublesum() sums a double over all nodes.
+ g_vecdoublesum() sums a vector of doubles over all nodes.
+ g_floatmax() finds maximum of a floating point number over all nodes.
+ g_doublemax() finds maximum of a double over all nodes.
+ broadcast_float() broadcasts a single precision number from
+ node 0 to all nodes.
+ broadcast_double() broadcasts a double precision number
+ send_integer() sends an integer to one other node
+ receive_integer() receives an integer
+ terminate() kills the job on all processors
+
+ start_gather() starts asynchronous sends and receives required
+ to gather neighbors.
+ wait_gather() waits for receives to finish, insuring that the
+ data has actually arrived.
+
+ send_field() sends a field to one other node.
+ receive_field() receives a field from some other node.
+*/
+
+/* load in definitions, variables etc. */
+
+#include "comdefs.h"
+#include "generic.h"
+#include "timers.h" /* includes comm timer calculators */
+
+#ifdef TIMERS
+static timer_type start_gather_timer, wait_send_timer, wait_receive_timer, g_sync_timer,
+ g_sum_timer, broadcast_timer, send_timer, total_time;
+#endif
+static double mpi_sent_data = 0.0;
+
+
+#define MIN_GATHER_INDEX 100
+#define MAX_GATHER_INDEX 7100 /* allows 7000 concurrent gathers */
+#define FIELD_TYPE 11 /* used in send/receive field */
+#define SEND_INTEGER_TYPE 12 /* used in send/receive int */
+
+#define SEND_FLAG 1 /* flags for marking msg_tags */
+#define RECEIVE_FLAG 2
+
+extern comlist_struct *comlist; /* the comlist variables in layout.c */
+
+/************************************************************************/
+
+/* get all msg_tags in a single array, avoid allocating
+ * small bits and pieces of messages
+ */
+
+#define N_MSG_TAG 40
+
+static msg_tag msg_tag_arr[N_MSG_TAG];
+static msg_tag *msg_tag_free, *msg_tag_last;
+
+void init_msg_tags()
+{
+ int i;
+
+ for (i=0; inext;
+ }
+
+ /* p points to the last in the list */
+ msg_tag_free = p->next;
+ p->next = NULL;
+ return(r);
+}
+
+
+void release_msg_tags(msg_tag *tp)
+{
+ msg_tag *p;
+
+ for (p=tp; p->next != NULL; p=p->next) ;
+ p->next = msg_tag_free;
+ msg_tag_free = tp;
+}
+
+/************************************************************************/
+
+
+/* Machine initialization */
+#include
+void initialize_machine() {
+/* MPI_Init(&argc,&argv); */
+
+#ifdef TIMERS
+ timer_reset( &total_time );
+ timer_start( &total_time );
+
+ timer_reset( &start_gather_timer );
+ timer_reset( &wait_send_timer );
+ timer_reset( &wait_receive_timer );
+ timer_reset( &g_sync_timer );
+ timer_reset( &g_sum_timer );
+ timer_reset( &broadcast_timer );
+ timer_reset( &send_timer );
+#endif
+
+ init_msg_tags();
+
+}
+
+
+/************************************************************************/
+
+/* this formats the wait_array, used by forallsites_waitA()
+ * should be made as fast as possible!
+ */
+
+int setup_wait_arr( unsigned char *wait_arr, msg_tag* tag_out[],
+ msg_tag* tag_in[], int ntag )
+{
+ register int i,j;
+ int nt,dir[NA_MAX];
+
+ if (ntag > NA_MAX) halt("Error in forallsites_waitA: too many gathers");
+ for (nt=i=0; idir;
+ nt++;
+ }
+ tag_out[nt] = (msg_tag *)NULL; /* needed for the last loop */
+
+ forallsites(i) {
+ wait_arr[i] = 0; /* basic, no wait */
+ for (j=0; j= node.sites ) wait_arr[i] = j+1;
+ }
+ return( nt );
+}
+
+
+/************************************************************************/
+/* GATHER ROUTINES */
+/* start_gather() returns a pointer to a list of msg_tag's, which
+ be used as input to subsequent wait_gather().
+
+ This list contains msg_tags for all receive buffers, followed by
+ end flag.
+
+ If no messages at all are required, the routine will return NULL.
+ msg_buf==NULL should be a reliable indicator of no message.
+
+ usage: tag = start_gather( source, size, direction, parity )
+*/
+
+
+
+msg_tag* start_gather( field, size, dir, parity )
+ /* arguments */
+ char * field; /* pointer to some latfield */
+ int size; /* size in bytes of the field (eg sizeof(su3_vector))*/
+ int dir; /* direction to gather from. eg XUP - index into
+ neighbor tables */
+ int parity; /* parity of sites whose neighbors we gather.
+ one of EVEN, ODD or EVENODD (EVEN+ODD). */
+{
+ /* local variables */
+ int i,j,k,nodepar; /* scratch */
+ int offset; /* number of sites in this receive or send */
+ int *idx; /* index array pointer */
+ int nsites;
+ char *tpt; /* temp ptr to buffer */
+ msg_tag *mbuf, *mp; /* list of message tags, to be returned */
+ comlist_struct *cp;
+ send_struct *sp;
+ receive_struct *rp;
+ static int index = MIN_GATHER_INDEX; /* index to identify the operation */
+
+
+ if (dir < 0 || dir >= NDIRS) {
+ printf("No such gather %d, node %d\n",dir,mynode());
+ terminate(1212);
+ }
+
+ /* First, get the rolling index for the operation
+ * This MUST BE HERE even if there's nothing to do, because
+ * somebody else might be doing this!
+ */
+ ++index ; if (index > MAX_GATHER_INDEX) index = MIN_GATHER_INDEX;
+
+ cp = &comlist[dir];
+
+ /* Now if there's nothing to do, return - CHECK IF GATHERED */
+ if( ( cp->n_send == 0 && cp->n_receive == 0 )
+ || is_already_gathered( field, size, dir, parity) )
+ return( (msg_tag *) NULL );
+
+ /* mark gathered, if needed */
+ gather_mark_gathered( field, size, dir, parity );
+
+ timer_start( &start_gather_timer );
+
+ /* allocate a buffer for the msg_tags. This is dynamically allocated
+ because there may be an arbitrary number of gathers in progress
+ in any direction. SIZE = n_msgs sent+received, for end flag */
+
+ /* mbuf = (msg_tag *)malloc((cp->n_send + cp->n_receive + 1)*sizeof(msg_tag) );
+ if(mbuf==NULL){
+ printf("No room for mbuf, node %d\n",mynode());
+ terminate(1212);
+ }
+ */
+
+ mbuf = get_msg_tags(cp->n_send + cp->n_receive);
+
+ /* Loop over node parity */
+ forbothparities(nodepar) {
+
+ mp=mbuf;
+ /* HANDLE RECEIVES: loop over nodes which will send here */
+ for (i=0, rp=cp->from_node; in_receive;i++, rp=rp->next, mp=mp->next) {
+
+ if (nodepar == node.parity) {
+ MPI_Status status;
+
+ /* note--neighbors of EVEN sites are always first in the list!
+ * Thus, for ODD sites we must change the offset
+ */
+ switch (parity) {
+ case EVEN: nsites = rp->n_even; offset = rp->offset; break;
+ case ODD: nsites = rp->n_odd; offset = rp->offset + rp->n_even; break;
+ case EVENODD: nsites = rp->n; offset = rp->offset; break;
+ }
+
+ mp->flag = RECEIVE_FLAG; /* flag as normal receive */
+ mp->dir = dir;
+ /* and post receive -- comes right on spot */
+ MPI_Recv( ((char *)field) + offset*size, nsites*size, MPI_BYTE,
+ rp->node, index, MPI_COMM_WORLD, &status );
+ }
+ }
+
+ /* HANDLE SENDS - note: mp automatically correct */
+ for(k=0,sp=cp->to_node; k < cp->n_send; k++,sp = sp->next, mp = mp->next) {
+ if (nodepar != node.parity) {
+ switch (parity) {
+ case EVEN: nsites = sp->n_even; offset = 0; break;
+ case ODD: nsites = sp->n_odd; offset = sp->n_even; break;
+ case EVENODD: nsites = sp->n; offset = 0; break;
+ }
+
+ /* allocate buffer */
+ tpt = (char *)malloc( nsites*size );
+ if(tpt==NULL){printf("NO ROOM for tpt, node %d\n",mynode());exit(1);}
+ mp->flag = SEND_FLAG; /* flag as send */
+ mp->dir = dir;
+ mp->buf = tpt;
+ /* gather data into the buffer */
+
+ idx = sp->sitelist + offset; /* initial offset */
+ for (j=0; jbuf, nsites*size, MPI_BYTE,
+ sp->node, index, MPI_COMM_WORLD );
+
+ free( mp->buf );
+ }
+ }
+
+ }
+
+ timer_end( &start_gather_timer );
+
+ release_msg_tags( mbuf );
+
+ /* return */
+ return((msg_tag*)NULL);
+}
+
+
+msg_tag * wait_gather( msg_tag *mbuf )
+{
+ MPI_Status status;
+ msg_tag *mp;
+
+ if (mbuf == NULL) return((msg_tag *)NULL);
+ timer_start( &wait_receive_timer );
+ /* wait for all receive messages */
+ for(mp=mbuf; mp != NULL && mp->flag == RECEIVE_FLAG; mp=mp->next) {
+ MPI_Wait( &(mp->mpi), &status );
+ }
+ timer_end( &wait_receive_timer );
+ /* wait for all send messages */
+ timer_start( &wait_send_timer );
+#ifndef SEND_TEST
+ for( ; mp != NULL && mp->flag == SEND_FLAG; mp=mp->next) {
+ MPI_Wait( &(mp->mpi), &status );
+ /* release the buffer */
+ free( mp->buf );
+ }
+#endif
+ timer_end( &wait_send_timer );
+ /* and free the mbuf */
+ release_msg_tags( mbuf );
+ return((msg_tag *)NULL);
+}
+
+
+/****************************************************************
+ */
+
+
+/* SEND AND RECEIVE FIELD */
+/* send_field is to be called only by the node doing the sending */
+/* get_field is to be called only by the node to which the field was sent */
+void send_field(buf,size,tonode)
+ void *buf; int size,tonode;
+{
+ timer_start( &send_timer );
+ MPI_Send(buf,size,MPI_BYTE,tonode,FIELD_TYPE,MPI_COMM_WORLD);
+ timer_end( &send_timer );
+ mpi_sent_data += size;
+}
+void receive_field(buf,size)
+ void *buf; int size;
+{
+ MPI_Status status;
+
+ timer_start( &send_timer );
+ MPI_Recv(buf,size,MPI_BYTE,MPI_ANY_SOURCE,FIELD_TYPE,
+ MPI_COMM_WORLD,&status);
+ timer_end( &send_timer );
+ mpi_sent_data += size;
+}
+
+/* BASIC COMMUNICATIONS FUNCTIONS */
+
+/* Tell what kind of machine we are on */
+static char name[]="MPI (portable)";
+char * machine_type(){
+ return(name);
+}
+
+/* Return my node number */
+int mynode()
+{
+ int node;
+ MPI_Comm_rank( MPI_COMM_WORLD, &node );
+ return(node);
+}
+
+/* Return number of nodes */
+int numnodes()
+{
+ int nodes;
+ MPI_Comm_size( MPI_COMM_WORLD, &nodes );
+ return(nodes);
+}
+
+/* Synchronize all nodes */
+void g_sync()
+{
+ timer_start( &g_sync_timer );
+ MPI_Barrier( MPI_COMM_WORLD );
+ timer_end( &g_sync_timer );
+}
+
+/* Sum float over all nodes to node 0 */
+void g_floatsum( float * fpt )
+{
+ float work;
+
+ timer_start( &g_sum_timer );
+ if (this_node == 0) {
+ MPI_Reduce( fpt, &work, 1, MPI_FLOAT, MPI_SUM, 0 , MPI_COMM_WORLD );
+ *fpt = work;
+ } else {
+ MPI_Reduce( fpt, &work, 1, MPI_FLOAT, MPI_SUM, 0 , MPI_COMM_WORLD );
+ }
+ timer_end( &g_sum_timer );
+}
+
+/* Sum double over all nodes */
+void g_doublesum( double * dpt )
+{
+ double work;
+
+ timer_start( &g_sum_timer );
+ if (this_node == 0) {
+ MPI_Reduce( dpt, &work, 1, MPI_DOUBLE, MPI_SUM, 0 , MPI_COMM_WORLD );
+ *dpt = work;
+ } else {
+ MPI_Reduce( dpt, &work, 1, MPI_DOUBLE, MPI_SUM, 0 , MPI_COMM_WORLD );
+ }
+ timer_end( &g_sum_timer );
+}
+
+/* Sum double over all nodes, and scatter the result */
+void g_doublesum_scatter( double * dpt )
+{
+ double work;
+
+ timer_start( &g_sum_timer );
+ MPI_Allreduce( dpt, &work, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+ *dpt = work;
+ timer_end( &g_sum_timer );
+}
+
+
+
+/* Sum a vector of doubles over all nodes */
+void g_vecintsum( dpt, n )
+ int *dpt; int n;
+{
+ register int *work;
+ register int i;
+
+ timer_start( &g_sum_timer );
+ work = (int *)malloc(n*sizeof(int));
+ if (this_node == 0) {
+ MPI_Reduce( dpt, work, n, MPI_INT, MPI_SUM, 0 , MPI_COMM_WORLD );
+ for (i=0; istatus == GATHER_FOLLOW && (a->gathered[dir] ^ parity) == 0) {
+#ifdef CHECK_GATHER_FIELDS
+ /* now looks like is gathered, check if the checkup field has changed */
+ if (((parity & EVEN) && (*((unsigned int *)field) != a->check_even[dir] )) ||
+ ((parity & ODD ) && (*((unsigned int *)(field+size*(node.sites-1)))
+ != a->check_odd[dir] ))) {
+ printf(" #### GATHER CHECK: Forgotten mark_changed() somewhere!\n");
+ if (size < sizeof(int))
+ printf(" Because field size %d < sizeof(int), can be spurious\n",size);
+ else {
+ printf(" Field size %d chars, dir %d parity %d\n",size, dir, parity);
+ }
+ halt(" ###### ");
+ }
+#endif
+ n_gather_avoided++;
+ return(1);
+ }
+ n_gather_done++;
+ return(0);
+}
+
+void gather_status_reset( char *field, int size )
+{
+ struct gather_status_arr *a;
+ /* cast the latfield array */
+ a = (struct gather_status_arr *) (field + size*node.latfield_size);
+ a->status = GATHER_NOT_FOLLOW;
+}
+
+void gather_mark_dirty( char *field, int size, int parity )
+{
+ int dir,p;
+ struct gather_status_arr *a;
+
+ /* cast the latfield array */
+ a = (struct gather_status_arr *) (field + size*node.latfield_size);
+ a->status = GATHER_FOLLOW;
+ p = opp_parity(parity);
+ /* mark opposite parity, because will fetch from there! */
+ /* Remember that need to mark opposite directions too! */
+ for(dir=0; dir